Package org.cyberneko.html.parsers

Examples of org.cyberneko.html.parsers.SAXParser


    }
    return new ParseImpl(newLink.getId(),linkExtractor.getLinks());
  }

  private SAXParser getParser(XMLDocumentFilter[] filters) {
    SAXParser parser = new SAXParser();
    try {
      parser.setProperty("http://cyberneko.org/html/properties/filters", filters);
      parser.setFeature(
        "http://cyberneko.org/html/features/balance-tags/ignore-outside-content",
        false);
      parser.setFeature(
        "http://cyberneko.org/html/features/balance-tags/document-fragment",
        true);
      parser.setFeature(
        "http://cyberneko.org/html/features/report-errors",
        false);
    } catch (SAXNotRecognizedException ex) {
      throw new IllegalStateException(ex);
    } catch (SAXNotSupportedException ex) {
View Full Code Here


    public Parser(Reader reader) throws IOException, SAXException {
      this(new InputSource(reader));
    }
   
    public Parser(InputSource source) throws IOException, SAXException {
      final SAXParser parser = new SAXParser();
      parser.setFeature("http://xml.org/sax/features/namespaces", true);
      parser.setFeature("http://cyberneko.org/html/features/balance-tags", true);
      parser.setFeature("http://cyberneko.org/html/features/report-errors", false);
      parser.setProperty("http://cyberneko.org/html/properties/names/elems", "lower");
      parser.setProperty("http://cyberneko.org/html/properties/names/attrs", "lower");

      final StringBuilder title = new StringBuilder(), body = new StringBuilder();
      final DefaultHandler handler = new DefaultHandler() {
        private int inBODY = 0, inHEAD = 0, inTITLE = 0, suppressed = 0;

        @Override
        public void startElement(String namespaceURI, String localName, String qName, Attributes atts) throws SAXException {
          if (inHEAD > 0) {
            if ("title".equals(localName)) {
              inTITLE++;
            } else {
              if ("meta".equals(localName)) {
                String name = atts.getValue("name");
                if (name == null) {
                  name = atts.getValue("http-equiv");
                }
                final String val = atts.getValue("content");
                if (name != null && val != null) {
                  metaTags.setProperty(name.toLowerCase(Locale.ROOT), val);
                }
              }
            }
          } else if (inBODY > 0) {
            if (SUPPRESS_ELEMENTS.contains(localName)) {
              suppressed++;
            } else if ("img".equals(localName)) {
              // the original javacc-based parser preserved <IMG alt="..."/>
              // attribute as body text in [] parenthesis:
              final String alt = atts.getValue("alt");
              if (alt != null) {
                body.append('[').append(alt).append(']');
              }
            }
          } else if ("body".equals(localName)) {
            inBODY++;
          } else if ("head".equals(localName)) {
            inHEAD++;
          } else if ("frameset".equals(localName)) {
            throw new SAXException("This parser does not support HTML framesets.");
          }
        }

        @Override
        public void endElement(String namespaceURI, String localName, String qName) throws SAXException {
          if (inBODY > 0) {
            if ("body".equals(localName)) {
              inBODY--;
            } else if (ENDLINE_ELEMENTS.contains(localName)) {
              body.append('\n');
            } else if (SUPPRESS_ELEMENTS.contains(localName)) {
              suppressed--;
            }
          } else if (inHEAD > 0) {
            if ("head".equals(localName)) {
              inHEAD--;
            } else if (inTITLE > 0 && "title".equals(localName)) {
              inTITLE--;
            }
          }
        }
       
        @Override
        public void characters(char[] ch, int start, int length) throws SAXException {
          if (inBODY > 0 && suppressed == 0) {
            body.append(ch, start, length);
          } else if (inTITLE > 0) {
            title.append(ch, start, length);
          }
        }

        @Override
        public InputSource resolveEntity(String publicId, String systemId) {
          // disable network access caused by DTDs
          return new InputSource(new StringReader(""));
        }
      };
     
      parser.setContentHandler(handler);
      parser.setErrorHandler(handler);
      parser.parse(source);
     
      // the javacc-based parser trimmed title (which should be done for HTML in all cases):
      this.title = title.toString().trim();
     
      // assign body text
View Full Code Here

        }
    }

    private static Document parsePage(HttpMethodBase page) throws IOException {
        // configure a non-validating parser that ignores DTD
        SAXParser sp = new SAXParser();
//                sp.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd",false);

        try {
            return new SAXReader(sp).read(
                    new InputStreamReader(page.getResponseBodyAsStream(), page.getResponseCharSet()));
View Full Code Here

  public void parse(InputSource is, ContentHandler consumer)
      throws IOException, SAXException {
    if (is == null)
      throw new NullPointerException("is argument is required.");

    SAXParser parser = new SAXParser();
    parser.setFeature("http://xml.org/sax/features/namespaces", true);
    parser
        .setFeature(
            "http://cyberneko.org/html/features/override-namespaces",
            false);
    parser.setFeature(
        "http://cyberneko.org/html/features/insert-namespaces", false);
    parser
        .setFeature(
            "http://cyberneko.org/html/features/scanner/ignore-specified-charset",
            true);
    parser.setProperty(
        "http://cyberneko.org/html/properties/default-encoding",
        "UTF-8");
    parser.setProperty("http://cyberneko.org/html/properties/names/elems",
        "lower");
    parser.setProperty("http://cyberneko.org/html/properties/names/attrs",
        "lower");

    parser.setContentHandler(new RemoveNamespacesHandler(
        new MergeCharacterEventsHandler(consumer)));
    parser.parse(is);
  }
View Full Code Here

            throw new ResourceException(String.format("Unsupported ContentType %s for DirectoryListing", contentType));
        }
        String contentEncoding = UriResource.extractCharacterEncoding(contentType, "utf-8");
        final Reader htmlText = new InputStreamReader(content, contentEncoding);
        final InputSource inputSource = new InputSource(htmlText);
        final SAXParser htmlParser = new SAXParser();
        final AnchorListerHandler anchorListerHandler = new AnchorListerHandler();
        htmlParser.setContentHandler(anchorListerHandler);
        htmlParser.parse(inputSource);

        List<String> hrefs = anchorListerHandler.getHrefs();
        List<URI> uris = resolveURIs(baseURI, hrefs);
        return filterNonDirectChilds(baseURI, uris);
    }
View Full Code Here

        super(ruby, rubyClass);
    }
   
    @Override
    protected AbstractSAXParser createParser() throws SAXException {
        SAXParser parser = new SAXParser();

        try{
            parser.setProperty(
                "http://cyberneko.org/html/properties/names/elems", "lower");
            parser.setProperty(
                "http://cyberneko.org/html/properties/names/attrs", "lower");
            return parser;
        } catch(SAXException ex) {
            throw new SAXException(
                "Problem while creating HTML SAX Parser: " + ex.toString());
View Full Code Here

TOP

Related Classes of org.cyberneko.html.parsers.SAXParser

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.