Package org.cyberneko.html.parsers

Examples of org.cyberneko.html.parsers.SAXParser


public class NekoHTMLFilter extends LogDelegator implements Filter {

  @Override
  public String filter(String original) {
    try {
      SAXParser parser = new SAXParser();
      HTMLHandler contentHandler = new HTMLHandler((int)((float)original.length() * 0.66f));
      parser.setContentHandler(contentHandler);
      parser.parse(new InputSource(new StringReader(original)));
      return contentHandler.toString();
    } catch (SAXException e) {
      logError("", e);
      return null;
    } catch (IOException e) {
View Full Code Here


    }
  }

  public String filter(InputStream in) {
    try {
      SAXParser parser = new SAXParser();
      HTMLHandler contentHandler = new HTMLHandler((int)(1000 * 0.66f));
      parser.setContentHandler(contentHandler);
      parser.parse(new InputSource(in));
      return contentHandler.toString();
    } catch (SAXException e) {
      logError("", e);
      return null;
    } catch (IOException e) {
View Full Code Here

        }
    }

    private static Document parsePage(HttpMethodBase page) throws IOException {
        // configure a non-validating parser that ignores DTD
        SAXParser sp = new SAXParser();
//                sp.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd",false);

        try {
            return new SAXReader(sp).read(
                    new InputStreamReader(page.getResponseBodyAsStream(), page.getResponseCharSet()));
View Full Code Here

public class HtmlParser implements Parser {

    public void parse(InputStream stream, ContentHandler handler,
            Metadata metadata) throws IOException, SAXException, TikaException {

        final SAXParser parser = new SAXParser();

        final InputSource source;

        Reader utf8Reader;
       
        try {
            utf8Reader = org.apache.tika.utils.Utils.getUTF8Reader(
                    stream, metadata);
        } catch (TikaException ex) {
            utf8Reader = null;
        }

        if (utf8Reader == null) {
            source = new InputSource(stream);
        } else {
            source = new InputSource(utf8Reader);
        }

       
        parser.setContentHandler(new TitleExtractingContentHandler(handler,
                metadata));
        parser.parse(source);
    }
View Full Code Here

    public Parser(Reader reader) throws IOException, SAXException {
      this(new InputSource(reader));
    }
   
    public Parser(InputSource source) throws IOException, SAXException {
      final SAXParser parser = new SAXParser();
      parser.setFeature("http://xml.org/sax/features/namespaces", true);
      parser.setFeature("http://cyberneko.org/html/features/balance-tags", true);
      parser.setFeature("http://cyberneko.org/html/features/report-errors", false);
      parser.setProperty("http://cyberneko.org/html/properties/names/elems", "lower");
      parser.setProperty("http://cyberneko.org/html/properties/names/attrs", "lower");

      final StringBuilder title = new StringBuilder(), body = new StringBuilder();
      final DefaultHandler handler = new DefaultHandler() {
        private int inBODY = 0, inHEAD = 0, inTITLE = 0, suppressed = 0;

        @Override
        public void startElement(String namespaceURI, String localName, String qName, Attributes atts) throws SAXException {
          if (inHEAD > 0) {
            if ("title".equals(localName)) {
              inTITLE++;
            } else {
              if ("meta".equals(localName)) {
                String name = atts.getValue("name");
                if (name == null) {
                  name = atts.getValue("http-equiv");
                }
                final String val = atts.getValue("content");
                if (name != null && val != null) {
                  metaTags.setProperty(name.toLowerCase(Locale.ROOT), val);
                }
              }
            }
          } else if (inBODY > 0) {
            if (SUPPRESS_ELEMENTS.contains(localName)) {
              suppressed++;
            } else if ("img".equals(localName)) {
              // the original javacc-based parser preserved <IMG alt="..."/>
              // attribute as body text in [] parenthesis:
              final String alt = atts.getValue("alt");
              if (alt != null) {
                body.append('[').append(alt).append(']');
              }
            }
          } else if ("body".equals(localName)) {
            inBODY++;
          } else if ("head".equals(localName)) {
            inHEAD++;
          } else if ("frameset".equals(localName)) {
            throw new SAXException("This parser does not support HTML framesets.");
          }
        }

        @Override
        public void endElement(String namespaceURI, String localName, String qName) throws SAXException {
          if (inBODY > 0) {
            if ("body".equals(localName)) {
              inBODY--;
            } else if (ENDLINE_ELEMENTS.contains(localName)) {
              body.append('\n');
            } else if (SUPPRESS_ELEMENTS.contains(localName)) {
              suppressed--;
            }
          } else if (inHEAD > 0) {
            if ("head".equals(localName)) {
              inHEAD--;
            } else if (inTITLE > 0 && "title".equals(localName)) {
              inTITLE--;
            }
          }
        }
       
        @Override
        public void characters(char[] ch, int start, int length) throws SAXException {
          if (inBODY > 0 && suppressed == 0) {
            body.append(ch, start, length);
          } else if (inTITLE > 0) {
            title.append(ch, start, length);
          }
        }

        @Override
        public InputSource resolveEntity(String publicId, String systemId) {
          // disable network access caused by DTDs
          return new InputSource(new StringReader(""));
        }
      };
     
      parser.setContentHandler(handler);
      parser.setErrorHandler(handler);
      parser.parse(source);
     
      // the javacc-based parser trimmed title (which should be done for HTML in all cases):
      this.title = title.toString().trim();
     
      // assign body text
View Full Code Here

   * Initialize a Cyber Necko parser configured to return lower case element's names
   *
   * @return
   */
  private SAXParser initParser() {
    parser = new SAXParser();
    try {
      parser.setProperty("http://cyberneko.org/html/properties/names/elems", "lower");
      parser.setFeature("http://cyberneko.org/html/features/balance-tags/ignore-outside-content", false);
      parser.setFeature("http://cyberneko.org/html/features/balance-tags/document-fragment", true);
      parser.setFeature("http://cyberneko.org/html/features/report-errors", false);
View Full Code Here

  @Override
  public Parse parse(ContentEntity entity, Link newLink) throws DroidsException, IOException {
    // setup filter chain
    XMLDocumentFilter[] filters = { getRemover() };
    // create HTML parser
    SAXParser parser = getParser(filters);
    LinkExtractor linkExtractor = new LinkExtractor(newLink, elements);
    parser.setContentHandler(linkExtractor);
    InputStream instream = entity.obtainContent();
    try {
      parser.parse(new InputSource(instream));
    } catch (SAXException ex) {
      throw new ContentFormatViolationException("Failure parsing HTML content", ex);
    } finally {
      instream.close();
    }
View Full Code Here

    }
    return new ParseImpl(newLink.getId(),linkExtractor.getLinks());
  }

  private SAXParser getParser(XMLDocumentFilter[] filters) {
    SAXParser parser = new SAXParser();
    try {
      parser.setProperty("http://cyberneko.org/html/properties/filters", filters);
      parser.setFeature(
        "http://cyberneko.org/html/features/balance-tags/ignore-outside-content",
        false);
      parser.setFeature(
        "http://cyberneko.org/html/features/balance-tags/document-fragment",
        true);
      parser.setFeature(
        "http://cyberneko.org/html/features/report-errors",
        false);
    } catch (SAXNotRecognizedException ex) {
      throw new IllegalStateException(ex);
    } catch (SAXNotSupportedException ex) {
View Full Code Here

     */
    public void rewrite(Rewriter rewriter, java.io.Reader reader, java.io.Writer writer)
            throws RewriterException
    {
        // use a cyberneko SAXParser
        SAXParser parser = new SAXParser() ;

        // setup filter chain
        XMLDocumentFilter[] filters = {
            new Purifier(),                                                                                  // [1] standard neko purifications (tag balancing, etc)
            new CallbackElementRemover( rewriter ),                                                          // [2] accept / reject tags based on advice from rewriter
            writer != null ? new org.cyberneko.html.filters.Writer( writer, null ) : new DefaultFilter()     // [3] propagate results to specified writer (or do nothing -- Default -- when writer is null)
        };
       
        String filtersPropName = "http://cyberneko.org/html/properties/filters";
  
        try
        {
            parser.setProperty(filtersPropName, filters);
        }
        catch (SAXException e)
        {
            // either no longer supported (SAXNotSupportedException), or no logner recognized (SAXNotRecognizedException)
            log.error(filtersPropName + " is, unexpectedly, no longer defined for the cyberneko HTML parser",e);
            throw new RewriterException("cyberneko parser version not supported",e);
        }

        try
        {
            // parse from reader
            parser.parse(new XMLInputSource( null, null, null, reader, null )) ;
        }
        catch (IOException e)
        {
            String msg = "cyberneko HTML parsing failure";
            log.error(msg,e);
View Full Code Here

    /**
     * Factory method to create a new SAXReader
     */   
    protected SAXReader createSAXReader() throws Exception {
        // installs the NeckHTML parser
        SAXParser parser = new SAXParser();
        parser.setProperty(
            "http://cyberneko.org/html/properties/names/elems",
            "match"
        );
        parser.setProperty(
            "http://cyberneko.org/html/properties/names/attrs",
            "match"
        );
        return new SAXReader( parser );
    }
View Full Code Here

TOP

Related Classes of org.cyberneko.html.parsers.SAXParser

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.