Package org.cyberneko.html

Examples of org.cyberneko.html.HTMLConfiguration

This configuration recognizes the following properties:

For complete usage information, refer to the documentation. @see HTMLScanner @see HTMLTagBalancer @see HTMLErrorReporter @author Andy Clark @version $Id: HTMLConfiguration.java,v 1.9 2005/02/14 03:56:54 andyc Exp $


    htmlScanner.scanDocument(true);
    return handler;
  }

  protected HTMLConfiguration newConfiguration() {
    HTMLConfiguration config = new HTMLConfiguration();
    // Maintain original case for elements and attributes
    config.setProperty("http://cyberneko.org/html/properties/names/elems", "match");
    config.setProperty("http://cyberneko.org/html/properties/names/attrs", "no-change");
    // Parse as fragment.
    config.setFeature("http://cyberneko.org/html/features/balance-tags/document-fragment", true);
    // Get notified of entity and character references
    config.setFeature("http://apache.org/xml/features/scanner/notify-char-refs", true);
    config.setFeature("http://cyberneko.org/html/features/scanner/notify-builtin-refs", true);
    return config;
  }
View Full Code Here


    return true;
  }

  @Override
  protected HTMLConfiguration newConfiguration() {
    HTMLConfiguration config = super.newConfiguration();
    config.setFeature("http://xml.org/sax/features/namespaces", true);
    return config;
  }
View Full Code Here

  @Override
  protected Document parseDomImpl(String source) throws GadgetException {
    DocumentHandler handler;

    HTMLConfiguration config = newConfiguration();
    try {
      handler = parseHtmlImpl(source, config, new NormalizingTagBalancer());
    } catch (IOException ioe) {
      return null;
    }
View Full Code Here

  @Override
  protected DocumentFragment parseFragmentImpl(String source) throws GadgetException {
    DocumentHandler handler;

    HTMLConfiguration config = newConfiguration();
    // http://cyberneko.org/html/features/balance-tags/document-fragment
    // deprecated http://cyberneko.org/html/features/document-fragment
    config.setFeature("http://cyberneko.org/html/features/balance-tags/document-fragment", true);
    config.setProperty("http://cyberneko.org/html/properties/balance-tags/fragment-context-stack",
        new QName[]{new QName(null, "HTML", "HTML", null), new QName(null, "BODY", "BODY", null)});

    try {
      handler = parseHtmlImpl(source, config, new NekoPatchTagBalancer());
    } catch (IOException ioe) {
View Full Code Here

      bodyFirst = headScript;
    }
  }

  protected HTMLConfiguration newConfiguration() {
    HTMLConfiguration config = new HTMLConfiguration();
    // Maintain original case for elements and attributes
    config.setProperty("http://cyberneko.org/html/properties/names/elems", "match");
    config.setProperty("http://cyberneko.org/html/properties/names/attrs", "no-change");
    // Get notified of entity and character references
    config.setFeature("http://apache.org/xml/features/scanner/notify-char-refs", true);
    config.setFeature("http://cyberneko.org/html/features/scanner/notify-builtin-refs", true);
    config.setFeature("http://xml.org/sax/features/namespaces", true);
    return config;
  }
View Full Code Here

        public HtmlSaxParser(Properties properties) {
            super(getConfig(properties));
        }

        private static HTMLConfiguration getConfig(Properties properties) {
            HTMLConfiguration config = new HTMLConfiguration();
            config.setProperty("http://cyberneko.org/html/properties/names/elems", "lower");
            if (properties != null) {
                for (Iterator i = properties.keySet().iterator();i.hasNext();) {
                    String name = (String) i.next();
                    if (name.indexOf("/features/") > -1) {
                        config.setFeature(name, Boolean.getBoolean(properties.getProperty(name)));
                    } else if (name.indexOf("/properties/") > -1) {
                        config.setProperty(name, properties.getProperty(name));
                    }
                }
            }
            return config;
        }
View Full Code Here

        public HtmlSaxParser(Properties properties) {
            super(getConfig(properties));
        }
   
        private static HTMLConfiguration getConfig(Properties properties) {
            HTMLConfiguration config = new HTMLConfiguration();
            config.setProperty("http://cyberneko.org/html/properties/names/elems", "lower");
            if (properties != null) {
                for (Iterator i = properties.keySet().iterator(); i.hasNext();) {
                    String name = (String) i.next();
                    if (name.indexOf("/features/") > -1) {
                        config.setFeature(name, Boolean.getBoolean(properties.getProperty(name)));
                    } else if (name.indexOf("/properties/") > -1) {
                        config.setProperty(name, properties.getProperty(name));
                    }
                }
            }

            return config;
View Full Code Here

    public NekoHtmlSaxParser(Properties properties) {
        super(getConfig(properties));
    }

    private static HTMLConfiguration getConfig(Properties properties) {
        HTMLConfiguration config = new HTMLConfiguration();
        config.setProperty("http://cyberneko.org/html/properties/names/elems", "lower");
        if (properties != null) {
            for (Iterator i = properties.keySet().iterator(); i.hasNext();) {
                String name = (String) i.next();
                if (name.indexOf("/features/") > -1) {
                    config.setFeature(name, Boolean.getBoolean(properties.getProperty(name)));
                } else if (name.indexOf("/properties/") > -1) {
                    config.setProperty(name, properties.getProperty(name));
                }
            }
        }
        return config;
    }
View Full Code Here

        ElementRemover remover = new ElementRemover();
        remover.removeElement(REMOVE_SCRIPT);
        StringWriter contentWriter = new StringWriter();
        Writer writer = new Writer(contentWriter, CHAR_ENCODING);
        XMLDocumentFilter[] filters = { remover, writer, };
        XMLParserConfiguration parser = new HTMLConfiguration();
        parser.setProperty("http://cyberneko.org/html/properties/filters",
                filters);
        XMLInputSource source = new XMLInputSource(null, null, null,
                contentReader, CHAR_ENCODING);
        try {
            parser.parse(source);
        } catch (XNIException e) {
            throw new NotIndexableException("Can not parse html -- ", e);

        } catch (IOException e) {
            throw new NotIndexableException("Can not parse html -- ", e);
View Full Code Here

      new org.cyberneko.html.filters.Writer(new ByteArrayOutputStream(), "UTF-8"),
      new org.cyberneko.html.filters.Writer(new ByteArrayOutputStream(), "UTF-8")
    };
   
        // create HTML parser
    final XMLParserConfiguration parser = new HTMLConfiguration();
        parser.setProperty("http://cyberneko.org/html/properties/filters", filters);

    XMLInputSource source = new XMLInputSource(null, "currentUrl", null, inputStream, "UTF-8");
       
      parser.parse(source);
        inputStream.close();
  }
View Full Code Here

TOP

Related Classes of org.cyberneko.html.HTMLConfiguration

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.