Package org.cyberneko.html.parsers

Examples of org.cyberneko.html.parsers.DOMFragmentParser


    properties = OutputPropertiesFactory.getDefaultMethodProperties(Method.HTML);
    _HTMLserialiser = SerializerFactory.getSerializer(properties);
    _serializer = _HTMLserialiser;
    // serializer.setOutputStream(output);
    // _parser.setContentHandler(serializer.asContentHandler());
    viewStateParser = new DOMFragmentParser();
    // Set parser features
    try {
      viewStateParser
          .setProperty(
              "http://cyberneko.org/html/properties/names/elems",
View Full Code Here


    // OutputPropertiesFactory.getDefaultMethodProperties(Method.XML);
    // properties.put("encoding",_encoding);
    _serializer = SerializerFactory.getSerializer(properties);
    // serializer.setOutputStream(output);
    // _parser.setContentHandler(serializer.asContentHandler());
    viewStateParser = new DOMFragmentParser();
    // Set parser features
    try {
      viewStateParser
          .setProperty(
              "http://cyberneko.org/html/properties/names/elems",
View Full Code Here

      Document htmlDoc = documentProvider.createDocument(null, null, null);
      // Workaround for error check failure adding text node to entity ref as a child
      htmlDoc.setStrictErrorChecking(false);
      DocumentFragment fragment = htmlDoc.createDocumentFragment();
      InputSource input = new InputSource(new StringReader(source));
      DOMFragmentParser parser = new DOMFragmentParser();
      parser.setProperty("http://cyberneko.org/html/properties/names/elems", "default");
      parser.setFeature("http://cyberneko.org/html/features/document-fragment", true);
      parser.setProperty("http://cyberneko.org/html/properties/names/attrs", "no-change");
      parser.setFeature("http://apache.org/xml/features/scanner/notify-char-refs", true);
      parser.setFeature("http://cyberneko.org/html/features/scanner/notify-builtin-refs", true);
      parser.parse(input, fragment);
      return fragment;
    } catch (Exception e) {
      throw new GadgetException(GadgetException.Code.HTML_PARSE_ERROR, e);
    }
  }
View Full Code Here

  @Before
  public void setup() throws Exception {
    conf = NutchConfiguration.create();
    conf.setBoolean("parser.html.form.use_action", true);
    utils = new DOMContentUtils(conf);
    DOMFragmentParser parser = new DOMFragmentParser();
    parser.setFeature(
        "http://cyberneko.org/html/features/scanner/allow-selfclosing-iframe",
        true);
    for (int i = 0; i < testPages.length; i++) {
      DocumentFragment node = new HTMLDocumentImpl().createDocumentFragment();
      try {
        parser.parse(
            new InputSource(new ByteArrayInputStream(testPages[i].getBytes())),
            node);
        testBaseHrefURLs[i] = new URL(testBaseHrefs[i]);
      } catch (Exception e) {
        Assert.assertTrue("caught exception: " + e, false);
View Full Code Here

  private URL[][] currURLsAndAnswers;

  @Test
  public void testRobotsMetaProcessor() {
    DOMFragmentParser parser= new DOMFragmentParser();;

    try {
      currURLsAndAnswers= new URL[][] {
          {new URL("http://www.nutch.org"), null},
          {new URL("http://www.nutch.org"), null},
          {new URL("http://www.nutch.org"), null},
          {new URL("http://www.nutch.org"), null},
          {new URL("http://www.nutch.org"), null},
          {new URL("http://www.nutch.org"), null},
          {new URL("http://www.nutch.org"), null},
          {new URL("http://www.nutch.org/foo/"),
            new URL("http://www.nutch.org/")},
            {new URL("http://www.nutch.org"),
              new URL("http://www.nutch.org/base/")}
      };
    } catch (Exception e) {
      Assert.assertTrue("couldn't make test URLs!", false);
    }

    for (int i= 0; i < tests.length; i++) {
      byte[] bytes= tests[i].getBytes();

      DocumentFragment node = new HTMLDocumentImpl().createDocumentFragment();

      try {
        parser.parse(new InputSource(new ByteArrayInputStream(bytes)), node);
      } catch (Exception e) {
        e.printStackTrace();
      }

      HTMLMetaTags robotsMeta= new HTMLMetaTags();
View Full Code Here

  private static void setup() throws Exception {
    conf = NutchConfiguration.create();
    conf.setBoolean("parser.html.form.use_action", true);
    utils = new DOMContentUtils(conf);
    DOMFragmentParser parser = new DOMFragmentParser();
    parser.setFeature(
        "http://cyberneko.org/html/features/scanner/allow-selfclosing-iframe",
        true);
    for (int i = 0; i < testPages.length; i++) {
      DocumentFragment node = new HTMLDocumentImpl().createDocumentFragment();
      try {
        parser.parse(
            new InputSource(new ByteArrayInputStream(testPages[i].getBytes())),
            node);
        testBaseHrefURLs[i] = new URL(testBaseHrefs[i]);
      } catch (Exception e) {
        assertTrue("caught exception: " + e, false);
View Full Code Here

    // MAIN
    //

    /** Main. */
    public static void main(String[] argv) throws Exception {
        DOMFragmentParser parser = new DOMFragmentParser();
        HTMLDocument document = new HTMLDocumentImpl();
        for (int i = 0; i < argv.length; i++) {
            DocumentFragment fragment = document.createDocumentFragment();
            parser.parse(argv[i], fragment);
            print(fragment, "");
        }
    } // main(String[])
View Full Code Here

    public void testInvalidAttributeName() throws Exception {
      doTest("<html 9='id'></html>", "<HTML/>");
    }

    private void doTest(final String html, final String expected) throws Exception {
        DOMFragmentParser parser = new DOMFragmentParser();
        HTMLDocument document = new HTMLDocumentImpl();

        DocumentFragment fragment = document.createDocumentFragment();
        InputSource source = new InputSource(new StringReader(html));
        parser.parse(source, fragment);
//        final OutputFormat of = new OutputFormat();
//        of.setOmitXMLDeclaration(true);
//        XMLSerializer s = new XMLSerializer(of);
//        StringWriter sw = new StringWriter();
//        s.setOutputCharStream(sw);
View Full Code Here

    reader.parse(input);
    return frag;
  }

  private DocumentFragment parseNeko(InputSource input) throws Exception {
    DOMFragmentParser parser = new DOMFragmentParser();
    try {
      parser.setFeature("http://cyberneko.org/html/features/augmentations",
          true);
      parser.setProperty("http://cyberneko.org/html/properties/default-encoding",
          defaultCharEncoding);
      parser.setFeature("http://cyberneko.org/html/features/scanner/ignore-specified-charset",
          true);
      parser.setFeature("http://cyberneko.org/html/features/balance-tags/ignore-outside-content",
          false);
      parser.setFeature("http://cyberneko.org/html/features/balance-tags/document-fragment",
          true);
      parser.setFeature("http://cyberneko.org/html/features/report-errors",
          LOG.isTraceEnabled());
    } catch (SAXException e) {}
    // convert Document to DocumentFragment
    HTMLDocumentImpl doc = new HTMLDocumentImpl();
    doc.setErrorChecking(false);
    DocumentFragment res = doc.createDocumentFragment();
    DocumentFragment frag = doc.createDocumentFragment();
    parser.parse(input, frag);
    res.appendChild(frag);

    try {
      while(true) {
        frag = doc.createDocumentFragment();
        parser.parse(input, frag);
        if (!frag.hasChildNodes()) break;
        if (LOG.isInfoEnabled()) {
          LOG.info(" - new frag, " + frag.getChildNodes().getLength() + " nodes.");
        }
        res.appendChild(frag);
View Full Code Here

    // OutputPropertiesFactory.getDefaultMethodProperties(Method.XML);
    // properties.put("encoding",_encoding);
    _serializer = SerializerFactory.getSerializer(properties);
    // serializer.setOutputStream(output);
    // _parser.setContentHandler(serializer.asContentHandler());
    viewStateParser = new DOMFragmentParser();
    // Set parser features
    try {
      viewStateParser
          .setProperty(
              "http://cyberneko.org/html/properties/names/elems",
View Full Code Here

TOP

Related Classes of org.cyberneko.html.parsers.DOMFragmentParser

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.