Package org.cyberneko.html.parsers

Examples of org.cyberneko.html.parsers.DOMFragmentParser


            IOException {
        if (html == null) {
            return null;
        }

        DOMFragmentParser parser = new DOMFragmentParser();

        // フィルターの設定
        ElementRemover remover = new ElementRemover();
        XMLDocumentFilter[] filters = { remover };
        parser.setProperty(
            "http://cyberneko.org/html/properties/filters",
            filters);
        HTMLDocument document = new HTMLDocumentImpl();
        DocumentFragment fragment = document.createDocumentFragment();

        InputSource inputSource = new InputSource(new StringReader(html));
        parser.parse(inputSource, fragment);
        StringWriter writer = new StringWriter();
        OutputFormat format = new OutputFormat();

        format.setOmitXMLDeclaration(true);
        XMLSerializer serializer = new XMLSerializer();
View Full Code Here


    properties = OutputPropertiesFactory.getDefaultMethodProperties(Method.HTML);
    _HTMLserialiser = SerializerFactory.getSerializer(properties);
    _serializer = _HTMLserialiser;
    // serializer.setOutputStream(output);
    // _parser.setContentHandler(serializer.asContentHandler());
    viewStateParser = new DOMFragmentParser();
    // Set parser features
    try {
      viewStateParser
          .setProperty(
              "http://cyberneko.org/html/properties/names/elems",
View Full Code Here

    // MAIN
    //

    /** Main. */
    public static void main(String[] argv) throws Exception {
        DOMFragmentParser parser = new DOMFragmentParser();
        HTMLDocument document = new HTMLDocumentImpl();
        for (int i = 0; i < argv.length; i++) {
            DocumentFragment fragment = document.createDocumentFragment();
            parser.parse(argv[i], fragment);
            print(fragment, "");
        }
    } // main(String[])
View Full Code Here

    // MAIN
    //

    /** Main. */
    public static void main(String[] argv) throws Exception {
        DOMFragmentParser parser = new DOMFragmentParser();
        HTMLDocument document = new HTMLDocumentImpl();
        for (int i = 0; i < argv.length; i++) {
            DocumentFragment fragment = document.createDocumentFragment();
            parser.parse(argv[i], fragment);
            print(fragment, "");
        }
    } // main(String[])
View Full Code Here

      return parser.getDocument();
    } else {
      Document htmlDoc = documentProvider.createDocument(null, null, null);
      // Workaround for error check failure adding text node to entity ref as a child
      htmlDoc.setStrictErrorChecking(false);
      DOMFragmentParser parser = new DOMFragmentParser();
      parser.setProperty("http://cyberneko.org/html/properties/names/elems", "default");
      parser.setFeature("http://cyberneko.org/html/features/document-fragment", true);
      parser.setProperty("http://cyberneko.org/html/properties/names/attrs", "no-change");
      parser.setFeature("http://apache.org/xml/features/scanner/notify-char-refs", true);
      parser.setFeature("http://cyberneko.org/html/features/scanner/notify-builtin-refs", true);
      DocumentFragment fragment = htmlDoc.createDocumentFragment();
      parser.parse(input, fragment);
      normalizeFragment(htmlDoc, fragment);
      return htmlDoc;
    }
  }
View Full Code Here

    reader.parse(input);
    return frag;
  }

  private DocumentFragment parseNeko(InputSource input) throws Exception {
    DOMFragmentParser parser = new DOMFragmentParser();
    try {
      parser.setFeature("http://cyberneko.org/html/features/augmentations",
          true);
      parser.setProperty("http://cyberneko.org/html/properties/default-encoding",
          defaultCharEncoding);
      parser.setFeature("http://cyberneko.org/html/features/scanner/ignore-specified-charset",
          true);
      parser.setFeature("http://cyberneko.org/html/features/balance-tags/ignore-outside-content",
          false);
      parser.setFeature("http://cyberneko.org/html/features/balance-tags/document-fragment",
          true);
      parser.setFeature("http://cyberneko.org/html/features/report-errors",
          LOG.isTraceEnabled());
    } catch (SAXException e) {}
    // convert Document to DocumentFragment
    HTMLDocumentImpl doc = new HTMLDocumentImpl();
    doc.setErrorChecking(false);
    DocumentFragment res = doc.createDocumentFragment();
    DocumentFragment frag = doc.createDocumentFragment();
    parser.parse(input, frag);
    res.appendChild(frag);

    try {
      while(true) {
        frag = doc.createDocumentFragment();
        parser.parse(input, frag);
        if (!frag.hasChildNodes()) break;
        if (LOG.isInfoEnabled()) {
          LOG.info(" - new frag, " + frag.getChildNodes().getLength() + " nodes.");
        }
        res.appendChild(frag);
View Full Code Here

      throw new RuntimeException(e);
    }
  }

  public SeleniumScript(String scriptUrl, String url) throws IOException, SAXException, XPathExpressionException {
    DOMFragmentParser parser = new DOMFragmentParser();
    HTMLDocument document = new HTMLDocumentImpl();
    DocumentFragment fragment = document.createDocumentFragment();
    try {
      parser.parse(scriptUrl, fragment);
      // not nice, it seems that parse also throws a FileNotFoundException sometimes.
      // XXX I don't know why
      if (fragment.getTextContent().contains("The page was not found!")) {
        throw new FileNotFoundException(scriptUrl);
      }
View Full Code Here

    // OutputPropertiesFactory.getDefaultMethodProperties(Method.XML);
    // properties.put("encoding",_encoding);
    _serializer = SerializerFactory.getSerializer(properties);
    // serializer.setOutputStream(output);
    // _parser.setContentHandler(serializer.asContentHandler());
    viewStateParser = new DOMFragmentParser();
    // Set parser features
    try {
      viewStateParser
          .setProperty(
              "http://cyberneko.org/html/properties/names/elems",
View Full Code Here

    properties = OutputPropertiesFactory.getDefaultMethodProperties(Method.HTML);
    _HTMLserialiser = SerializerFactory.getSerializer(properties);
    _serializer = _HTMLserialiser;
    // serializer.setOutputStream(output);
    // _parser.setContentHandler(serializer.asContentHandler());
    viewStateParser = new DOMFragmentParser();
    // Set parser features
    try {
      viewStateParser
          .setProperty(
              "http://cyberneko.org/html/properties/names/elems",
View Full Code Here

  private static void setup() throws Exception {
    conf = NutchConfiguration.create();
    conf.setBoolean("parser.html.form.use_action", true);
    utils = new DOMContentUtils(conf);
    DOMFragmentParser parser = new DOMFragmentParser();
    for (int i = 0; i < testPages.length; i++) {
      DocumentFragment node = new HTMLDocumentImpl().createDocumentFragment();
      try {
        parser.parse(
            new InputSource(new ByteArrayInputStream(testPages[i].getBytes())),
            node);
        testBaseHrefURLs[i] = new URL(testBaseHrefs[i]);
      } catch (Exception e) {
        assertTrue("caught exception: " + e, false);
View Full Code Here

TOP

Related Classes of org.cyberneko.html.parsers.DOMFragmentParser

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.