Package org.cyberneko.html.parsers

Examples of org.cyberneko.html.parsers.DOMParser


    public static Logger logger = Logger.getLogger(Validator.class);
   
    public Model validate(InputStream html) {
      Model model = ModelFactory.createDefaultModel();
     
        DOMParser parser = new DOMParser();
        try {
      parser.setFeature("http://xml.org/sax/features/namespaces", false);
    } catch (SAXNotRecognizedException e1) {
      logger.error(e1);
    } catch (SAXNotSupportedException e1) {
      logger.error(e1);
    }

    try {
      parser.parse(new InputSource(html));
      XmlObject xmlBean = XmlObject.Factory.parse(parser.getDocument().getDocumentElement());
      XmlCursor cursor = xmlBean.newCursor();
      cursor.toFirstContentToken();
      cursor.selectPath("//SCRIPT[@type = 'application/rdf+n3']");
      while(cursor.hasNextSelection()) {
        cursor.toNextSelection();
View Full Code Here


            }
        }
    }

    public static Document getDocument(InputStream is) throws IOException, SAXException {
        DOMParser parser = new DOMParser();
        InputSource inputSource = new InputSource(is);
        inputSource.setEncoding(PluginConstants.UTF_ENCODING);
        parser.parse(inputSource);
        return parser.getDocument();
    }
View Full Code Here

       
        if (homePageMethod.getStatusCode() != 200) {
            throw new HttpException("HTTP Status Code is " + homePageMethod.getStatusCode());
        }
       
        DOMParser parser = new DOMParser();
        parser.setFeature("http://xml.org/sax/features/namespaces", false);
       
        InputSource inputSource;
        if (homePageMethod.getResponseCharSet() != null) {
            Reader reader = new InputStreamReader(homePageMethod.getResponseBodyAsStream(), homePageMethod.getResponseCharSet());
            inputSource = new InputSource(reader);
        }
        else {
            inputSource = new InputSource(homePageMethod.getResponseBodyAsStream());   
        }
       
        parser.parse(inputSource);
        org.w3c.dom.Document w3cDoc = parser.getDocument();
       
        DOMReader xmlReader = new DOMReader();
        return xmlReader.read(w3cDoc);
       
    }
View Full Code Here

    }
   
    @CodeCompletion
    public Document parsehtml(String html) throws SAXException, IOException {
       
        DOMParser parser = new DOMParser();
        parser.parse(new InputSource(new StringReader(html)));
        org.w3c.dom.Document w3cDoc = parser.getDocument();
       
        DOMReader xmlReader = new DOMReader();
        return xmlReader.read(w3cDoc);
       
    }
View Full Code Here

    public static Document readHtmlDocument(String str) {
        Document document = null;
        try {
            URL url = FlexibleLocation.resolveLocation(str);
            if (url != null) {
                DOMParser parser = new DOMParser();
                parser.setFeature("http://xml.org/sax/features/namespaces", false);
                parser.parse(url.toExternalForm());
                document = parser.getDocument();
            } else {
                Debug.logError("Unable to locate HTML document " + str, module);
            }
        } catch (Exception e) {
            Debug.logError(e, "Error while reading HTML document " + str, module);
View Full Code Here

    // MAIN
    //

    /** Main. */
    public static void main(String[] argv) throws Exception {
        DOMParser parser = new DOMParser();
        for (int i = 0; i < argv.length; i++) {
            parser.parse(argv[i]);
            print(parser.getDocument(), "");
        }
    } // main(String[])
View Full Code Here

    doTest("hello</span>world", new String[] { "html", "body", "div", "span" }, expected);
  }

  private void doTest(final String html, final String[] contextStack,
      final String expected) throws Exception {
    final DOMParser parser = new DOMParser();
    parser.setFeature("http://cyberneko.org/html/features/balance-tags/document-fragment", true);
    if (contextStack != null) {
      parser.setProperty("http://cyberneko.org/html/properties/balance-tags/fragment-context-stack", toQNames(contextStack));
    }

    final StringWriter out = new StringWriter();
    final XMLDocumentFilter[] filters = { new Writer(out) };
    parser.setProperty("http://cyberneko.org/html/properties/filters", filters);

    final StringReader sr = new StringReader(html);
    final XMLInputSource in = new XMLInputSource(null, "foo", null, sr, null);
    parser.parse(in);

    assertEquals(expected.trim(), out.toString().trim());
  }
View Full Code Here

        Document document = getDOM();
        return new DocumentReport( validator.validate(dURI, document, applyFix), document );
    }

    private Document parse() throws IOException, SAXException, TransformerException {
        final DOMParser parser = new DOMParser() {

            private QName currentQName;
            private Augmentations currentAugmentations;

            @Override
            protected Element createElementNode(QName qName) {
                final Element created = super.createElementNode(qName);
                if (qName.equals(currentQName) && currentAugmentations != null) {
                    final ElementLocation elementLocation = createElementLocation(
                        currentAugmentations.getItem(AUGMENTATIONS_FEATURE)
                    );
                    created.setUserData(ELEMENT_LOCATION, elementLocation, null);
                }
                return created;
            }

            @Override
            public void startElement(QName qName, XMLAttributes xmlAttributes, Augmentations augmentations)
            throws XNIException {
                super.startElement(qName, xmlAttributes, augmentations);
                currentQName = qName;
                currentAugmentations = augmentations;
            }

            private ElementLocation createElementLocation(Object obj) {
                if(obj == null) return null;
                String pattern = null;
                try {
                    pattern = obj.toString();
                    if( "synthesized".equals(pattern) ) return null;
                    final String[] parts = pattern.split(":");
                    return new ElementLocation(
                            Integer.parseInt(parts[0]),
                            Integer.parseInt(parts[1]),
                            Integer.parseInt(parts[3]),
                            Integer.parseInt(parts[4])

                    );
                } catch (Exception e) {
                    logger.warn(
                            String.format("Unexpected string format for given augmentation: [%s]", pattern),
                            e
                    );
                    return null;
                }
            }
        };
        parser.setFeature("http://xml.org/sax/features/namespaces", false);
        parser.setFeature("http://cyberneko.org/html/features/scanner/script/strip-cdata-delims", true);
        parser.setFeature(AUGMENTATIONS_FEATURE, true);
        if (this.encoding != null)
            parser.setProperty("http://cyberneko.org/html/properties/default-encoding", this.encoding);

        /*
         * NOTE: the SpanCloserInputStream has been added to wrap the stream passed to the CyberNeko
         *       parser. This will ensure the correct handling of inline HTML SPAN tags.
         *       This fix is documented at issue #78.      
         */
        parser.parse(new InputSource( new SpanCloserInputStream(input)));
        return parser.getDocument();
    }
View Full Code Here

     * @param commandFactory command factory.
     * @return TestCase or TestSuite.
     */
    public static Selenese parse(String filename, InputStream is, ICommandFactory commandFactory) {
        try {
            DOMParser dp = new DOMParser();
            dp.setEntityResolver(null);
            dp.setFeature("http://xml.org/sax/features/namespaces", false);
            dp.setFeature(XERCES_FEATURE_PREFIX + INCLUDE_COMMENTS_FEATURE, true);
            dp.parse(new InputSource(is));
            Document document = dp.getDocument();
            Node seleniumBase = XPathAPI.selectSingleNode(document, "/HTML/HEAD/LINK[@rel='selenium.base']/@href");
            if (seleniumBase != null) {
                String baseURL = seleniumBase.getNodeValue();
                return new TestCaseParser(filename, document, baseURL).parse(commandFactory);
            }
View Full Code Here

        m_ThumbNails.add("no image");
        HttpEntity entity = response.getEntity();
        // String content = EntityUtils.toString(entity);

        try {
          DOMParser dpHTML = new DOMParser();
          dpHTML
              .setProperty(
                  "http://cyberneko.org/html/properties/default-encoding",
                  "utf-8");
          // dpHTML.parse(new
          // InputSource(EntityUtils.toString(entity)));
          dpHTML.parse(new InputSource(entity.getContent()));
          doc = dpHTML.getDocument();
          NodeList ndlMet = doc.getElementsByTagName("meta");
          NodeList ndlTitle = doc.getElementsByTagName("title");
          NodeList ndlImage = doc.getElementsByTagName("img");
          check4OpenGraphTags(ndlMet, strBaseURL);
          if (m_Description.equals("")) {
View Full Code Here

TOP

Related Classes of org.cyberneko.html.parsers.DOMParser

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.