Examples of TagSoupParser

cascading.operation.xml.TagSoupParser
ccil.org/~cowan/XML/tagsoup/">Tag Soup library to convert incoming HTML to clean XHTML.
org.apache.any23.extractor.html.TagSoupParser
tml.sourceforge.net/">NekoHTML based TagSoup parser by default uses the Xerces HTML DOM implementation, which doesn't support namespaces and forces uppercase element names. This works with the RDFa XSLT Converter and with XPath, so we left it this way. @author Richard Cyganiak (richard at cyganiak dot de) @author Michele Mostarda (mostarda@fbk.eu) @author Davide Palmisano (palmisano@fbk.eu)

Examples of cascading.operation.xml.TagSoupParser

      {
      // create a new pipe assembly to create the word count across all the pages, and the word count in a single page
      Pipe pipe = new Pipe( sourceName );


      // convert the html to xhtml using the TagSouParser. return only the fields "url" and "xml", discard the rest
      pipe = new Each( pipe, new Fields( "page" ), new TagSoupParser( new Fields( "xml" ) ), new Fields( "url", "xml" ) );
      // apply the given XPath expression to the xml in the "xml" field. this expression extracts the 'body' element.
      XPathGenerator bodyExtractor = new XPathGenerator( new Fields( "body" ), XPathOperation.NAMESPACE_XHTML, "//xhtml:body" );
      pipe = new Each( pipe, new Fields( "xml" ), bodyExtractor, new Fields( "url", "body" ) );
      // apply another XPath expression. this expression removes all elements from the xml, leaving only text nodes.
      // text nodes in a 'script' element are removed.

View Full Code Here

Examples of org.apache.any23.extractor.html.TagSoupParser

            ensureHasLocalCopy();
            final InputStream is = new BufferedInputStream( localDocumentSource.openInputStream() );
            is.mark(Integer.MAX_VALUE);
            final String candidateEncoding = getParserEncoding();
            is.reset();
            final TagSoupParser tagSoupParser = new TagSoupParser(
                    is,
                    documentURI.stringValue(),
                    candidateEncoding
            );
            if(extractionParameters.isValidate()) {
                documentReport = tagSoupParser.getValidatedDOM( extractionParameters.isFix() );
            } else {
                documentReport = new DocumentReport( EmptyValidationReport.getInstance(), tagSoupParser.getDOM() );
            }
            tagSoupDOMRelatedParameters = extractionParameters;
        }
        return documentReport;
    }

View Full Code Here

Examples of org.apache.any23.extractor.html.TagSoupParser

        xPathExtractionRule.add(template1);
        xPathExtractionRule.add(template2);


        final String documentURI = "http://www.page.com/test-uri";
        final InputStream testData = this.getClass().getResourceAsStream("xpathextractor-test.html");
        final TagSoupParser tagSoupParser = new TagSoupParser(testData, documentURI);
        final ExtractionResult extractionResult = mock(ExtractionResult.class);
        xPathExtractionRule.process(tagSoupParser.getDOM(), extractionResult);


        verify(extractionResult).writeTriple(
                ValueFactoryImpl.getInstance().createURI("http://sub1"),
                ValueFactoryImpl.getInstance().createURI("http://pred1"),
                ValueFactoryImpl.getInstance().createLiteral("value1"),

View Full Code Here

Examples of org.apache.any23.extractor.html.TagSoupParser

        Assert.assertEquals("Unexpected value for this_root"    , "http://example.org/"       , vars[1]);
        Assert.assertEquals("Unexpected value for html_base"    , "http://example.org/john-d/", vars[2]);
    }


    private String[] checkPageBaseHandling(String testFile) throws IOException, XSLTStylesheetException {
        final TagSoupParser tagSoupParser = new TagSoupParser(
                this.getClass().getResourceAsStream(testFile),
                "http://test/document/uri"
        );
        final StringWriter sw = new StringWriter();
        RDFaExtractor.getXSLT().applyTo(tagSoupParser.getDOM(), sw);
        final String content = sw.toString();
        logger.debug(content);
        final Pattern pattern = Pattern.compile("<!--this_location: '(.+)' this_root: '(.+)' html_base: '(.+)'-->");
        final Matcher matcher = pattern.matcher(content);
        Assert.assertTrue("Cannot find comment matching within generated output.", matcher.find());

View Full Code Here

Examples of org.apache.any23.extractor.html.TagSoupParser

    }


    private Document getDom(String document) throws IOException {
        final InputStream is = this.getClass().getResourceAsStream(document);
        try {
            final TagSoupParser tagSoupParser = new TagSoupParser(is, "http://test-document");
            return tagSoupParser.getDOM();
        } finally {
            is.close();
        }
    }

View Full Code Here

Examples of org.apache.any23.extractor.html.TagSoupParser

    }


    private DOMDocument loadDocument(String document) throws IOException, URISyntaxException {
        InputStream is = this.getClass().getResourceAsStream(document);
        final String documentURI = "http://test.com";
        TagSoupParser tsp = new TagSoupParser(is, documentURI);
        return new DefaultDOMDocument( new URI(documentURI), tsp.getDOM() );
    }

View Full Code Here

Examples of org.apache.any23.extractor.html.TagSoupParser

        }
        InputStream documentInputInputStream = null;
        try {
            final DocumentSource documentSource = document.get(0);
            documentInputInputStream = documentSource.openInputStream();
            final TagSoupParser tagSoupParser = new TagSoupParser(
                    documentInputInputStream,
                    documentSource.getDocumentURI()
            );
            org.apache.any23.extractor.microdata.MicrodataParser.getMicrodataAsJSON(tagSoupParser.getDOM(), System.out);
        } finally {
            if (documentInputInputStream != null) StreamUtils.closeGracefully(documentInputInputStream);
        }
    }

View Full Code Here

Examples of org.apache.any23.extractor.html.TagSoupParser

        }
        InputStream documentInputInputStream = null;
        try {
            final DocumentSource documentSource = document.get(0);
            documentInputInputStream = documentSource.openInputStream();
            final TagSoupParser tagSoupParser = new TagSoupParser(
                    documentInputInputStream,
                    documentSource.getDocumentURI()
            );
            org.apache.any23.extractor.microdata.MicrodataParser.getMicrodataAsJSON(tagSoupParser.getDOM(), System.out);
        } finally {
            if (documentInputInputStream != null) StreamUtils.closeGracefully(documentInputInputStream);
        }
    }

View Full Code Here

Examples of org.apache.any23.extractor.html.TagSoupParser

            ensureHasLocalCopy();
            final InputStream is = new BufferedInputStream( localDocumentSource.openInputStream() );
            is.mark(Integer.MAX_VALUE);
            final String candidateEncoding = getParserEncoding();
            is.reset();
            final TagSoupParser tagSoupParser = new TagSoupParser(
                    is,
                    documentURI.stringValue(),
                    candidateEncoding
            );
            if(extractionParameters.isValidate()) {
                documentReport = tagSoupParser.getValidatedDOM( extractionParameters.isFix() );
            } else {
                documentReport = new DocumentReport( EmptyValidationReport.getInstance(), tagSoupParser.getDOM() );
            }
            tagSoupDOMRelatedParameters = extractionParameters;
        }
        return documentReport;
    }

View Full Code Here

TOP

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.