Examples of TagSoupParser


Examples of cascading.operation.xml.TagSoupParser

      {
      // create a new pipe assembly to create the word count across all the pages, and the word count in a single page
      Pipe pipe = new Pipe( sourceName );

      // convert the html to xhtml using the TagSouParser. return only the fields "url" and "xml", discard the rest
      pipe = new Each( pipe, new Fields( "page" ), new TagSoupParser( new Fields( "xml" ) ), new Fields( "url", "xml" ) );
      // apply the given XPath expression to the xml in the "xml" field. this expression extracts the 'body' element.
      XPathGenerator bodyExtractor = new XPathGenerator( new Fields( "body" ), XPathOperation.NAMESPACE_XHTML, "//xhtml:body" );
      pipe = new Each( pipe, new Fields( "xml" ), bodyExtractor, new Fields( "url", "body" ) );
      // apply another XPath expression. this expression removes all elements from the xml, leaving only text nodes.
      // text nodes in a 'script' element are removed.
View Full Code Here

Examples of org.apache.any23.extractor.html.TagSoupParser

            ensureHasLocalCopy();
            final InputStream is = new BufferedInputStream( localDocumentSource.openInputStream() );
            is.mark(Integer.MAX_VALUE);
            final String candidateEncoding = getParserEncoding();
            is.reset();
            final TagSoupParser tagSoupParser = new TagSoupParser(
                    is,
                    documentURI.stringValue(),
                    candidateEncoding
            );
            if(extractionParameters.isValidate()) {
                documentReport = tagSoupParser.getValidatedDOM( extractionParameters.isFix() );
            } else {
                documentReport = new DocumentReport( EmptyValidationReport.getInstance(), tagSoupParser.getDOM() );
            }
            tagSoupDOMRelatedParameters = extractionParameters;
        }
        return documentReport;
    }
View Full Code Here

Examples of org.apache.any23.extractor.html.TagSoupParser

        xPathExtractionRule.add(template1);
        xPathExtractionRule.add(template2);

        final String documentURI = "http://www.page.com/test-uri";
        final InputStream testData = this.getClass().getResourceAsStream("xpathextractor-test.html");
        final TagSoupParser tagSoupParser = new TagSoupParser(testData, documentURI);
        final ExtractionResult extractionResult = mock(ExtractionResult.class);
        xPathExtractionRule.process(tagSoupParser.getDOM(), extractionResult);

        verify(extractionResult).writeTriple(
                ValueFactoryImpl.getInstance().createURI("http://sub1"),
                ValueFactoryImpl.getInstance().createURI("http://pred1"),
                ValueFactoryImpl.getInstance().createLiteral("value1"),
View Full Code Here

Examples of org.apache.any23.extractor.html.TagSoupParser

        Assert.assertEquals("Unexpected value for this_root"    , "http://example.org/"       , vars[1]);
        Assert.assertEquals("Unexpected value for html_base"    , "http://example.org/john-d/", vars[2]);
    }

    private String[] checkPageBaseHandling(String testFile) throws IOException, XSLTStylesheetException {
        final TagSoupParser tagSoupParser = new TagSoupParser(
                this.getClass().getResourceAsStream(testFile),
                "http://test/document/uri"
        );
        final StringWriter sw = new StringWriter();
        RDFaExtractor.getXSLT().applyTo(tagSoupParser.getDOM(), sw);
        final String content = sw.toString();
        logger.debug(content);
        final Pattern pattern = Pattern.compile("<!--this_location: '(.+)' this_root: '(.+)' html_base: '(.+)'-->");
        final Matcher matcher = pattern.matcher(content);
        Assert.assertTrue("Cannot find comment matching within generated output.", matcher.find());
View Full Code Here

Examples of org.apache.any23.extractor.html.TagSoupParser

    }

    private Document getDom(String document) throws IOException {
        final InputStream is = this.getClass().getResourceAsStream(document);
        try {
            final TagSoupParser tagSoupParser = new TagSoupParser(is, "http://test-document");
            return tagSoupParser.getDOM();
        } finally {
            is.close();
        }
    }
View Full Code Here

Examples of org.apache.any23.extractor.html.TagSoupParser

    }

    private DOMDocument loadDocument(String document) throws IOException, URISyntaxException {
        InputStream is = this.getClass().getResourceAsStream(document);
        final String documentURI = "http://test.com";
        TagSoupParser tsp = new TagSoupParser(is, documentURI);
        return new DefaultDOMDocument( new URI(documentURI), tsp.getDOM() );
    }
View Full Code Here

Examples of org.apache.any23.extractor.html.TagSoupParser

        }
        InputStream documentInputInputStream = null;
        try {
            final DocumentSource documentSource = document.get(0);
            documentInputInputStream = documentSource.openInputStream();
            final TagSoupParser tagSoupParser = new TagSoupParser(
                    documentInputInputStream,
                    documentSource.getDocumentURI()
            );
            org.apache.any23.extractor.microdata.MicrodataParser.getMicrodataAsJSON(tagSoupParser.getDOM(), System.out);
        } finally {
            if (documentInputInputStream != null) StreamUtils.closeGracefully(documentInputInputStream);
        }
    }
View Full Code Here

Examples of org.apache.any23.extractor.html.TagSoupParser

        }
        InputStream documentInputInputStream = null;
        try {
            final DocumentSource documentSource = document.get(0);
            documentInputInputStream = documentSource.openInputStream();
            final TagSoupParser tagSoupParser = new TagSoupParser(
                    documentInputInputStream,
                    documentSource.getDocumentURI()
            );
            org.apache.any23.extractor.microdata.MicrodataParser.getMicrodataAsJSON(tagSoupParser.getDOM(), System.out);
        } finally {
            if (documentInputInputStream != null) StreamUtils.closeGracefully(documentInputInputStream);
        }
    }
View Full Code Here

Examples of org.apache.any23.extractor.html.TagSoupParser

            ensureHasLocalCopy();
            final InputStream is = new BufferedInputStream( localDocumentSource.openInputStream() );
            is.mark(Integer.MAX_VALUE);
            final String candidateEncoding = getParserEncoding();
            is.reset();
            final TagSoupParser tagSoupParser = new TagSoupParser(
                    is,
                    documentURI.stringValue(),
                    candidateEncoding
            );
            if(extractionParameters.isValidate()) {
                documentReport = tagSoupParser.getValidatedDOM( extractionParameters.isFix() );
            } else {
                documentReport = new DocumentReport( EmptyValidationReport.getInstance(), tagSoupParser.getDOM() );
            }
            tagSoupDOMRelatedParameters = extractionParameters;
        }
        return documentReport;
    }
View Full Code Here
TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.