Package org.apache.any23.source

Examples of org.apache.any23.source.DocumentSource


    @Test
    public void testExtractionParametersWithNestingDisabled()
    throws IOException, ExtractionException, TripleHandlerException {
        final int EXPECTED_TRIPLES = 19;
        Any23 runner = new Any23();
        DocumentSource source = new FileDocumentSource(
                new File("src/test/resources/microformats/nested-microformats-a1.html"),
                "http://www.test.com"
        );

        ByteArrayOutputStream baos = new ByteArrayOutputStream();
View Full Code Here


    }

    @Test
    public void testExceptionPropagation() throws IOException {
        Any23 any23 = new Any23();
        DocumentSource source = new FileDocumentSource(
                new File("src/test/resources/application/turtle/geolinkeddata.ttl"),
                "http://www.test.com"
        );
        CountingTripleHandler cth1 = new CountingTripleHandler();
        try {
View Full Code Here

    @Test
    public void testXMLMimeTypeManagement() throws IOException, ExtractionException {
        final String documentURI = "http://www.test.com/resource.xml";
        final String contentType = "application/xml";
        final String in = StreamUtils.asString( this.getClass().getResourceAsStream("any23-xml-mimetype.xml") );
        final DocumentSource doc = new StringDocumentSource(in, documentURI, contentType);
        final Any23 any23 = new Any23();
        final CountingTripleHandler cth = new CountingTripleHandler(false);
        final ReportingTripleHandler rth = new ReportingTripleHandler(cth);
        final ExtractionReport report = any23.extract(doc, rth);
        Assert.assertFalse(report.hasMatchingExtractors());
View Full Code Here

    @Test
    public void testAbstractMethodErrorIssue186_1() throws IOException, ExtractionException{
        final Any23 runner = new Any23();
        final String content = FileUtils.readResourceContent("/html/rdfa/rdfa-issue186-1.xhtml");
        final DocumentSource source = new StringDocumentSource(content, "http://base.com");
        final ByteArrayOutputStream out = new ByteArrayOutputStream();
        final TripleHandler handler = new NTriplesWriter(out);
        runner.extract(source, handler);
        String n3 = out.toString("UTF-8");
        logger.debug(n3);
View Full Code Here

    @Test
    public void testAbstractMethodErrorIssue186_2() throws IOException, ExtractionException{
        final Any23 runner = new Any23();
        final String content = FileUtils.readResourceContent("/html/rdfa/rdfa-issue186-2.xhtml");
        final DocumentSource source = new StringDocumentSource(content, "http://richard.cyganiak.de/");
        final ByteArrayOutputStream out = new ByteArrayOutputStream();
        final TripleHandler handler = new NTriplesWriter(out);
        runner.extract(source, handler);
        final String n3 = out.toString("UTF-8");
        logger.debug(n3);
View Full Code Here

        final ModifiableConfiguration modifiableConf = DefaultConfiguration.copy();
        modifiableConf.setProperty("any23.extraction.metadata.timesize", "off");
        final Any23 any23 = new Any23(modifiableConf);

        final String content = FileUtils.readResourceContent("/rdf/rdf-issue183.ttl");
        final DocumentSource source = new StringDocumentSource(content, "http://base.com");
        final ByteArrayOutputStream out = new ByteArrayOutputStream();
        final TripleHandler handler = new NTriplesWriter(out);
        any23.extract(source, handler);
        handler.close();
        final String n3 = out.toString("UTF-8");
View Full Code Here

        if (document.isEmpty()) {
            throw new IllegalArgumentException("No input document URL specified");
        }
        InputStream documentInputInputStream = null;
        try {
            final DocumentSource documentSource = document.get(0);
            documentInputInputStream = documentSource.openInputStream();
            final TagSoupParser tagSoupParser = new TagSoupParser(
                    documentInputInputStream,
                    documentSource.getDocumentURI()
            );
            org.apache.any23.extractor.microdata.MicrodataParser.getMicrodataAsJSON(tagSoupParser.getDOM(), System.out);
        } finally {
            if (documentInputInputStream != null) StreamUtils.closeGracefully(documentInputInputStream);
        }
View Full Code Here

        /* 2 */final String content = "@prefix foo: <http://example.org/ns#> .   "
                + "@prefix : <http://other.example.org/ns#> ."
                + "foo:bar foo: : .                          "
                + ":bar : foo:bar .                           ";
        // The second argument of StringDocumentSource() must be a valid URI.
        /* 3 */DocumentSource source = new StringDocumentSource(content,
                "http://host.com/service");
        /* 4 */ByteArrayOutputStream out = new ByteArrayOutputStream();
        /* 5 */TripleHandler handler = new NTriplesWriter(out);
        try {
            /* 6 */runner.extract(source, handler);
View Full Code Here

        assumeOnlineAllowed();

        /* 1 */Any23 runner = new Any23();
        /* 2 */runner.setHTTPUserAgent("test-user-agent");
        /* 3 */HTTPClient httpClient = runner.getHTTPClient();
        /* 4 */DocumentSource source = new HTTPDocumentSource(httpClient,
                "http://dbpedia.org/resource/Trento");
        /* 5 */ByteArrayOutputStream out = new ByteArrayOutputStream();
        /* 6 */TripleHandler handler = new NTriplesWriter(out);
        try {
            /* 7 */runner.extract(source, handler);
View Full Code Here

        ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
        TripleHandler handler = new NTriplesWriter(byteArrayOutputStream);
        TripleHandler rdfWriter = new IgnoreAccidentalRDFa(handler);
        ReportingTripleHandler reporting = new ReportingTripleHandler(rdfWriter);

        DocumentSource source = getDocumentSourceFromResource(
                "/html/rdfa/ansa_2010-02-26_12645863.html",
                "http://host.com/service");

        Assert.assertTrue(any23.extract(source, reporting)
                .hasMatchingExtractors());
View Full Code Here

TOP

Related Classes of org.apache.any23.source.DocumentSource

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.