Package org.apache.stanbol.enhancer.engines.htmlextractor.impl

Examples of org.apache.stanbol.enhancer.engines.htmlextractor.impl.HtmlExtractor


     * @throws ExtractorException if there is an error during extraction
     * @throws IOException if there is an error when reading the document
     */
    @Test
    public void testMFExtraction() throws Exception {
        HtmlExtractor extractor = new HtmlExtractor(registry, parser);
        MGraph model = new SimpleMGraph();
        String testFile = "test-MF.html";

        // extract text from RDFa annotated html
        InputStream in = getResourceAsStream(testFile);
        assertNotNull("failed to load resource " + testFile, in);

        extractor.extract("file://" + testFile,in,null, "text/html", model);

        // show triples
        int tripleCounter = model.size();
        LOG.debug("Microformat triples: {}",tripleCounter);
        printTriples(model);
View Full Code Here


     *
     * @throws Exception
     */
    @Test
    public void testMicrodataExtraction() throws Exception {
      HtmlExtractor extractor = new HtmlExtractor(registry, parser);
      MGraph model = new SimpleMGraph();
      String testFile = "test-microdata.html";

      // extract text from RDFa annotated html
      InputStream in = getResourceAsStream(testFile);
      assertNotNull("failed to load resource " + testFile, in);

      extractor.extract("file://" + testFile,in,null, "text/html", model);

      // show triples
      int tripleCounter = model.size();
      LOG.debug("Microdata triples: {}",tripleCounter);
      printTriples(model);
View Full Code Here

     *
     * @throws Exception
     */
    @Test
    public void testRootExtraction() throws Exception {
        HtmlExtractor extractor = new HtmlExtractor(registry, parser);
        MGraph model = new SimpleMGraph();
        String testFile = "test-MultiRoot.html";

        // extract text from RDFa annotated html
        InputStream in = getResourceAsStream(testFile);
        assertNotNull("failed to load resource " + testFile, in);

        extractor.extract("file://" + testFile,in,null, "text/html", model);

        // show triples
        int tripleCounter = model.size();
        LOG.debug("Triples: {}",tripleCounter);
        printTriples(model);
View Full Code Here

        return CANNOT_ENHANCE;
    }
   
    @Override
    public void computeEnhancements(ContentItem ci) throws EngineException {
        HtmlExtractor extractor = new HtmlExtractor(htmlExtractorRegistry, htmlParser);
        MGraph model = new SimpleMGraph();
        ci.getLock().readLock().lock();
        try {
            extractor.extract(ci.getUri().getUnicodeString(), ci.getStream(),null, ci.getMimeType(), model);
        } catch (ExtractorException e) {
            throw new EngineException("Error while processing ContentItem "
                    + ci.getUri()+" with HtmlExtractor",e);
        } finally {
            ci.getLock().readLock().unlock();
View Full Code Here

     * @throws ExtractorException if there is an error during extraction
     * @throws IOException if there is an error when reading the document
     */
    @Test
    public void testRdfaExtraction() throws Exception {
        HtmlExtractor extractor = new HtmlExtractor(registry, parser);
        MGraph model = new SimpleMGraph();
        String testFile = "test-rdfa.html";
        // extract text from RDFa annotated html
        InputStream in = getResourceAsStream(testFile);
        assertNotNull("failed to load resource " + testFile, in);

        extractor.extract("file://" + testFile,in,null, "text/html", model);

        // show triples
        int tripleCounter = model.size();
        LOG.debug("RDFa triples: {}",tripleCounter);
        printTriples(model);
View Full Code Here

TOP

Related Classes of org.apache.stanbol.enhancer.engines.htmlextractor.impl.HtmlExtractor

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.