Package org.apache.stanbol.enhancer.engines.htmlextractor

Source Code of org.apache.stanbol.enhancer.engines.htmlextractor.TestHtmlExtractor

/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements.  See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.stanbol.enhancer.engines.htmlextractor;

import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertTrue;

import java.io.IOException;
import java.io.InputStream;
import java.util.Set;

import org.apache.clerezza.rdf.core.MGraph;
import org.apache.clerezza.rdf.core.NonLiteral;
import org.apache.clerezza.rdf.core.Triple;
import org.apache.clerezza.rdf.core.UriRef;
import org.apache.clerezza.rdf.core.impl.SimpleMGraph;
import org.apache.stanbol.enhancer.engines.htmlextractor.impl.ClerezzaRDFUtils;
import org.apache.stanbol.enhancer.engines.htmlextractor.impl.ExtractorException;
import org.apache.stanbol.enhancer.engines.htmlextractor.impl.HtmlExtractionRegistry;
import org.apache.stanbol.enhancer.engines.htmlextractor.impl.HtmlExtractor;
import org.apache.stanbol.enhancer.engines.htmlextractor.impl.HtmlParser;
import org.apache.stanbol.enhancer.engines.htmlextractor.impl.InitializationException;
import org.junit.BeforeClass;
import org.junit.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
*
* @author <a href="mailto:kasper@dfki.de">Walter Kasper</a>
*
*/

public class TestHtmlExtractor {
    private static final Logger LOG = LoggerFactory.getLogger(TestHtmlExtractor.class);
   
    private static HtmlParser parser;
   
    private static HtmlExtractionRegistry registry;
   
    // define the Nepomuks NIE namespace locally here
    private static final String NIE_NS = "http://www.semanticdesktop.org/ontologies/2007/01/19/nie#";
   
    @BeforeClass
    public static void oneTimeSetup() throws IOException {
        try {
            registry = new HtmlExtractionRegistry("htmlextractors.xml");
        }
        catch (InitializationException e) {
            LOG.error("Registry Initialization Error: " + e.getMessage());
            throw new IOException(e.getMessage());
        }
        parser = new HtmlParser();

    }
   
    /**
     * This tests the RDFa extraction.
     *
     * @throws ExtractorException if there is an error during extraction
     * @throws IOException if there is an error when reading the document
     */
    @Test
    public void testRdfaExtraction() throws Exception {
        HtmlExtractor extractor = new HtmlExtractor(registry, parser);
        MGraph model = new SimpleMGraph();
        String testFile = "test-rdfa.html";
        // extract text from RDFa annotated html
        InputStream in = getResourceAsStream(testFile);
        assertNotNull("failed to load resource " + testFile, in);

        extractor.extract("file://" + testFile,in,null, "text/html", model);

        // show triples
        int tripleCounter = model.size();
        LOG.debug("RDFa triples: {}",tripleCounter);
        printTriples(model);
        assertEquals(8, tripleCounter);
        ClerezzaRDFUtils.makeConnected(model, new UriRef("file://" + testFile), new UriRef(NIE_NS+"contains"));
    }
   
    /** This tests some Microformat extraction
     *
     * @throws ExtractorException if there is an error during extraction
     * @throws IOException if there is an error when reading the document
     */
    @Test
    public void testMFExtraction() throws Exception {
        HtmlExtractor extractor = new HtmlExtractor(registry, parser);
        MGraph model = new SimpleMGraph();
        String testFile = "test-MF.html";

        // extract text from RDFa annotated html
        InputStream in = getResourceAsStream(testFile);
        assertNotNull("failed to load resource " + testFile, in);

        extractor.extract("file://" + testFile,in,null, "text/html", model);

        // show triples
        int tripleCounter = model.size();
        LOG.debug("Microformat triples: {}",tripleCounter);
        printTriples(model);
        assertEquals(127, tripleCounter);
        ClerezzaRDFUtils.makeConnected(model, new UriRef("file://" + testFile), new UriRef(NIE_NS+"contains"));
    }

    /** This test some extraction of microdata from an HTML-5 document
     *
     * @throws Exception
     */
    @Test
    public void testMicrodataExtraction() throws Exception {
      HtmlExtractor extractor = new HtmlExtractor(registry, parser);
      MGraph model = new SimpleMGraph();
      String testFile = "test-microdata.html";

      // extract text from RDFa annotated html
      InputStream in = getResourceAsStream(testFile);
      assertNotNull("failed to load resource " + testFile, in);

      extractor.extract("file://" + testFile,in,null, "text/html", model);

      // show triples
      int tripleCounter = model.size();
      LOG.debug("Microdata triples: {}",tripleCounter);
      printTriples(model);
      assertEquals(91, tripleCounter);
      ClerezzaRDFUtils.makeConnected(model, new UriRef("file://" + testFile), new UriRef(NIE_NS+"contains"));
    }
   
    /** This tests the merging of disconnected graphs under a single root
     *
     * @throws Exception
     */
    @Test
    public void testRootExtraction() throws Exception {
        HtmlExtractor extractor = new HtmlExtractor(registry, parser);
        MGraph model = new SimpleMGraph();
        String testFile = "test-MultiRoot.html";

        // extract text from RDFa annotated html
        InputStream in = getResourceAsStream(testFile);
        assertNotNull("failed to load resource " + testFile, in);

        extractor.extract("file://" + testFile,in,null, "text/html", model);

        // show triples
        int tripleCounter = model.size();
        LOG.debug("Triples: {}",tripleCounter);
        printTriples(model);
        Set<NonLiteral> roots = ClerezzaRDFUtils.findRoots(model);
        assertTrue(roots.size() > 1);
        ClerezzaRDFUtils.makeConnected(model, new UriRef("file://" + testFile), new UriRef(NIE_NS+"contains"));
        roots = ClerezzaRDFUtils.findRoots(model);
        assertEquals(1,roots.size());
    }
   
    private InputStream getResourceAsStream(String testResultFile) {
        return this.getClass().getClassLoader().getResourceAsStream(
                testResultFile);
    }

    private void printTriples(MGraph model) {
        for (Triple t: model) {
            LOG.debug(t.toString());
        }
    }   
}
TOP

Related Classes of org.apache.stanbol.enhancer.engines.htmlextractor.TestHtmlExtractor

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.