Examples of HTMLExtractor

com.ontometrics.scraper.extraction.HtmlExtractor
Provides a means of collecting {@link Manipulator}s and performing progressive harvesting of html from an original source. This is done through an implementation of the Chain of Responsibility Pattern: the manipulators are held in a LinkedList and when new ones are added, they are bolted on to the end, then, when the source is requested, the first {@link Manipulator} isinvoked, setting off the chain of operations. Then the resulting source is extracted. Extractor is fetching content from {@link URL} using {@link UrlContentProvider}. Default implementation is {@link UrlConnectionContentProvider}, but it can be changed, see {@link #urlContentProvider(UrlContentProvider)} @author Rob
org.apache.jmeter.extractor.HtmlExtractor
org.apache.stanbol.enhancer.engines.htmlextractor.impl.HtmlExtractor
HtmlExtractor.java @author Walter Kasper
org.vietspider.html.path2.HTMLExtractor
Author : Nhu Dinh Thuan nhudinhthuan@yahoo.com Dec 6, 2007

Examples of com.ontometrics.scraper.extraction.HtmlExtractor

      public URL getBaseUrl() {
        // TODO Auto-generated method stub
        return PagedListingFolder.getUrl();
      }
    };
    HtmlExtractor htmlExtractor = html().url(PagedListingTable.getUrl());


    List<Record> records = new ListingDetailScraper().setConvertURLs(true)
        .iterator(pageIterator)
        .listing(new LinkExtractor().source(htmlExtractor))
        .details(new DefaultFieldExtractor().source(htmlExtractor))

View Full Code Here

Examples of org.apache.jmeter.extractor.HtmlExtractor


    @Override
    public void configure(TestElement el) {
        super.configure(el);
        if (el instanceof HtmlExtractor){
            HtmlExtractor htmlExtractor = (HtmlExtractor) el;
            showScopeSettings(htmlExtractor, true);
            expressionField.setText(htmlExtractor.getExpression());
            attributeField.setText(htmlExtractor.getAttribute());
            defaultField.setText(htmlExtractor.getDefaultValue());
            matchNumberField.setText(htmlExtractor.getMatchNumberAsString());
            refNameField.setText(htmlExtractor.getRefName());
            extractorImplName.setSelectedItem(htmlExtractor.getExtractor());
        }
    }

View Full Code Here

Examples of org.apache.jmeter.extractor.HtmlExtractor

    /**
     * @see org.apache.jmeter.gui.JMeterGUIComponent#createTestElement()
     */
    @Override
    public TestElement createTestElement() {
        AbstractScopedTestElement extractor = new HtmlExtractor();
        modifyTestElement(extractor);
        return extractor;
    }

View Full Code Here

Examples of org.apache.jmeter.extractor.HtmlExtractor

     */
    @Override
    public void modifyTestElement(TestElement extractor) {
        super.configureTestElement(extractor);
        if (extractor instanceof HtmlExtractor) {
            HtmlExtractor htmlExtractor = (HtmlExtractor) extractor;
            saveScopeSettings(htmlExtractor);
            htmlExtractor.setRefName(refNameField.getText());
            htmlExtractor.setExpression(expressionField.getText());
            htmlExtractor.setAttribute(attributeField.getText());
            htmlExtractor.setDefaultValue(defaultField.getText());
            htmlExtractor.setMatchNumber(matchNumberField.getText());
            if(extractorImplName.getSelectedIndex()< HtmlExtractor.getImplementations().length) {
                htmlExtractor.setExtractor(HtmlExtractor.getImplementations()[extractorImplName.getSelectedIndex()]);
            } else {
                htmlExtractor.setExtractor(USE_DEFAULT_EXTRACTOR_IMPL);               
            }


        }
    }

View Full Code Here

Examples of org.apache.stanbol.enhancer.engines.htmlextractor.impl.HtmlExtractor

     * @throws ExtractorException if there is an error during extraction
     * @throws IOException if there is an error when reading the document
     */
    @Test
    public void testMFExtraction() throws Exception {
        HtmlExtractor extractor = new HtmlExtractor(registry, parser);
        MGraph model = new SimpleMGraph();
        String testFile = "test-MF.html";


        // extract text from RDFa annotated html
        InputStream in = getResourceAsStream(testFile);
        assertNotNull("failed to load resource " + testFile, in);


        extractor.extract("file://" + testFile,in,null, "text/html", model);


        // show triples
        int tripleCounter = model.size();
        LOG.debug("Microformat triples: {}",tripleCounter);
        printTriples(model);

View Full Code Here

Examples of org.apache.stanbol.enhancer.engines.htmlextractor.impl.HtmlExtractor

     * 
     * @throws Exception
     */
    @Test
    public void testMicrodataExtraction() throws Exception {
      HtmlExtractor extractor = new HtmlExtractor(registry, parser);
      MGraph model = new SimpleMGraph();
      String testFile = "test-microdata.html";


      // extract text from RDFa annotated html
      InputStream in = getResourceAsStream(testFile);
      assertNotNull("failed to load resource " + testFile, in);


      extractor.extract("file://" + testFile,in,null, "text/html", model);


      // show triples
      int tripleCounter = model.size();
      LOG.debug("Microdata triples: {}",tripleCounter);
      printTriples(model);

View Full Code Here

Examples of org.apache.stanbol.enhancer.engines.htmlextractor.impl.HtmlExtractor

     * 
     * @throws Exception
     */
    @Test
    public void testRootExtraction() throws Exception {
        HtmlExtractor extractor = new HtmlExtractor(registry, parser);
        MGraph model = new SimpleMGraph();
        String testFile = "test-MultiRoot.html";


        // extract text from RDFa annotated html
        InputStream in = getResourceAsStream(testFile);
        assertNotNull("failed to load resource " + testFile, in);


        extractor.extract("file://" + testFile,in,null, "text/html", model);


        // show triples
        int tripleCounter = model.size();
        LOG.debug("Triples: {}",tripleCounter);
        printTriples(model);

View Full Code Here

Examples of org.apache.stanbol.enhancer.engines.htmlextractor.impl.HtmlExtractor

        return CANNOT_ENHANCE;
    }
    
    @Override
    public void computeEnhancements(ContentItem ci) throws EngineException {
        HtmlExtractor extractor = new HtmlExtractor(htmlExtractorRegistry, htmlParser);
        MGraph model = new SimpleMGraph();
        ci.getLock().readLock().lock();
        try {
            extractor.extract(ci.getUri().getUnicodeString(), ci.getStream(),null, ci.getMimeType(), model);
        } catch (ExtractorException e) {
            throw new EngineException("Error while processing ContentItem "
                    + ci.getUri()+" with HtmlExtractor",e);
        } finally {
            ci.getLock().readLock().unlock();

View Full Code Here

Examples of org.apache.stanbol.enhancer.engines.htmlextractor.impl.HtmlExtractor

     * @throws ExtractorException if there is an error during extraction
     * @throws IOException if there is an error when reading the document
     */
    @Test
    public void testRdfaExtraction() throws Exception {
        HtmlExtractor extractor = new HtmlExtractor(registry, parser);
        MGraph model = new SimpleMGraph();
        String testFile = "test-rdfa.html";
        // extract text from RDFa annotated html
        InputStream in = getResourceAsStream(testFile);
        assertNotNull("failed to load resource " + testFile, in);


        extractor.extract("file://" + testFile,in,null, "text/html", model);


        // show triples
        int tripleCounter = model.size();
        LOG.debug("RDFa triples: {}",tripleCounter);
        printTriples(model);

View Full Code Here

Examples of org.vietspider.html.path2.HTMLExtractor

        tree.removeAll();
        if(document != null) handler.createTreeItem(tree, document); 
        new AutoSelectDataNode2(document, url, handler, tree);
        
        if(paths.length < 1) {
          HTMLExtractor extractor  = new HTMLExtractor();
          NodePathParser pathParser = new NodePathParser();
          if(hyperlinkUtil == null) hyperlinkUtil = new HyperLinkUtil();
          HTMLNode header = null;
          HTMLNode body = null;
          try {
            NodePath nodePath  = pathParser.toPath("HEAD");
            header = extractor.lookNode(document.getRoot(), nodePath);
            nodePath  = pathParser.toPath("BODY");
            body = extractor.lookNode(document.getRoot(), nodePath);
          } catch (Exception e) {
            ClientLog.getInstance().setException(getShell(), e);
          }
          
          if(header == null || body == null) return;

View Full Code Here

0 1 2

TOP

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.