Examples of org.apache.nutch.parse.ParseData

org.apache.nutch.parse.ParseData
Data extracted from a page's content. @see Parse#getData()

    String class1 = "NonExistingFilter";
    String class2 = "org.apache.nutch.indexer.basic.BasicIndexingFilter";
    conf.set(IndexingFilters.INDEXINGFILTER_ORDER, class1 + " " + class2);


    IndexingFilters filters = new IndexingFilters(conf);
    filters.filter(new Document(), new ParseImpl("text", new ParseData(
        new ParseStatus(), "title", new Outlink[0], new Metadata())), new Text(
        "http://www.example.com/"), new CrawlDatum(), new Inlinks());
  }

View Full Code Here

        }
        // if (LOG.isInfoEnabled()) {
        //   LOG.info("Outlinks: "+outlinks);
        // }


        ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS,
                contentTitle.toString(), outlinks, content.getMetadata());
        parseData.setConf(this.conf);
        return new ParseImpl(indexText.toString(), parseData);
    }

View Full Code Here

      title = "";


    // collect outlink
    Outlink[] outlinks = OutlinkExtractor.getOutlinks(text, getConf());


    ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title,
                                        outlinks, content.getMetadata());
    parseData.setConf(this.conf);
    return new ParseImpl(text, parseData);
  }

View Full Code Here

    }


    String text = delegate.getText();


    return new ParseImpl(text,
                         new ParseData(ParseStatus.STATUS_SUCCESS,
                                       title,
                                       OutlinkExtractor
        .                              getOutlinks(text, this.conf),
                                       content.getMetadata(),
                                       metadata));

View Full Code Here

            //check that there are 3 outlinks:
            //http://test.channel.com
            //http://www-scf.usc.edu/~mattmann/
            //http://www.nutch.org


            ParseData theParseData = parse.getData();


            Outlink[] theOutlinks = theParseData.getOutlinks();


            assertTrue("There aren't 3 outlinks read!", theOutlinks.length == 3);


            //now check to make sure that those are the two outlinks
            boolean hasLink1 = false, hasLink2 = false, hasLink3 = false;

View Full Code Here

  public void reduce(WritableComparable key, Iterator values, OutputCollector output, Reporter reporter) throws IOException {
    CrawlDatum lastG = null;
    CrawlDatum lastF = null;
    CrawlDatum lastSig = null;
    Content lastC = null;
    ParseData lastPD = null;
    ParseText lastPT = null;
    String lastGname = null;
    String lastFname = null;
    String lastSigname = null;
    String lastCname = null;

View Full Code Here

    }
    Path parseDir = new Path(segment, ParseData.DIR_NAME);
    if (fs.exists(fetchDir) && fs.isDirectory(fetchDir)) {
      cnt = 0L;
      long errors = 0L;
      ParseData value = new ParseData();
      MapFile.Reader[] mreaders = MapFileOutputFormat.getReaders(fs, parseDir, getConf());
      for (int i = 0; i < mreaders.length; i++) {
        while (mreaders[i].next(key, value)) {
          cnt++;
          if (!value.getStatus().isSuccess()) errors++;
        }
        mreaders[i].close();
      }
      stats.parsed = cnt;
      stats.parseErrors = errors;

View Full Code Here

  public void testContent() throws Exception {


    Parse parse = new ParseUtil(NutchConfiguration.create())
                        .parseByExtensionId("parse-mspowerpoint", this.content);


    ParseData data = parse.getData();
    String text = parse.getText();


    assertTrue("No content extracted length ==0", text.length() > 0);
    
    this.dumpToFile(this.testFile.getName(), data, text);

View Full Code Here

  public void testMeta() throws Exception {


    Parse parse = new ParseUtil(NutchConfiguration.create())
                        .parseByExtensionId("parse-mspowerpoint", content);
    
    ParseData data = parse.getData();


    final FileExtensionFilter titleFilter = new FileExtensionFilter(
        this.testFile.getName() + ".meta");
    final File[] titleFiles = this.sampleDir.listFiles(titleFilter);


    if (titleFiles.length > 0) {
      assertEquals("Document Title", this.fileToString(titleFiles[0]),
          "Title: " + data.getTitle() + LINE_SEPARATOR +
          "Outlinks: " + data.getOutlinks().length + LINE_SEPARATOR);
    } else {
      assertTrue("Document Title length ==0", data.getTitle().length() > 0);
      LOG.info("Comparison file for Title not available: "
          + this.testFile.getName() + ".meta");
    }
  }

View Full Code Here

      List list = Arrays.asList(old);
      outlinks.addAll(list);
      ParseStatus status = parse.getData().getStatus();
      String text = parse.getText();
      Outlink[] newlinks = (Outlink[])outlinks.toArray(new Outlink[outlinks.size()]);
      ParseData parseData = new ParseData(status, title, newlinks,
                                          parse.getData().getContentMeta(),
                                          parse.getData().getParseMeta());
      parseData.setConf(this.conf);
      parse = new ParseImpl(text, parseData);
    }
    return parse;
  }

View Full Code Here

0 1 2 3 4 5 6 7 8 9

TOP

Related Classes of org.apache.nutch.parse.ParseData

com.atlantbh.nutch.filter.index.omit.OmitIndexingFilterTest

com.atlantbh.nutch.filter.xpath.XPathHtmlParserFilterTest

com.atlantbh.nutch.filter.xpath.XPathIndexingFilterTest

com.atlantbh.nutch.index.alternativedataflow.AlternativeDataFlowIndexingFilterTest

com.atlantbh.nutch.index.alternativedataflow.flow.CsvDataFlowTest

org.apache.commons.cli.Options

org.apache.hadoop.conf.Configuration

org.apache.hadoop.fs.FileSystem

org.apache.hadoop.util.GenericOptionsParser

org.apache.nutch.fetcher.TestFetcher

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.