Examples of org.apache.nutch.parse.ParseData

org.apache.nutch.parse.ParseData
Data extracted from a page's content. @see Parse#getData()

   * index.
   *  
   */
  public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum,
                         Inlinks inlinks) throws IndexingException {
    ParseData parseData = parse.getData();
    Metadata parseMeta = parseData.getParseMeta();
    
    String[] authors = parseMeta.getValues(Feed.FEED_AUTHOR);
    String[] tags = parseMeta.getValues(Feed.FEED_TAGS);
    String published = parseMeta.get(Feed.FEED_PUBLISHED);
    String updated = parseMeta.get(Feed.FEED_UPDATED);

View Full Code Here

      OutputCollector<Text, FieldsWritable> output, Reporter reporter)
      throws IOException {


      Node nodeDb = null;
      List<CrawlDatum> fetchDatums = new ArrayList<CrawlDatum>();
      ParseData parseData = null;
      ParseText parseText = null;
      List<FieldWritable> fieldsList = new ArrayList<FieldWritable>();


      // assign values, url must be successfully fetched and parsed
      while (values.hasNext()) {


        ObjectWritable objWrite = values.next();
        Object value = objWrite.get();
        if (value instanceof CrawlDatum) {
          CrawlDatum datum = (CrawlDatum)value;
          if (datum.getStatus() == CrawlDatum.STATUS_FETCH_SUCCESS) {
            fetchDatums.add(datum);
          }
        }
        else if (value instanceof Node) {
          nodeDb = (Node)value;
        }
        else if (value instanceof ParseData
          && ((ParseData)value).getStatus().isSuccess()) {
          parseData = (ParseData)value;
        }
        else if (value instanceof ParseText) {
          parseText = (ParseText)value;
        }
      }


      // if not successfully fetched and parsed then stop processing
      int numDatums = fetchDatums.size();
      if (numDatums == 0 || nodeDb == null || parseText == null
        || parseData == null) {
        return;
      }


      // get the most recent fetch time, this is duplicates inside of a single
      // segment, usually due to redirects
      CrawlDatum fetchDatum = null;
      long mostRecent = 0L;
      for (CrawlDatum cur : fetchDatums) {
        long fetchTime = cur.getFetchTime();
        if (fetchDatum == null || fetchTime > mostRecent) {
          fetchDatum = cur;
          mostRecent = fetchTime;
        }
      }


      // get parse metadata
      Metadata metadata = parseData.getContentMeta();
      Parse parse = new ParseImpl(parseText, parseData);


      // handle redirect urls
      Text reprUrlText = (Text)fetchDatum.getMetaData().get(
        Nutch.WRITABLE_REPR_URL_KEY);

View Full Code Here

      title = "";


    // collect outlink
    Outlink[] outlinks = OutlinkExtractor.getOutlinks(text, getConf());


    ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title,
                                        outlinks, content.getMetadata());
    return ParseResult.createParseResult(content.getUrl(), 
                                         new ParseImpl(text, parseData));
  }

View Full Code Here

      title = "";


    // collect outlink
    Outlink[] outlinks = OutlinkExtractor.getOutlinks(text, getConf());


    ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title,
                                        outlinks, content.getMetadata(),
                                        metadata);
    return ParseResult.createParseResult(content.getUrl(), new ParseImpl(text, parseData));
    // any filter?
    //return HtmlParseFilters.filter(content, parse, root);

View Full Code Here

          }


          Parse parse = (Parse)value;


          textOut.append(key, new ParseText(parse.getText()));
          ParseData parseData = parse.getData();


          // recover the signature prepared by Fetcher or ParseSegment
          String sig = parseData.getContentMeta().get(
            Nutch.SIGNATURE_KEY);
            
          if (sig != null)
          {
            byte[] signature = (sha1)?
              Base32.decode(sig): StringUtil.fromHexString(sig);
            
            if (signature != null)
            {
              // append a CrawlDatum with a signature
              CrawlDatum d = new CrawlDatum(
                CrawlDatum.STATUS_SIGNATURE, 0.0f);
              d.setSignature(signature);
              crawlOut.append(key, d);
            }
          }


          // collect outlinks for subsequent db update
          Outlink[] links = parseData.getOutlinks();
          if (ignoreExternalLinks)
          {
            try
            {
              fromHost = new URL(fromUrl).getHost().toLowerCase();

View Full Code Here

    
    if (fromUrl == null) { // discard all outlinks    
      return;
    }


    ParseData parseData = (ParseData)value;
    Outlink[] outlinks = parseData.getOutlinks();
    Inlinks inlinks = new Inlinks();
    
    String fromUrlCriginalColectionName=null; 
    String fromUrlTimestamp=null;
    if (collectionType.equals(Global.COLLECTION_TYPE_MULTIPLE)) {

View Full Code Here

    }
    Path parseDir = new Path(segment, ParseData.DIR_NAME);
    if (fs.exists(fetchDir) && fs.isDirectory(fetchDir)) {
      cnt = 0L;
      long errors = 0L;
      ParseData value = new ParseData();
      MapFile.Reader[] mreaders = MapFileOutputFormat.getReaders(fs, parseDir, getConf());
      for (int i = 0; i < mreaders.length; i++) {
        while (mreaders[i].next(key, value)) {
          cnt++;
          if (!value.getStatus().isSuccess()) errors++;
        }
        mreaders[i].close();
      }
      stats.parsed = cnt;
      stats.parseErrors = errors;

View Full Code Here

      title = "";


    // collect outlink
    Outlink[] outlinks = OutlinkExtractor.getOutlinks(text, getConf());


    ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title,
                                        outlinks, content.getMetadata(),
                                        metadata);
    parseData.setConf(this.conf);
    return new ParseImpl(text, parseData);
    // any filter?
    //return HtmlParseFilters.filter(content, parse, root);
  }

View Full Code Here

    
  public Parse getParse(Content content)
  {
//    return new ParseImpl(content.getUrl(),  TODO MC BUG - don't index url as content
    return new ParseImpl("",
      new ParseData(ParseStatus.STATUS_SUCCESS,
      "", new Outlink[0], content.getMetadata()));
  }

View Full Code Here

  public void reduce(WritableComparable key, Iterator values, OutputCollector output, Reporter reporter) throws IOException {
    CrawlDatum lastG = null;
    CrawlDatum lastF = null;
    CrawlDatum lastSig = null;
    Content lastC = null;
    ParseData lastPD = null;
    ParseText lastPT = null;
    String lastGname = null;
    String lastFname = null;
    String lastSigname = null;
    String lastCname = null;

View Full Code Here

0 1 2 3 4 5 6 7 8 9

TOP

Related Classes of org.apache.nutch.parse.ParseData

com.atlantbh.nutch.filter.index.omit.OmitIndexingFilterTest

com.atlantbh.nutch.filter.xpath.XPathHtmlParserFilterTest

com.atlantbh.nutch.filter.xpath.XPathIndexingFilterTest

com.atlantbh.nutch.index.alternativedataflow.AlternativeDataFlowIndexingFilterTest

com.atlantbh.nutch.index.alternativedataflow.flow.CsvDataFlowTest

org.apache.commons.cli.Options

org.apache.hadoop.conf.Configuration

org.apache.hadoop.fs.FileSystem

org.apache.hadoop.util.GenericOptionsParser

org.apache.nutch.fetcher.TestFetcher

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.