Examples of org.apache.nutch.parse.ParseData

org.apache.nutch.parse.ParseData
Data extracted from a page's content. @see Parse#getData()

                bodyValue);
          }
        }
      }
    }
    ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS,
        metadataCollector.getTitle(), metadataCollector.getOutlinks(),
        contentMeta, metadataCollector.getData());
    ParseResult parseResult = ParseResult.createParseResult(content
        .getUrl(),
        new ParseImpl(metadataCollector.getText(), parseData));

View Full Code Here

    reader = new SequenceFile.Reader(fs, parseData, conf);
    
    READ_PARSE_DATA:
      do {
      Text key = new Text();
      ParseData value = new ParseData();
      if(!reader.next(key, value)) break READ_PARSE_DATA;
      // make sure they all contain "nutch.segment.name" and "nutch.content.digest" 
      // keys in parse metadata
      Metadata contentMeta = value.getContentMeta();
      if (contentMeta.get(Nutch.SEGMENT_NAME_KEY) != null 
            && contentMeta.get(Nutch.SIGNATURE_KEY) != null) {
        handledurls.add(key.toString());
      }
    } while(true);

View Full Code Here

                     OutputCollector<Text, NutchIndexAction> output, Reporter reporter)
    throws IOException {
    Inlinks inlinks = null;
    CrawlDatum dbDatum = null;
    CrawlDatum fetchDatum = null;
    ParseData parseData = null;
    ParseText parseText = null;


    while (values.hasNext()) {
      final Writable value = values.next().get(); // unwrap
      if (value instanceof Inlinks) {
        inlinks = (Inlinks)value;
      } else if (value instanceof CrawlDatum) {
        final CrawlDatum datum = (CrawlDatum)value;
        if (CrawlDatum.hasDbStatus(datum)) {
          dbDatum = datum;
        }
        else if (CrawlDatum.hasFetchStatus(datum)) {
          // don't index unmodified (empty) pages
          if (datum.getStatus() != CrawlDatum.STATUS_FETCH_NOTMODIFIED) {
            fetchDatum = datum;
          }
        } else if (CrawlDatum.STATUS_LINKED == datum.getStatus() ||
                   CrawlDatum.STATUS_SIGNATURE == datum.getStatus() ||
                   CrawlDatum.STATUS_PARSE_META == datum.getStatus()) {
          continue;
        } else {
          throw new RuntimeException("Unexpected status: "+datum.getStatus());
        }
      } else if (value instanceof ParseData) {
        parseData = (ParseData)value;


        // Handle robots meta? https://issues.apache.org/jira/browse/NUTCH-1434
        if (deleteRobotsNoIndex) {
          // Get the robots meta data
          String robotsMeta = parseData.getMeta("robots");


          // Has it a noindex for this url?
          if (robotsMeta != null && robotsMeta.toLowerCase().indexOf("noindex") != -1) {
            // Delete it!
            NutchIndexAction action = new NutchIndexAction(null, NutchIndexAction.DELETE);
            output.collect(key, action);
            return;
          }
        }
      } else if (value instanceof ParseText) {
        parseText = (ParseText)value;
      } else if (LOG.isWarnEnabled()) {
        LOG.warn("Unrecognized type: "+value.getClass());
      }
    }
    
    // Whether to delete GONE or REDIRECTS
    if (delete && fetchDatum != null && dbDatum != null) {    
      if (fetchDatum.getStatus() == CrawlDatum.STATUS_FETCH_GONE || dbDatum.getStatus() == CrawlDatum.STATUS_DB_GONE) {
        reporter.incrCounter("IndexerStatus", "Documents deleted", 1);


        NutchIndexAction action = new NutchIndexAction(null, NutchIndexAction.DELETE);
        output.collect(key, action);
        return;
      }
      
      if (fetchDatum.getStatus() == CrawlDatum.STATUS_FETCH_REDIR_PERM ||
          fetchDatum.getStatus() == CrawlDatum.STATUS_FETCH_REDIR_TEMP ||
          dbDatum.getStatus() == CrawlDatum.STATUS_DB_REDIR_PERM ||
          dbDatum.getStatus() == CrawlDatum.STATUS_DB_REDIR_TEMP) {
        reporter.incrCounter("IndexerStatus", "Deleted redirects", 1);
        reporter.incrCounter("IndexerStatus", "Perm redirects deleted", 1);


        NutchIndexAction action = new NutchIndexAction(null, NutchIndexAction.DELETE);
        output.collect(key, action);
        return;
      }
    }


    if (fetchDatum == null || dbDatum == null
        || parseText == null || parseData == null) {
      return;                                     // only have inlinks
    }


    // Whether to delete pages marked as duplicates
    if (delete && dbDatum.getStatus() == CrawlDatum.STATUS_DB_DUPLICATE) {
      reporter.incrCounter("IndexerStatus", "Duplicates deleted", 1);
      NutchIndexAction action = new NutchIndexAction(null, NutchIndexAction.DELETE);
      output.collect(key, action);
      return;
    }
    
    // Whether to skip DB_NOTMODIFIED pages
    if (skip && dbDatum.getStatus() == CrawlDatum.STATUS_DB_NOTMODIFIED) {
      reporter.incrCounter("IndexerStatus", "Skipped", 1);
      return;
    }


    if (!parseData.getStatus().isSuccess() ||
        fetchDatum.getStatus() != CrawlDatum.STATUS_FETCH_SUCCESS) {
      return;
    }


    NutchDocument doc = new NutchDocument();
    final Metadata metadata = parseData.getContentMeta();


    // add segment, used to map from merged index back to segment files
    doc.add("segment", metadata.get(Nutch.SEGMENT_NAME_KEY));


    // add digest, used by dedup

View Full Code Here

                     OutputCollector<Text, NutchDocument> output, Reporter reporter)
    throws IOException {
    Inlinks inlinks = null;
    CrawlDatum dbDatum = null;
    CrawlDatum fetchDatum = null;
    ParseData parseData = null;
    ParseText parseText = null;
    while (values.hasNext()) {
      final Writable value = values.next().get(); // unwrap
      if (value instanceof Inlinks) {
        inlinks = (Inlinks)value;
      } else if (value instanceof CrawlDatum) {
        final CrawlDatum datum = (CrawlDatum)value;
        if (CrawlDatum.hasDbStatus(datum))
          dbDatum = datum;
        else if (CrawlDatum.hasFetchStatus(datum)) {
          // don't index unmodified (empty) pages
          if (datum.getStatus() != CrawlDatum.STATUS_FETCH_NOTMODIFIED)
            fetchDatum = datum;
        } else if (CrawlDatum.STATUS_LINKED == datum.getStatus() ||
                   CrawlDatum.STATUS_SIGNATURE == datum.getStatus() ||
                   CrawlDatum.STATUS_PARSE_META == datum.getStatus()) {
          continue;
        } else {
          throw new RuntimeException("Unexpected status: "+datum.getStatus());
        }
      } else if (value instanceof ParseData) {
        parseData = (ParseData)value;
      } else if (value instanceof ParseText) {
        parseText = (ParseText)value;
      } else if (LOG.isWarnEnabled()) {
        LOG.warn("Unrecognized type: "+value.getClass());
      }
    }


    if (fetchDatum == null || dbDatum == null
        || parseText == null || parseData == null) {
      return;                                     // only have inlinks
    }


    if (!parseData.getStatus().isSuccess() ||
        fetchDatum.getStatus() != CrawlDatum.STATUS_FETCH_SUCCESS) {
      return;
    }


    NutchDocument doc = new NutchDocument();
    final Metadata metadata = parseData.getContentMeta();


    // add segment, used to map from merged index back to segment files
    doc.add("segment", metadata.get(Nutch.SEGMENT_NAME_KEY));


    // add digest, used by dedup

View Full Code Here

      OutputCollector<Text, MetaWrapper> output, Reporter reporter) throws IOException {
    CrawlDatum lastG = null;
    CrawlDatum lastF = null;
    CrawlDatum lastSig = null;
    Content lastC = null;
    ParseData lastPD = null;
    ParseText lastPT = null;
    String lastGname = null;
    String lastFname = null;
    String lastSigname = null;
    String lastCname = null;

View Full Code Here


      if (value instanceof ParseData) {


        // get the parse data and the outlinks from the parse data, along with
        // the fetch time for those links
        ParseData data = (ParseData)value;
        long fetchTime = getFetchTime(data);
        Outlink[] outlinkAr = data.getOutlinks();
        Map<String, String> outlinkMap = new LinkedHashMap<String, String>();


        // normalize urls and put into map
        if (outlinkAr != null && outlinkAr.length > 0) {
          for (int i = 0; i < outlinkAr.length; i++) {

View Full Code Here

      OutputCollector<Text, FieldsWritable> output, Reporter reporter)
      throws IOException {


      Node nodeDb = null;
      List<CrawlDatum> fetchDatums = new ArrayList<CrawlDatum>();
      ParseData parseData = null;
      ParseText parseText = null;
      List<FieldWritable> fieldsList = new ArrayList<FieldWritable>();


      // assign values, url must be successfully fetched and parsed
      while (values.hasNext()) {


        ObjectWritable objWrite = values.next();
        Object value = objWrite.get();
        if (value instanceof CrawlDatum) {
          CrawlDatum datum = (CrawlDatum)value;
          if (datum.getStatus() == CrawlDatum.STATUS_FETCH_SUCCESS) {
            fetchDatums.add(datum);
          }
        }
        else if (value instanceof Node) {
          nodeDb = (Node)value;
        }
        else if (value instanceof ParseData
          && ((ParseData)value).getStatus().isSuccess()) {
          parseData = (ParseData)value;
        }
        else if (value instanceof ParseText) {
          parseText = (ParseText)value;
        }
      }


      // if not successfully fetched and parsed then stop processing
      int numDatums = fetchDatums.size();
      if (numDatums == 0 || nodeDb == null || parseText == null
        || parseData == null) {
        return;
      }


      // get the most recent fetch time, this is duplicates inside of a single
      // segment, usually due to redirects
      CrawlDatum fetchDatum = null;
      long mostRecent = 0L;
      for (CrawlDatum cur : fetchDatums) {
        long fetchTime = cur.getFetchTime();
        if (fetchDatum == null || fetchTime > mostRecent) {
          fetchDatum = cur;
          mostRecent = fetchTime;
        }
      }


      // get parse metadata
      Metadata metadata = parseData.getContentMeta();
      Parse parse = new ParseImpl(parseText, parseData);


      // handle redirect urls
      Text reprUrlText = (Text)fetchDatum.getMetaData().get(
        Nutch.WRITABLE_REPR_URL_KEY);

View Full Code Here

    }


    if (text == null) { text = ""; }
    if (title == null) { title = ""; }


    ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title,
                                        outlinks, content.getMetadata(),
                                        metadata);
    return ParseResult.createParseResult(content.getUrl(), 
                                         new ParseImpl(text, parseData));
  }

View Full Code Here

            //check that there are 3 outlinks:
            //http://test.channel.com
            //http://www-scf.usc.edu/~mattmann/
            //http://www.nutch.org


            ParseData theParseData = parse.getData();


            Outlink[] theOutlinks = theParseData.getOutlinks();


            assertTrue("There aren't 3 outlinks read!", theOutlinks.length == 3);


            //now check to make sure that those are the two outlinks
            boolean hasLink1 = false, hasLink2 = false, hasLink3 = false;

View Full Code Here

  private void assertContentType(Configuration conf, String source, String expected) throws IndexingException {
    Metadata metadata = new Metadata();
    metadata.add(Response.CONTENT_TYPE, source);
    MoreIndexingFilter filter = new MoreIndexingFilter();
    filter.setConf(conf);
    NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text", new ParseData(
        new ParseStatus(), "title", new Outlink[0], metadata)), new Text(
        "http://www.example.com/"), new CrawlDatum(), new Inlinks());
    assertEquals("mime type not detected", expected, doc.getFieldValue("type"));
  }

View Full Code Here

0 1 2 3 4 5 6 7 8 9

TOP

Related Classes of org.apache.nutch.parse.ParseData

com.atlantbh.nutch.filter.index.omit.OmitIndexingFilterTest

com.atlantbh.nutch.filter.xpath.XPathHtmlParserFilterTest

com.atlantbh.nutch.filter.xpath.XPathIndexingFilterTest

com.atlantbh.nutch.index.alternativedataflow.AlternativeDataFlowIndexingFilterTest

com.atlantbh.nutch.index.alternativedataflow.flow.CsvDataFlowTest

org.apache.commons.cli.Options

org.apache.hadoop.conf.Configuration

org.apache.hadoop.fs.FileSystem

org.apache.hadoop.util.GenericOptionsParser

org.apache.nutch.fetcher.TestFetcher

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.