Examples of ARCReporter


Examples of org.archive.mapred.ARCReporter

    // Assumption is that this map is being run by ARCMapRunner.
    // Otherwise, the below casts fail.
    String url = key.toString();
       
    ARCRecord rec = (ARCRecord)((ObjectWritable)value).get();
    ARCReporter reporter = (ARCReporter)r;      

    // Its null first time map is called on an ARC.
    checkArcName(rec);  
    if (! isIndex(rec))
    {
      return;
    }
    checkCollectionName();
   
    final ARCRecordMetaData arcData = rec.getMetaData();
    String oldUrl = url;
   
    try
    {
      url = urlNormalizers.normalize(url, URLNormalizers.SCOPE_FETCHER);
      url = filters.filter(url); // filter the url
    }
    catch (Exception e)
    {
      LOG.warn("Skipping record. Didn't pass normalization/filter " +
        oldUrl + ": " + e.toString());

      return;
    }

    final long b = arcData.getContentBegin();
    final long l = arcData.getLength();
    final long recordLength = (l > b)? (l - b): l;

    // Look at ARCRecord meta data line mimetype. It can be empty.  If so,
    // two more chances at figuring it either by looking at HTTP headers or
    // by looking at first couple of bytes of the file.  See below.
    String mimetype =
      getMimetype(arcData.getMimetype(), this.mimeTypes, url);
   
    if (skip(mimetype))
    {
      return;
    }

    // Copy http headers to nutch metadata.
    final Metadata metaData = new Metadata();
    final Header[] headers = rec.getHttpHeaders();
    for (int j = 0; j < headers.length; j++)
    {
      final Header header = headers[j];
     
      if (mimetype == null)
      {
        // Special handling. If mimetype is still null, try getting it
        // from the http header. I've seen arc record lines with empty
        // content-type and a MIME unparseable file ending; i.e. .MID.
        if ((header.getName() != null) &&
          header.getName().toLowerCase().equals(ImportArcs.CONTENT_TYPE_KEY))
        {
          mimetype = getMimetype(header.getValue(), null, null);
         
          if (skip(mimetype))
          {
            return;
          }
        }
      }
     
      metaData.set(header.getName(), header.getValue());
    }

    // This call to reporter setStatus pings the tasktracker telling it our
    // status and telling the task tracker we're still alive (so it doesn't
    // time us out).
    final String noSpacesMimetype =
      TextUtils.replaceAll(ImportArcs.WHITESPACE,
      ((mimetype == null || mimetype.length() <= 0)?
      "TODO": mimetype),
      "-");
    final String recordLengthAsStr = Long.toString(recordLength);
   
    reporter.setStatus(getStatus(url, oldUrl, recordLengthAsStr, noSpacesMimetype));

    // This is a nutch 'more' field.
    metaData.set("contentLength", recordLengthAsStr);

    rec.skipHttpHeader();
    reporter.setStatusIfElapse("read headers on " + url);

    // TODO: Skip if unindexable type.
    int total = 0;
   
    // Read in first block. If mimetype still null, look for MAGIC.
    int len = rec.read(this.buffer, 0, this.buffer.length);
   
    if (mimetype == null)
    {
      MimeType mt = this.mimeTypes.getMimeType(this.buffer);
     
      if (mt == null || mt.getName() == null)
      {
        LOG.warn("Failed to get mimetype for: " + url);
       
        return;
      }
     
      mimetype = mt.getName();
    }
   
    metaData.set(ImportArcs.CONTENT_TYPE_KEY, mimetype);

    // How much do we read total? If pdf, we will read more. If equal to -1,
    // read all.
    int readLimit = (ImportArcs.PDF_TYPE.equals(mimetype))?
      this.pdfContentLimit : this.contentLimit;
   
    // Reset our contentBuffer so can reuse.  Over the life of an ARC
    // processing will grow to maximum record size.
    this.contentBuffer.reset();
    while ((len != -1) && ((readLimit == -1) || (total < readLimit)))
    {
      total += len;
      this.contentBuffer.write(this.buffer, 0, len);
      len = rec.read(this.buffer, 0, this.buffer.length);
      reporter.setStatusIfElapse("reading " + url);
    }

    // Close the Record.  We're done with it.  Side-effect is calculation
    // of digest -- if we're digesting.
    rec.close();
    reporter.setStatusIfElapse("closed " + url);

    final byte[] contentBytes = this.contentBuffer.toByteArray();
    final CrawlDatum datum = new CrawlDatum();
    datum.setStatus(CrawlDatum.STATUS_FETCH_SUCCESS);

    // Calculate digest or use precalculated sha1.
    String digest = (this.sha1)? rec.getDigestStr():
    MD5Hash.digest(contentBytes).toString();
    metaData.set(Nutch.SIGNATURE_KEY, digest);
   
    // Set digest back into the arcData so available later when we write
    // CDX line.
    arcData.setDigest(digest);

    metaData.set(Nutch.SEGMENT_NAME_KEY, this.segmentName);
   
    // Score at this stage is 1.0f.
    metaData.set(Nutch.SCORE_KEY, Float.toString(datum.getScore()));

    final long startTime = System.currentTimeMillis();
    final Content content = new Content(url, url, contentBytes, mimetype,
      metaData, getConf());
    datum.setFetchTime(Nutchwax.getDate(arcData.getDate()));

    MapWritable mw = datum.getMetaData();
   
    if (mw == null)
    {
      mw = new MapWritable();
    }
           
    if (collectionType.equals(Global.COLLECTION_TYPE_MULTIPLE)) {
      mw.put(new Text(ImportArcs.ARCCOLLECTION_KEY), new Text(SqlSearcher.getCollectionNameWithTimestamp(collectionName,arcData.getDate())));  
    }
    else {
      mw.put(new Text(ImportArcs.ARCCOLLECTION_KEY), new Text(collectionName));
    }   
    mw.put(new Text(ImportArcs.ARCFILENAME_KEY), new Text(arcName));
    mw.put(new Text(ImportArcs.ARCFILEOFFSET_KEY),
      new Text(Long.toString(arcData.getOffset())));
    datum.setMetaData(mw);
         
  TimeoutParsingThread tout=threadPool.getThread(Thread.currentThread().getId(),timeoutIndexingDocument)
  tout.setUrl(url);
    tout.setContent(content);
    tout.setParseUtil(parseUtil);         
    tout.wakeupAndWait();       
 
  ParseStatus parseStatus=tout.getParseStatus();
  Parse parse=tout.getParse();    
  reporter.setStatusIfElapse("parsed " + url);
    
  if (!parseStatus.isSuccess()) {
      final String status = formatToOneLine(parseStatus.toString());
      LOG.warn("Error parsing: " + mimetype + " " + url + ": " + status);
      parse = null;
View Full Code Here

Examples of org.archive.mapred.ARCReporter

      final OutputCollector output, final Reporter r) throws IOException {
    // Assumption is that this map is being run by ARCMapRunner.
    // Otherwise, the below casts fail.
    String url = key.toString();
    ARCRecord rec = (ARCRecord) ((ObjectWritable) value).get();
    ARCReporter reporter = (ARCReporter) r;
   
    reporter.incrCounter(Counter.TOTALFILES, 1);   
        if (!isIndex(rec)) {  // If it is the first record skip it so there are no errors from this record.
          reporter.incrCounter(Counter.ISINDEX, 1);
            return;
        }
       
    final ARCRecordMetaData arcData = rec.getMetaData();
    // Look at ARCRecord meta data line mimetype. It can be empty. If so,
    // two more chances at figuring it either by looking at HTTP headers or
    // by looking at first couple of bytes of the file. See below.
    String mimetype = getMimetype(arcData.getMimetype(), this.mimeTypes, url);
    rec.skipHttpHeader();
    reporter.setStatusIfElapse("read headers on " + url);

    // Read in first block. If mimetype still null, look for MAGIC.
    int len = rec.read(this.buffer, 0, this.buffer.length);
    // check mimetype
    if (mimetype == null) {
      MimeType mt = this.mimeTypes.getMimeType(this.buffer);

      if (mt == null || mt.getName() == null) {
        LOG.warn("ProcessArcs" + "Failed to get mimetype for: " + url);

        return;
      }

      mimetype = mt.getName();
    }
   
    // filter documents
    if (filter(mimetype)) {
      return;
    }   
   
    // Reset our contentBuffer so can reuse. Over the life of an ARC
    // processing will grow to maximum record size.
    this.contentBuffer.reset();

    int total = 0;
    while ((len != -1)) {
      total += len;
      this.contentBuffer.write(this.buffer, 0, len);
      len = rec.read(this.buffer, 0, this.buffer.length);
      reporter.setStatusIfElapse("reading " + url);
    }

    // Close the Record. We're done with it. Side-effect is calculation
    // of digest -- if we're digesting.
    rec.close();
    reporter.setStatusIfElapse("closed " + url);

    final byte[] contentBytes = this.contentBuffer.toByteArray();

    // Html file from ARC
    ByteArrayInputStream in = new ByteArrayInputStream(contentBytes);
View Full Code Here
TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.