Examples of ARCRecordMetaData

org.archive.io.arc.ARCRecordMetaData

Examples of org.archive.io.arc.ARCRecordMetaData

    }
  }
  
  private SearchResult adaptInner(ARCRecord rec) throws IOException {
    rec.close();
    ARCRecordMetaData meta = rec.getMetaData();
    
    SearchResult result = new SearchResult();
    String arcName = meta.getArc(); 
    int index = arcName.lastIndexOf(File.separator);
    if (index > 0 && (index + 1) < arcName.length()) {
        arcName = arcName.substring(index + 1);
    }
    result.put(WaybackConstants.RESULT_ARC_FILE, arcName);
    result.put(WaybackConstants.RESULT_OFFSET, String.valueOf(meta
        .getOffset()));
    
    // initialize with default HTTP code...
    result.put(WaybackConstants.RESULT_HTTP_CODE, "-");
    
    result.put(WaybackConstants.RESULT_MD5_DIGEST, rec.getDigestStr());
    result.put(WaybackConstants.RESULT_MIME_TYPE, meta.getMimetype());
    result.put(WaybackConstants.RESULT_CAPTURE_DATE, meta.getDate());
    
    String uriStr = meta.getUrl();
    if (uriStr.startsWith(ARCRecord.ARC_MAGIC_NUMBER)) {
      // skip filedesc record altogether...
      return null;
    }
    if (uriStr.startsWith(WaybackConstants.DNS_URL_PREFIX)) {
      // skip URL + HTTP header processing for dns records...
    
      String origHost = uriStr.substring(WaybackConstants.DNS_URL_PREFIX
          .length());
      result.put(WaybackConstants.RESULT_ORIG_HOST, origHost);
      result.put(WaybackConstants.RESULT_REDIRECT_URL, "-");
      result.put(WaybackConstants.RESULT_URL, uriStr);
      result.put(WaybackConstants.RESULT_URL_KEY, uriStr);
    
    } else {
    
      UURI uri = UURIFactory.getInstance(uriStr);
      result.put(WaybackConstants.RESULT_URL, uriStr);
    
      String uriHost = uri.getHost();
      if (uriHost == null) {
        LOGGER.info("No host in " + uriStr + " in " + meta.getArc());
      } else {
        result.put(WaybackConstants.RESULT_ORIG_HOST, uriHost);
    
        String statusCode = (meta.getStatusCode() == null) ? "-" : meta
            .getStatusCode();
        result.put(WaybackConstants.RESULT_HTTP_CODE, statusCode);
    
        String redirectUrl = "-";
        Header[] headers = rec.getHttpHeaders();
        if (headers != null) {
    
          for (int i = 0; i < headers.length; i++) {
            if (headers[i].getName().equals(
                WaybackConstants.LOCATION_HTTP_HEADER)) {


              String locationStr = headers[i].getValue();
              // TODO: "Location" is supposed to be absolute:
              // (http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html)
              // (section 14.30) but Content-Location can be
              // relative.
              // is it correct to resolve a relative Location, as
              // we are?
              // it's also possible to have both in the HTTP
              // headers...
              // should we prefer one over the other?
              // right now, we're ignoring "Content-Location"
              try {
                UURI uriRedirect = UURIFactory.getInstance(uri,
                    locationStr);
                redirectUrl = uriRedirect.getEscapedURI();
    
              } catch (URIException e) {
                LOGGER.info("Bad Location: " + locationStr
                    + " for " + uriStr + " in "
                    + meta.getArc() + " Skipped");
              }
              break;
            }
          }
        }
        result.put(WaybackConstants.RESULT_REDIRECT_URL, redirectUrl);
    
        String indexUrl = canonicalizer.urlStringToKey(meta.getUrl());
        result.put(WaybackConstants.RESULT_URL_KEY, indexUrl);
      }
    
    }
    return result;

View Full Code Here

Examples of org.archive.io.arc.ARCRecordMetaData

    {
      return;
    }
    checkCollectionName();
    
    final ARCRecordMetaData arcData = rec.getMetaData();
    String oldUrl = url;
    
    try
    {
      url = urlNormalizers.normalize(url, URLNormalizers.SCOPE_FETCHER);
      url = filters.filter(url); // filter the url
    }
    catch (Exception e)
    {
      LOG.warn("Skipping record. Didn't pass normalization/filter " +
        oldUrl + ": " + e.toString());


      return;
    }


    final long b = arcData.getContentBegin();
    final long l = arcData.getLength();
    final long recordLength = (l > b)? (l - b): l;


    // Look at ARCRecord meta data line mimetype. It can be empty.  If so,
    // two more chances at figuring it either by looking at HTTP headers or
    // by looking at first couple of bytes of the file.  See below.
    String mimetype =
      getMimetype(arcData.getMimetype(), this.mimeTypes, url);
    
    if (skip(mimetype))
    {
      return;
    }


    // Copy http headers to nutch metadata.
    final Metadata metaData = new Metadata();
    final Header[] headers = rec.getHttpHeaders();
    for (int j = 0; j < headers.length; j++)
    {
      final Header header = headers[j];
      
      if (mimetype == null)
      {
        // Special handling. If mimetype is still null, try getting it
        // from the http header. I've seen arc record lines with empty
        // content-type and a MIME unparseable file ending; i.e. .MID.
        if ((header.getName() != null) &&
          header.getName().toLowerCase().equals(ImportArcs.CONTENT_TYPE_KEY))
        {
          mimetype = getMimetype(header.getValue(), null, null);
          
          if (skip(mimetype))
          {
            return;
          }
        }
      }
      
      metaData.set(header.getName(), header.getValue());
    }


    // This call to reporter setStatus pings the tasktracker telling it our
    // status and telling the task tracker we're still alive (so it doesn't
    // time us out).
    final String noSpacesMimetype =
      TextUtils.replaceAll(ImportArcs.WHITESPACE,
      ((mimetype == null || mimetype.length() <= 0)?
      "TODO": mimetype),
      "-");
    final String recordLengthAsStr = Long.toString(recordLength);
    
    reporter.setStatus(getStatus(url, oldUrl, recordLengthAsStr, noSpacesMimetype));


    // This is a nutch 'more' field.
    metaData.set("contentLength", recordLengthAsStr);


    rec.skipHttpHeader();
    reporter.setStatusIfElapse("read headers on " + url);


    // TODO: Skip if unindexable type.
    int total = 0;
    
    // Read in first block. If mimetype still null, look for MAGIC.
    int len = rec.read(this.buffer, 0, this.buffer.length);
    
    if (mimetype == null)
    {
      MimeType mt = this.mimeTypes.getMimeType(this.buffer);
      
      if (mt == null || mt.getName() == null)
      {
        LOG.warn("Failed to get mimetype for: " + url);
        
        return;
      }
      
      mimetype = mt.getName();
    }
    
    metaData.set(ImportArcs.CONTENT_TYPE_KEY, mimetype);


    // How much do we read total? If pdf, we will read more. If equal to -1,
    // read all.
    int readLimit = (ImportArcs.PDF_TYPE.equals(mimetype))?
      this.pdfContentLimit : this.contentLimit;
    
    // Reset our contentBuffer so can reuse.  Over the life of an ARC
    // processing will grow to maximum record size.
    this.contentBuffer.reset();
 
    while ((len != -1) && ((readLimit == -1) || (total < readLimit)))
    {
      total += len;
      this.contentBuffer.write(this.buffer, 0, len);
      len = rec.read(this.buffer, 0, this.buffer.length);
      reporter.setStatusIfElapse("reading " + url);
    }


    // Close the Record.  We're done with it.  Side-effect is calculation
    // of digest -- if we're digesting.
    rec.close();
    reporter.setStatusIfElapse("closed " + url);


    final byte[] contentBytes = this.contentBuffer.toByteArray();
    final CrawlDatum datum = new CrawlDatum();
    datum.setStatus(CrawlDatum.STATUS_FETCH_SUCCESS);


    // Calculate digest or use precalculated sha1.
    String digest = (this.sha1)? rec.getDigestStr():
    MD5Hash.digest(contentBytes).toString();
    metaData.set(Nutch.SIGNATURE_KEY, digest);
    
    // Set digest back into the arcData so available later when we write
    // CDX line.
    arcData.setDigest(digest);


    metaData.set(Nutch.SEGMENT_NAME_KEY, this.segmentName);
    
    // Score at this stage is 1.0f.
    metaData.set(Nutch.SCORE_KEY, Float.toString(datum.getScore()));


    final long startTime = System.currentTimeMillis();
    final Content content = new Content(url, url, contentBytes, mimetype,
      metaData, getConf());
    datum.setFetchTime(Nutchwax.getDate(arcData.getDate()));


    MapWritable mw = datum.getMetaData();
    
    if (mw == null)
    {
      mw = new MapWritable();
    }
            
    if (collectionType.equals(Global.COLLECTION_TYPE_MULTIPLE)) {
      mw.put(new Text(ImportArcs.ARCCOLLECTION_KEY), new Text(SqlSearcher.getCollectionNameWithTimestamp(collectionName,arcData.getDate())));   
    }
    else {
      mw.put(new Text(ImportArcs.ARCCOLLECTION_KEY), new Text(collectionName));
    }    
    mw.put(new Text(ImportArcs.ARCFILENAME_KEY), new Text(arcName));
    mw.put(new Text(ImportArcs.ARCFILEOFFSET_KEY),
      new Text(Long.toString(arcData.getOffset())));
    datum.setMetaData(mw);
          
  TimeoutParsingThread tout=threadPool.getThread(Thread.currentThread().getId(),timeoutIndexingDocument);  
  tout.setUrl(url);
    tout.setContent(content);
    tout.setParseUtil(parseUtil);          
    tout.wakeupAndWait();        
  
  ParseStatus parseStatus=tout.getParseStatus();
  Parse parse=tout.getParse();     
  reporter.setStatusIfElapse("parsed " + url);
     
  if (!parseStatus.isSuccess()) {
      final String status = formatToOneLine(parseStatus.toString());
      LOG.warn("Error parsing: " + mimetype + " " + url + ": " + status);
      parse = null;
    }
    else {
      // Was it a slow parse?
      final double kbPerSecond = getParseRate(startTime,
        (contentBytes != null) ? contentBytes.length : 0);
      
      if (LOG.isDebugEnabled())
      {
        LOG.debug(getParseRateLogMessage(url,
          noSpacesMimetype, kbPerSecond));
      }
      else if (kbPerSecond < this.parseThreshold)
      {
        LOG.warn(getParseRateLogMessage(url, noSpacesMimetype,
          kbPerSecond));
      }
    }


    Writable v = new FetcherOutput(datum, null,
      parse != null ? new ParseImpl(parse) : null);       
    if (collectionType.equals(Global.COLLECTION_TYPE_MULTIPLE)) {
      LOG.info("multiple: "+SqlSearcher.getCollectionNameWithTimestamp(this.collectionName,arcData.getDate())+" "+url); 
      output.collect(Nutchwax.generateWaxKey(url,SqlSearcher.getCollectionNameWithTimestamp(this.collectionName,arcData.getDate())), v); 
    }
    else {
      output.collect(Nutchwax.generateWaxKey(url, this.collectionName), v);
    }
  }

View Full Code Here

Examples of org.archive.io.arc.ARCRecordMetaData

        if (!isIndex(rec)) {  // If it is the first record skip it so there are no errors from this record.
          reporter.incrCounter(Counter.ISINDEX, 1);
            return;
        }
        
    final ARCRecordMetaData arcData = rec.getMetaData();
    // Look at ARCRecord meta data line mimetype. It can be empty. If so,
    // two more chances at figuring it either by looking at HTTP headers or
    // by looking at first couple of bytes of the file. See below.
    String mimetype = getMimetype(arcData.getMimetype(), this.mimeTypes, url);
    rec.skipHttpHeader();
    reporter.setStatusIfElapse("read headers on " + url);


    // Read in first block. If mimetype still null, look for MAGIC.
    int len = rec.read(this.buffer, 0, this.buffer.length);

View Full Code Here

Examples of org.archive.io.arc.ARCRecordMetaData

    }
  }


  private CaptureSearchResult adaptInner(ARCRecord rec) throws IOException {
    rec.close();
    ARCRecordMetaData meta = rec.getMetaData();
    
    CaptureSearchResult result = new CaptureSearchResult();
    String arcName = meta.getArc(); 
    int index = arcName.lastIndexOf(File.separator);
    if (index > 0 && (index + 1) < arcName.length()) {
        arcName = arcName.substring(index + 1);
    }
    result.setFile(arcName);
    result.setOffset(meta.getOffset());
    
    // initialize with default HTTP code...
    result.setHttpCode("-");
    result.setRedirectUrl("-");
    
//    result.setDigest("sha1:"+rec.getDigestStr());
    result.setDigest(rec.getDigestStr());
    result.setCaptureTimestamp(meta.getDate());
    String uriStr = meta.getUrl();
    result.setOriginalUrl(uriStr);
    
    
    if (uriStr.startsWith(ARCRecord.ARC_MAGIC_NUMBER)) {
      result.setMimeType(ARC_FILEDESC_VERSION);
    } else if (uriStr.startsWith(WaybackConstants.DNS_URL_PREFIX)) {
      // skip URL + HTTP header processing for dns records...
    
      result.setUrlKey(uriStr);
      result.setMimeType("text/dns");
      result.setCompressedLength(rec.compressedBytes);


    } else {
    
      result.setUrlKey(canonicalizer.urlStringToKey(uriStr));
    
      String statusCode = (meta.getStatusCode() == null) ? "-" : meta
          .getStatusCode();
      result.setHttpCode(statusCode);
  
      Header[] headers = rec.getHttpHeaders();
      annotater.annotateHTTPContent(result, rec, headers, meta.getMimetype());
    }
    return result;
  }

View Full Code Here

TOP

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.