Package org.archive.wayback.resourcestore.indexer

Source Code of org.archive.wayback.resourcestore.indexer.WARCRecordToSearchResultAdapter

/*
*  This file is part of the Wayback archival access software
*   (http://archive-access.sourceforge.net/projects/wayback/).
*
*  Licensed to the Internet Archive (IA) by one or more individual
*  contributors.
*
*  The IA licenses this file to You under the Apache License, Version 2.0
*  (the "License"); you may not use this file except in compliance with
*  the License.  You may obtain a copy of the License at
*
*      http://www.apache.org/licenses/LICENSE-2.0
*
*  Unless required by applicable law or agreed to in writing, software
*  distributed under the License is distributed on an "AS IS" BASIS,
*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*  See the License for the specific language governing permissions and
*  limitations under the License.
*/
package org.archive.wayback.resourcestore.indexer;

import java.io.File;
import java.io.IOException;
import java.util.logging.Logger;

import org.apache.commons.httpclient.Header;
import org.apache.commons.httpclient.StatusLine;
import org.apache.commons.httpclient.URIException;
import org.apache.commons.httpclient.util.EncodingUtil;
import org.archive.format.warc.WARCConstants;
import org.archive.format.warc.WARCConstants.WARCRecordType;
import org.archive.io.ArchiveRecordHeader;
import org.archive.io.RecoverableIOException;
import org.archive.io.arc.ARCConstants;
import org.archive.io.warc.WARCRecord;
import org.archive.util.LaxHttpParser;
import org.archive.wayback.UrlCanonicalizer;
import org.archive.wayback.core.CaptureSearchResult;
import org.archive.wayback.resourceindex.filters.WARCRevisitAnnotationFilter;
import org.archive.wayback.util.Adapter;
import org.archive.wayback.util.url.IdentityUrlCanonicalizer;

/**
* Adapts certain WARCRecords into SearchResults. DNS and response records are
* mostly straightforward, but SearchResult objects generated from revisit
* records contain lots of "placeholder" fields, which are expected to be
* understood by later processes traversing a stream of SearchResult objects.
*
* @author brad
* @version $Date$, $Revision$
* @see WARCRevisitAnnotationFilter
*/
public class WARCRecordToSearchResultAdapter
implements Adapter<WARCRecord,CaptureSearchResult>{
 
  private static final Logger LOGGER =
        Logger.getLogger(WARCRecordToSearchResultAdapter.class.getName());

  private static final String VERSION = "0.1.0";
  private static final String WARC_FILEDESC_VERSION =
    "warc/warcinfo" + VERSION;
 
  private final static String DEFAULT_VALUE = "-";
  private UrlCanonicalizer canonicalizer = null;
  private HTTPRecordAnnotater annotater = null;
 
  private boolean processAll = false;

  public WARCRecordToSearchResultAdapter() {
    canonicalizer = new IdentityUrlCanonicalizer();
    annotater = new HTTPRecordAnnotater();
  }

  /*
   * This just calls adaptInner, returning null if an Exception is thrown:
   */
  public CaptureSearchResult adapt(WARCRecord rec) {
    try {
      return adaptInner(rec);
    } catch (IOException e) {
      e.printStackTrace();
      return null;
    } catch (OutOfMemoryError e) {
      e.printStackTrace();
      return null;
    }
  }

  private CaptureSearchResult adaptInner(WARCRecord rec) throws IOException {
   
    ArchiveRecordHeader header = rec.getHeader();

    String typeStr = header.getHeaderValue(WARCConstants.HEADER_KEY_TYPE).toString();
    WARCRecordType type;
    try {
      type = WARCRecordType.valueOf(typeStr);
    } catch (IllegalArgumentException e) {
      LOGGER.warning("Skipping unrecognized record type : " + typeStr);
      return null;
    }

    CaptureSearchResult result = genericResult(rec);

    switch (type) {
    case response:
      String mime = annotater.transformHTTPMime(header.getMimetype());
      if(mime != null && mime.equals("text/dns")) {
        // close to complete reading, then the digest is legit
        // TODO: DO we want to use the WARC header digest for this?
        rec.close();
        result.setDigest(transformWARCDigest(rec.getDigestStr()));
        result.setMimeType(mime);
      } else {
        result = adaptWARCHTTPResponse(result,rec);
      }
      break;

     
    case revisit:
      // also set the mime type:
      result.setMimeType("warc/revisit");
      break;
     
    case request:
      if(processAll) {
        // also set the mime type:
        result.setMimeType("warc/request");
      } else {
        result = null;
      }
      break;
     
    case metadata:
      if(processAll) {
        // also set the mime type:
        result.setMimeType("warc/metadata");
      } else {
        result = null;
      }
      break;
     
    case warcinfo:
      result.setMimeType(WARC_FILEDESC_VERSION);
      break;
     
    default:
      LOGGER.info("Skipping record type : " + type);
      break;
    }
   
    return result;
  }

  // ALL HELPER METHODS BELOW:

  /*
   * Extract all common WARC fields into a CaptureSearchResult. This is the
   * same for all WARC record types:
   * 
   *    file, offset, timestamp, digest, urlKey, originalUrl
   */
  private CaptureSearchResult genericResult(WARCRecord rec) {

    CaptureSearchResult result = new CaptureSearchResult();

    result.setMimeType(DEFAULT_VALUE);
    result.setHttpCode(DEFAULT_VALUE);
    result.setRedirectUrl(DEFAULT_VALUE);

    ArchiveRecordHeader header = rec.getHeader();

    String file = transformWARCFilename(header.getReaderIdentifier());
    long offset = header.getOffset();
   
    result.setCaptureTimestamp(transformWARCDate(header.getDate()));
    result.setFile(file);
    result.setOffset(offset);
    result.setDigest(transformWARCDigest(header.getHeaderValue(
        WARCRecord.HEADER_KEY_PAYLOAD_DIGEST)));
   
    String origUrl = header.getUrl();
    if(origUrl == null) {
      String type = header.getHeaderValue(WARCConstants.HEADER_KEY_TYPE).toString();
      if(type.equals(WARCConstants.WARCRecordType.warcinfo)) {
        String filename = header.getHeaderValue(
            WARCConstants.HEADER_KEY_FILENAME).toString();
        result.setOriginalUrl("filedesc:"+filename);
        result.setUrlKey("filedesc:"+filename);       
      } else {
        result.setOriginalUrl(DEFAULT_VALUE);
        result.setUrlKey(DEFAULT_VALUE);
      }

     
    } else {
      result.setOriginalUrl(origUrl);
      try {
        String urlKey = canonicalizer.urlStringToKey(origUrl);
        result.setUrlKey(urlKey);
      } catch (URIException e) {
        String shortUrl =
          (origUrl.length() < 100)
          ? origUrl
          :origUrl.substring(0,100);
        LOGGER.warning("FAILED canonicalize(" + shortUrl + "):" +
            file + " " + offset);
        result.setUrlKey(origUrl);
      }
    }
    return result;
  }

    /**
     * borrowed(copied) from org.archive.io.arc.ARCRecord...
     *
     * @param bytes Array of bytes to examine for an EOL.
     * @return Count of end-of-line characters or zero if none.
     */
    private int getEolCharsCount(byte [] bytes) {
        int count = 0;
        if (bytes != null && bytes.length >=1 &&
                bytes[bytes.length - 1] == '\n') {
            count++;
            if (bytes.length >=2 && bytes[bytes.length -2] == '\r') {
                count++;
            }
        }
        return count;
    }

    private String transformWARCFilename(String readerIdentifier) {
    String warcName = readerIdentifier;
    int index = warcName.lastIndexOf(File.separator);
    if (index > 0 && (index + 1) < warcName.length()) {
        warcName = warcName.substring(index + 1);
    }
    return warcName;
  }

  private String transformWARCDigest(final Object o) {
    if(o == null) {
      return DEFAULT_VALUE;
    }
    String orig = o.toString();
    if(orig.startsWith("sha1:")) {
      return orig.substring(5);
    }
    return orig;
//    return (o == null) ? DEFAULT_VALUE : o.toString();
  }

  /*
   * Transform input date to 14-digit timestamp:
   * 2007-08-29T18:00:26Z => 20070829180026
   */
  private static String transformWARCDate(final String input) {
   
    StringBuilder output = new StringBuilder(14);
   
    output.append(input.substring(0,4));
    output.append(input.substring(5,7));
    output.append(input.substring(8,10));
    output.append(input.substring(11,13));
    output.append(input.substring(14,16));
    output.append(input.substring(17,19));
   
    return output.toString();
  }

    /*
     * Currently the WARCReader doesn't parse HTTP headers. This method parses
     * them then calls the common ARC/WARC shared record parsing code, which
     * addresses HTTP headers, and possibly even parses HTML content to look
     * for Robot Meta tags.
     */
  private CaptureSearchResult adaptWARCHTTPResponse(CaptureSearchResult result,
      WARCRecord rec) throws IOException {

    ArchiveRecordHeader header = rec.getHeader();
    // need to parse the documents HTTP message and headers here: WARCReader
    // does not implement this... yet..
   
        byte [] statusBytes = LaxHttpParser.readRawLine(rec);
        int eolCharCount = getEolCharsCount(statusBytes);
        if (eolCharCount <= 0) {
            throw new RecoverableIOException("Failed to read http status where one " +
                    " was expected: " +
                    ((statusBytes == null) ? "(null)" : new String(statusBytes)));
        }
        String statusLine = EncodingUtil.getString(statusBytes, 0,
            statusBytes.length - eolCharCount, ARCConstants.DEFAULT_ENCODING);
        if ((statusLine == null) ||
                !StatusLine.startsWithHTTP(statusLine)) {
           throw new RecoverableIOException("Failed parse of http status line.");
        }
        StatusLine status = new StatusLine(statusLine);
    result.setHttpCode(String.valueOf(status.getStatusCode()));
       
    Header[] headers = LaxHttpParser.parseHeaders(rec,
                ARCConstants.DEFAULT_ENCODING);

   
    annotater.annotateHTTPContent(result,rec,headers,header.getMimetype());

    return result;
  }


  public UrlCanonicalizer getCanonicalizer() {
    return canonicalizer;
  }

  public void setCanonicalizer(UrlCanonicalizer canonicalizer) {
    this.canonicalizer = canonicalizer;
  }

  public boolean isProcessAll() {
    return processAll;
  }

  public void setProcessAll(boolean processAll) {
    this.processAll = processAll;
  }
  /**
   * @return the annotater
   */
  public HTTPRecordAnnotater getAnnotater() {
    return annotater;
  }

  /**
   * @param annotater the annotater to set
   */
  public void setAnnotater(HTTPRecordAnnotater annotater) {
    this.annotater = annotater;
  }
}
TOP

Related Classes of org.archive.wayback.resourcestore.indexer.WARCRecordToSearchResultAdapter

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.