Package org.archive.wayback.resourcestore

Source Code of org.archive.wayback.resourcestore.WARCRecordToSearchResultAdapter

package org.archive.wayback.resourcestore;

import java.io.File;
import java.io.IOException;
import java.util.logging.Logger;

import org.apache.commons.httpclient.Header;
import org.apache.commons.httpclient.HttpParser;
import org.apache.commons.httpclient.StatusLine;
import org.apache.commons.httpclient.URIException;
import org.apache.commons.httpclient.util.EncodingUtil;
import org.archive.io.ArchiveRecordHeader;
import org.archive.io.RecoverableIOException;
import org.archive.io.arc.ARCConstants;
import org.archive.io.warc.WARCConstants;
import org.archive.io.warc.WARCRecord;
import org.archive.net.UURI;
import org.archive.net.UURIFactory;
import org.archive.wayback.UrlCanonicalizer;
import org.archive.wayback.WaybackConstants;
import org.archive.wayback.core.SearchResult;
import org.archive.wayback.util.Adapter;
import org.archive.wayback.util.url.AggressiveUrlCanonicalizer;

/**
* Adapts certain WARCRecords into SearchResults. DNS and response records are
* mostly straightforward, but SearchResult objects generated from revisit
* records contain lots of "placeholder" fields, which are expected to be
* understood by later processes traversing a stream of SearchResult objects.
*
* See org.archive.wayback.resourceindex.DeduplicateSearchResultAnnotationAdapter.
*
* @author brad
* @version $Date$, $Revision$
*/
public class WARCRecordToSearchResultAdapter
implements Adapter<WARCRecord,SearchResult>{
 
  private final static String DEFAULT_VALUE = "-";
  private final static String SEARCH_FIELDS[] = {
      WaybackConstants.RESULT_URL,
      WaybackConstants.RESULT_URL_KEY,
      WaybackConstants.RESULT_ORIG_HOST,
      WaybackConstants.RESULT_CAPTURE_DATE,
      WaybackConstants.RESULT_MD5_DIGEST,
      WaybackConstants.RESULT_MIME_TYPE,
      WaybackConstants.RESULT_HTTP_CODE,
      WaybackConstants.RESULT_REDIRECT_URL,
      WaybackConstants.RESULT_ARC_FILE,
      WaybackConstants.RESULT_OFFSET,
  };

  private static final Logger LOGGER = Logger.getLogger(
      WARCRecordToSearchResultAdapter.class.getName());

  private UrlCanonicalizer canonicalizer = null;

  public WARCRecordToSearchResultAdapter() {
    canonicalizer = new AggressiveUrlCanonicalizer();
  }

  /* (non-Javadoc)
   * @see org.archive.wayback.util.Adapter#adapt(java.lang.Object)
   */
  public SearchResult adapt(WARCRecord rec) {
    try {
      return adaptInner(rec);
    } catch (IOException e) {
      e.printStackTrace();
      return null;
    }
  }
 
  /*
   * Transform input date to 14-digit timestamp:
   * 2007-08-29T18:00:26Z => 20070829180026
   */
  private static String transformDate(final String input) {
   
    StringBuilder output = new StringBuilder(14);
   
    output.append(input.substring(0,4));
    output.append(input.substring(5,7));
    output.append(input.substring(8,10));
    output.append(input.substring(11,13));
    output.append(input.substring(14,16));
    output.append(input.substring(17,19));
   
    return output.toString();
  }
 
  private static String transformHTTPMime(final String input) {
    int semiIdx = input.indexOf(';');
    if(semiIdx > 0) {
      return input.substring(0,semiIdx).trim();
    }
    return input.trim();
  }

  private String transformWarcFilename(String readerIdentifier) {
    String warcName = readerIdentifier;
    int index = warcName.lastIndexOf(File.separator);
    if (index > 0 && (index + 1) < warcName.length()) {
        warcName = warcName.substring(index + 1);
    }
    return warcName;
  }

  private String transformDigest(final Object o) {
    if(o == null) {
      return DEFAULT_VALUE;
    }
    String orig = o.toString();
    if(orig.startsWith("sha1:")) {
      return orig.substring(5);
    }
    return orig;
  }

  private SearchResult getBlankSearchResult() {
    SearchResult result = new SearchResult();
    for(String field : SEARCH_FIELDS) {
      result.put(field, DEFAULT_VALUE);
    }
    return result;
  }
 
  private UURI addUrlDataToSearchResult(SearchResult result, String urlStr)
  throws IOException {

    result.put(WaybackConstants.RESULT_URL, urlStr);
    result.put(WaybackConstants.RESULT_URL_KEY, urlStr);

 
    UURI uri = UURIFactory.getInstance(urlStr);
    String uriHost = uri.getHost();
    if (uriHost == null) {

      LOGGER.info("No host in " + urlStr);

    } else {

      result.put(WaybackConstants.RESULT_ORIG_HOST, uriHost);
    }

    String urlKey = canonicalizer.urlStringToKey(urlStr);
    result.put(WaybackConstants.RESULT_URL_KEY, urlKey);

    return uri;
  }

  private SearchResult adaptDNS(ArchiveRecordHeader header, WARCRecord rec)
  throws IOException {

    SearchResult result = getBlankSearchResult();

    result.put(WaybackConstants.RESULT_CAPTURE_DATE,
        transformDate(header.getDate()));
    result.put(WaybackConstants.RESULT_ARC_FILE,
        transformWarcFilename(header.getReaderIdentifier()));
    result.put(WaybackConstants.RESULT_OFFSET,
        String.valueOf(header.getOffset()));
   
    String uriStr = header.getUrl();
   
    String origHost = uriStr.substring(WaybackConstants.DNS_URL_PREFIX
        .length());
    result.put(WaybackConstants.RESULT_MIME_TYPE, header.getMimetype());

    result.put(WaybackConstants.RESULT_ORIG_HOST, origHost);
    result.put(WaybackConstants.RESULT_URL, uriStr);
    result.put(WaybackConstants.RESULT_URL_KEY, uriStr);

    rec.close();
    result.put(WaybackConstants.RESULT_MD5_DIGEST, rec.getDigestStr());

    return result;
  }

  private SearchResult adaptRevisit(ArchiveRecordHeader header, WARCRecord rec)
  throws IOException {

    SearchResult result = getBlankSearchResult();

    result.put(WaybackConstants.RESULT_CAPTURE_DATE,
        transformDate(header.getDate()));
    result.put(WaybackConstants.RESULT_MD5_DIGEST,
        transformDigest(header.getHeaderValue(
            WARCRecord.HEADER_KEY_PAYLOAD_DIGEST)));
   
    addUrlDataToSearchResult(result,header.getUrl());

    return result;
  }

    /**
     * borrowed(copied) from org.archive.io.arc.ARCRecord...
     *
     * @param bytes Array of bytes to examine for an EOL.
     * @return Count of end-of-line characters or zero if none.
     */
    private int getEolCharsCount(byte [] bytes) {
        int count = 0;
        if (bytes != null && bytes.length >=1 &&
                bytes[bytes.length - 1] == '\n') {
            count++;
            if (bytes.length >=2 && bytes[bytes.length -2] == '\r') {
                count++;
            }
        }
        return count;
    }
 
  private SearchResult adaptResponse(ArchiveRecordHeader header, WARCRecord rec)
  throws IOException {

    SearchResult result = getBlankSearchResult();

    result.put(WaybackConstants.RESULT_CAPTURE_DATE,
        transformDate(header.getDate()));
    result.put(WaybackConstants.RESULT_ARC_FILE,
        transformWarcFilename(header.getReaderIdentifier()));
    result.put(WaybackConstants.RESULT_OFFSET,
        String.valueOf(header.getOffset()));
   
    String origUrl = header.getUrl();
    UURI uri = addUrlDataToSearchResult(result,origUrl);

    // need to parse the documents HTTP message and headers here: WARCReader
    // does not implement this... yet..
   
        byte [] statusBytes = HttpParser.readRawLine(rec);
        int eolCharCount = getEolCharsCount(statusBytes);
        if (eolCharCount <= 0) {
            throw new RecoverableIOException("Failed to read http status where one " +
                " was expected: " + new String(statusBytes));
        }
        String statusLine = EncodingUtil.getString(statusBytes, 0,
            statusBytes.length - eolCharCount, ARCConstants.DEFAULT_ENCODING);
        if ((statusLine == null) ||
                !StatusLine.startsWithHTTP(statusLine)) {
           throw new RecoverableIOException("Failed parse of http status line.");
        }
        StatusLine status = new StatusLine(statusLine);
    result.put(WaybackConstants.RESULT_HTTP_CODE,
        String.valueOf(status.getStatusCode()));
       
    Header[] headers = HttpParser.parseHeaders(rec,
                ARCConstants.DEFAULT_ENCODING);

    rec.close();
    result.put(WaybackConstants.RESULT_MD5_DIGEST,
        transformDigest(header.getHeaderValue(
            WARCRecord.HEADER_KEY_PAYLOAD_DIGEST)));

    if (headers != null) {
 
      for (Header httpHeader : headers) {
        if (httpHeader.getName().equals(
            WaybackConstants.LOCATION_HTTP_HEADER)) {
 
          String locationStr = httpHeader.getValue();
          // TODO: "Location" is supposed to be absolute:
          // (http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html)
          // (section 14.30) but Content-Location can be
          // relative.
          // is it correct to resolve a relative Location, as
          // we are?
          // it's also possible to have both in the HTTP
          // headers...
          // should we prefer one over the other?
          // right now, we're ignoring "Content-Location"
          try {
            UURI uriRedirect = UURIFactory.getInstance(uri,
                locationStr);
            result.put(WaybackConstants.RESULT_REDIRECT_URL,
                uriRedirect.getEscapedURI());
          } catch (URIException e) {
            LOGGER.info("Bad Location: " + locationStr
                + " for " + origUrl + " in "
                + header.getReaderIdentifier() + " Skipped");
          }
        } else if(httpHeader.getName().toLowerCase().equals("content-type")) {
          result.put(WaybackConstants.RESULT_MIME_TYPE,
              transformHTTPMime(httpHeader.getValue()));
        }
      }
    }
    return result;
  }
 
  private SearchResult adaptInner(WARCRecord rec) throws IOException {
   
    SearchResult result = null;
    ArchiveRecordHeader header = rec.getHeader();
    String type = header.getHeaderValue(WARCConstants.HEADER_KEY_TYPE).toString();
    if(type.equals(WARCConstants.RESPONSE)) {
      String mime = header.getMimetype();
      if(mime.equals("text/dns")) {
        result = adaptDNS(header,rec);
      } else {
        result = adaptResponse(header,rec);
      }
    } else if(type.equals(WARCConstants.REVISIT)) {
      result = adaptRevisit(header,rec);
    }

    return result;
  }

  public UrlCanonicalizer getCanonicalizer() {
    return canonicalizer;
  }

  public void setCanonicalizer(UrlCanonicalizer canonicalizer) {
    this.canonicalizer = canonicalizer;
  }
}
TOP

Related Classes of org.archive.wayback.resourcestore.WARCRecordToSearchResultAdapter

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.