Package org.archive.wayback.resourcestore

Source Code of org.archive.wayback.resourcestore.ARCRecordToSearchResultAdapter

/* ArcRecordToSearchResultAdapter
*
* $Id$
*
* Created on 3:27:03 PM Jul 26, 2007.
*
* Copyright (C) 2007 Internet Archive.
*
* This file is part of wayback-core.
*
* wayback-core is free software; you can redistribute it and/or modify
* it under the terms of the GNU Lesser Public License as published by
* the Free Software Foundation; either version 2.1 of the License, or
* any later version.
*
* wayback-core is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
* GNU Lesser Public License for more details.
*
* You should have received a copy of the GNU Lesser Public License
* along with wayback-core; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
*/
package org.archive.wayback.resourcestore;

import java.io.File;
import java.io.IOException;
import java.util.logging.Logger;

import org.apache.commons.httpclient.Header;
import org.apache.commons.httpclient.URIException;
import org.archive.io.arc.ARCRecord;
import org.archive.io.arc.ARCRecordMetaData;
import org.archive.net.UURI;
import org.archive.net.UURIFactory;
import org.archive.wayback.UrlCanonicalizer;
import org.archive.wayback.WaybackConstants;
import org.archive.wayback.core.SearchResult;
import org.archive.wayback.util.Adapter;
import org.archive.wayback.util.url.AggressiveUrlCanonicalizer;

/**
*
*
* @author brad
* @version $Date$, $Revision$
*/
public class ARCRecordToSearchResultAdapter
implements Adapter<ARCRecord,SearchResult>{

  private static final Logger LOGGER = Logger.getLogger(
      ARCRecordToSearchResultAdapter.class.getName());

  private UrlCanonicalizer canonicalizer = null;
 
  public ARCRecordToSearchResultAdapter() {
    canonicalizer = new AggressiveUrlCanonicalizer();
  }
//  public static SearchResult arcRecordToSearchResult(final ARCRecord rec)
//  throws IOException, ParseException {
  /* (non-Javadoc)
   * @see org.archive.wayback.util.Adapter#adapt(java.lang.Object)
   */
  public SearchResult adapt(ARCRecord rec) {
    try {
      return adaptInner(rec);
    } catch (IOException e) {
      e.printStackTrace();
      return null;
    }
  }
 
  private SearchResult adaptInner(ARCRecord rec) throws IOException {
    rec.close();
    ARCRecordMetaData meta = rec.getMetaData();
   
    SearchResult result = new SearchResult();
    String arcName = meta.getArc();
    int index = arcName.lastIndexOf(File.separator);
    if (index > 0 && (index + 1) < arcName.length()) {
        arcName = arcName.substring(index + 1);
    }
    result.put(WaybackConstants.RESULT_ARC_FILE, arcName);
    result.put(WaybackConstants.RESULT_OFFSET, String.valueOf(meta
        .getOffset()));
   
    // initialize with default HTTP code...
    result.put(WaybackConstants.RESULT_HTTP_CODE, "-");
   
    result.put(WaybackConstants.RESULT_MD5_DIGEST, rec.getDigestStr());
    result.put(WaybackConstants.RESULT_MIME_TYPE, meta.getMimetype());
    result.put(WaybackConstants.RESULT_CAPTURE_DATE, meta.getDate());
   
    String uriStr = meta.getUrl();
    if (uriStr.startsWith(ARCRecord.ARC_MAGIC_NUMBER)) {
      // skip filedesc record altogether...
      return null;
    }
    if (uriStr.startsWith(WaybackConstants.DNS_URL_PREFIX)) {
      // skip URL + HTTP header processing for dns records...
   
      String origHost = uriStr.substring(WaybackConstants.DNS_URL_PREFIX
          .length());
      result.put(WaybackConstants.RESULT_ORIG_HOST, origHost);
      result.put(WaybackConstants.RESULT_REDIRECT_URL, "-");
      result.put(WaybackConstants.RESULT_URL, uriStr);
      result.put(WaybackConstants.RESULT_URL_KEY, uriStr);
   
    } else {
   
      UURI uri = UURIFactory.getInstance(uriStr);
      result.put(WaybackConstants.RESULT_URL, uriStr);
   
      String uriHost = uri.getHost();
      if (uriHost == null) {
        LOGGER.info("No host in " + uriStr + " in " + meta.getArc());
      } else {
        result.put(WaybackConstants.RESULT_ORIG_HOST, uriHost);
   
        String statusCode = (meta.getStatusCode() == null) ? "-" : meta
            .getStatusCode();
        result.put(WaybackConstants.RESULT_HTTP_CODE, statusCode);
   
        String redirectUrl = "-";
        Header[] headers = rec.getHttpHeaders();
        if (headers != null) {
   
          for (int i = 0; i < headers.length; i++) {
            if (headers[i].getName().equals(
                WaybackConstants.LOCATION_HTTP_HEADER)) {

              String locationStr = headers[i].getValue();
              // TODO: "Location" is supposed to be absolute:
              // (http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html)
              // (section 14.30) but Content-Location can be
              // relative.
              // is it correct to resolve a relative Location, as
              // we are?
              // it's also possible to have both in the HTTP
              // headers...
              // should we prefer one over the other?
              // right now, we're ignoring "Content-Location"
              try {
                UURI uriRedirect = UURIFactory.getInstance(uri,
                    locationStr);
                redirectUrl = uriRedirect.getEscapedURI();
   
              } catch (URIException e) {
                LOGGER.info("Bad Location: " + locationStr
                    + " for " + uriStr + " in "
                    + meta.getArc() + " Skipped");
              }
              break;
            }
          }
        }
        result.put(WaybackConstants.RESULT_REDIRECT_URL, redirectUrl);
   
        String indexUrl = canonicalizer.urlStringToKey(meta.getUrl());
        result.put(WaybackConstants.RESULT_URL_KEY, indexUrl);
      }
   
    }
    return result;
  }
  public UrlCanonicalizer getCanonicalizer() {
    return canonicalizer;
  }
  public void setCanonicalizer(UrlCanonicalizer canonicalizer) {
    this.canonicalizer = canonicalizer;
  }
}
TOP

Related Classes of org.archive.wayback.resourcestore.ARCRecordToSearchResultAdapter

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.