Package org.archive.wayback.liveweb

Source Code of org.archive.wayback.liveweb.LiveWebCache

/* LiveWebCache
*
* $Id$
*
* Created on 5:26:17 PM Mar 12, 2007.
*
* Copyright (C) 2007 Internet Archive.
*
* This file is part of wayback-svn.
*
* wayback-svn is free software; you can redistribute it and/or modify
* it under the terms of the GNU Lesser Public License as published by
* the Free Software Foundation; either version 2.1 of the License, or
* any later version.
*
* wayback-svn is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
* GNU Lesser Public License for more details.
*
* You should have received a copy of the GNU Lesser Public License
* along with wayback-svn; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
*/
package org.archive.wayback.liveweb;

import java.io.IOException;
import java.net.URL;
import java.util.Date;
import java.util.logging.Logger;

import org.apache.commons.httpclient.URIException;
import org.archive.io.arc.ARCLocation;
import org.archive.io.arc.ARCRecord;
import org.archive.wayback.UrlCanonicalizer;
import org.archive.wayback.WaybackConstants;
import org.archive.wayback.core.CaptureSearchResults;
import org.archive.wayback.core.Resource;
import org.archive.wayback.core.SearchResult;
import org.archive.wayback.core.SearchResults;
import org.archive.wayback.core.Timestamp;
import org.archive.wayback.core.WaybackRequest;
import org.archive.wayback.exception.LiveDocumentNotAvailableException;
import org.archive.wayback.exception.ResourceNotInArchiveException;
import org.archive.wayback.exception.WaybackException;
import org.archive.wayback.resourcestore.ARCRecordToSearchResultAdapter;
import org.archive.wayback.resourcestore.ArcResource;
import org.archive.wayback.util.url.AggressiveUrlCanonicalizer;

/**
*
*
* @author brad
* @version $Date$, $Revision$
*/
public class LiveWebCache {
  private static final Logger LOGGER = Logger.getLogger(
      LiveWebCache.class.getName());

  private long maxFailedCacheMS = 600000;
  private ARCCacheDirectory arcCacheDir = null;
  private URLCacher cacher = null;
  private LiveWebLocalResourceIndex index = null;
  private UrlCanonicalizer canonicalizer = null;
  private ARCRecordToSearchResultAdapter adapter = null;
 
  public LiveWebCache() {
    canonicalizer = new AggressiveUrlCanonicalizer();
    adapter = new ARCRecordToSearchResultAdapter();
    adapter.setCanonicalizer(canonicalizer);
  }
 
  /**
   * closes all resources
   */
  public void shutdown() {
    arcCacheDir.shutdown();
  }
 
  private WaybackRequest makeCacheWBRequest(URL url, long maxCacheMS,
      boolean bUseOlder) throws URIException {
    WaybackRequest req = new WaybackRequest();
    req.setRequestUrl(url.toString());
    req.put(WaybackConstants.REQUEST_TYPE,
        WaybackConstants.REQUEST_CLOSEST_QUERY);
    req.put(WaybackConstants.REQUEST_EXACT_DATE,
        Timestamp.currentTimestamp().getDateStr());
    Timestamp earliest = null;
    if(bUseOlder) {
      earliest = Timestamp.earliestTimestamp();
    } else {
      Date d = new Date(System.currentTimeMillis() - maxCacheMS);
      earliest = new Timestamp(d);
    }
    req.put(WaybackConstants.REQUEST_START_DATE,earliest.getDateStr());
    // for now, assume all live web requests are only satisfiable by the
    // exact host -- no massaging.
    req.put(WaybackConstants.REQUEST_EXACT_HOST_ONLY,
        WaybackConstants.REQUEST_YES);
    return req;
  }
 
  private boolean isForgedFailRecentEnough(SearchResult result) {
    String captureDate = result.get(WaybackConstants.RESULT_CAPTURE_DATE);
    Timestamp t = new Timestamp(captureDate);
    long maxAge = System.currentTimeMillis() - maxFailedCacheMS;
    long failAge = t.getDate().getTime();
    if(failAge > maxAge) {
      return true;
    }
    return false;
  }
 
  private boolean isForgedFailedSearchResult(SearchResult result) {
    String arcFile = result.get(WaybackConstants.RESULT_ARC_FILE);
    return arcFile.equals("-");
  }
 
  private SearchResult forgeFailedSearchResult(URL url) {
    SearchResult result = new SearchResult();

    result.put(WaybackConstants.RESULT_ARC_FILE, "-");
    result.put(WaybackConstants.RESULT_OFFSET, "0");

    result.put(WaybackConstants.RESULT_HTTP_CODE, "0");

    result.put(WaybackConstants.RESULT_MD5_DIGEST, "-");
    result.put(WaybackConstants.RESULT_MIME_TYPE, "-");
    result.put(WaybackConstants.RESULT_CAPTURE_DATE,
        Timestamp.currentTimestamp().getDateStr());

    result.put(WaybackConstants.RESULT_ORIG_HOST, url.getHost());
    result.put(WaybackConstants.RESULT_REDIRECT_URL, "-");
    result.put(WaybackConstants.RESULT_URL, url.toString());

    String indexUrl;
    try {
      indexUrl = canonicalizer.urlStringToKey(url.toString());
    } catch (URIException e) {
      // not gonna happen...
      e.printStackTrace();
      indexUrl = url.toString();
    }
    result.put(WaybackConstants.RESULT_URL_KEY, indexUrl);
   
    return result;
  }
 
  private Resource getLocalCachedResource(URL url, long maxCacheMS,
      boolean bUseOlder) throws ResourceNotInArchiveException,
      IOException, LiveDocumentNotAvailableException {
   
    Resource resource = null;
    WaybackRequest wbRequest = makeCacheWBRequest(url,maxCacheMS,bUseOlder);
   
    CaptureSearchResults results = null;
    try {
      SearchResults gresults = index.query(wbRequest);
      if(!(gresults instanceof CaptureSearchResults)) {
        throw new IOException("bad result type...");
      }
      results = (CaptureSearchResults) gresults;
    } catch (ResourceNotInArchiveException e) {
//      e.printStackTrace();
      throw e;
    } catch (WaybackException e) {
      e.printStackTrace();
      throw new IOException(e.getMessage());
    }
    SearchResult result = results.getClosest(wbRequest);
    if(result != null) {
      if(isForgedFailedSearchResult(result)) {
        if(isForgedFailRecentEnough(result)) {
          LOGGER.info(url.toString() + " has failed recently");
          throw new LiveDocumentNotAvailableException("failed prev");
        } else {
          LOGGER.info(url.toString() + " failed a while ago");
          throw new ResourceNotInArchiveException("Nope");
        }
      }
      String name = (String) result.get(WaybackConstants.RESULT_ARC_FILE);
      long offset = Long.parseLong(
          (String) result.get(WaybackConstants.RESULT_OFFSET));
      resource = arcCacheDir.getResource(name, offset);
    }
    return resource;
  }
 
  private Resource getLiveCachedResource(URL url)
    throws LiveDocumentNotAvailableException, IOException {
   
    Resource resource = null;
   
    LOGGER.info("Caching URL(" + url.toString() + ")");
    ARCLocation location = null;
    try {
      location = cacher.cache(arcCacheDir, url.toString());
    } catch(LiveDocumentNotAvailableException e) {
      // record the failure, so we can fail early next time:
      SearchResult result = forgeFailedSearchResult(url);
      index.addSearchResult(result);
      LOGGER.info("Added FAIL-URL(" + url.toString() + ") to LiveIndex");
      throw e;
    }
    if(location != null) {
      String name = location.getName();
      long offset = location.getOffset();
      LOGGER.info("Cached URL(" + url.toString() + ") in " +
          "ARC(" + name + ") at (" + offset + ")");
      resource = arcCacheDir.getResource(name, offset);
      // add the result to the index:
      if(resource instanceof ArcResource) {
        ArcResource aResource = (ArcResource) resource;
        ARCRecord record = (ARCRecord) aResource.getArcRecord();
       
        SearchResult result = adapter.adapt(record);
        index.addSearchResult(result);
        LOGGER.info("Added URL(" + url.toString() + ") in " +
            "ARC(" + name + ") at (" + offset + ") to LiveIndex");
       
        // we just read thru the doc in order to index it. Reset:
        resource = arcCacheDir.getResource(name, offset);
      }

    }
   
    return resource;
  }
 
  /**
   * @param url
   * @param maxCacheMS
   * @param bUseOlder
   * @return Resource for url
   *
   * @throws LiveDocumentNotAvailableException
   * @throws IOException
   */
  public Resource getCachedResource(URL url, long maxCacheMS,
      boolean bUseOlder) throws LiveDocumentNotAvailableException,
      IOException {
   
    Resource resource = null;
    try {
      resource = getLocalCachedResource(url, maxCacheMS, false);
      LOGGER.info("Using Cached URL(" + url.toString() + ")");
     
    } catch(ResourceNotInArchiveException e) {
      try {
        LOGGER.info("URL:" + url.toString() + " has not been cached"
            + " recently enough. Attempting from Live Web");

        resource = getLiveCachedResource(url);

      } catch (LiveDocumentNotAvailableException e1) {
        if(bUseOlder) {
          // we don't have a copy that satisfies the "ideal" maxAge,
          // but the file isn't on the live web, and the caller has
          // asked to use an older cached copy if a fresh one isn't
          // available.
          LOGGER.info("Second Cached attempt for URL(" +
              url.toString() + ") allowing older...");
          try {
            resource = getLocalCachedResource(url, maxCacheMS, true);
          } catch (ResourceNotInArchiveException e2) {
            LOGGER.info("Unable to live-get and older" +
                " is not in cache...throwing LDNAE");
            // rethrow the original...
            throw e1;
          }
          LOGGER.info("Got older version of Cached URL(" +
              url.toString() + ")");
        } else {
          LOGGER.info("Unable to live-get...throwing LDNAE");
          // rethrow the original...
          throw e1;
        }
      }
    }
    return resource;
  }

  /**
   * @return the maxFailedCacheMS
   */
  public long getMaxFailedCacheMS() {
    return maxFailedCacheMS;
  }

  /**
   * @param maxFailedCacheMS the maxFailedCacheMS to set
   */
  public void setMaxFailedCacheMS(long maxFailedCacheMS) {
    this.maxFailedCacheMS = maxFailedCacheMS;
  }

  /**
   * @return the arcCacheDir
   */
  public ARCCacheDirectory getArcCacheDir() {
    return arcCacheDir;
  }

  /**
   * @param arcCacheDir the arcCacheDir to set
   */
  public void setArcCacheDir(ARCCacheDirectory arcCacheDir) {
    this.arcCacheDir = arcCacheDir;
  }

  /**
   * @return the cacher
   */
  public URLCacher getCacher() {
    return cacher;
  }

  /**
   * @param cacher the cacher to set
   */
  public void setCacher(URLCacher cacher) {
    this.cacher = cacher;
  }

  /**
   * @return the index
   */
  public LiveWebLocalResourceIndex getIndex() {
    return index;
  }

  /**
   * @param index the index to set
   */
  public void setIndex(LiveWebLocalResourceIndex index) {
    this.index = index;
  }

  public UrlCanonicalizer getCanonicalizer() {
    return canonicalizer;
  }

  public void setCanonicalizer(UrlCanonicalizer canonicalizer) {
    this.canonicalizer = canonicalizer;
    adapter.setCanonicalizer(canonicalizer);
  }
}
TOP

Related Classes of org.archive.wayback.liveweb.LiveWebCache

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.