Package org.archive.wayback.resourceindex

Source Code of org.archive.wayback.resourceindex.LocalResourceIndex

/* LocalResourceIndex
*
* $Id: LocalResourceIndex.java 2232 2008-04-11 04:07:18Z bradtofel $
*
* Created on 5:02:21 PM Aug 17, 2006.
*
* Copyright (C) 2006 Internet Archive.
*
* This file is part of Wayback.
*
* Wayback is free software; you can redistribute it and/or modify
* it under the terms of the GNU Lesser Public License as published by
* the Free Software Foundation; either version 2.1 of the License, or
* any later version.
*
* Wayback is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
* GNU Lesser Public License for more details.
*
* You should have received a copy of the GNU Lesser Public License
* along with Wayback; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
*/
package org.archive.wayback.resourceindex;

import java.io.IOException;
import java.util.Iterator;

import org.apache.commons.httpclient.URIException;
import org.archive.net.UURI;
import org.archive.net.UURIFactory;
import org.archive.wayback.ResourceIndex;
import org.archive.wayback.UrlCanonicalizer;
import org.archive.wayback.WaybackConstants;
import org.archive.wayback.resourceindex.filters.CaptureToUrlResultFilter;
import org.archive.wayback.resourceindex.filters.CounterFilter;
import org.archive.wayback.resourceindex.filters.DateRangeFilter;
import org.archive.wayback.resourceindex.filters.DuplicateRecordFilter;
import org.archive.wayback.resourceindex.filters.EndDateFilter;
import org.archive.wayback.resourceindex.filters.GuardRailFilter;
import org.archive.wayback.resourceindex.filters.HostMatchFilter;
import org.archive.wayback.resourceindex.filters.SelfRedirectFilter;
import org.archive.wayback.resourceindex.filters.UrlMatchFilter;
import org.archive.wayback.resourceindex.filters.UrlPrefixMatchFilter;
import org.archive.wayback.resourceindex.filters.WindowEndFilter;
import org.archive.wayback.resourceindex.filters.WindowStartFilter;
import org.archive.wayback.core.CaptureSearchResults;
import org.archive.wayback.core.SearchResult;
import org.archive.wayback.core.SearchResults;
import org.archive.wayback.core.Timestamp;
import org.archive.wayback.core.UrlSearchResults;
import org.archive.wayback.core.WaybackRequest;
import org.archive.wayback.exception.AccessControlException;
import org.archive.wayback.exception.BadQueryException;
import org.archive.wayback.exception.ResourceIndexNotAvailableException;
import org.archive.wayback.exception.ResourceNotInArchiveException;
import org.archive.wayback.util.AdaptedIterator;
import org.archive.wayback.util.CloseableIterator;
import org.archive.wayback.util.ObjectFilter;
import org.archive.wayback.util.ObjectFilterChain;
import org.archive.wayback.util.url.AggressiveUrlCanonicalizer;

/**
*
*
* @author brad
* @version $Date: 2008-04-11 05:07:18 +0100 (Sex, 11 Abr 2008) $, $Revision: 2232 $
*/
public class LocalResourceIndex implements ResourceIndex {

  /**
   * maximum number of records to return
   */   
  private int maxRecords = NutchResourceIndex.MAX_RECORDS;

  protected SearchResultSource source;
 
  private UrlCanonicalizer canonicalizer = null;
 
  private boolean dedupeRecords = false;

  public LocalResourceIndex() {
    canonicalizer = new AggressiveUrlCanonicalizer();
  }
 
  private void filterRecords(Iterator<SearchResult> itr,
      ObjectFilter<SearchResult> filter, SearchResults results,
      boolean forwards) throws IOException {

    if(dedupeRecords) {
      itr = new AdaptedIterator<SearchResult, SearchResult>(itr,
          new DeduplicationSearchResultAnnotationAdapter());
    }
    while (itr.hasNext()) {
      SearchResult result = itr.next();
      int ruling = filter.filterObject(result);
      if (ruling == ObjectFilter.FILTER_ABORT) {
        break;
      } else if (ruling == ObjectFilter.FILTER_INCLUDE) {
        results.addSearchResult(result, forwards);
      }
    }
    if(itr instanceof CloseableIterator) {
      CloseableIterator<SearchResult> citr =
        (CloseableIterator<SearchResult>) itr;
      source.cleanup(citr);
    }
  }

  private String getRequired(WaybackRequest wbRequest, String field,
      String defaultValue) throws BadQueryException {

    String value = wbRequest.get(field);
    if (value == null) {
      if (defaultValue == null) {
        throw new BadQueryException("No " + field + " specified");
      } else {
        value = defaultValue;
      }
    }
    return value;
  }

  private String getRequired(WaybackRequest wbRequest, String field)
      throws BadQueryException {
    return getRequired(wbRequest, field, null);
  }

  private HostMatchFilter getExactHostFilter(WaybackRequest wbRequest) {

    HostMatchFilter filter = null;
    String exactHostFlag = wbRequest.get(
        WaybackConstants.REQUEST_EXACT_HOST_ONLY);
    if(exactHostFlag != null &&
        exactHostFlag.equals(WaybackConstants.REQUEST_YES)) {

      String searchUrl = wbRequest.get(WaybackConstants.REQUEST_URL);
      try {

        UURI searchURI = UURIFactory.getInstance(searchUrl);
        String exactHost = searchURI.getHost();
        filter = new HostMatchFilter(exactHost);

      } catch (URIException e) {
        // Really, this isn't gonna happen, we've already canonicalized
        // it... should really optimize and do that just once.
        e.printStackTrace();
      }
    }
    return filter;
  }
  /*
   * (non-Javadoc)
   *
   * @see org.archive.wayback.ResourceIndex#query(org.archive.wayback.core.WaybackRequest)
   */
  public SearchResults query(WaybackRequest wbRequest)
      throws ResourceIndexNotAvailableException,
      ResourceNotInArchiveException, BadQueryException,
      AccessControlException {

    SearchResults results = null; // return value placeholder

    String startKey; // actual key where search will begin
    String keyUrl; // "purified" URL request
    int startResult; // calculated based on hits/page * pagenum

    // first grab all the info from the WaybackRequest, and validate it:

    int resultsPerPage = wbRequest.getResultsPerPage();
    int pageNum = wbRequest.getPageNum();
    startResult = (pageNum - 1) * resultsPerPage;

    if (resultsPerPage < 1) {
      throw new BadQueryException("resultsPerPage cannot be < 1");
    }
    if (resultsPerPage > maxRecords) {
      throw new BadQueryException("resultsPerPage cannot be > "
          + maxRecords);
    }
    if (pageNum < 1) {
      throw new BadQueryException("pageNum must be > 0");
    }

    String searchUrl = getRequired(wbRequest, WaybackConstants.REQUEST_URL);
    String searchType = getRequired(wbRequest,
        WaybackConstants.REQUEST_TYPE);
    String startDate = getRequired(wbRequest,
        WaybackConstants.REQUEST_START_DATE, Timestamp
            .earliestTimestamp().getDateStr());
    String endDate = getRequired(wbRequest,
        WaybackConstants.REQUEST_END_DATE, Timestamp.latestTimestamp()
            .getDateStr());
    String exactDate = getRequired(wbRequest,
        WaybackConstants.REQUEST_EXACT_DATE, Timestamp
            .latestTimestamp().getDateStr());

    try {
      keyUrl = canonicalizer.urlStringToKey(searchUrl);
    } catch (URIException e) {
      throw new BadQueryException("invalid "
          + WaybackConstants.REQUEST_URL + " " + searchUrl);
    }

    // set up the common Filters:

    // makes sure we don't inspect too many records: prevents DOS
    GuardRailFilter guardrail = new GuardRailFilter(maxRecords);

    // checks an exclusion service for every matching record
    ObjectFilter<SearchResult> exclusion = wbRequest.getExclusionFilter();

    // count how many results got to the ExclusionFilter:
    CounterFilter preExCounter = new CounterFilter();
    // count how many results got past the ExclusionFilter, or how
    // many total matched, if there was no ExclusionFilter:
    CounterFilter finalCounter = new CounterFilter();
   
    // has the user asked for only results on the exact host specified?
    HostMatchFilter hostMatchFilter = getExactHostFilter(wbRequest);

    if (searchType.equals(WaybackConstants.REQUEST_REPLAY_QUERY)
        || searchType.equals(WaybackConstants.REQUEST_CLOSEST_QUERY)) {

      results = new CaptureSearchResults();

      ObjectFilterChain<SearchResult> forwardFilters =
        new ObjectFilterChain<SearchResult>();

//      ObjectFilterChain<SearchResult> reverseFilters =
//        new ObjectFilterChain<SearchResult>();

      // use the same guardrail for both:
      forwardFilters.addFilter(guardrail);
//      reverseFilters.addFilter(guardrail);
     
      forwardFilters.addFilter(new DuplicateRecordFilter());
     
      // match URL key:
      forwardFilters.addFilter(new UrlMatchFilter(keyUrl));
//      reverseFilters.addFilter(new UrlMatchFilter(keyUrl));

      if(hostMatchFilter != null) {
        forwardFilters.addFilter(hostMatchFilter);
//        reverseFilters.addFilter(hostMatchFilter);
      }
     
      // be sure to only include records within the date range we want:
      // The bin search may start the forward filters at a record older
      // than we want. Since the fowardFilters only include an abort
      // endDateFilter, we might otherwise include a record before the
      // requested range.
      DateRangeFilter drFilter = new DateRangeFilter(startDate,endDate);
      forwardFilters.addFilter(drFilter);
//      reverseFilters.addFilter(drFilter);
     
      // abort processing if we hit a date outside the search range:
      forwardFilters.addFilter(new EndDateFilter(endDate));
//      reverseFilters.addFilter(new StartDateFilter(startDate));

      // for replay, do not include records that redirect to
      // themselves.. We'll leave this for both closest and replays,
      // because the only application of closest at the moment is
      // timeline in which case, we don't want to show captures that
      // redirect to themselves in the timeline if they are not viewable.
      SelfRedirectFilter selfRedirectFilter = new SelfRedirectFilter();
      selfRedirectFilter.setCanonicalizer(canonicalizer);
      forwardFilters.addFilter(selfRedirectFilter);
//      reverseFilters.addFilter(selfRedirectFilter);
     
      // possibly filter via exclusions:
      if(exclusion != null) {
        forwardFilters.addFilter(preExCounter);
        forwardFilters.addFilter(exclusion);

//        reverseFilters.addFilter(preExCounter);
//        reverseFilters.addFilter(exclusion);
      }
      forwardFilters.addFilter(finalCounter);
//      reverseFilters.addFilter(finalCounter);

      forwardFilters.addFilter(new WindowEndFilter(resultsPerPage));
//      int resultsPerDirection = (int) Math.floor(resultsPerPage / 2);
//      reverseFilters.addFilter(new WindowEndFilter(resultsPerDirection));

      startKey = keyUrl;

      try {
//        CloseableIterator<SearchResult> reverse =
//          new AdaptedObjectFilterIterator<SearchResult>(
//          source.getPrefixReverseIterator(startKey),
//          reverseFilters);

//        // reverse the reverseResults:
//        ArrayList<SearchResult> reverseResults =
//          new ArrayList<SearchResult>();
//        while(reverse.hasNext()) {
//          reverseResults.add(0, reverse.next());
//        }
       
        // now make a composite of the reverse and forwards:
       
        CloseableIterator<SearchResult> forward =
          source.getPrefixIterator(startKey);
//       
//        CompositeIterator<SearchResult> resultsItr =
//          new CompositeIterator<SearchResult>();
//        resultsItr.addComponent(reverseResults.iterator());
//        resultsItr.addComponent(forward);
       
        // and filter:
//        filterRecords(resultsItr, forwardFilters, results, true);
        filterRecords(forward, forwardFilters, results, true);

      } catch (IOException e) {
        throw new ResourceIndexNotAvailableException(
            e.getLocalizedMessage());
      }

    } else if (searchType.equals(WaybackConstants.REQUEST_URL_QUERY)) {

      results = new CaptureSearchResults();
      // build up the FilterChain(s):
      ObjectFilterChain<SearchResult> filters =
        new ObjectFilterChain<SearchResult>();
      filters.addFilter(guardrail);
      filters.addFilter(new DuplicateRecordFilter());

      filters.addFilter(new UrlMatchFilter(keyUrl));
      if(hostMatchFilter != null) {
        filters.addFilter(hostMatchFilter);
      }
      filters.addFilter(new EndDateFilter(endDate));
      // possibly filter via exclusions:
      if (exclusion != null) {
        filters.addFilter(preExCounter);
        filters.addFilter(exclusion);
      }
      filters.addFilter(finalCounter);
      // OPTIMIZ: beginning the search at the startDate causes problems
      // with deduplicated results. We need to be smarter about rolling
      // backwards a ways if we start on a deduped record.
//      startKey = keyUrl + " " + startDate;
      startKey = keyUrl + " ";

      // add the start and end windowing filters:
      filters.addFilter(new WindowStartFilter(startResult));
      filters.addFilter(new WindowEndFilter(resultsPerPage));
      try {
        filterRecords(source.getPrefixIterator(startKey), filters, results,
            true);
      } catch (IOException e) {
        throw new ResourceIndexNotAvailableException(
            e.getLocalizedMessage());
      }
     

    } else if (searchType.equals(WaybackConstants.REQUEST_URL_PREFIX_QUERY)) {

      results = new UrlSearchResults();
      // build up the FilterChain(s):
      ObjectFilterChain<SearchResult> filters =
        new ObjectFilterChain<SearchResult>();
      filters.addFilter(guardrail);
      filters.addFilter(new DuplicateRecordFilter());

      filters.addFilter(new UrlPrefixMatchFilter(keyUrl));
      if(hostMatchFilter != null) {
        filters.addFilter(hostMatchFilter);
      }
      filters.addFilter(new DateRangeFilter(startDate, endDate));
      // possibly filter via exclusions:
      if (exclusion != null) {
        filters.addFilter(preExCounter);
        filters.addFilter(exclusion);
      }
      filters.addFilter(new CaptureToUrlResultFilter());
      filters.addFilter(finalCounter);
      startKey = keyUrl;

      // add the start and end windowing filters:
      filters.addFilter(new WindowStartFilter(startResult));
      filters.addFilter(new WindowEndFilter(resultsPerPage));
      try {
        filterRecords(source.getPrefixIterator(startKey), filters, results,
            true);
      } catch (IOException e) {
        throw new ResourceIndexNotAvailableException(
            e.getLocalizedMessage());
      }

    } else {
      throw new BadQueryException("Unknown query type(" + searchType
          + "), must be " + WaybackConstants.REQUEST_REPLAY_QUERY
          + ", " + WaybackConstants.REQUEST_CLOSEST_QUERY + ", "
          + WaybackConstants.REQUEST_URL_QUERY + ", or "
          + WaybackConstants.REQUEST_URL_PREFIX_QUERY);
    }

    int matched = finalCounter.getNumMatched();
    if (matched == 0) {
      if (exclusion != null) {
        if(preExCounter.getNumMatched() > 0) {
          throw new AccessControlException("All results Excluded");
        }
      }
      throw new ResourceNotInArchiveException("the URL " + keyUrl
          + " is not in the archive.");
    }

    // now we need to set some filter properties on the results:
    results.putFilter(WaybackConstants.REQUEST_URL, keyUrl);
    results.putFilter(WaybackConstants.REQUEST_TYPE, searchType);
    results.putFilter(WaybackConstants.REQUEST_START_DATE, startDate);
    results.putFilter(WaybackConstants.REQUEST_EXACT_DATE, exactDate);
    results.putFilter(WaybackConstants.REQUEST_END_DATE, endDate);

    // window info
    results.putFilter(WaybackConstants.RESULTS_FIRST_RETURNED, String
        .valueOf(startResult));
    results.putFilter(WaybackConstants.RESULTS_REQUESTED, String
        .valueOf(resultsPerPage));

    // how many are actually in the results:
    results.putFilter(WaybackConstants.RESULTS_NUM_RESULTS, String
        .valueOf(matched));

    // how many matched (includes those outside window)
    results.putFilter(WaybackConstants.RESULTS_NUM_RETURNED, String
        .valueOf(results.getResultCount()));

    return results;
  }

  /**
   * @param maxRecords the maxRecords to set
   */
  public void setMaxRecords(int maxRecords) {
    this.maxRecords = maxRecords;
  }

  /**
   * @param source the source to set
   */
  public void setSource(SearchResultSource source) {
    this.source = source;
  }

  public boolean isDedupeRecords() {
    return dedupeRecords;
  }

  public void setDedupeRecords(boolean dedupeRecords) {
    this.dedupeRecords = dedupeRecords;
  }

  public UrlCanonicalizer getCanonicalizer() {
    return canonicalizer;
  }

  public void setCanonicalizer(UrlCanonicalizer canonicalizer) {
    this.canonicalizer = canonicalizer;
  }

  public void shutdown() throws IOException {
    source.shutdown();
  }
}
TOP

Related Classes of org.archive.wayback.resourceindex.LocalResourceIndex

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.