Package org.archive.wayback.resourceindex.filterfactory

Source Code of org.archive.wayback.resourceindex.filterfactory.ExclusionCaptureFilterGroup

/*
*  This file is part of the Wayback archival access software
*   (http://archive-access.sourceforge.net/projects/wayback/).
*
*  Licensed to the Internet Archive (IA) by one or more individual
*  contributors.
*
*  The IA licenses this file to You under the Apache License, Version 2.0
*  (the "License"); you may not use this file except in compliance with
*  the License.  You may obtain a copy of the License at
*
*      http://www.apache.org/licenses/LICENSE-2.0
*
*  Unless required by applicable law or agreed to in writing, software
*  distributed under the License is distributed on an "AS IS" BASIS,
*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*  See the License for the specific language governing permissions and
*  limitations under the License.
*/
package org.archive.wayback.resourceindex.filterfactory;

import java.util.List;

import org.archive.wayback.UrlCanonicalizer;
import org.archive.wayback.core.CaptureSearchResult;
import org.archive.wayback.core.SearchResults;
import org.archive.wayback.core.WaybackRequest;
import org.archive.wayback.exception.AccessControlException;
import org.archive.wayback.exception.AdministrativeAccessControlException;
import org.archive.wayback.exception.ResourceNotInArchiveException;
import org.archive.wayback.exception.RobotAccessControlException;
import org.archive.wayback.exception.RobotNotAvailableException;
import org.archive.wayback.exception.RobotTimedOutAccessControlException;
import org.archive.wayback.resourceindex.filters.CounterFilter;
import org.archive.wayback.resourceindex.filters.ExclusionFilter;
import org.archive.wayback.util.ObjectFilter;
import org.archive.wayback.util.ObjectFilterChain;

public class ExclusionCaptureFilterGroup implements CaptureFilterGroup {

  private ObjectFilterChain<CaptureSearchResult> chain = null;
  private CounterFilter preCounter = null;
  private CounterFilter postCounter = null;
  String requestUrl = null;
  private boolean sawRobots = false;
  private boolean passedRobots = false;
  private boolean robotTimedOut = false;
  private boolean liveWebGone = false;
  private boolean sawAdministrative = false;
  private boolean passedAdministrative = false;
  private UrlCanonicalizer canonicalizer = null;
 
  public ExclusionCaptureFilterGroup(WaybackRequest request, UrlCanonicalizer canonicalizer) {
   
    this.canonicalizer = canonicalizer;
   
    // checks an exclusion service for every matching record
    ExclusionFilter exclusion = request.getExclusionFilter();
    chain = new ObjectFilterChain<CaptureSearchResult>();
    if(exclusion != null) {
      exclusion.setFilterGroup(this);
//      preCounter = new CounterFilter();
//      // count how many results got to the ExclusionFilter:
//      chain.addFilter(preCounter);
      chain.addFilter(exclusion);
      // count how many results got past the ExclusionFilter:
      requestUrl = request.getRequestUrl();
    }
//    postCounter = new CounterFilter();
//    chain.addFilter(postCounter);
  }
 
  public UrlCanonicalizer getCaptureFilterGroupCanonicalizer()
  {
    return canonicalizer;
  }
 
  public List<ObjectFilter<CaptureSearchResult>> getFilters() {
    return chain.getFilters();
  }

  public void annotateResults(SearchResults results)
      throws AccessControlException, ResourceNotInArchiveException,
      RobotNotAvailableException {
    if(getRobotTimedOut()) {
      throw new RobotTimedOutAccessControlException("Unable to check" +
          " robots.txt for " + requestUrl);
    }
    if(getLiveWebGone()) {
      throw new RobotNotAvailableException("The URL " + requestUrl +
      " is blocked by the sites robots.txt file");
    }
    if(isSawRobots() && !isPassedRobots()) {
      throw new RobotAccessControlException("The URL " + requestUrl +
          " is blocked by the sites robots.txt file");
    }
    if(isSawAdministrative() && !isPassedAdministrative()) {
      throw new AdministrativeAccessControlException(requestUrl +
          "  is not available in the Wayback Machine.");
    }
  }

  public void setPassedRobots() {
    passedRobots = true;
  }
  public void setSawRobots() {
    sawRobots = true;
  }
 
  public void setPassedAdministrative(boolean passed) {
    passedAdministrative = passed;
  }

  public void setPassedAdministrative() {
    passedAdministrative = true;
  }
  public void setSawAdministrative() {
    sawAdministrative = true;
  }

  public void setRobotTimedOut() {
    robotTimedOut = true;
  }
  public boolean getRobotTimedOut() {
    return robotTimedOut;
  }

  public void setLiveWebGone() {
    liveWebGone = true;
  }
  public boolean getLiveWebGone() {
    return liveWebGone;
  }

  public boolean isSawRobots() {
    return sawRobots;
  }

  public boolean isPassedRobots() {
    return passedRobots;
  }

  public boolean isSawAdministrative() {
    return sawAdministrative;
  }

  public boolean isPassedAdministrative() {
    return passedAdministrative;
  }
}
TOP

Related Classes of org.archive.wayback.resourceindex.filterfactory.ExclusionCaptureFilterGroup

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.