Package org.archive.wayback.webapp

Source Code of org.archive.wayback.webapp.LiveWebAccessPoint

/*
*  This file is part of the Wayback archival access software
*   (http://archive-access.sourceforge.net/projects/wayback/).
*
*  Licensed to the Internet Archive (IA) by one or more individual
*  contributors.
*
*  The IA licenses this file to You under the Apache License, Version 2.0
*  (the "License"); you may not use this file except in compliance with
*  the License.  You may obtain a copy of the License at
*
*      http://www.apache.org/licenses/LICENSE-2.0
*
*  Unless required by applicable law or agreed to in writing, software
*  distributed under the License is distributed on an "AS IS" BASIS,
*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*  See the License for the specific language governing permissions and
*  limitations under the License.
*/
package org.archive.wayback.webapp;

import java.io.IOException;
import java.net.InetAddress;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.UnknownHostException;
import java.util.Date;
import java.util.logging.Logger;
import java.util.regex.Pattern;

import javax.servlet.ServletException;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;

import org.archive.io.arc.ARCRecord;
import org.archive.wayback.accesscontrol.robotstxt.RobotExclusionFilterFactory;
import org.archive.wayback.accesscontrol.staticmap.StaticMapExclusionFilterFactory;
import org.archive.wayback.core.CaptureSearchResult;
import org.archive.wayback.core.CaptureSearchResults;
import org.archive.wayback.core.FastCaptureSearchResult;
import org.archive.wayback.core.WaybackRequest;
import org.archive.wayback.exception.AccessControlException;
import org.archive.wayback.exception.AdministrativeAccessControlException;
import org.archive.wayback.exception.BadQueryException;
import org.archive.wayback.exception.ConfigurationException;
import org.archive.wayback.exception.LiveDocumentNotAvailableException;
import org.archive.wayback.exception.ResourceIndexNotAvailableException;
import org.archive.wayback.exception.ResourceNotInArchiveException;
import org.archive.wayback.exception.RobotAccessControlException;
import org.archive.wayback.exception.WaybackException;
import org.archive.wayback.liveweb.LiveWebCache;
import org.archive.wayback.resourceindex.filters.ExclusionFilter;
import org.archive.wayback.resourcestore.resourcefile.ArcResource;
import org.archive.wayback.util.url.UrlOperations;

/**
* @author brad
*
* RequestHandler which satisfies all incoming requests through a LiveWebCache,
* using an internal AccessPoint to rewrite replayed documents.
*
*/
public class LiveWebAccessPoint extends LiveWebRequestHandler {
  private static final Logger LOGGER = Logger.getLogger(
      LiveWebAccessPoint.class.getName());
 
  enum PerfStat
  {
    LiveWeb;
  }

  private AccessPoint inner = null;
  private LiveWebCache cache = null;
  private RobotExclusionFilterFactory robotFactory = null;
  private StaticMapExclusionFilterFactory adminFactory = null;
 
  private Pattern skipHost = null;
  private int dnsCheckTimeout = 0;
 
  private String requireReferrer = null;
 
  public final static String LIVEWEB_RUNTIME_ERROR_HEADER = "X-Archive-Wayback-Runtime-Liveweb-Error";
 
  private long maxCacheMS = 86400000
 
   
  public boolean handleRequest(HttpServletRequest httpRequest,
      HttpServletResponse httpResponse)
  throws ServletException, IOException {
   
    String urlString = translateRequestPathQuery(httpRequest);
    urlString = UrlOperations.fixupHTTPUrlWithOneSlash(urlString);
    boolean handled = true;
   
    ArcResource r = null;
   
    WaybackRequest wbRequest = new WaybackRequest();
    wbRequest.setAccessPoint(inner);
    wbRequest.setRequestUrl(urlString);

    try {
      String ref = httpRequest.getHeader("Referer");
     
      PerfStats.clearAll();     
     
      if ((ref == null) || !skipHost.matcher(ref).find()) {
        wbRequest.setTimestampSearchKey(true);
        wbRequest.setReplayDate(new Date());
        wbRequest.setReplayRequest();
       
        try {
          inner.queryIndex(wbRequest);
          // Succeeded, so send redirect to query
          httpResponse.sendRedirect(inner.getReplayPrefix() + urlString);
          return true;
        } catch (ResourceIndexNotAvailableException e) {
          throw new LiveDocumentNotAvailableException(e.toString());
        } catch (ResourceNotInArchiveException e) {
          //Continue
        } catch (BadQueryException e) {
          throw new LiveDocumentNotAvailableException(e.toString());
        } catch (AccessControlException e) {
          //Continue
          //throw new LiveDocumentNotAvailableException(e.toString());
        } catch (ConfigurationException e) {
          throw new LiveDocumentNotAvailableException(e.toString());
        }
      }
     
      wbRequest.setLiveWebRequest(true);
     
      if (inner.isEnablePerfStatsHeader()) {
        PerfStats.timeStart(AccessPoint.PerfStat.Total);
        httpResponse = new PerfWritingHttpServletResponse(httpRequest, httpResponse, AccessPoint.PerfStat.Total, inner.getPerfStatsHeader());
      }
     
      Thread.currentThread().setName("Thread " +
          Thread.currentThread().getId() + " " + getBeanName() +
          " handling: " + urlString);
     
      CaptureSearchResult result = new FastCaptureSearchResult();
     
      r = this.getLiveWebResource(result, urlString);
     
      if (r != null) {       
        CaptureSearchResults results = new CaptureSearchResults();
        results.addSearchResult(result);
     
        wbRequest.setReplayTimestamp(result.getCaptureTimestamp());
         
        inner.getReplay().getRenderer(wbRequest, result, r).renderResource(httpRequest, httpResponse, wbRequest, result, r,
            inner.getUriConverter(), results)
      } else {
        throw new LiveDocumentNotAvailableException(urlString);
      }

    } catch(WaybackException e) {
      inner.logError(httpResponse, LIVEWEB_RUNTIME_ERROR_HEADER, e, wbRequest);
      inner.getException().renderException(httpRequest, httpResponse, wbRequest, e, inner.getUriConverter());
   
    } catch(Exception e) {
      inner.logError(httpResponse, LIVEWEB_RUNTIME_ERROR_HEADER, e, wbRequest);
    } finally {
      if (r != null) {
        r.close();
      }
    }
   
    return handled;
  }
 
  protected ArcResource getLiveWebResource(CaptureSearchResult result, String urlString) throws WaybackException, IOException
  {
    URL url = null;
    ArcResource r = null;

    if (!urlString.startsWith(UrlOperations.HTTP_SCHEME) &&
      !urlString.startsWith(UrlOperations.HTTPS_SCHEME)) {
     
      // Remove accidental calendar page requests
      if (urlString.startsWith("*/")) {
        urlString = urlString.substring(2);
      }
     
      // Assume http
      urlString = UrlOperations.HTTP_SCHEME + urlString;
    }
       
    try {
      url = new URL(urlString);
    } catch(MalformedURLException e) {
      throw new BadQueryException("Bad URL(" + urlString + ")");
    }
   
    if ((skipHost != null) && skipHost.matcher(url.getHost()).find()) {
      return null;
    }
   
    if ((dnsCheckTimeout > 0) && !checkUrlDns(url, dnsCheckTimeout)) {
      return null;
    }

    result.setOriginalUrl(urlString);
   
    String canonUrl = urlString;
   
    if (inner.getSelfRedirectCanonicalizer() != null) {
      try {
        canonUrl = inner.getSelfRedirectCanonicalizer().urlStringToKey(urlString);
      } catch (IOException io) {
        throw new BadQueryException("Bad URL(" + urlString + ")");
      }
    }
   
    result.setUrlKey(canonUrl);
   
    // check admin excludes first, if configured:
    if(adminFactory != null) {
      ExclusionFilter f = adminFactory.get();
      if(f == null) {
        LOGGER.severe("Unable to get administrative exclusion filter!");
        throw new AdministrativeAccessControlException(urlString + "is blocked.");
      }
      int ruling = f.filterObject(result);
      if(ruling == ExclusionFilter.FILTER_EXCLUDE) {
        throw new AdministrativeAccessControlException(urlString + "is blocked.");
      }       
    }
    // check robots next, if configured
    if(robotFactory != null) {
      int ruling = robotFactory.get().filterObject(result);
      if(ruling == ExclusionFilter.FILTER_EXCLUDE) {
        throw new RobotAccessControlException(urlString + "is blocked by robots.txt");
      }
    }
    // no robots check, or robots.txt says GO:
    //long start = System.currentTimeMillis();
   
    try {
      PerfStats.timeStart(PerfStat.LiveWeb);
      r = (ArcResource) cache.getCachedResource(url, maxCacheMS , false);
    } finally {
      PerfStats.timeEnd(PerfStat.LiveWeb);
    }

    ARCRecord ar = (ARCRecord) r.getArcRecord();
    int status = ar.getStatusCode();
    if ((status == 200) || ((status >= 300) && (status < 400))) {
      result.setCaptureTimestamp(ar.getMetaData().getDate());
      result.setMimeType(ar.getMetaData().getMimetype());
      return r;
    }
   
    return null;
  }
 
  @Override
  public String getLiveWebRedirect(HttpServletRequest request, WaybackRequest wbRequest, WaybackException we)
  {
    if (isLiveWebFound(request, wbRequest)) {
      return LiveWebRedirector.DEFAULT;
    }
   
    return null;
  }
 
  protected boolean checkUrlDns(URL url, int timeout)
  {   
    InetAddress addr = null;
   
    try {
      addr = InetAddress.getByName(url.getHost());
    } catch (UnknownHostException e) {
      return false;
    }
   
    if (addr.isAnyLocalAddress() || addr.isLinkLocalAddress() || addr.isLoopbackAddress()) {
      return false;
    }
   
    if (timeout == 0) {
      return true;
    }
   
    try {
      if (addr.isReachable(timeout)) {
        return true;
      }
    } catch (IOException e) {

    }
   
    return false;
  }
 
  private boolean isLiveWebFound(HttpServletRequest request, WaybackRequest wbRequest)
  {
    ArcResource r = null;
   
    String urlString = wbRequest.getRequestUrl();
   
    try {
      r = getLiveWebResource(new FastCaptureSearchResult(), urlString);
      return (r != null);
     
    } catch (Exception e) {
      return false;
    } finally {
      if (r != null) {
        try {
          r.close();
        } catch (IOException e) {
         
        }
      }
    }
  }

  /**
   * @return the cache
   */
  public LiveWebCache getCache() {
    return cache;
  }

  /**
   * @param cache the cache to set
   */
  public void setCache(LiveWebCache cache) {
    this.cache = cache;
  }

  /**
   * @return the robotFactory
   */
  public RobotExclusionFilterFactory getRobotFactory() {
    return robotFactory;
  }

  /**
   * @param robotFactory the robotFactory to set
   */
  public void setRobotFactory(RobotExclusionFilterFactory robotFactory) {
    this.robotFactory = robotFactory;
  }

  /**
   * @return the inner
   */
  public AccessPoint getInner() {
    return inner;
  }

  /**
   * @param inner the inner to set
   */
  public void setInner(AccessPoint inner) {
    this.inner = inner;
  }

  public StaticMapExclusionFilterFactory getAdminFactory() {
    return adminFactory;
  }

  public void setAdminFactory(StaticMapExclusionFilterFactory adminFactory) {
    this.adminFactory = adminFactory;
  }

  public String getSkipHost() {
    return skipHost.pattern();
  }

  public void setSkipHost(String skipHost) {
    this.skipHost = Pattern.compile(skipHost);
  }

  public int getDnsCheckTimeout() {
    return dnsCheckTimeout;
  }

  public void setDnsCheckTimeout(int dnsCheckTimeout) {
    this.dnsCheckTimeout = dnsCheckTimeout;
  }

  public String getRequireReferrer() {
    return requireReferrer;
  }

  public void setRequireReferrer(String requireReferrer) {
    this.requireReferrer = requireReferrer;
  }
}
TOP

Related Classes of org.archive.wayback.webapp.LiveWebAccessPoint

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.