Package org.apache.manifoldcf.crawler.connectors.rss

Source Code of org.apache.manifoldcf.crawler.connectors.rss.ThrottledFetcher$ExecuteMethodThread

/* $Id: ThrottledFetcher.java 988245 2010-08-23 18:39:35Z kwright $ */

/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.manifoldcf.crawler.connectors.rss;

import org.apache.manifoldcf.core.interfaces.*;
import org.apache.manifoldcf.core.common.XThreadInputStream;
import org.apache.manifoldcf.core.common.InterruptibleSocketFactory;
import org.apache.manifoldcf.agents.interfaces.*;
import org.apache.manifoldcf.crawler.interfaces.*;
import org.apache.manifoldcf.crawler.system.Logging;
import org.apache.manifoldcf.crawler.system.ManifoldCF;
import java.util.*;
import java.io.*;
import java.net.*;

import org.apache.http.conn.HttpClientConnectionManager;
import org.apache.http.client.HttpClient;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.impl.client.HttpClientBuilder;
import org.apache.http.protocol.HttpRequestExecutor;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.CredentialsProvider;
import org.apache.http.impl.client.BasicCredentialsProvider;
import org.apache.http.config.SocketConfig;
import org.apache.http.conn.ssl.BrowserCompatHostnameVerifier;
import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.auth.AuthScope;
import org.apache.http.auth.NTCredentials;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpRequestBase;
import org.apache.http.impl.client.DefaultRedirectStrategy;
import org.apache.http.util.EntityUtils;
import org.apache.http.HttpStatus;
import org.apache.http.HttpHost;
import org.apache.http.Header;
import org.apache.http.message.BasicHeader;
import org.apache.http.client.HttpRequestRetryHandler;
import org.apache.http.protocol.HttpContext;

import org.apache.http.conn.ConnectTimeoutException;
import org.apache.http.client.CircularRedirectException;
import org.apache.http.NoHttpResponseException;
import org.apache.http.HttpException;

/** This class uses httpclient to fetch stuff from webservers.  However, it additionally controls the fetch
* rate in two ways: first, controlling the overall bandwidth used per server, and second, limiting the number
* of simultaneous open connections per server.  It's also capable of limiting the maximum number of fetches
* per time period per server as well; however, this functionality is not strictly necessary at this time because
* the CF scheduler does that at a higher layer.
* An instance of this class would very probably need to have a lifetime consistent with the long-term nature
* of these values, and be static.
* This class sets up a different Http connection pool for each server, so that we can foist off onto the httpclient
* library the task of limiting the number of connections.  This means that we need periodic polling to determine
* when idle pooled connections can be freed.
*/
public class ThrottledFetcher
{
  public static final String _rcsid = "@(#)$Id: ThrottledFetcher.java 988245 2010-08-23 18:39:35Z kwright $";

  /** This flag determines whether we record everything to the disk, as a means of doing a web snapshot */
  protected static final boolean recordEverything = false;

  /** The read chunk length */
  protected static final int READ_CHUNK_LENGTH = 4096;

  /** This counter keeps track of the total outstanding handles across everything, because we do try to control that */
  protected static int globalHandleCount = 0;
  /** This is the lock object for that global handle counter */
  protected static Integer globalHandleCounterLock = new Integer(0);

  /** This hash maps the server string (without port) to a pool throttling object, where
  * we can track the statistics and make sure we throttle appropriately */
  protected final Map<String,IConnectionThrottler> serverMap = new HashMap<String,IConnectionThrottler>();

  /** Reference count for how many connections to this pool there are */
  protected int refCount = 0;

  // Current host name
  private static String currentHost = null;
  static
  {
    // Find the current host name
    try
    {
      java.net.InetAddress addr = java.net.InetAddress.getLocalHost();

      // Get hostname
      currentHost = addr.getHostName();
    }
    catch (java.net.UnknownHostException e)
    {
    }
  }

  /** Note that we're about to need a handle (and make sure we have enough) */
  protected static void registerGlobalHandle(int maxHandles)
    throws ManifoldCFException
  {
    try
    {
      synchronized (globalHandleCounterLock)
      {
        while (globalHandleCount >= maxHandles)
        {
          globalHandleCounterLock.wait();
        }
        globalHandleCount++;
      }
    }
    catch (InterruptedException e)
    {
      throw new ManifoldCFException("Interrupted: "+e.getMessage(),e,ManifoldCFException.INTERRUPTED);
    }
  }

  /** Note that we're done with a handle (so we can free it) */
  protected static void releaseGlobalHandle()
  {
    synchronized (globalHandleCounterLock)
    {
      globalHandleCount--;
      globalHandleCounterLock.notifyAll();
    }
  }

  /** Constructor.
  */
  public ThrottledFetcher()
  {
  }

  /** Establish a connection to a specified URL.
  * @param serverName is the FQDN of the server, e.g. foo.metacarta.com
  * @param connectionLimit is the maximum desired outstanding connections at any one time.
  * @param connectionTimeoutMilliseconds is the number of milliseconds to wait for the connection before timing out.
  */
  public synchronized IThrottledConnection createConnection(IThreadContext threadContext, String throttleGroupName,
    String serverName, int connectionLimit, int connectionTimeoutMilliseconds,
    String proxyHost, int proxyPort, String proxyAuthDomain, String proxyAuthUsername, String proxyAuthPassword)
    throws ManifoldCFException, ServiceInterruption
  {
    IConnectionThrottler server;
    server = serverMap.get(serverName);
    if (server == null)
    {
      // Create a connection throttler for this server
      IThrottleGroups tg = ThrottleGroupsFactory.make(threadContext);
      server = tg.obtainConnectionThrottler(RSSConnector.rssThrottleGroupType, throttleGroupName, new String[]{serverName});
      serverMap.put(serverName,server);
    }

    return new ThrottledConnection(serverName, server,
      connectionTimeoutMilliseconds,connectionLimit,
      proxyHost,proxyPort,proxyAuthDomain,proxyAuthUsername,proxyAuthPassword);
  }

  /** Poll.  This method is designed to allow idle connections to be closed and freed.
  */
  public synchronized void poll()
    throws ManifoldCFException
  {
    // Nothing needed now; connections are released when we're done with them.
  }

  /** Note that there is a repository connection that is using this object. */
  public synchronized void noteConnectionEstablished()
  {
    refCount++;
  }

  /** Connection pool no longer needed.  Call this to indicate that this object no
  * longer needs to keep its pools available, for the moment.
  */
  public synchronized void noteConnectionReleased()
  {
    refCount--;
    if (refCount == 0)
    {
      // Since we don't have any actual pools here, this can be a no-op for now
      // MHL
      serverMap.clear();
    }
  }

  /** This class represents an established connection to a URL.
  */
  protected static class ThrottledConnection implements IThrottledConnection
  {
    /** The server fqdn */
    protected final String serverName;
    /** The throttling object we use to track connections */
    protected final IConnectionThrottler connectionThrottler;
    /** The throttling object we use to track fetches */
    protected final IFetchThrottler fetchThrottler;
    /** Connection timeout in milliseconds */
    protected final int connectionTimeoutMilliseconds;
    /** The client connection manager */
    protected final HttpClientConnectionManager connectionManager;
    /** The httpclient */
    protected final HttpClient httpClient;

    /** The method object */
    protected HttpRequestBase executeMethod = null;
    /** The start-fetch time */
    protected long startFetchTime = -1L;
    /** The error trace, if any */
    protected Throwable throwable = null;
    /** The current URL being fetched */
    protected String myUrl = null;
    /** The status code fetched, if any */
    protected int statusCode = FETCH_NOT_TRIED;
    /** The kind of fetch we are doing */
    protected String fetchType = null;
    /** The current bytes in the current fetch */
    protected long fetchCounter = 0L;

    /** The thread that is actually doing the work */
    protected ExecuteMethodThread methodThread = null;
    /** Set if thread has been started */
    protected boolean threadStarted = false;
   
    /** Constructor.
    */
    public ThrottledConnection(String serverName,
      IConnectionThrottler connectionThrottler,
      int connectionTimeoutMilliseconds, int connectionLimit,
      String proxyHost, int proxyPort, String proxyAuthDomain, String proxyAuthUsername, String proxyAuthPassword)
      throws ManifoldCFException
    {
      this.serverName = serverName;
      this.connectionThrottler = connectionThrottler;
      this.connectionTimeoutMilliseconds = connectionTimeoutMilliseconds;

      // Create the https scheme for this connection
      javax.net.ssl.SSLSocketFactory httpsSocketFactory = KeystoreManagerFactory.getTrustingSecureSocketFactory();;
      SSLConnectionSocketFactory myFactory = new SSLConnectionSocketFactory(new InterruptibleSocketFactory(httpsSocketFactory,connectionTimeoutMilliseconds),
        SSLConnectionSocketFactory.ALLOW_ALL_HOSTNAME_VERIFIER);

      connectionManager = new PoolingHttpClientConnectionManager();

      CredentialsProvider credentialsProvider = new BasicCredentialsProvider();

      RequestConfig.Builder requestBuilder = RequestConfig.custom()
          .setCircularRedirectsAllowed(true)
          .setSocketTimeout(connectionTimeoutMilliseconds)
          .setStaleConnectionCheckEnabled(true)
          .setExpectContinueEnabled(true)
          .setConnectTimeout(connectionTimeoutMilliseconds)
          .setConnectionRequestTimeout(connectionTimeoutMilliseconds);

      // If there's a proxy, set that too.
      if (proxyHost != null && proxyHost.length() > 0)
      {

        // Configure proxy authentication
        if (proxyAuthUsername != null && proxyAuthUsername.length() > 0)
        {
          if (proxyAuthPassword == null)
            proxyAuthPassword = "";
          if (proxyAuthDomain == null)
            proxyAuthDomain = "";

          credentialsProvider.setCredentials(
            new AuthScope(proxyHost, proxyPort),
            new NTCredentials(proxyAuthUsername, proxyAuthPassword, currentHost, proxyAuthDomain));
        }

        HttpHost proxy = new HttpHost(proxyHost, proxyPort);

        requestBuilder.setProxy(proxy);
      }

      httpClient = HttpClients.custom()
        .setConnectionManager(connectionManager)
        .setMaxConnTotal(1)
        .disableAutomaticRetries()
        .setDefaultRequestConfig(requestBuilder.build())
        .setDefaultSocketConfig(SocketConfig.custom()
          .setTcpNoDelay(true)
          .setSoTimeout(connectionTimeoutMilliseconds)
          .build())
        .setDefaultCredentialsProvider(credentialsProvider)
        .setSSLSocketFactory(myFactory)
        .setRequestExecutor(new HttpRequestExecutor(connectionTimeoutMilliseconds))
        .setRedirectStrategy(new DefaultRedirectStrategy())
        .build();

      registerGlobalHandle(connectionLimit);
      try
      {
        int result = connectionThrottler.waitConnectionAvailable();
        if (result != IConnectionThrottler.CONNECTION_FROM_CREATION)
          throw new IllegalStateException("Got back unexpected value from waitForAConnection() of "+result);
        fetchThrottler = connectionThrottler.getNewConnectionFetchThrottler();
      }
      catch (InterruptedException e)
      {
        throw new ManifoldCFException(e.getMessage(),ManifoldCFException.INTERRUPTED);
      }
    }

    /** Begin the fetch process.
    * @param fetchType is a short descriptive string describing the kind of fetch being requested.  This
    *        is used solely for logging purposes.
    */
    public void beginFetch(String fetchType)
      throws ManifoldCFException
    {
      this.fetchType = fetchType;
      fetchCounter = 0L;
      try
      {
        if (fetchThrottler.obtainFetchDocumentPermission() == false)
          throw new IllegalStateException("obtainFetchDocumentPermission() had unexpected return value");
      }
      catch (InterruptedException e)
      {
        throw new ManifoldCFException("Interrupted",ManifoldCFException.INTERRUPTED);
      }
      threadStarted = false;
    }

    /** Log the fetch of a number of bytes. */
    public void logFetchCount(int count)
    {
      fetchCounter += (long)count;
    }


    /** Execute the fetch and get the return code.  This method uses the
    * standard logging mechanism to keep track of the fetch attempt.  It also
    * signals the following three conditions: ServiceInterruption (if a dynamic
    * error occurs), OK, or a static error code (for a condition where retry is
    * not likely to be helpful).  The actual HTTP error code is NOT returned by
    * this method.
    * @param protocol is the protocol to use to perform the access, e.g. "http"
    * @param port is the port to use to perform the access, where -1 means "use the default"
    * @param urlPath is the path part of the url, e.g. "/robots.txt"
    * @param userAgent is the value of the userAgent header to use.
    * @param from is the value of the from header to use.
    * @param proxyHost is the proxy host, or null if none.
    * @param proxyPort is the proxy port, or -1 if none.
    * @param proxyAuthDomain is the proxy authentication domain, or null.
    * @param proxyAuthUsername is the proxy authentication user name, or null.
    * @param proxyAuthPassword is the proxy authentication password, or null.
    * @param lastETag is the requested lastETag header value.
    * @param lastModified is the requested lastModified header value.
    * @return the status code: success, static error, or dynamic error.
    */
    public int executeFetch(String protocol, int port, String urlPath, String userAgent, String from,
      String lastETag, String lastModified)
      throws ManifoldCFException, ServiceInterruption
    {

      StringBuilder sb = new StringBuilder(protocol);
      sb.append("://").append(serverName);
      if (port != -1)
        sb.append(":").append(Integer.toString(port));
      sb.append(urlPath);
      myUrl = sb.toString();

      // Create the get method
      executeMethod = new HttpGet(myUrl);
     
      startFetchTime = System.currentTimeMillis();

      // Set all appropriate headers
      executeMethod.setHeader(new BasicHeader("User-Agent",userAgent));
      executeMethod.setHeader(new BasicHeader("From",from));
      executeMethod.setHeader(new BasicHeader("Accept","*/*"));

      if (lastETag != null)
        executeMethod.setHeader(new BasicHeader("ETag",lastETag));
      if (lastModified != null)
        executeMethod.setHeader(new BasicHeader("Last-Modified",lastModified));
      // Create the execution thread.
      methodThread = new ExecuteMethodThread(this, fetchThrottler,
        httpClient, executeMethod);
      // Start the method thread, which will start the transaction
      try
      {
        methodThread.start();
        threadStarted = true;
        // We want to wait until at least the execution has fired, and then figure out where we
        // stand
        try
        {
          statusCode = methodThread.getResponseCode();
          long currentTime;
          switch (statusCode)
          {
          case HttpStatus.SC_OK:
            return STATUS_OK;
          case HttpStatus.SC_UNAUTHORIZED:
          case HttpStatus.SC_USE_PROXY:
            // Permanent errors that mean, "fetch not allowed"
            return STATUS_SITEERROR;
          case HttpStatus.SC_REQUEST_TIMEOUT:
          case HttpStatus.SC_GATEWAY_TIMEOUT:
          case HttpStatus.SC_SERVICE_UNAVAILABLE:
            // Temporary service interruption
            // May want to make the retry time a parameter someday
            currentTime = System.currentTimeMillis();
            throw new ServiceInterruption("Http response temporary error on '"+myUrl+"': "+Integer.toString(statusCode),
              null,currentTime + 60L * 60000L,currentTime + 1440L * 60000L,-1,false);
          case HttpStatus.SC_NOT_MODIFIED:
            return STATUS_NOCHANGE;
          case HttpStatus.SC_INTERNAL_SERVER_ERROR:
            // Fail for a while, but give up after 24 hours
            currentTime = System.currentTimeMillis();
            throw new ServiceInterruption("Http response internal server error on '"+myUrl+"': "+Integer.toString(statusCode),
              null,currentTime + 60L * 60000L,currentTime + 1440L * 60000L,-1,false);
          case HttpStatus.SC_GONE:
          case HttpStatus.SC_NOT_FOUND:
          case HttpStatus.SC_BAD_GATEWAY:
          case HttpStatus.SC_BAD_REQUEST:
          default:
            return STATUS_PAGEERROR;
          }
        }
        catch (InterruptedException e)
        {
          methodThread.interrupt();
          methodThread = null;
          threadStarted = false;
          throw e;
        }

      }
      catch (InterruptedException e)
      {
        // Drop the current connection on the floor, so it cannot be reused.
        executeMethod = null;
        throw new ManifoldCFException("Interrupted: "+e.getMessage(),e,ManifoldCFException.INTERRUPTED);
      }
      catch (java.net.MalformedURLException e)
      {
        throwable = new ManifoldCFException("Illegal URI: '"+myUrl+"'",e);
        statusCode = FETCH_BAD_URI;
        return STATUS_PAGEERROR;
      }
      catch (java.net.SocketTimeoutException e)
      {
        throwable = e;
        long currentTime = System.currentTimeMillis();
        throw new ServiceInterruption("Timed out waiting for IO for '"+myUrl+"': "+e.getMessage(), e, currentTime + 300000L,
          currentTime + 120L * 60000L,-1,false);
      }
      catch (ConnectTimeoutException e)
      {
        throwable = e;
        long currentTime = System.currentTimeMillis();
        throw new ServiceInterruption("Timed out waiting for connect for '"+myUrl+"': "+e.getMessage(), e, currentTime + 60L * 60000L,
          currentTime + 720L * 60000L,-1,false);
      }
      catch (InterruptedIOException e)
      {
        throw new ManifoldCFException("Interrupted",ManifoldCFException.INTERRUPTED);
      }
      catch (CircularRedirectException e)
      {
        throwable = e;
        statusCode = FETCH_CIRCULAR_REDIRECT;
        return STATUS_PAGEERROR;
      }
      catch (NoHttpResponseException e)
      {
        throwable = e;
        // Give up after 2 hours.
        long currentTime = System.currentTimeMillis();
        throw new ServiceInterruption("Timed out waiting for response for '"+myUrl+"'", e, currentTime + 15L * 60000L,
          currentTime + 120L * 60000L,-1,false);
      }
      catch (java.net.ConnectException e)
      {
        throwable = e;
        // Give up after 6 hours.
        long currentTime = System.currentTimeMillis();
        throw new ServiceInterruption("Timed out waiting for a connection for '"+myUrl+"'", e, currentTime + 1000000L,
          currentTime + 720L * 60000L,-1,false);
      }
      catch (java.net.NoRouteToHostException e)
      {
        // This exception means we know the IP address but can't get there.  That's either a firewall issue, or it's something transient
        // with the network.  Some degree of retry is probably wise.
        throwable = e;
        long currentTime = System.currentTimeMillis();
        throw new ServiceInterruption("No route to host for '"+myUrl+"'", e, currentTime + 1000000L,
          currentTime + 720L * 60000L,-1,false);
      }
      catch (HttpException e)
      {
        throwable = e;
        statusCode = FETCH_IO_ERROR;
        return STATUS_PAGEERROR;
      }
      catch (IOException e)
      {
        // Treat this as a bad url.  We don't know what happened, but it isn't something we are going to naively
        // retry on.
        throwable = e;
        statusCode = FETCH_IO_ERROR;
        return STATUS_PAGEERROR;
      }
      catch (Throwable e)
      {
        Logging.connectors.debug("RSS: Caught an unexpected exception: "+e.getMessage(),e);
        throwable = e;
        statusCode = FETCH_UNKNOWN_ERROR;
        return STATUS_PAGEERROR;
      }
    }

    /** Get the http response code.
    *@return the response code.  This is either an HTTP response code, or one of the codes above.
    */
    public int getResponseCode()
      throws ManifoldCFException, ServiceInterruption
    {
      return statusCode;
    }

    /** Get the response input stream.  It is the responsibility of the caller
    * to close this stream when done.
    */
    public InputStream getResponseBodyStream()
      throws ManifoldCFException, ServiceInterruption
    {
      if (executeMethod == null)
        throw new ManifoldCFException("Attempt to get an input stream when there is no method");
      if (methodThread == null || threadStarted == false)
        throw new ManifoldCFException("Attempt to get an input stream when no method thread");
      try
      {
        return methodThread.getSafeInputStream();
      }
      catch (InterruptedException e)
      {
        methodThread.interrupt();
        throw new ManifoldCFException("Interrupted: "+e.getMessage(),e,ManifoldCFException.INTERRUPTED);
      }
      catch (java.net.SocketTimeoutException e)
      {
        long currentTime = System.currentTimeMillis();
        throw new ServiceInterruption("Timed out waiting for IO for '"+myUrl+"': "+e.getMessage(), e, currentTime + 300000L,
          currentTime + 120L * 60000L,-1,false);
      }
      catch (ConnectTimeoutException e)
      {
        long currentTime = System.currentTimeMillis();
        throw new ServiceInterruption("Timed out waiting for connect for '"+myUrl+"': "+e.getMessage(), e, currentTime + 60L * 60000L,
          currentTime + 720L * 60000L,-1,false);
      }
      catch (InterruptedIOException e)
      {
        methodThread.interrupt();
        throw new ManifoldCFException("Interrupted",ManifoldCFException.INTERRUPTED);
      }
      catch (NoHttpResponseException e)
      {
        // Give up after 2 hours.
        long currentTime = System.currentTimeMillis();
        throw new ServiceInterruption("Timed out waiting for response for '"+myUrl+"'", e, currentTime + 15L * 60000L,
          currentTime + 120L * 60000L,-1,false);
      }
      catch (java.net.ConnectException e)
      {
        // Give up after 6 hours.
        long currentTime = System.currentTimeMillis();
        throw new ServiceInterruption("Timed out waiting for a stream connection for '"+myUrl+"'", e, currentTime + 1000000L,
          currentTime + 720L * 60000L,-1,false);
      }
      catch (java.net.NoRouteToHostException e)
      {
        // This exception means we know the IP address but can't get there.  That's either a firewall issue, or it's something transient
        // with the network.  Some degree of retry is probably wise.
        long currentTime = System.currentTimeMillis();
        throw new ServiceInterruption("No route to host for '"+myUrl+"'", e, currentTime + 1000000L,
          currentTime + 720L * 60000L,-1,false);
      }
      catch (HttpException e)
      {
        throw new ManifoldCFException("Http exception reading stream: "+e.getMessage(),e);
      }
      catch (IOException e)
      {
        throw new ManifoldCFException("I/O exception reading stream: "+e.getMessage(),e);
      }
    }

    /** Get a specified response header, if it exists.
    *@param headerName is the name of the header.
    *@return the header value, or null if it doesn't exist.
    */
    public String getResponseHeader(String headerName)
      throws ManifoldCFException, ServiceInterruption
    {
      if (executeMethod == null)
        throw new ManifoldCFException("Attempt to get a header when there is no method");
      if (methodThread == null || threadStarted == false)
        throw new ManifoldCFException("Attempt to get a header when no method thread");
      try
      {
        return methodThread.getFirstHeader(headerName);
      }
      catch (InterruptedException e)
      {
        methodThread.interrupt();
        throw new ManifoldCFException("Interrupted: "+e.getMessage(),e,ManifoldCFException.INTERRUPTED);
      }
      catch (java.net.SocketTimeoutException e)
      {
        long currentTime = System.currentTimeMillis();
        throw new ServiceInterruption("Timed out waiting for IO for '"+myUrl+"': "+e.getMessage(), e, currentTime + 300000L,
          currentTime + 120L * 60000L,-1,false);
      }
      catch (ConnectTimeoutException e)
      {
        long currentTime = System.currentTimeMillis();
        throw new ServiceInterruption("Timed out waiting for connect for '"+myUrl+"': "+e.getMessage(), e, currentTime + 60L * 60000L,
          currentTime + 720L * 60000L,-1,false);
      }
      catch (InterruptedIOException e)
      {
        methodThread.interrupt();
        throw new ManifoldCFException("Interrupted",ManifoldCFException.INTERRUPTED);
      }
      catch (NoHttpResponseException e)
      {
        // Give up after 2 hours.
        long currentTime = System.currentTimeMillis();
        throw new ServiceInterruption("Timed out waiting for response for '"+myUrl+"'", e, currentTime + 15L * 60000L,
          currentTime + 120L * 60000L,-1,false);
      }
      catch (java.net.ConnectException e)
      {
        // Give up after 6 hours.
        long currentTime = System.currentTimeMillis();
        throw new ServiceInterruption("Timed out waiting for a connection for '"+myUrl+"'", e, currentTime + 1000000L,
          currentTime + 720L * 60000L,-1,false);
      }
      catch (java.net.NoRouteToHostException e)
      {
        // This exception means we know the IP address but can't get there.  That's either a firewall issue, or it's something transient
        // with the network.  Some degree of retry is probably wise.
        long currentTime = System.currentTimeMillis();
        throw new ServiceInterruption("No route to host for '"+myUrl+"'", e, currentTime + 1000000L,
          currentTime + 720L * 60000L,-1,false);
      }
      catch (HttpException e)
      {
        throw new ManifoldCFException("Http exception reading response: "+e.getMessage(),e);
      }
      catch (IOException e)
      {
        throw new ManifoldCFException("I/O exception reading response: "+e.getMessage(),e);
      }
    }

    /** Done with the fetch.  Call this when the fetch has been completed.  A log entry will be generated
    * describing what was done.
    */
    public void doneFetch(IVersionActivity activities)
      throws ManifoldCFException
    {
     
      if (fetchType != null)
      {
        if (methodThread != null && threadStarted)
          methodThread.abort();
        long endTime = System.currentTimeMillis();

        activities.recordActivity(new Long(startFetchTime),RSSConnector.ACTIVITY_FETCH,
          new Long(fetchCounter),myUrl,Integer.toString(statusCode),(throwable==null)?null:throwable.getMessage(),null);

        Logging.connectors.info("RSS: FETCH "+fetchType+"|"+myUrl+"|"+new Long(startFetchTime).toString()+"+"+new Long(endTime-startFetchTime).toString()+"|"+
          Integer.toString(statusCode)+"|"+new Long(fetchCounter).toString()+"|"+((throwable==null)?"":(throwable.getClass().getName()+"| "+throwable.getMessage())));
        if (throwable != null)
        {
          if (Logging.connectors.isDebugEnabled())
            Logging.connectors.debug("RSS: Fetch exception for '"+myUrl+"'",throwable);
        }
       
        // Shut down (join) the connection thread, if any, and if it started
        if (methodThread != null)
        {
          if (threadStarted)
          {
            try
            {
              methodThread.finishUp();
            }
            catch (InterruptedException e)
            {
              throw new ManifoldCFException(e.getMessage(),e,ManifoldCFException.INTERRUPTED);
            }
            threadStarted = false;
          }
          methodThread = null;
        }
       
        executeMethod = null;
        throwable = null;
        startFetchTime = -1L;
        myUrl = null;
        statusCode = -1;
        fetchType = null;
      }
    }

    /** Close the connection.  Call this to end this server connection.
    */
    public void close()
      throws ManifoldCFException
    {
      // Clean up the connection pool.  This should do the necessary bookkeeping to release the one connection that's sitting there.
      connectionManager.shutdown();
      connectionThrottler.noteConnectionDestroyed();
      releaseGlobalHandle();
    }

  }

  /** This class throttles an input stream based on the specified byte rate parameters.  The
  * throttling takes place across all streams that are open to the server in question.
  */
  protected static class ThrottledInputstream extends InputStream
  {
    /** Throttled connection */
    protected final ThrottledConnection throttledConnection;
    /** Stream throttler */
    protected final IStreamThrottler streamThrottler;
    /** The stream we are wrapping. */
    protected final InputStream inputStream;

    /** Constructor.
    */
    public ThrottledInputstream(ThrottledConnection throttledConnection, IStreamThrottler streamThrottler, InputStream is)
    {
      this.throttledConnection = throttledConnection;
      this.streamThrottler = streamThrottler;
      this.inputStream = is;
    }

    /** Read a byte.
    */
    public int read()
      throws IOException
    {
      byte[] byteArray = new byte[1];
      int count = read(byteArray,0,1);
      if (count == -1)
        return count;
      return (int)byteArray[0];
    }

    /** Read lots of bytes.
    */
    public int read(byte[] b)
      throws IOException
    {
      return read(b,0,b.length);
    }

    /** Read lots of specific bytes.
    */
    public int read(byte[] b, int off, int len)
      throws IOException
    {
      int totalCount = 0;
      while (len > ThrottledFetcher.READ_CHUNK_LENGTH)
      {
        int amt = basicRead(b,off,ThrottledFetcher.READ_CHUNK_LENGTH,totalCount);
        if (amt == -1)
        {
          if (totalCount == 0)
            return amt;
          return totalCount;
        }
        totalCount += amt;
        off += amt;
        len -= amt;
      }
      if (len > 0)
      {
        int amt = basicRead(b,off,len,totalCount);
        if (amt == -1)
        {
          if (totalCount == 0)
            return amt;
          return totalCount;
        }
        return totalCount + amt;
      }
      return totalCount;
    }

    /** Basic read, which uses the server object to throttle activity.
    */
    protected int basicRead(byte[] b, int off, int len, int totalSoFar)
      throws IOException
    {
      try
      {
        if (streamThrottler.obtainReadPermission(len) == false)
          throw new IllegalStateException("Throttler shut down while still active");
        int amt = 0;
        try
        {
          amt = inputStream.read(b,off,len);
          return amt;
        }
        finally
        {
          if (amt == -1)
            streamThrottler.releaseReadPermission(len,0);
          else
          {
            streamThrottler.releaseReadPermission(len,amt);
            throttledConnection.logFetchCount(amt);
          }
        }
      }
      catch (InterruptedException e)
      {
        InterruptedIOException e2 = new InterruptedIOException("Interrupted");
        e2.bytesTransferred = totalSoFar;
        throw e2;
      }
    }

    /** Skip
    */
    public long skip(long n)
      throws IOException
    {
      // Not sure whether we should bother doing anything with this; it's not used.
      return inputStream.skip(n);
    }

    /** Get available.
    */
    public int available()
      throws IOException
    {
      return inputStream.available();
    }

    /** Mark.
    */
    public void mark(int readLimit)
    {
      inputStream.mark(readLimit);
    }

    /** Reset.
    */
    public void reset()
      throws IOException
    {
      inputStream.reset();
    }

    /** Check if mark is supported.
    */
    public boolean markSupported()
    {
      return inputStream.markSupported();
    }

    /** Close.
    */
    public void close()
      throws IOException
    {
      try
      {
        inputStream.close();
      }
      finally
      {
        streamThrottler.closeStream();
      }
    }

  }

  /** This thread does the actual socket communication with the server.
  * It's set up so that it can be abandoned at shutdown time.
  *
  * The way it works is as follows:
  * - it starts the transaction
  * - it receives the response, and saves that for the calling class to inspect
  * - it transfers the data part to an input stream provided to the calling class
  * - it shuts the connection down
  *
  * If there is an error, the sequence is aborted, and an exception is recorded
  * for the calling class to examine.
  *
  * The calling class basically accepts the sequence above.  It starts the
  * thread, and tries to get a response code.  If instead an exception is seen,
  * the exception is thrown up the stack.
  */
  protected static class ExecuteMethodThread extends Thread
  {
    /** The connection */
    protected final ThrottledConnection theConnection;
    /** The fetch throttler */
    protected final IFetchThrottler fetchThrottler;
    /** Client and method, all preconfigured */
    protected final HttpClient httpClient;
    protected final HttpRequestBase executeMethod;
   
    protected HttpResponse response = null;
    protected Throwable responseException = null;
    protected XThreadInputStream threadStream = null;
    protected InputStream bodyStream = null;
    protected boolean streamCreated = false;
    protected Throwable streamException = null;

    protected boolean abortThread = false;
   
    protected Throwable shutdownException = null;

    protected Throwable generalException = null;
   
    public ExecuteMethodThread(ThrottledConnection theConnection, IFetchThrottler fetchThrottler,
      HttpClient httpClient, HttpRequestBase executeMethod)
    {
      super();
      setDaemon(true);
      this.theConnection = theConnection;
      this.fetchThrottler = fetchThrottler;
      this.httpClient = httpClient;
      this.executeMethod = executeMethod;
    }

    public void run()
    {
      try
      {
        try
        {
          // Call the execute method appropriately
          synchronized (this)
          {
            if (!abortThread)
            {
              try
              {
                response = httpClient.execute(executeMethod);
              }
              catch (java.net.SocketTimeoutException e)
              {
                responseException = e;
              }
              catch (ConnectTimeoutException e)
              {
                responseException = e;
              }
              catch (InterruptedIOException e)
              {
                throw e;
              }
              catch (Throwable e)
              {
                responseException = e;
              }
              this.notifyAll();
            }
          }
           
          // Start the transfer of the content
          if (responseException == null)
          {
            synchronized (this)
            {
              if (!abortThread)
              {
                try
                {
                  bodyStream = response.getEntity().getContent();
                  if (bodyStream != null)
                  {
                    bodyStream = new ThrottledInputstream(theConnection,fetchThrottler.createFetchStream(),bodyStream);
                    threadStream = new XThreadInputStream(bodyStream);
                  }
                  streamCreated = true;
                }
                catch (java.net.SocketTimeoutException e)
                {
                  streamException = e;
                }
                catch (ConnectTimeoutException e)
                {
                  streamException = e;
                }
                catch (InterruptedIOException e)
                {
                  throw e;
                }
                catch (Throwable e)
                {
                  streamException = e;
                }
                this.notifyAll();
              }
            }
          }
         
          if (responseException == null && streamException == null)
          {
            if (threadStream != null)
            {
              // Stuff the content until we are done
              threadStream.stuffQueue();
            }
          }
         
        }
        finally
        {
          if (bodyStream != null)
          {
            try
            {
              bodyStream.close();
            }
            catch (IOException e)
            {
            }
            bodyStream = null;
          }
          synchronized (this)
          {
            try
            {
              executeMethod.abort();
            }
            catch (Throwable e)
            {
              shutdownException = e;
            }
            this.notifyAll();
          }
        }
      }
      catch (Throwable e)
      {
        // We catch exceptions here that should ONLY be InterruptedExceptions, as a result of the thread being aborted.
        this.generalException = e;
      }
    }

    public int getResponseCode()
      throws InterruptedException, IOException, HttpException
    {
      // Must wait until the response object is there
      while (true)
      {
        synchronized (this)
        {
          checkException(responseException);
          if (response != null)
            return response.getStatusLine().getStatusCode();
          wait();
        }
      }
    }
     
    public String getFirstHeader(String headerName)
      throws InterruptedException, IOException, HttpException
    {
      // Must wait for the response object to appear
      while (true)
      {
        synchronized (this)
        {
          checkException(responseException);
          if (response != null)
          {
            Header h = response.getFirstHeader(headerName);
            if (h == null)
              return null;
            return h.getValue();
          }
          wait();
        }
      }
    }
     
    public InputStream getSafeInputStream()
      throws InterruptedException, IOException, HttpException
    {
      // Must wait until stream is created, or until we note an exception was thrown.
      while (true)
      {
        synchronized (this)
        {
          if (responseException != null)
            throw new IllegalStateException("Check for response before getting stream");
          checkException(streamException);
          if (streamCreated)
            return threadStream;
          wait();
        }
      }
    }
   
    public void abort()
    {
      // This will be called during the finally
      // block in the case where all is well (and
      // the stream completed) and in the case where
      // there were exceptions.
      synchronized (this)
      {
        if (streamCreated)
        {
          if (threadStream != null)
            threadStream.abort();
        }
        abortThread = true;
      }
    }
   
    public void finishUp()
      throws InterruptedException
    {
      join();
    }
   
    protected synchronized void checkException(Throwable exception)
      throws IOException, HttpException
    {
      if (exception != null)
      {
        // Throw the current exception, but clear it, so no further throwing is possible on the same problem.
        Throwable e = exception;
        if (e instanceof IOException)
          throw (IOException)e;
        else if (e instanceof HttpException)
          throw (HttpException)e;
        else if (e instanceof RuntimeException)
          throw (RuntimeException)e;
        else if (e instanceof Error)
          throw (Error)e;
        else
          throw new RuntimeException("Unhandled exception of type: "+e.getClass().getName(),e);
      }
    }

  }


}
TOP

Related Classes of org.apache.manifoldcf.crawler.connectors.rss.ThrottledFetcher$ExecuteMethodThread

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.