Package org.commoncrawl.util.shared

Source Code of org.commoncrawl.util.shared.S3Downloader$Callback

/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements.  See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership.  The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
**/

package org.commoncrawl.util.shared;

import java.io.IOException;
import java.net.URL;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.LinkedList;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.TimeZone;
import java.util.TreeMap;
import java.util.Vector;
import java.util.concurrent.Semaphore;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.util.StringUtils;
import org.commoncrawl.async.EventLoop;
import org.commoncrawl.async.Timer;
import org.commoncrawl.io.internal.NIOBufferList;
import org.commoncrawl.io.internal.NIOHttpConnection;
import org.commoncrawl.io.internal.NIOHttpConnection.State;
import org.commoncrawl.io.shared.NIOHttpHeaders;

import com.google.common.collect.Lists;


/**
*
* Async S3Downloader Class
*  Uses CC NIO library to perform IO
*
* @author rana
*
*/
public class S3Downloader implements NIOHttpConnection.Listener {

  private static final Log LOG = LogFactory.getLog(S3Downloader.class);
 
  private static final int MAX_FAILURES_PER_ITEM = 5;
  private static final int DEFAULT_MAX_PARALLEL_STREAMS = 20;
  private static final int DEFAULT_MIN_HTTP_BUFFER_SIZE = 32 * 1024;
  private static final int DEFAULT_MAX_HTTP_BUFFER_SIZE = 32 * 1024;
 
  private String _s3BucketName;
  private String _s3AccessId;
  private String _s3SecretKey;
  private LinkedList<S3DownloadItem> _queuedItems   = new LinkedList<S3DownloadItem>();
  private LinkedList<NIOHttpConnection> _activeConnections      = new LinkedList<NIOHttpConnection>();
  private EventLoop _eventLoop = null;
  private boolean   _ownsEventLoop = false;
  private S3Utils.CallingFormat _callingFormat = S3Utils.CallingFormat.getSubdomainCallingFormat();
  private Callback _callback;
  private boolean _freezeDownloads = false;
  private int _lastItemId = 0;
  private int _maxParallelStreams = DEFAULT_MAX_PARALLEL_STREAMS;
  private BandwidthUtils.BandwidthHistory _downloaderStats = new BandwidthUtils.BandwidthHistory();
  private boolean _isRequesterPays = false;
 

  public static interface Callback {
    public boolean downloadStarting(int itemId,String itemKey,int contentLength);
    public boolean contentAvailable(NIOHttpConnection theConnection,int itemId,String itemKey,NIOBufferList contentBuffer);
    public void downloadFailed(int itemId,String itemKey,String errorCode);
    public void downloadComplete(int itemId,String itemKey);
  }
 
  public S3Downloader(String s3BucketName,String s3AccessId,String s3SecretKey, boolean isRequesterPays)throws IOException {
    _s3BucketName = s3BucketName;
    _s3AccessId = s3AccessId;
    _s3SecretKey = s3SecretKey;
    _isRequesterPays = isRequesterPays;
    _eventLoop = new EventLoop();
  }
 
  public EventLoop getEventLoop() {
    return _eventLoop;
  }
 
  public void setMaxParallelStreams(int maxStreams) {
    _maxParallelStreams = maxStreams;
  }

  public int getMaxParallelStreams() {
    return _maxParallelStreams;
  }

  public void initialize(Callback listener)throws IOException {
    initialize(listener,null);
  }
  public void initialize(Callback listener,EventLoop externalEventLoop) throws IOException {
   
    if (externalEventLoop == null) {
      _eventLoop = new EventLoop();
      _ownsEventLoop = true;
    }
    else {
      _eventLoop = externalEventLoop;
      _ownsEventLoop = false;
    }
   
    if (_callback != null) {
      throw new RuntimeException("Invalid State - start called on already active downloader");
    }
    if (listener == null) {
      throw new IOException("Null Listener is Invalid");
    }
    _callback = listener;
   
    if (_ownsEventLoop) {
      _eventLoop.start();
    }
  }
 
  public void shutdown() {

    if (_callback == null) {
      throw new RuntimeException("Invalid State - stop called on already inactive downloader");
    }
   
    _freezeDownloads = true;
   
    Thread eventThread = (_ownsEventLoop) ? _eventLoop.getEventThread() : null;
   
    final Semaphore shutdownSemaphore = new Semaphore(0);
   
    _eventLoop.setTimer(new Timer(1,false,new Timer.Callback() {

      // shutdown within the context of the async thread ...
      public void timerFired(Timer timer) {
     
        try {
         
          // fail any active connections
          for (NIOHttpConnection connection : Lists.newArrayList(_activeConnections)) {
            S3DownloadItem item = (S3DownloadItem) connection.getContext();
            if (item != null) {
              failDownload(item, NIOHttpConnection.ErrorType.UNKNOWN,connection, false);
            }
          }
         
          _activeConnections.clear();
         
          // next, fail all queued items
          for (S3DownloadItem item : _queuedItems) {
            failDownload(item, NIOHttpConnection.ErrorType.UNKNOWN,null, false);
          }
          _queuedItems.clear();
          _freezeDownloads = false;
          _callback = null;
         
          if (_ownsEventLoop) {
            //System.out.println("Stopping Event Loop");
            _eventLoop.stop();
          }
          _eventLoop = null;
          _ownsEventLoop = false;
        }
        finally {
          //System.out.println("Releasing Semaphore");
          shutdownSemaphore.release();
        }
      } 
    }));
    //System.out.println("Acquiring Shutdown Semaphore");
    shutdownSemaphore.acquireUninterruptibly();
    //System.out.println("Acquired Shutdown Semaphore");
   
    try {
     
      if (eventThread != null) {
        eventThread.join();
      }
    } catch (InterruptedException e) {
    }
  }
 
  public void waitForCompletion() {
    try {
      _eventLoop.getEventThread().join();
    } catch (InterruptedException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    }
  }
 
  public synchronized  int fetchItem(String itemKey) throws IOException {
    int itemId = -1;
    synchronized (_queuedItems) {
      itemId = ++_lastItemId;
      _queuedItems.addLast(new S3DownloadItem(itemKey,itemId));
    }
    _eventLoop.setTimer(new Timer(1,false,new Timer.Callback() {

      public void timerFired(Timer timer) {
        // executes in the context of the event loop thread ...
        downloadNextItem();
      }
     
     }));
   
    return itemId;
  }
 
  public synchronized  int fetchPartialItem(String itemKey,int rangeStart,int bytesToFetch) throws IOException {
    int itemId = -1;
    S3DownloadItem downloadItem = new S3DownloadItem(itemKey,itemId);
   
    downloadItem.setLastReadPos(rangeStart);
    downloadItem.setContentLength(downloadItem.getLastReadPos() + bytesToFetch);
   
    synchronized (_queuedItems) {
      itemId = ++_lastItemId;
      _queuedItems.addLast(downloadItem);
    }
    _eventLoop.setTimer(new Timer(1,false,new Timer.Callback() {

      public void timerFired(Timer timer) {
        // executes in the context of the event loop thread ...
        downloadNextItem();
      }
     
     }));
   
    return itemId;
  }

 
  private void downloadNextItem() {
    if (_activeConnections.size() < _maxParallelStreams) {
      S3DownloadItem item = null;
      synchronized (_queuedItems) {
        if (_queuedItems.size() != 0)
          item = _queuedItems.removeFirst();
      }
      if (item != null)
        downloadItem(item);
    }
  }
 
  private void downloadItem(S3DownloadItem item) {

    NIOHttpConnection connection = null;
    try {
      // construct the url for the item
      URL theURL = _callingFormat.getURL(false, S3Utils.DEFAULT_HOST, S3Utils.INSECURE_PORT, _s3BucketName, item.getKey(), null);
      connection = new NIOHttpConnection(theURL,_eventLoop.getSelector(),_eventLoop.getResolver(),null);
      connection.getContentBuffer().setMinBufferSize(DEFAULT_MIN_HTTP_BUFFER_SIZE);
      connection.getContentBuffer().setMaxBufferSize(DEFAULT_MAX_HTTP_BUFFER_SIZE);
      // set up the item as the context for the connection ...
      connection.setContext(item);
      // we don't want to populate default http headers ...
      connection.setPopulateDefaultHeaderItems(false);
      // get at headers object
      NIOHttpHeaders headers = connection.getRequestHeaders();
      // populate http request string
      headers.prepend("GET" + " " + theURL.getFile() +" "  + "HTTP/1.1", null);
      // populate host entry ...
      if (theURL.getPort() != -1 && theURL.getPort() != 80) {
        headers.set("Host",theURL.getHost() +":"+String.valueOf(theURL.getPort()));
      }
      else {
        headers.set("Host",theURL.getHost());
      }
      // create a tree map in parallel (to pass to canonicalization routine for s3 auth)
      Map amazonHeaders = new TreeMap();
     
      // add date ...
      String theDate = httpDate();
     
      headers.set("Date", theDate);
      // and set it in amazon headers ...
      addToAmazonHeader("Date", theDate, amazonHeaders);
      // add requester pays if specified ...
      if (_isRequesterPays) {
        headers.set("x-amz-request-payer", "requester");
        addToAmazonHeader("x-amz-request-payer", "requester",amazonHeaders);
      }

      String canonicalString =  S3Utils.makeCanonicalString("GET", _s3BucketName, item.getKey(), null,amazonHeaders );
      String encodedCanonical = S3Utils.encode(_s3SecretKey, canonicalString, false);
     
      // add auth string to headers ...
      headers.set("Authorization","AWS " + _s3AccessId + ":" + encodedCanonical);
     
      // figure out of this is a continuation ...
      if (item.isContinuation()) {
        // figure out where to start ...
        String rangeString = "bytes=" + item.getLastReadPos() + "-" + item.getContentLength();
        // set the range header ...
        headers.set("Range",rangeString);
        // and if etag is valid ...
        if (item.getLastKnownETag() != null) {
          headers.set("If-match",item.getLastKnownETag());
        }
      }
      // add cache control pragmas ...
      headers.set ("Connection", "close");
      headers.set("Cache-Control", "no-cache");
      headers.set("Pragma", "no-cache");
      headers.remove("Accept-Encoding");
      headers.set("Accept-Encoding","identity");
     
      // set up the listener relationship
      connection.setListener(this);
      // and open the  connection
      connection.open();
     
      // add it to list for tracking purposes
      _activeConnections.add(connection);
     
    }
    catch (IOException e) {
      LOG.error(StringUtils.stringifyException(e));
      LOG.error("Failure Opening Connection For Key:" + item.getKey());

      synchronized (_queuedItems) {
        _queuedItems.addLast(item);
      }
      if (connection != null) {
        // null out listener
        connection.setListener(null);
        connection.setContext(null);
        // close connection
        connection.close();
      }
    }
  }
 
  private final void requeueDownloadItem(S3DownloadItem item) {
    synchronized (_queuedItems) {
      _queuedItems.addLast(item);
    }
  }
 
  private final void resetConnection(NIOHttpConnection theConnection) {
    theConnection.close();
    theConnection.setListener(null);
    theConnection.setContext(null);
    _activeConnections.remove(theConnection);
  }
  private final void failDownload(S3DownloadItem item,NIOHttpConnection.ErrorType errorType,NIOHttpConnection theConnection,boolean potentiallyRetry) {
   
    int resultCode = -1;
   
    if (theConnection != null) {
     
      NIOHttpHeaders headers = theConnection.getResponseHeaders();
      resultCode = NIOHttpConnection.getHttpResponseCode(headers);
     
      if (item != null) {
        item.incrementFailureCount(errorType,resultCode);
      }
     
      resetConnection(theConnection);
    }
   
    if (item != null) {
      if (potentiallyRetry && item.isDownloadRecoverable()) {
        requeueDownloadItem(item);
      }
      else {
        LOG.error("Download Failed for Item:" + item.getKey());
        if (_callback != null) {
          _callback.downloadFailed(item.getId(),item.getKey(),"Failure Reason:" + errorType.toString() + " ResultCode:" + resultCode);
        }
      }
    }
    if (!_freezeDownloads) {
      downloadNextItem();
    }
  }
 
  private final void completeDownload(S3DownloadItem item,NIOHttpConnection theConnection) {

    if (item != null) {
      if (_callback != null) {
        _callback.downloadComplete(item.getId(),item.getKey());
      }
    }
    resetConnection(theConnection);

    if (!_freezeDownloads) {
      downloadNextItem();
    }
   
   }
 
  // @Override
  public void HttpConnectionStateChanged(NIOHttpConnection theConnection,State oldState, State state) {
   
    if (oldState == State.PARSING_HEADERS && state == State.RECEIVING_CONTENT) {
      NIOHttpHeaders headers = theConnection.getResponseHeaders();
      LOG.info("*** S3 INCOMING HEADERS:" + headers.toString());
      LOG.info("Content Length From Header for:" + theConnection.getURL() + " is:" + headers.findValue("Content-Length"));
    }
   
    LOG.info("S3Download Connection:" + theConnection.getURL() +" Old State:" + oldState + " NewState:" + state);
    // get context
    S3DownloadItem item = (S3DownloadItem) theConnection.getContext();

    // if we started receiving content ...
    if (state == State.RECEIVING_CONTENT) {
     
      // this means we successfully parsed http header ...
      // pick up etag and content length for the item ...
      NIOHttpHeaders headers = theConnection.getResponseHeaders();
     
      int resultCode = NIOHttpConnection.getHttpResponseCode(headers);
     
      boolean continueDownloading = false;
      boolean isContinuation = false;
     
      if (item != null) {
        // if success
        if (resultCode >= 200 && resultCode <300) {
         
          continueDownloading = true;
         
          String etagValue = headers.findValue("ETag");
          String contentLengthValue = headers.findValue("Content-Length");
          String rangeValue = headers.findValue("Content-Range");
         
          if(etagValue != null && contentLengthValue != null) {
            try {
              // now check range value .. in case it is set ...
              if (rangeValue != null) {
                isContinuation = true;
                // replace the content length value with value embedded in range response ...
                contentLengthValue = rangeValue.substring(rangeValue.indexOf('/') + 1);
              }
             
              int contentLength = Integer.parseInt(contentLengthValue);
              if (contentLength != -1) {
               
                item.setLastKnownETagAndContentLength(etagValue, contentLength);
              }
            }
            catch (NumberFormatException e) {
              LOG.error(StringUtils.stringifyException(e));
              continueDownloading = false;
            }
          }
        }
      }
      if (continueDownloading) {
        if (!isContinuation) {
          if (_callback != null) {
            continueDownloading = _callback.downloadStarting(item.getId(),item.getKey(), item.getContentLength());
          }
        }
      }
      if (!continueDownloading) {
        failDownload(item,NIOHttpConnection.ErrorType.UNKNOWN,theConnection,true);
      }
    }
   
    if (state == State.ERROR) {
      failDownload(item, theConnection.getErrorType(), theConnection,true);
    }
   
    if (state == State.DONE) {
      completeDownload(item, theConnection);
    }
  }

  // @Override
  public void HttpContentAvailable(NIOHttpConnection theConnection,NIOBufferList contentBuffer) {

    //LOG.info("Content Available for Connection:" + theConnection.getURL() +" BytesAvailable:" + contentBuffer.available());
    // get context
    S3DownloadItem item = (S3DownloadItem) theConnection.getContext();

    if (item != null)  {
      // update item download stats ...
      item.getDownloadStats().update(contentBuffer.available());
      // and aggregate stats ...
      _downloaderStats.update(contentBuffer.available());
      // upgrade the item's content status
      item.setLastReadPos(item.getLastReadPos() + contentBuffer.available());
      // upgrade stats too ...
      item.incrementDownloadedBytesCounter(contentBuffer.available());
      // figure out if we are going to dump stats ...
      if (item.getDownloadedBytes() >= 100000) {
        BandwidthUtils.BandwidthStats stats = new BandwidthUtils.BandwidthStats();
        BandwidthUtils.BandwidthStats aggregateStats = new BandwidthUtils.BandwidthStats();
       
        item.getDownloadStats().calcSpeed(stats);
        _downloaderStats.calcSpeed(aggregateStats);
        // and dump them to the log for now ...
        /*
        LOG.info("Download Speed for Item:" + item.getKey() + " ID:("+ item.getId() +") " + stats.scaledBytesPerSecond + " " + stats.scaledBytesUnits +" " +
                      aggregateStats.scaledBytesPerSecond + " " + aggregateStats.scaledBytesUnits + "(" + _activeConnections.size() +")");
                      */
        // reset bytes download counter ...
        item.resetDownloadedBytes();
      }
     
     
      boolean continueDownload = true;
      // callback to listener
      if (_callback != null) {
        continueDownload = _callback.contentAvailable(theConnection,item.getId(),item.getKey(), contentBuffer);
      }
     
      if (!continueDownload) {
        LOG.info("Explicit Abort Received via contentAvailabe for Item:" + item.getKey());
        failDownload(item, NIOHttpConnection.ErrorType.UNKNOWN, theConnection, false);
      }
    }
    try {
      while (contentBuffer.read() != null) ;
    } catch (IOException e) {
      LOG.error(StringUtils.stringifyException(e));
    }
    // always reset content buffer to preserve memory ...
    contentBuffer.reset();
   
  }
 
  /**
   * Generate an rfc822 date for use in the Date HTTP header.
   */
  private static String httpDate() {
      final String DateFormat = "EEE, dd MMM yyyy HH:mm:ss ";
      SimpleDateFormat format = new SimpleDateFormat( DateFormat, Locale.US );
      format.setTimeZone( TimeZone.getTimeZone( "GMT" ) );
      return format.format( new Date() ) + "GMT";
  }
  private static void addToAmazonHeader(String key,String value,Map amazonHeaders) {
    List<String> list  = (List<String>) amazonHeaders.get(key);
    if (list == null) {
      list = new Vector<String>();
      amazonHeaders.put(key, list);
    }
    list.add(value);
  }

  private static class S3DownloadItem {
   
    public S3DownloadItem(String itemKey,int itemId) {
      _itemKey  = itemKey;
      _itemId = itemId;
    }

    public void downloadComplete() {
      //LOG.info("Download Complete for Item:" + _itemKey);
    }
   
    public void downloadFailed() {
      //LOG.info("Download Failed for Item:" + _itemKey);
    }
   
    public String getKey() { return _itemKey; }
    public int     getId() {  return _itemId; }
   
    public void setLastReadPos(int lastReadPos) {
      _lastReadPos= lastReadPos;
    }
    public int   getLastReadPos() {
      return _lastReadPos;
    }
   
    public int getFailureCount() {
      return _failureCount;
    }
   
    public BandwidthUtils.BandwidthHistory getDownloadStats() {
      return _downloadStats;
    }

   
    public boolean isDownloadRecoverable() {
      // we can recover from io exception or timeout type errors
      if (_lastErrorType == NIOHttpConnection.ErrorType.UNKNOWN || _lastErrorType == NIOHttpConnection.ErrorType.IOEXCEPTION || _lastErrorType == NIOHttpConnection.ErrorType.TIMEOUT) {
        // all 400 error codes are unrecoverable failures
        if (_lastKnownResultCode >= 400 && _lastKnownResultCode < 500) {
          return false;
        }
        return _failureCount < MAX_FAILURES_PER_ITEM;
      }
      // all other type of connection errors are unrecoverable ...
      return false;
    }
   
    public void incrementFailureCount(NIOHttpConnection.ErrorType errorType,int lastKnownResultCode) {
      _failureCount++;
      _lastErrorType = errorType;
      _lastKnownResultCode = lastKnownResultCode;
    }
   
    public String getLastKnownETag() {
      return _lastKnownETag;
    }
   
    public int getContentLength() {
      return _lastKnownContentLength;
    }
   
    public void setContentLength(int contentLength) {
      _lastKnownContentLength = contentLength;
    }

    public boolean isContinuation() {
      return (_lastReadPos != 0);
    }   
   
    public int getDownloadedBytes() {
      return _downloadedBytes;
    }
   
    public void resetDownloadedBytes() {
      _downloadedBytes = 0;
    }
   
    public void incrementDownloadedBytesCounter(int newlyReceivedByteCount) {
      _downloadedBytes += newlyReceivedByteCount;
    }
   
    public void setLastKnownETagAndContentLength(String etag,int contentLength) {
      _lastKnownETag = etag;
      _lastKnownContentLength = contentLength;
    }
   
    private String      _itemKey;
    private int         _itemId =0;
    private String      _lastKnownETag = null;
    private int          _lastKnownContentLength = -1;
    private short      _failureCount = 0;
    private int          _lastReadPos = 0;
    private NIOHttpConnection.ErrorType _lastErrorType=NIOHttpConnection.ErrorType.UNKNOWN;
    private int          _lastKnownResultCode=-1;
    private int          _downloadedBytes;
    private BandwidthUtils.BandwidthHistory     _downloadStats = new BandwidthUtils.BandwidthHistory();
  }
 
}
TOP

Related Classes of org.commoncrawl.util.shared.S3Downloader$Callback

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.