Source Code of com.atlantbh.hadoop.s3.io.S3InputFormat

package com.atlantbh.hadoop.s3.io;


import java.io.IOException;
import java.util.ArrayList;
import java.util.List;


import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.InvalidJobConfException;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.JobID;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.TaskAttemptID;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;


import com.amazonaws.services.s3.model.ObjectListing;
import com.amazonaws.services.s3.model.S3ObjectSummary;


/**
 * Abstract S3 File Input format class
 * 
 * @author samir
 * 
 * @param <K>
 *            Mapper key class
 * @param <V>
 *            Mapper value class
 */
public abstract class S3InputFormat<K, V> extends InputFormat<K, V> {


  Logger LOG = LoggerFactory.getLogger(S3InputFormat.class);


  static String S3_BUCKET_NAME = "s3.bucket.name";
  /**
   * Number of files to get from S3 in single request. Default value is 100
   */
  static String S3_KEY_PREFIX = "s3.key.prefix";
  static String S3_MAX_KEYS = "s3.max.keys";
  
  static String S3_NUM_OF_KEYS_PER_MAPPER = "s3.input.numOfKeys";
  static String S3_NUM_OF_MAPPERS = "s3.input.numOfMappers";


  S3BucketReader s3Reader;


  public S3InputFormat() throws IOException {
  }


  /**
   * Returns list of {@link S3InputSplit} 
   * 
   * @see org.apache.hadoop.mapreduce.InputFormat#getSplits(org.apache.hadoop.mapreduce.JobContext)
   */
  @Override
  public List<InputSplit> getSplits(JobContext context) throws IOException, InterruptedException {
    Configuration conf = context.getConfiguration();


    String bucketName = conf.get(S3_BUCKET_NAME);
    String keyPrefix = conf.get(S3_KEY_PREFIX);
    
    int maxKeys = conf.getInt(S3_MAX_KEYS, 500);


    int numOfMappers = conf.getInt(S3_NUM_OF_MAPPERS, -1);
    int numOfKeysPerMapper = conf.getInt(S3_NUM_OF_KEYS_PER_MAPPER, -1);
    boolean useMappers = true;


    if (bucketName == null || "".equals(bucketName)) {
      throw new InvalidJobConfException("S3 bucket name cannot be empty");
    }


    if (numOfMappers == -1 && numOfKeysPerMapper == -1) {
      LOG.warn("Non of {} and {} properties are not set. Defaulting to numOfMappers=1 to determine input splits",
          S3_NUM_OF_KEYS_PER_MAPPER, S3_NUM_OF_MAPPERS);
      numOfMappers = 1;
      useMappers = true;
    } else if (numOfMappers > -1 && numOfKeysPerMapper > -1) {
      LOG.warn("Both {} and {} are set. Using numOfMappers value to determine input splits",
          S3_NUM_OF_KEYS_PER_MAPPER, S3_NUM_OF_MAPPERS);
      useMappers = true;
    } else if (numOfMappers == -1) {
      LOG.warn("Using {} value to determine input splits", S3_NUM_OF_KEYS_PER_MAPPER);
      useMappers = false;
    } else if (numOfKeysPerMapper == -1) {
      LOG.warn("Using {} value to determine input splits", S3_NUM_OF_MAPPERS);
      useMappers = true;
    }


    s3Reader = new S3BucketReader(bucketName, keyPrefix, null, maxKeys);


    List<InputSplit> splits = new ArrayList<InputSplit>();


    if (useMappers) {
      // not implemented
      throw new IOException("Defining S3InputFormat with number of mappers is not implemented");
    } else {
      S3ObjectSummary startKey = null;
      S3ObjectSummary endKey = null;


      int batchSize = 0;
      
      int numOfSplits = 0;
      int numOfCalls = 0;


      int maxKeyIDX = 0;
      int currentKeyIDX = 0;
      int nextKeyIDX = 0;


      ObjectListing listing = null;
      boolean isLastCall = true;


      // split all keys starting with "keyPrefix" into splits of
      // "numOfKeysPerMapper" keys
      do {
        // for first time we have to build request after that use
        // previous listing to get next batch
        if (listing == null) {
          listing = s3Reader.listObjects(bucketName, keyPrefix, maxKeys);
        } else {
          listing = s3Reader.listObjects(listing);
        }


        // Is this last call to WS (last batch of objects)
        isLastCall = !listing.isTruncated();


        // Size of the batch from last WS call
        batchSize = listing.getObjectSummaries().size();


        // Absolute index of last key from batch
        maxKeyIDX = numOfCalls * maxKeys + batchSize;


        // Absolute indexes of current and next keys
        currentKeyIDX = numOfSplits * numOfKeysPerMapper;
        // if there are no more keys to process, index of last key is selected
        nextKeyIDX = (numOfSplits + 1) * numOfKeysPerMapper > maxKeyIDX && isLastCall ? maxKeyIDX : (numOfSplits + 1) * numOfKeysPerMapper;


        // create one input split for each key which is in current range
        while (nextKeyIDX <= maxKeyIDX) {


          startKey = endKey;
          endKey = listing.getObjectSummaries().get((nextKeyIDX - 1) % maxKeys);


          // Create new input split
          S3InputSplit split = new S3InputSplit();
          split.setBucketName(bucketName);
          split.setKeyPrefix(keyPrefix);
          split.setMarker(startKey != null ? startKey.getKey() : null);
          split.setLastKey(endKey.getKey());
          split.setSize(nextKeyIDX - currentKeyIDX);


          splits.add(split);
          numOfSplits++;


          // Stop when last key was processed
          if (nextKeyIDX == maxKeyIDX && isLastCall) {
            break;
          }
          
          // Set next key index
          currentKeyIDX = numOfSplits * numOfKeysPerMapper;
          nextKeyIDX = (numOfSplits + 1) * numOfKeysPerMapper > maxKeyIDX && isLastCall ? maxKeyIDX : (numOfSplits + 1) * numOfKeysPerMapper;
        }
        numOfCalls++;
      } while (!isLastCall);
    }


    LOG.info("Number of input splits={}", splits.size());


    return splits;
  }


  public static void main(String[] args) throws IOException, InterruptedException {
    Configuration conf = new Configuration(true);
    conf.set(S3_BUCKET_NAME, "hari_dev");
    conf.set(S3_KEY_PREFIX, "users/299/avatar_f");
    conf.setInt(S3_NUM_OF_KEYS_PER_MAPPER, 1);


    S3InputFormat<Text, S3ObjectSummaryWritable> s3FileInput = new S3ObjectSummaryInputFormat();
    List<InputSplit> splits = s3FileInput.getSplits(new JobContext(conf, new JobID()));


    S3ObjectSummaryRecordReader reader = new S3ObjectSummaryRecordReader();


    for (InputSplit inputSplit : splits) {
      System.out.println(inputSplit.toString());
      reader.initialize(inputSplit, new TaskAttemptContext(conf, new TaskAttemptID()));


      int i = 0;
      while (reader.nextKeyValue()) {
        System.out.printf("\t%d Key=%s. Value=%s\n", i++, reader.getCurrentKey(), reader.getCurrentValue());
      }
    }
  }
}
Source Code of com.atlantbh.hadoop.s3.io.S3InputFormat

Related Classes of com.atlantbh.hadoop.s3.io.S3InputFormat