Package cc

Source Code of cc.SimpleDistCp$SimpleDistCpMapper

package cc;

import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.nio.ByteBuffer;
import java.nio.channels.Channels;
import java.nio.channels.ReadableByteChannel;
import java.nio.channels.WritableByteChannel;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.hadoop.mapred.lib.MultithreadedMapRunner;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

import com.amazonaws.Request;
import com.amazonaws.auth.BasicAWSCredentials;
import com.amazonaws.handlers.AbstractRequestHandler;
import com.amazonaws.services.s3.AmazonS3Client;

public class SimpleDistCp extends Configured implements Tool {
 
  private static final String CC_HDFS_PATH = "cc.hdfs_path";
  private static final int MAX_RETRIES = 10;
 
  public static void main(String args[]) throws Exception {
    ToolRunner.run(new SimpleDistCp(), args);
  }
   
  public int run(String[] args) throws Exception {
    if (args.length!=2) {
      throw new RuntimeException("usage: SimpleDistCp <mainfest_dir> <hdfs_output_path>");
    }       
   
    String hdfsPath = args[1];
    if (!hdfsPath.endsWith("/")) {
      hdfsPath += "/";
    }
    getConf().set(CC_HDFS_PATH, hdfsPath);
   
    JobConf conf = new JobConf(getConf(), getClass());   
    conf.setJobName(getClass().getName());
   
    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);
    conf.set("mapred.output.compress", "true");
    conf.set("mapred.output.compression.type", "BLOCK");
    conf.set("mapred.output.compression.codec", "org.apache.hadoop.io.compress.GzipCodec");

    conf.set("mapred.map.tasks.speculative.execution", "false");

    conf.setNumReduceTasks(0);
   
    conf.setMapperClass(SimpleDistCpMapper.class);   
    conf.setMapRunnerClass(MultithreadedMapRunner.class);   
   
    FileInputFormat.addInputPath(conf, new Path(args[0]));
    FileOutputFormat.setOutputPath(conf, new Path(args[1]));
    conf.setOutputFormat(SequenceFileOutputFormat.class);
   
    JobClient.runJob(conf);

    return 0;   
  }   
 
  private static class SimpleDistCpMapper extends MapReduceBase implements Mapper<LongWritable,Text,Text,Text> {
       
    private String ccHdfsPath, awsAccessKeyId, awsSecretAccessKey;
    private FileSystem filesystem;
   
    public void configure(JobConf job) {
      super.configure(job);
      ccHdfsPath = job.get(CC_HDFS_PATH);
      awsAccessKeyId = job.get("fs.s3n.awsAccessKeyId");
      awsSecretAccessKey = job.get("fs.s3n.awsSecretAccessKey");
      try {
        filesystem = FileSystem.get(new Configuration());
      } catch (IOException e) {
        e.printStackTrace();
      }
    }
   
    public void map(LongWritable k, Text s3key, OutputCollector<Text, Text> collector, Reporter reporter) throws IOException {
       
      int attempts = 0;     
      while(attempts < MAX_RETRIES) {
       
        InputStream s3object = null;
        FSDataOutputStream hdfsFile = null;
       
        try {
          String logmsg = Thread.currentThread().getName()+
                          " copying "+s3key.toString()+
                          " to "+hdfsPathForKey(s3key).toString()+
                          " attempts="+attempts;
          reporter.setStatus(logmsg);
          System.err.println(logmsg);
        
          AmazonS3Client s3Client = newS3Client();         
          s3object = s3Client.getObject("commoncrawl-crawl-002", s3key.toString()).getObjectContent();                 
          hdfsFile = createHdfsFileFor(s3key);  
         
          copy(s3object, hdfsFile, reporter);                             
         
          s3object.close();
          hdfsFile.close();
          s3Client.shutdown();
         
          return;
        }
        catch (Exception e) {
          // clean up attempt
          if (s3object!=null) s3object.close();
          if (hdfsFile!=null) hdfsFile.close();             
          filesystem.delete(hdfsPathForKey(s3key), true);

          // report
          e.printStackTrace();
          reporter.getCounter("exception", e.getClass().getName()).increment(1);
          attempts++;
          try { Thread.sleep(attempts*1000); } catch (InterruptedException ignore) { }
          reporter.progress();
        }         
      }
      reporter.setStatus("failed to download "+s3key.toString());
      System.err.println("failed to download "+s3key.toString());
      reporter.getCounter("error","exceeded_max_attempts_to_dload").increment(1);
    }

    private FSDataOutputStream createHdfsFileFor(Text s3key) throws IOException {   
      return filesystem.create(hdfsPathForKey(s3key), (short)2);     
    }

    private Path hdfsPathForKey(Text s3key) {
      return new Path(ccHdfsPath + s3key.toString());    
    }
   
    private AmazonS3Client newS3Client() {     
      AmazonS3Client s3Client = new AmazonS3Client(new BasicAWSCredentials(awsAccessKeyId, awsSecretAccessKey));
      s3Client.addRequestHandler(new AbstractRequestHandler() {
        public void beforeRequest(Request<?> request) {
          request.addHeader("x-amz-request-payer", "requester");
        }
      });     
      return s3Client;
    }
   
    private void copy(InputStream inputStream, OutputStream outputStream, Reporter reporter) throws IOException {
      final ReadableByteChannel input  = Channels.newChannel(inputStream);
      final WritableByteChannel output = Channels.newChannel(outputStream);       
      ByteBuffer buffer = ByteBuffer.allocateDirect(64 * 1024);
      while (input.read(buffer) != -1) {
        buffer.flip();
        output.write(buffer);
        buffer.compact();
        reporter.progress();
      }
      buffer.flip();
      while (buffer.hasRemaining()) {
        output.write(buffer);
       }
      inputStream.close();
      outputStream.close();     
    }
   
  }
 
}
TOP

Related Classes of cc.SimpleDistCp$SimpleDistCpMapper

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.