Source Code of org.archive.wayback.resourceindex.indexer.hadoop.Driver$MapClass

package org.archive.wayback.resourceindex.indexer.hadoop;




import java.io.IOException;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;


import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.ObjectWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapred.ClusterStatus;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.TextOutputFormat;
import org.apache.hadoop.mapred.lib.IdentityReducer;
import org.apache.hadoop.util.ReflectionUtils;
import org.archive.io.arc.ARCRecord;
import org.archive.mapred.ARCMapRunner;
import org.archive.mapred.ARCRecordMapper;
import org.archive.wayback.core.SearchResult;
import org.archive.wayback.resourcestore.ARCRecordToSearchResultAdapter;
import org.archive.wayback.resourceindex.cdx.SearchResultToCDXLineAdapter;


/**
 * Hadoop Driver for generation of alphabetically partitioned Wayback CDX 
 * files using the Hadoop framework. 
 *
 * @author brad
 * @version $Date$, $Revision$
 */
public class Driver {
  


  /**
   * Mapper which converts an ARCRecord into a CDX line.
   *
   * @author brad
   * @version $Date$, $Revision$
   */
  public static class MapClass extends MapReduceBase implements ARCRecordMapper {


    private Text outKey = new Text();
    private Text outValue = new Text("");
    private ARCRecordToSearchResultAdapter ARtoSR = 
      new ARCRecordToSearchResultAdapter();
    private SearchResultToCDXLineAdapter SRtoCDX =
      new SearchResultToCDXLineAdapter();
    
    public void map(WritableComparable key, Writable value,
        OutputCollector output, Reporter reporter) throws IOException {
      ObjectWritable ow = (ObjectWritable) value;
      ARCRecord rec = (ARCRecord) ow.get();
      String line;
      SearchResult result = ARtoSR.adapt(rec);
      if(result != null) {
        line = SRtoCDX.adapt(result);
        if(line != null) {


          outKey.set(line);
          output.collect(outKey, outValue);
        }
      }
    }
    public void onARCOpen() throws IOException {}
    public void onARCClose() throws IOException {}
  }


  static void printUsage() {
    System.out.println("[-m <maps>] <input> <output>");
    System.exit(1);
  }


  /**
   * The main driver for sort program.
   * Invoke this method to submit the map/reduce job.
   * @param args 
   * @throws IOException When there is communication problems with the 
   *                     job tracker.
   */
  public static void main(String[] args) throws IOException {
    Configuration defaults = new Configuration();


    JobConf jobConf = new JobConf(defaults, Driver.class);
    jobConf.setJobName("cdx1");


    jobConf.setMapRunnerClass(ARCMapRunner.class);
    
//    jobConf.setInputFormat(SequenceFileInputFormat.class);
    jobConf.setOutputFormat(TextOutputFormat.class);


    jobConf.setOutputKeyClass(Text.class);
    jobConf.setOutputValueClass(Text.class);
    jobConf.set("mapred.partitioner.class",
        "org.archive.wayback.resourceindex.indexer.hadoop.AlphaPartitioner");


    jobConf.setMapperClass(MapClass.class);        
//    jobConf.setMapperClass(IdentityMapper.class);
    jobConf.setReducerClass(IdentityReducer.class);


    AlphaPartitioner part = (AlphaPartitioner)ReflectionUtils.newInstance(
        jobConf.getPartitionerClass(), jobConf);
    int num_reduces = part.getNumPartitions();
    
    
    JobClient client = new JobClient(jobConf);
    ClusterStatus cluster = client.getClusterStatus();
    int num_maps = cluster.getTaskTrackers()
        * jobConf.getInt("test.sort.maps_per_host", 10);
    List <String>otherArgs = new ArrayList<String>();
    for (int i = 0; i < args.length; ++i) {
      try {
        if ("-m".equals(args[i])) {
          num_maps = Integer.parseInt(args[++i]);
        } else {
          otherArgs.add(args[i]);
        }
      } catch (NumberFormatException except) {
        System.out.println("ERROR: Integer expected instead of "
            + args[i]);
        printUsage();
      } catch (ArrayIndexOutOfBoundsException except) {
        System.out.println("ERROR: Required parameter missing from "
            + args[i - 1]);
        printUsage(); // exits
      }
    }


    jobConf.setNumMapTasks(num_maps);
    jobConf.setNumReduceTasks(num_reduces);


    // Make sure there are exactly 2 parameters left.
    if (otherArgs.size() != 2) {
      System.out.println("ERROR: Wrong number of parameters: "
          + otherArgs.size() + " instead of 2.");
      printUsage();
    }
    jobConf.setInputPath(new Path((String) otherArgs.get(0)));
    jobConf.setOutputPath(new Path((String) otherArgs.get(1)));


    // Uncomment to run locally in a single process
    //job_conf.set("mapred.job.tracker", "local");


    System.out.println("Running on " + cluster.getTaskTrackers()
        + " nodes to sort from " + jobConf.getInputPaths()[0]
        + " into " + jobConf.getOutputPath() + " with " + num_reduces
        + " reduces.");
    Date startTime = new Date();
    System.out.println("Job started: " + startTime);
    JobClient.runJob(jobConf);
    Date end_time = new Date();
    System.out.println("Job ended: " + end_time);
    System.out.println("The job took "
        + (end_time.getTime() - startTime.getTime()) / 1000
        + " seconds.");
  }
}
Source Code of org.archive.wayback.resourceindex.indexer.hadoop.Driver$MapClass

Related Classes of org.archive.wayback.resourceindex.indexer.hadoop.Driver$MapClass