Source Code of com.intel.hadoop.graphbuilder.idnormalize.mapreduce.SortEdgeMR

/* Copyright (C) 2012 Intel Corporation.
 *     All rights reserved.
 *           
 *  Licensed under the Apache License, Version 2.0 (the "License");
 *  you may not use this file except in compliance with the License.
 *  You may obtain a copy of the License at
 *
 *       http://www.apache.org/licenses/LICENSE-2.0
 *
 *   Unless required by applicable law or agreed to in writing, software
 *   distributed under the License is distributed on an "AS IS" BASIS,
 *   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *   See the License for the specific language governing permissions and
 *   limitations under the License.
 *
 * For more about this software visit:
 *      http://www.01.org/GraphBuilder 
 */
package com.intel.hadoop.graphbuilder.idnormalize.mapreduce;


import java.io.IOException;
import java.util.Iterator;


import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hadoop.mapred.TextOutputFormat;
import org.apache.log4j.Logger;


import com.intel.hadoop.graphbuilder.graph.Edge;
import com.intel.hadoop.graphbuilder.parser.FieldParser;
import com.intel.hadoop.graphbuilder.parser.GraphParser;


/**
 * This class partition the edge list input by the hash of the source vertex.
 * 
 */
public class SortEdgeMR {
  private static final Logger LOG = Logger.getLogger(SortEdgeMR.class);


  /**
   * This mapper class maps each edge into (h(edge.source), edge). The hash
   * function depends on "numChunks" passed through the {@code JobConf}.
   * 
   * @author Haijie Gu
   */
  public static class SortEdgeMapper extends MapReduceBase implements
      Mapper<LongWritable, Text, IntWritable, Text> {


    public void configure(JobConf conf) {
      super.configure(conf);
      numChunks = conf.getInt("numChunks", 256);
      try {
        this.graphparser = (GraphParser) Class.forName(conf.get("GraphParser"))
            .newInstance();
        this.vidparser = (FieldParser) Class.forName(conf.get("VidParser"))
            .newInstance();
        this.edataparser = (FieldParser) Class.forName(conf.get("EdataParser"))
            .newInstance();
      } catch (InstantiationException e) {
        e.printStackTrace();
      } catch (IllegalAccessException e) {
        e.printStackTrace();
      } catch (ClassNotFoundException e) {
        e.printStackTrace();
      }
    }


    @Override
    public void map(LongWritable key, Text val,
        OutputCollector<IntWritable, Text> out, Reporter arg3)
        throws IOException {
      Edge e = graphparser.parseEdge(val.toString(), vidparser, edataparser);
      int hash = e.source().hashCode() % numChunks;
      if (hash < 0)
        hash += numChunks;
      out.collect(new IntWritable(hash), val);
    }


    private int numChunks;
    private GraphParser graphparser;
    private FieldParser vidparser;
    private FieldParser edataparser;
  }


  /**
   * This reducer class takes the input (hashval, edge) from mapper and outputs
   * edge directly.
   * 
   * @author Haijie Gu
   */
  public static class SortEdgeReducer extends MapReduceBase implements
      Reducer<IntWritable, Text, IntWritable, Text> {


    @Override
    public void reduce(IntWritable key, Iterator<Text> iter,
        OutputCollector<IntWritable, Text> out, Reporter reporter)
        throws IOException {
      while (iter.hasNext()) {
        out.collect(null, iter.next());
      }
    }
  }


  public SortEdgeMR(int numChunks, GraphParser graphparser,
      FieldParser vidparser, FieldParser edataparser) {
    this.numChunks = numChunks;
    this.graphparser = graphparser;
    this.vidparser = vidparser;
    this.edataparser = edataparser;
  }


  public void run(String inputpath, String outputpath) throws IOException {


    JobConf conf = new JobConf(SortEdgeMR.class);


    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);


    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(Text.class);


    conf.setMapperClass(SortEdgeMapper.class);
    conf.setReducerClass(SortEdgeReducer.class);


    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);


    conf.setInt("numChunks", numChunks);
    conf.set("GraphParser", graphparser.getClass().getName());
    conf.set("VidParser", vidparser.getClass().getName());
    conf.set("EdataParser", edataparser.getClass().getName());


    FileInputFormat.setInputPaths(conf, new Path(inputpath));
    FileOutputFormat.setOutputPath(conf, new Path(outputpath));


    LOG.info("==== Job: Partition the input edges by hash(sourceid) =========");
    LOG.info("Input = " + inputpath);
    LOG.info("Output = " + outputpath);
    LOG.debug("numChunks = " + numChunks);
    LOG.debug("GraphParser = " + graphparser.getClass().getName());
    LOG.debug("VidParser = " + vidparser.getClass().getName());
    LOG.debug("EdataParser = " + edataparser.getClass().getName());
    LOG.info("===============================================================");


    JobClient.runJob(conf);
    LOG.info("=================== Done ====================================\n");
  }


  private int numChunks;
  private GraphParser graphparser;
  private FieldParser vidparser;
  private FieldParser edataparser;


}
Source Code of com.intel.hadoop.graphbuilder.idnormalize.mapreduce.SortEdgeMR

Related Classes of com.intel.hadoop.graphbuilder.idnormalize.mapreduce.SortEdgeMR