Package org.apache.hadoop.mapred

Source Code of org.apache.hadoop.mapred.SortValidator$RecordChecker$Reduce

/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements.  See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership.  The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.hadoop.mapred;

import java.io.*;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.*;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
import org.apache.hadoop.io.WritableUtils;
import org.apache.hadoop.mapred.lib.HashPartitioner;
import org.apache.hadoop.fs.*;

/**
* A set of utilities to validate the <b>sort</b> of the map-reduce framework.
* This utility program has 2 main parts:
* 1. Checking the records' statistics
*   a) Validates the no. of bytes and records in sort's input & output.
*   b) Validates the xor of the md5's of each key/value pair.
*   c) Ensures same key/value is present in both input and output.
* 2. Check individual records  to ensure each record is present in both
*    the input and the output of the sort (expensive on large data-sets).
*   
* To run: bin/hadoop jar build/hadoop-examples.jar sortvalidate
*            [-m <i>maps</i>] [-r <i>reduces</i>] [-deep]
*            -sortInput <i>sort-in-dir</i> -sortOutput <i>sort-out-dir</i>
*
* @author Arun C Murthy
*/
public class SortValidator {

  static private final IntWritable sortInput = new IntWritable(1);
  static private final IntWritable sortOutput = new IntWritable(2);

  static void printUsage() {
    System.err.println("sortvalidate [-m <maps>] [-r <reduces>] [-deep] " +
    "-sortInput <sort-input-dir> -sortOutput <sort-output-dir>");
    System.exit(1);
  }

  static private IntWritable deduceInputFile(JobConf job) {
    Path[] inputPaths = job.getInputPaths();
    String inputFile = null;
    try {
      inputFile = new URI(job.get("map.input.file")).getPath();
    } catch (URISyntaxException urise) {
      System.err.println("Caught: " + urise);
      System.exit(-1);
    }
   
    // value == one for sort-input; value == two for sort-output
    return (inputFile.toString().startsWith(inputPaths[0].toString())) ?
            sortInput : sortOutput;
  }
 
  static private byte[] pair(BytesWritable a, BytesWritable b) {
    byte[] pairData = new byte[a.getSize()+ b.getSize()];
    System.arraycopy(a.get(), 0, pairData, 0, a.getSize());
    System.arraycopy(b.get(), 0, pairData, a.getSize(), b.getSize());
    return pairData;
  }

  /**
   * A simple map-reduce job which checks consistency of the
   * MapReduce framework's sort by checking:
   * a) Records are sorted correctly
   * b) Keys are partitioned correctly
   * c) The input and output have same no. of bytes and records.
   * d) The input and output have the correct 'checksum' by xor'ing
   *    the md5 of each record.
   *   
   * @author Arun C Murthy
   */
  public static class RecordStatsChecker {
   
    public static class RecordStatsWritable implements Writable {
      private long bytes = 0;
      private long records = 0;
      private int checksum = 0;
     
      public RecordStatsWritable() {}
     
      public RecordStatsWritable(long bytes, long records, int checksum) {
        this.bytes = bytes;
        this.records = records;
        this.checksum = checksum;
      }
     
      public void write(DataOutput out) throws IOException {
        WritableUtils.writeVLong(out, bytes);
        WritableUtils.writeVLong(out, records);
        WritableUtils.writeVInt(out, checksum);
      }

      public void readFields(DataInput in) throws IOException {
        bytes = WritableUtils.readVLong(in);
        records = WritableUtils.readVLong(in);
        checksum = WritableUtils.readVInt(in);
      }
     
      public long getBytes() { return bytes; }
      public long getRecords() { return records; }
      public int getChecksum() { return checksum; }
    }
   
    public static class Map extends MapReduceBase implements Mapper {
      private IntWritable key = null;
      private BytesWritable prevKey = null;
      private Partitioner partitioner = null;
      private int partition = -1;
      private int noSortReducers = -1;

      public void configure(JobConf job) {
        // 'key' == sortInput for sort-input; key == sortOutput for sort-output
        key = deduceInputFile(job);
       
        if (key == sortOutput) {
          partitioner = new HashPartitioner();
         
          // Figure the 'current' partition and no. of reduces of the 'sort'
          try {
            URI inputURI = new URI(job.get("map.input.file"));
            String inputFile = inputURI.getPath();
            partition = Integer.valueOf(
                          inputFile.substring(inputFile.lastIndexOf("part")+5)
                          ).intValue();
            noSortReducers = job.getInt("sortvalidate.sort.reduce.tasks", -1);
          } catch (Exception e) {
            System.err.println("Caught: " + e);
            System.exit(-1);
          }
        }
      }
     
      public void map(WritableComparable key,
              Writable value,
              OutputCollector output,
              Reporter reporter) throws IOException {
        BytesWritable bwKey = (BytesWritable)key;
        BytesWritable bwValue = (BytesWritable)value;
       
        if (this.key == sortOutput) {
          // Check if keys are 'sorted' if this 
          // record is from sort's output
          if (prevKey == null) {
            prevKey = bwKey;
          } else {
            if (prevKey.compareTo(bwKey) > 0) {
              throw new IOException("The 'map-reduce' framework wrongly classifed"
                      + "(" + prevKey + ") > (" + bwKey + ")");
            }
            prevKey = bwKey;
          }

          // Check if the sorted output is 'partitioned' right
          int keyPartition =
            partitioner.getPartition(bwKey, bwValue, noSortReducers);
          if (partition != keyPartition) {
            throw new IOException("Paritions do not match! - '" + partition +
                    "' v/s '" + keyPartition + "'");
          }
        }

        int keyValueChecksum =
            (WritableComparator.hashBytes(bwKey.get(), bwKey.getSize()) ^
             WritableComparator.hashBytes(bwValue.get(), bwValue.getSize()));

        // output (this.key, record-stats)
        output.collect(this.key, new RecordStatsWritable(
                (bwKey.getSize()+bwValue.getSize()), 1, keyValueChecksum));
      }
    }
   
    public static class Reduce extends MapReduceBase implements Reducer {
      public void reduce(WritableComparable key, Iterator values,
              OutputCollector output,
              Reporter reporter) throws IOException {
        long bytes = 0;
        long records = 0;
        int xor = 0;
        while (values.hasNext()) {
          RecordStatsWritable stats = ((RecordStatsWritable)values.next());
          bytes += stats.getBytes();
          records += stats.getRecords();
          xor ^= stats.getChecksum();
        }
       
        output.collect(key, new RecordStatsWritable(bytes, records, xor));
      }
    }
   
    public static class NonSplitableSequenceFileInputFormat
    extends SequenceFileInputFormat {
      protected boolean isSplitable(FileSystem fs, Path filename) {
        return false;
      }
    }
   
    static void checkRecords(Configuration defaults,
            Path sortInput, Path sortOutput) throws IOException {
      FileSystem fs = FileSystem.get(defaults);
      JobConf jobConf = new JobConf(defaults, RecordStatsChecker.class);
      jobConf.setJobName("sortvalidate-recordstats-checker");

      int noSortReduceTasks = fs.listPaths(sortOutput).length;
      jobConf.setInt("sortvalidate.sort.reduce.tasks", noSortReduceTasks);

      jobConf.setInputFormat(NonSplitableSequenceFileInputFormat.class);
      jobConf.setOutputFormat(SequenceFileOutputFormat.class);
     
      jobConf.setOutputKeyClass(IntWritable.class);
      jobConf.setOutputValueClass(RecordStatsChecker.RecordStatsWritable.class);
     
      jobConf.setMapperClass(Map.class);
      jobConf.setCombinerClass(Reduce.class);
      jobConf.setReducerClass(Reduce.class);
     
      jobConf.setNumMapTasks(noSortReduceTasks);
      jobConf.setNumReduceTasks(1);

      jobConf.setInputPath(sortInput);
      jobConf.addInputPath(sortOutput);
      Path outputPath = new Path("/tmp/sortvalidate/recordstatschecker");
      if (fs.exists(outputPath)) {
        fs.delete(outputPath);
      }
      jobConf.setOutputPath(outputPath);
     
      // Uncomment to run locally in a single process
      //job_conf.set("mapred.job.tracker", "local");
     
      System.out.println("\nSortValidator.RecordStatsChecker: Validate sort " +
              "from " + jobConf.getInputPaths()[0] + ", " +
              jobConf.getInputPaths()[1] + " into " + jobConf.getOutputPath() +
              " with 1 reducer.");
      Date startTime = new Date();
      System.out.println("Job started: " + startTime);
      JobClient.runJob(jobConf);
      Date end_time = new Date();
      System.out.println("Job ended: " + end_time);
      System.out.println("The job took " +
              (end_time.getTime() - startTime.getTime()) /1000 + " seconds.");
     
      // Check to ensure that the statistics of the
      // framework's sort-input and sort-output match
      SequenceFile.Reader stats = new SequenceFile.Reader(fs,
              new Path(outputPath, "part-00000"), defaults);
      IntWritable k1 = new IntWritable();
      IntWritable k2 = new IntWritable();
      RecordStatsWritable v1 = new RecordStatsWritable();
      RecordStatsWritable v2 = new RecordStatsWritable();
      if (!stats.next(k1, v1)) {
        throw new IOException("Failed to read record #1 from reduce's output");
      }
      if (!stats.next(k2, v2)) {
        throw new IOException("Failed to read record #2 from reduce's output");
      }

      if ((v1.getBytes() != v2.getBytes()) || (v1.getRecords() != v2.getRecords()) ||
              v1.getChecksum() != v2.getChecksum()) {
        throw new IOException("(" +
                v1.getBytes() + ", " + v1.getRecords() + ", " + v1.getChecksum() + ") v/s (" +
                v2.getBytes() + ", " + v2.getRecords() + ", " + v2.getChecksum() + ")");
      }
    }

  }
 
  /**
   * A simple map-reduce task to check if the input and the output
   * of the framework's sort is consistent by ensuring each record
   * is present in both the input and the output.
   *
   * @author Arun C   Murthy
   */
  public static class RecordChecker {
   
    public static class Map extends MapReduceBase implements Mapper {
      private IntWritable value = null;
     
      public void configure(JobConf job) {
        // value == one for sort-input; value == two for sort-output
        value = deduceInputFile(job);
      }
     
      public void map(WritableComparable key,
              Writable value,
              OutputCollector output,
              Reporter reporter) throws IOException {
        // newKey = (key, value)
        BytesWritable keyValue =
          new BytesWritable(pair((BytesWritable)key, (BytesWritable)value));
   
        // output (newKey, value)
        output.collect(keyValue, this.value);
      }
    }
   
    public static class Reduce extends MapReduceBase implements Reducer {
      public void reduce(WritableComparable key, Iterator values,
              OutputCollector output,
              Reporter reporter) throws IOException {
        int ones = 0;
        int twos = 0;
        while (values.hasNext()) {
          IntWritable count = ((IntWritable) values.next());
          if (count.equals(sortInput)) {
            ++ones;
          } else if (count.equals(sortOutput)) {
            ++twos;
          } else {
            throw new IOException("Invalid 'value' of " + count.get() +
                    " for (key,value): " + key.toString());
          }
        }
       
        // Check to ensure there are equal no. of ones and twos
        if (ones != twos) {
          throw new IOException("Illegal ('one', 'two'): (" + ones + ", " + twos +
                  ") for (key, value): " + key.toString());
        }
      }
    }
   
    static void checkRecords(Configuration defaults, int noMaps, int noReduces,
            Path sortInput, Path sortOutput) throws IOException {
      JobConf jobConf = new JobConf(defaults, RecordChecker.class);
      jobConf.setJobName("sortvalidate-record-checker");
     
      jobConf.setInputFormat(SequenceFileInputFormat.class);
      jobConf.setOutputFormat(SequenceFileOutputFormat.class);
     
      jobConf.setOutputKeyClass(BytesWritable.class);
      jobConf.setOutputValueClass(IntWritable.class);
     
      jobConf.setMapperClass(Map.class);       
      jobConf.setReducerClass(Reduce.class);
     
      JobClient client = new JobClient(jobConf);
      ClusterStatus cluster = client.getClusterStatus();
      if (noMaps == -1) {
        noMaps = cluster.getTaskTrackers() *
                   jobConf.getInt("test.sortvalidate.maps_per_host", 10);
      }
      if (noReduces == -1) {
        noReduces = cluster.getTaskTrackers() *
                      jobConf.getInt("test.sortvalidate.reduces_per_host",
                              cluster.getMaxTasks());
      }
      jobConf.setNumMapTasks(noMaps);
      jobConf.setNumReduceTasks(noReduces);
     
      jobConf.setInputPath(sortInput);
      jobConf.addInputPath(sortOutput);
      Path outputPath = new Path("/tmp/sortvalidate/recordchecker");
      FileSystem fs = FileSystem.get(defaults);
      if (fs.exists(outputPath)) {
        fs.delete(outputPath);
      }
      jobConf.setOutputPath(outputPath);
     
      // Uncomment to run locally in a single process
      //job_conf.set("mapred.job.tracker", "local");
     
      System.out.println("\nSortValidator.RecordChecker: Running on " +
              cluster.getTaskTrackers() +
              " nodes to validate sort from " + jobConf.getInputPaths()[0] + ", " +
              jobConf.getInputPaths()[1] + " into " + jobConf.getOutputPath() +
              " with " + noReduces + " reduces.");
      Date startTime = new Date();
      System.out.println("Job started: " + startTime);
      JobClient.runJob(jobConf);
      Date end_time = new Date();
      System.out.println("Job ended: " + end_time);
      System.out.println("The job took " +
              (end_time.getTime() - startTime.getTime()) /1000 + " seconds.");
    }
  }

 
  /**
   * The main driver for sort program.
   * Invoke this method to submit the map/reduce job.
   * @throws IOException When there is communication problems with the
   *                     job tracker.
   */
  public static void main(String[] args) throws IOException {
    Configuration defaults = new Configuration();
   
    int noMaps = -1, noReduces = -1;
    Path sortInput = null, sortOutput = null;
    boolean deepTest = false;
    for(int i=0; i < args.length; ++i) {
      try {
        if ("-m".equals(args[i])) {
          noMaps = Integer.parseInt(args[++i]);
        } else if ("-r".equals(args[i])) {
          noReduces = Integer.parseInt(args[++i]);
        } else if ("-sortInput".equals(args[i])){
          sortInput = new Path(args[++i]);
        } else if ("-sortOutput".equals(args[i])){
          sortOutput = new Path(args[++i]);
        } else if ("-deep".equals(args[i])) {
          deepTest = true;
        } else {
          printUsage();
        }
      } catch (NumberFormatException except) {
        System.err.println("ERROR: Integer expected instead of " + args[i]);
        printUsage();
      } catch (ArrayIndexOutOfBoundsException except) {
        System.err.println("ERROR: Required parameter missing from " +
                args[i-1]);
        printUsage(); // exits
      }
    }
   
    // Sanity check
    if (sortInput == null || sortOutput == null) {
      printUsage();
    }

    // Check if the records are consistent and sorted correctly
    RecordStatsChecker.checkRecords(defaults, sortInput, sortOutput);

    // Check if the same records are present in sort's inputs & outputs
    if (deepTest) {
      RecordChecker.checkRecords(defaults, noMaps, noReduces, sortInput,
            sortOutput);
    }
   
    System.out.println("\nSUCCESS! Validated the MapReduce framework's 'sort'" +
            " successfully.");
  }
 
}
TOP

Related Classes of org.apache.hadoop.mapred.SortValidator$RecordChecker$Reduce

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.