Source Code of org.apache.hadoop.mapred.ReduceTask$ReduceCopier

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */


package org.apache.hadoop.mapred;


import java.io.DataInput;
import java.io.DataOutput;
import java.io.File;
import java.io.IOException;
import java.net.URI;
import java.net.URL;
import java.net.URLClassLoader;
import java.text.DecimalFormat;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Hashtable;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Random;
import java.util.Set;
import java.util.TreeSet;


import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.InMemoryFileSystem;
import org.apache.hadoop.fs.LocalDirAllocator;
import org.apache.hadoop.fs.LocalFileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.io.DataInputBuffer;
import org.apache.hadoop.io.DataOutputBuffer;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
import org.apache.hadoop.io.WritableFactories;
import org.apache.hadoop.io.WritableFactory;
import org.apache.hadoop.metrics.MetricsContext;
import org.apache.hadoop.metrics.MetricsRecord;
import org.apache.hadoop.metrics.MetricsUtil;
import org.apache.hadoop.metrics.Updater;
import org.apache.hadoop.util.Progress;
import org.apache.hadoop.util.ReflectionUtils;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.util.DiskChecker.DiskErrorException;


import static org.apache.hadoop.mapred.Task.Counter.*;


/** A Reduce task. */
class ReduceTask extends Task {


  static {                                        // register a ctor
    WritableFactories.setFactory
      (ReduceTask.class,
       new WritableFactory() {
         public Writable newInstance() { return new ReduceTask(); }
       });
  }
  
  private static final Log LOG = LogFactory.getLog(ReduceTask.class.getName());
  private int numMaps;
  private ReduceCopier reduceCopier;


  { 
    getProgress().setStatus("reduce"); 
    setPhase(TaskStatus.Phase.SHUFFLE);        // phase to start with 
  }


  private Progress copyPhase = getProgress().addPhase("copy");
  private Progress sortPhase  = getProgress().addPhase("sort");
  private Progress reducePhase = getProgress().addPhase("reduce");


  public ReduceTask() {
    super();
  }


  public ReduceTask(String jobId, String jobFile, String tipId, String taskId,
                    int partition, int numMaps) {
    super(jobId, jobFile, tipId, taskId, partition);
    this.numMaps = numMaps;
  }


  public TaskRunner createRunner(TaskTracker tracker) throws IOException {
    return new ReduceTaskRunner(this, tracker, this.conf);
  }


  public boolean isMapTask() {
    return false;
  }


  public int getNumMaps() { return numMaps; }
  
  /**
   * Localize the given JobConf to be specific for this task.
   */
  public void localizeConfiguration(JobConf conf) throws IOException {
    super.localizeConfiguration(conf);
    conf.setNumMapTasks(numMaps);
  }


  public void write(DataOutput out) throws IOException {
    super.write(out);


    out.writeInt(numMaps);                        // write the number of maps
  }


  public void readFields(DataInput in) throws IOException {
    super.readFields(in);


    numMaps = in.readInt();
  }


  /** Iterates values while keys match in sorted input. */
  static class ValuesIterator implements Iterator {
    private SequenceFile.Sorter.RawKeyValueIterator in; //input iterator
    private WritableComparable key;               // current key
    private Writable value;                       // current value
    private boolean hasNext;                      // more w/ this key
    private boolean more;                         // more in file
    private WritableComparator comparator;
    private Class keyClass;
    private Class valClass;
    private Configuration conf;
    private DataOutputBuffer valOut = new DataOutputBuffer();
    private DataInputBuffer valIn = new DataInputBuffer();
    private DataInputBuffer keyIn = new DataInputBuffer();
    protected Reporter reporter;


    public ValuesIterator (SequenceFile.Sorter.RawKeyValueIterator in, 
                           WritableComparator comparator, Class keyClass,
                           Class valClass, Configuration conf, 
                           Reporter reporter)
      throws IOException {
      this.in = in;
      this.conf = conf;
      this.comparator = comparator;
      this.keyClass = keyClass;
      this.valClass = valClass;
      this.reporter = reporter;
      getNext();
    }


    /// Iterator methods


    public boolean hasNext() { return hasNext; }


    public Object next() {
      Object result = value;                      // save value
      try {
        getNext();                                  // move to next
      } catch (IOException e) {
        throw new RuntimeException(e);
      }
      reporter.progress();
      return result;                              // return saved value
    }


    public void remove() { throw new RuntimeException("not implemented"); }


    /// Auxiliary methods


    /** Start processing next unique key. */
    public void nextKey() {
      while (hasNext) { next(); }                 // skip any unread
      hasNext = more;
    }


    /** True iff more keys remain. */
    public boolean more() { return more; }


    /** The current key. */
    public WritableComparable getKey() { return key; }


    private void getNext() throws IOException {
      Writable lastKey = key;                     // save previous key
      try {
        key = (WritableComparable)ReflectionUtils.newInstance(keyClass, this.conf);
        value = (Writable)ReflectionUtils.newInstance(valClass, this.conf);
      } catch (Exception e) {
        throw new RuntimeException(e);
      }
      more = in.next();
      if (more) {
        //de-serialize the raw key/value
        keyIn.reset(in.getKey().getData(), in.getKey().getLength());
        key.readFields(keyIn);
        valOut.reset();
        (in.getValue()).writeUncompressedBytes(valOut);
        valIn.reset(valOut.getData(), valOut.getLength());
        value.readFields(valIn);


        if (lastKey == null) {
          hasNext = true;
        } else {
          hasNext = (comparator.compare(key, lastKey) == 0);
        }
      } else {
        hasNext = false;
      }
    }
  }
  private class ReduceValuesIterator extends ValuesIterator {
    public ReduceValuesIterator (SequenceFile.Sorter.RawKeyValueIterator in,
                                 WritableComparator comparator, Class keyClass,
                                 Class valClass,
                                 Configuration conf, Reporter reporter)
      throws IOException {
      super(in, comparator, keyClass, valClass, conf, reporter);
    }
    public void informReduceProgress() {
      reducePhase.set(super.in.getProgress().get()); // update progress
      reporter.progress();
    }
    public Object next() {
      reporter.incrCounter(REDUCE_INPUT_RECORDS, 1);
      return super.next();
    }
  }


  @SuppressWarnings("unchecked")
  public void run(JobConf job, final TaskUmbilicalProtocol umbilical)
    throws IOException {
    Reducer reducer = (Reducer)ReflectionUtils.newInstance(
                                                           job.getReducerClass(), job);


    // start thread that will handle communication with parent
    startCommunicationThread(umbilical);


    FileSystem lfs = FileSystem.getLocal(job);
    if (!job.get("mapred.job.tracker", "local").equals("local")) {
      reduceCopier = new ReduceCopier(umbilical, job);
      if (!reduceCopier.fetchOutputs()) {
        throw new IOException(getTaskId() + "The reduce copier failed");
      }
    }
    copyPhase.complete();                         // copy is already complete
    


    // open a file to collect map output
    // since we don't know how many map outputs got merged in memory, we have
    // to check whether a given map output exists, and if it does, add it in
    // the list of files to merge, otherwise not.
    List<Path> mapFilesList = new ArrayList<Path>();
    for(int i=0; i < numMaps; i++) {
      Path f;
      try {
        //catch and ignore DiskErrorException, since some map outputs will
        //really be absent (inmem merge).
        f = mapOutputFile.getInputFile(i, getTaskId());
      } catch (DiskErrorException d) { 
        continue;
      }
      if (lfs.exists(f))
        mapFilesList.add(f);
    }
    Path[] mapFiles = new Path[mapFilesList.size()];
    mapFiles = mapFilesList.toArray(mapFiles);
    
    Path tempDir = new Path(getTaskId()); 


    SequenceFile.Sorter.RawKeyValueIterator rIter;
 
    setPhase(TaskStatus.Phase.SORT); 


    final Reporter reporter = getReporter(umbilical);
    
    // sort the input file
    SequenceFile.Sorter sorter = new SequenceFile.Sorter(lfs, 
        job.getOutputKeyComparator(), job.getMapOutputValueClass(), job);
    sorter.setProgressable(reporter);
    rIter = sorter.merge(mapFiles, tempDir, 
        !conf.getKeepFailedTaskFiles()); // sort


    sortPhase.complete();                         // sort is complete
    setPhase(TaskStatus.Phase.REDUCE); 


    // make output collector
    String finalName = getOutputName(getPartition());
    FileSystem fs = FileSystem.get(job);


    final RecordWriter out = 
      job.getOutputFormat().getRecordWriter(fs, job, finalName, reporter);  
    
    OutputCollector collector = new OutputCollector() {
        @SuppressWarnings("unchecked")
        public void collect(WritableComparable key, Writable value)
          throws IOException {
          out.write(key, value);
          reporter.incrCounter(REDUCE_OUTPUT_RECORDS, 1);
          // indicate that progress update needs to be sent
          reporter.progress();
        }
      };
    
    // apply reduce function
    try {
      Class keyClass = job.getMapOutputKeyClass();
      Class valClass = job.getMapOutputValueClass();
      
      ReduceValuesIterator values = new ReduceValuesIterator(rIter, 
          job.getOutputValueGroupingComparator(), keyClass, valClass, 
          job, reporter);
      values.informReduceProgress();
      while (values.more()) {
        reporter.incrCounter(REDUCE_INPUT_GROUPS, 1);
        reducer.reduce(values.getKey(), values, collector, reporter);
        values.nextKey();
        values.informReduceProgress();
      }


      //Clean up: repeated in catch block below
      reducer.close();
      out.close(reporter);
      //End of clean up.
    } catch (IOException ioe) {
      try {
        reducer.close();
      } catch (IOException ignored) {}
        
      try {
        out.close(reporter);
      } catch (IOException ignored) {}
      
      throw ioe;
    }
    done(umbilical);
  }


  class ReduceCopier implements MRConstants {


    /** Reference to the umbilical object */
    private TaskUmbilicalProtocol umbilical;
    
    /** Reference to the task object */
    
    /** Number of ms before timing out a copy */
    private static final int STALLED_COPY_TIMEOUT = 3 * 60 * 1000;
    
    /**
     * our reduce task instance
     */
    private ReduceTask reduceTask;
    
    /**
     * the list of map outputs currently being copied
     */
    private List<MapOutputLocation> scheduledCopies;
    
    /**
     *  the results of dispatched copy attempts
     */
    private List<CopyResult> copyResults;
    
    /**
     *  the number of outputs to copy in parallel
     */
    private int numCopiers;
    
    /**
     * the maximum amount of time (less 1 minute) to wait to 
     * contact a host after a copy from it fails. We wait for (1 min +
     * Random.nextInt(maxBackoff)) seconds.
     */
    private int maxBackoff;
    
    /**
     * busy hosts from which copies are being backed off
     * Map of host -> next contact time
     */
    private Map<String, Long> penaltyBox;
    
    /**
     * the set of unique hosts from which we are copying
     */
    private Set<String> uniqueHosts;
    
    /**
     * the last time we polled the job tracker
     */
    private long lastPollTime;
    
    /**
     * A reference to the in memory file system for writing the map outputs to.
     */
    private InMemoryFileSystem inMemFileSys;
    
    /**
     * A reference to the local file system for writing the map outputs to.
     */
    private FileSystem localFileSys;
    
    /**
     * An instance of the sorter used for doing merge
     */
    private SequenceFile.Sorter sorter;
    
    /**
     * A reference to the throwable object (if merge throws an exception)
     */
    private volatile Throwable mergeThrowable;
    
    /** 
     * A flag to indicate that merge is in progress
     */
    private volatile boolean mergeInProgress = false;
    
    /**
     * When we accumulate mergeThreshold number of files in ram, we merge/spill
     */
    private int mergeThreshold = 500;
    
    /**
     * The threads for fetching the files.
     */
    private MapOutputCopier[] copiers = null;
    
    /**
     * The object for metrics reporting.
     */
    private ShuffleClientMetrics shuffleClientMetrics = null;
    
    /**
     * the minimum interval between tasktracker polls
     */
    private static final long MIN_POLL_INTERVAL = 1000;
    
    /**
     * the number of map output locations to poll for at one time
     */  
    private int probe_sample_size = 100;
    
    /**
     * a list of map output locations for fetch retrials 
     */
    private List<MapOutputLocation> retryFetches =
      new ArrayList<MapOutputLocation>();
    
    /** 
     * The set of required map outputs
     */
    private Set <Integer> neededOutputs = 
      Collections.synchronizedSet(new TreeSet<Integer>());
    
    /** 
     * The set of obsolete map taskids.
     */
    private Set <String> obsoleteMapIds = 
      Collections.synchronizedSet(new TreeSet<String>());
    
    private Random random = null;
    
    /**
     * the max size of the merge output from ramfs
     */
    private long ramfsMergeOutputSize;
    
    /**
     * Maximum no. of fetch-retries per-map.
     */
    private static final int MAX_FETCH_RETRIES_PER_MAP = 5;
    
    /**
     * Maximum no. of unique maps from which we failed to fetch map-outputs
     * even after {@link #MAX_FETCH_RETRIES_PER_MAP} retries; after this the
     * reduce task is failed.
     */
    private static final int MAX_FAILED_UNIQUE_FETCHES = 5;


    /**
     * The maps from which we fail to fetch map-outputs 
     * even after {@link #MAX_FETCH_RETRIES_PER_MAP} retries.
     */
    Set<Integer> fetchFailedMaps = new TreeSet<Integer>(); 
    
    /**
     * A map of taskId -> no. of failed fetches
     */
    Map<String, Integer> mapTaskToFailedFetchesMap = 
      new HashMap<String, Integer>();    


    /**
     * This class contains the methods that should be used for metrics-reporting
     * the specific metrics for shuffle. This class actually reports the
     * metrics for the shuffle client (the ReduceTask), and hence the name
     * ShuffleClientMetrics.
     */
    class ShuffleClientMetrics implements Updater {
      private MetricsRecord shuffleMetrics = null;
      private int numFailedFetches = 0;
      private int numSuccessFetches = 0;
      private long numBytes = 0;
      private int numThreadsBusy = 0;
      ShuffleClientMetrics(JobConf conf) {
        MetricsContext metricsContext = MetricsUtil.getContext("mapred");
        this.shuffleMetrics = 
          MetricsUtil.createRecord(metricsContext, "shuffleInput");
        this.shuffleMetrics.setTag("user", conf.getUser());
        this.shuffleMetrics.setTag("jobName", conf.getJobName());
        this.shuffleMetrics.setTag("jobId", ReduceTask.this.getJobId());
        this.shuffleMetrics.setTag("taskId", getTaskId());
        this.shuffleMetrics.setTag("sessionId", conf.getSessionId());
        metricsContext.registerUpdater(this);
      }
      public synchronized void inputBytes(long numBytes) {
        this.numBytes += numBytes;
      }
      public synchronized void failedFetch() {
        ++numFailedFetches;
      }
      public synchronized void successFetch() {
        ++numSuccessFetches;
      }
      public synchronized void threadBusy() {
        ++numThreadsBusy;
      }
      public synchronized void threadFree() {
        --numThreadsBusy;
      }
      public void doUpdates(MetricsContext unused) {
        synchronized (this) {
          shuffleMetrics.incrMetric("shuffle_input_bytes", numBytes);
          shuffleMetrics.incrMetric("shuffle_failed_fetches", 
                                    numFailedFetches);
          shuffleMetrics.incrMetric("shuffle_success_fetches", 
                                    numSuccessFetches);
          if (numCopiers != 0) {
            shuffleMetrics.setMetric("shuffle_fetchers_busy_percent",
                100*((float)numThreadsBusy/numCopiers));
          } else {
            shuffleMetrics.setMetric("shuffle_fetchers_busy_percent", 0);
          }
          numBytes = 0;
          numSuccessFetches = 0;
          numFailedFetches = 0;
        }
        shuffleMetrics.update();
      }
    }


    /** Represents the result of an attempt to copy a map output */
    private class CopyResult {
      
      // the map output location against which a copy attempt was made
      private final MapOutputLocation loc;
      
      // the size of the file copied, -1 if the transfer failed
      private final long size;
      
      //a flag signifying whether a copy result is obsolete
      private static final int OBSOLETE = -2;
      
      CopyResult(MapOutputLocation loc, long size) {
        this.loc = loc;
        this.size = size;
      }
      
      public int getMapId() { return loc.getMapId(); }
      public boolean getSuccess() { return size >= 0; }
      public boolean isObsolete() { 
        return size == OBSOLETE;
      }
      public long getSize() { return size; }
      public String getHost() { return loc.getHost(); }
      public MapOutputLocation getLocation() { return loc; }
    }
    
    private int extractMapIdFromPathName(Path pathname) {
      //all paths end with map_<id>.out
      String firstPathName = pathname.getName();
      int beginIndex = firstPathName.lastIndexOf("map_");
      int endIndex = firstPathName.lastIndexOf(".out");
      return Integer.parseInt(firstPathName.substring(beginIndex +
                              "map_".length(), endIndex));
    }
    
    private int nextMapOutputCopierId = 0;
    
    /** Copies map outputs as they become available */
    private class MapOutputCopier extends Thread {
      
      private MapOutputLocation currentLocation = null;
      private int id = nextMapOutputCopierId++;
      private Reporter reporter;
      
      public MapOutputCopier(Reporter reporter) {
        setName("MapOutputCopier " + reduceTask.getTaskId() + "." + id);
        LOG.debug(getName() + " created");
        this.reporter = reporter;
      }
      
      /**
       * Fail the current file that we are fetching
       * @return were we currently fetching?
       */
      public synchronized boolean fail() {
        if (currentLocation != null) {
          finish(-1);
          return true;
        } else {
          return false;
        }
      }
      
      /**
       * Get the current map output location.
       */
      public synchronized MapOutputLocation getLocation() {
        return currentLocation;
      }
      
      private synchronized void start(MapOutputLocation loc) {
        currentLocation = loc;
      }
      
      private synchronized void finish(long size) {
        if (currentLocation != null) {
          LOG.debug(getName() + " finishing " + currentLocation + " =" + size);
          synchronized (copyResults) {
            copyResults.add(new CopyResult(currentLocation, size));
            copyResults.notify();
          }
          currentLocation = null;
        }
      }
      
      /** Loop forever and fetch map outputs as they become available.
       * The thread exits when it is interrupted by {@link ReduceTaskRunner}
       */
      public void run() {
        while (true) {        
          try {
            MapOutputLocation loc = null;
            long size = -1;
            
            synchronized (scheduledCopies) {
              while (scheduledCopies.isEmpty()) {
                scheduledCopies.wait();
              }
              loc = scheduledCopies.remove(0);
            }
            
            try {
              shuffleClientMetrics.threadBusy();
              start(loc);
              size = copyOutput(loc);
              shuffleClientMetrics.successFetch();
            } catch (IOException e) {
              LOG.warn(reduceTask.getTaskId() + " copy failed: " +
                       loc.getMapTaskId() + " from " + loc.getHost());
              LOG.warn(StringUtils.stringifyException(e));
              shuffleClientMetrics.failedFetch();
              
              // Reset 
              size = -1;
            } finally {
              shuffleClientMetrics.threadFree();
              finish(size);
            }
          } catch (InterruptedException e) { 
            return; // ALL DONE
          } catch (Throwable th) {
            LOG.error("Map output copy failure: " + 
                      StringUtils.stringifyException(th));
          }
        }
      }
      
      /** Copies a a map output from a remote host, via HTTP. 
       * @param currentLocation the map output location to be copied
       * @return the path (fully qualified) of the copied file
       * @throws IOException if there is an error copying the file
       * @throws InterruptedException if the copier should give up
       */
      private long copyOutput(MapOutputLocation loc
                              ) throws IOException, InterruptedException {
        // check if we still need to copy the output from this location
        if (!neededOutputs.contains(loc.getMapId()) || 
            obsoleteMapIds.contains(loc.getMapTaskId())) {
          return CopyResult.OBSOLETE;
        } 
 
        String reduceId = reduceTask.getTaskId();
        LOG.info(reduceId + " Copying " + loc.getMapTaskId() +
                 " output from " + loc.getHost() + ".");
        // a temp filename. If this file gets created in ramfs, we're fine,
        // else, we will check the localFS to find a suitable final location
        // for this path
        Path filename = new Path("/" + reduceId + "/map_" +
                                 loc.getMapId() + ".out");
        // a working filename that will be unique to this attempt
        Path tmpFilename = new Path(filename + "-" + id);
        // this copies the map output file
        tmpFilename = loc.getFile(inMemFileSys, localFileSys, shuffleClientMetrics,
                                  tmpFilename, lDirAlloc, 
                                  conf, reduceTask.getPartition(), 
                                  STALLED_COPY_TIMEOUT, reporter);
        if (!neededOutputs.contains(loc.getMapId())) {
          if (tmpFilename != null) {
            FileSystem fs = tmpFilename.getFileSystem(conf);
            fs.delete(tmpFilename);
          }
          return CopyResult.OBSOLETE;
        }
        if (tmpFilename == null)
          throw new IOException("File " + filename + "-" + id + 
                                " not created");
        long bytes = -1;
        // lock the ReduceTask while we do the rename
        synchronized (ReduceTask.this) {
          // This file could have been created in the inmemory
          // fs or the localfs. So need to get the filesystem owning the path. 
          FileSystem fs = tmpFilename.getFileSystem(conf);
          if (!neededOutputs.contains(loc.getMapId())) {
            fs.delete(tmpFilename);
            return CopyResult.OBSOLETE;
          }
          
          bytes = fs.getLength(tmpFilename);
          //resolve the final filename against the directory where the tmpFile
          //got created
          filename = new Path(tmpFilename.getParent(), filename.getName());
          // if we can't rename the file, something is broken (and IOException
          // will be thrown). 
          if (!fs.rename(tmpFilename, filename)) {
            fs.delete(tmpFilename);
            bytes = -1;
            throw new IOException("failure to rename map output " + 
                                  tmpFilename);
          }
          
          LOG.info(reduceId + " done copying " + loc.getMapTaskId() +
                   " output from " + loc.getHost() + ".");
          //Create a thread to do merges. Synchronize access/update to 
          //mergeInProgress
          if (!mergeInProgress && 
              (inMemFileSys.getPercentUsed() >= MAX_INMEM_FILESYS_USE || 
               (mergeThreshold > 0 && 
                inMemFileSys.getNumFiles(MAP_OUTPUT_FILTER) >= 
                mergeThreshold))&&
              mergeThrowable == null) {
            LOG.info(reduceId + " InMemoryFileSystem " + 
                     inMemFileSys.getUri().toString() +
                     " is " + inMemFileSys.getPercentUsed() + 
                     " full. Triggering merge");
            InMemFSMergeThread m = new InMemFSMergeThread(inMemFileSys,
                                                          (LocalFileSystem)localFileSys, sorter);
            m.setName("Thread for merging in memory files");
            m.setDaemon(true);
            mergeInProgress = true;
            m.start();
          }
          neededOutputs.remove(loc.getMapId());
        }
        return bytes;
      }
      
    }
    
    private void configureClasspath(JobConf conf)
      throws IOException {
      
      // get the task and the current classloader which will become the parent
      Task task = ReduceTask.this;
      ClassLoader parent = conf.getClassLoader();   
      
      // get the work directory which holds the elements we are dynamically
      // adding to the classpath
      File workDir = new File(task.getJobFile()).getParentFile();
      ArrayList<URL> urllist = new ArrayList<URL>();
      
      // add the jars and directories to the classpath
      String jar = conf.getJar();
      if (jar != null) {      
        LocalDirAllocator lDirAlloc = 
                            new LocalDirAllocator("mapred.local.dir");
        File jobCacheDir = new File(lDirAlloc.getLocalPathToRead(
                                      TaskTracker.getJobCacheSubdir() 
                                      + Path.SEPARATOR + getJobId() 
                                      + Path.SEPARATOR  
                                      + "work", conf).toString());


        File[] libs = new File(jobCacheDir, "lib").listFiles();
        if (libs != null) {
          for (int i = 0; i < libs.length; i++) {
            urllist.add(libs[i].toURL());
          }
        }
        urllist.add(new File(jobCacheDir, "classes").toURL());
        urllist.add(jobCacheDir.toURL());
        
      }
      urllist.add(workDir.toURL());
      
      // create a new classloader with the old classloader as its parent
      // then set that classloader as the one used by the current jobconf
      URL[] urls = urllist.toArray(new URL[urllist.size()]);
      URLClassLoader loader = new URLClassLoader(urls, parent);
      conf.setClassLoader(loader);
    }
    
    public ReduceCopier(TaskUmbilicalProtocol umbilical, JobConf conf)
      throws IOException {
      
      configureClasspath(conf);
      this.shuffleClientMetrics = new ShuffleClientMetrics(conf);
      this.umbilical = umbilical;      
      this.reduceTask = ReduceTask.this;
      this.scheduledCopies = new ArrayList<MapOutputLocation>(100);
      this.copyResults = new ArrayList<CopyResult>(100);    
      this.numCopiers = conf.getInt("mapred.reduce.parallel.copies", 5);
      this.maxBackoff = conf.getInt("mapred.reduce.copy.backoff", 300);
      this.mergeThreshold = conf.getInt("mapred.inmem.merge.threshold", 1000);
      
      //we want to distinguish inmem fs instances for different reduces. Hence,
      //append a unique string in the uri for the inmem fs name
      URI uri = URI.create("ramfs://mapoutput" + reduceTask.hashCode());
      inMemFileSys = (InMemoryFileSystem)FileSystem.get(uri, conf);
      LOG.info(reduceTask.getTaskId() + " Created an InMemoryFileSystem, uri: "
               + uri);
      ramfsMergeOutputSize = (long)(MAX_INMEM_FILESYS_USE * 
                                    inMemFileSys.getFSSize());
      localFileSys = FileSystem.getLocal(conf);
      //create an instance of the sorter
      sorter =
        new SequenceFile.Sorter(inMemFileSys, conf.getOutputKeyComparator(), 
                                conf.getMapOutputValueClass(), conf);
      sorter.setProgressable(getReporter(umbilical));
      
      // hosts -> next contact time
      this.penaltyBox = new Hashtable<String, Long>();
      
      // hostnames
      this.uniqueHosts = new HashSet<String>();
      
      this.lastPollTime = 0;
      


      // Seed the random number generator with a reasonably globally unique seed
      long randomSeed = System.nanoTime() + 
                        (long)Math.pow(this.reduceTask.getPartition(),
                                       (this.reduceTask.getPartition()%10)
                                      );
      this.random = new Random(randomSeed);
    }
    
    public boolean fetchOutputs() throws IOException {
      final int      numOutputs = reduceTask.getNumMaps();
      List<MapOutputLocation> knownOutputs = 
        new ArrayList<MapOutputLocation>(numCopiers);
      int            numInFlight = 0, numCopied = 0;
      int            lowThreshold = numCopiers*2;
      long           bytesTransferred = 0;
      DecimalFormat  mbpsFormat = new DecimalFormat("0.00");
      Random         backoff = new Random();
      final Progress copyPhase = 
        reduceTask.getProgress().phase();
      
      //tweak the probe sample size (make it a function of numCopiers)
      probe_sample_size = Math.max(numCopiers*5, 50);
      
      for (int i = 0; i < numOutputs; i++) {
        neededOutputs.add(i);
        copyPhase.addPhase();       // add sub-phase per file
      }
      
      copiers = new MapOutputCopier[numCopiers];
      
      Reporter reporter = getReporter(umbilical);
      // start all the copying threads
      for (int i=0; i < copiers.length; i++) {
        copiers[i] = new MapOutputCopier(reporter);
        copiers[i].start();
      }
      
      // start the clock for bandwidth measurement
      long startTime = System.currentTimeMillis();
      long currentTime = startTime;
      IntWritable fromEventId = new IntWritable(0);
      
      try {
        // loop until we get all required outputs
        while (!neededOutputs.isEmpty() && mergeThrowable == null) {
          
          LOG.info(reduceTask.getTaskId() + " Need " + neededOutputs.size() +
          " map output(s)");
          
          try {
            // Put the hash entries for the failed fetches. Entries here
            // might be replaced by (mapId) hashkeys from new successful 
            // Map executions, if the fetch failures were due to lost tasks.
            // The replacements, if at all, will happen when we query the
            // tasktracker and put the mapId hashkeys with new 
            // MapOutputLocations as values
            knownOutputs.addAll(retryFetches);
             
            // The call getMapCompletionEvents will update fromEventId to
            // used for the next call to getMapCompletionEvents
            int currentNumKnownMaps = knownOutputs.size();
            int currentNumObsoleteMapIds = obsoleteMapIds.size();
            getMapCompletionEvents(fromEventId, knownOutputs);


            
            LOG.info(reduceTask.getTaskId() + ": " +  
                     "Got " + (knownOutputs.size()-currentNumKnownMaps) + 
                     " new map-outputs & " + 
                     (obsoleteMapIds.size()-currentNumObsoleteMapIds) + 
                     " obsolete map-outputs from tasktracker and " + 
                     retryFetches.size() + " map-outputs from previous failures"
                     );
            
            // clear the "failed" fetches hashmap
            retryFetches.clear();
          }
          catch (IOException ie) {
            LOG.warn(reduceTask.getTaskId() +
                    " Problem locating map outputs: " +
                    StringUtils.stringifyException(ie));
          }
          
          // now walk through the cache and schedule what we can
          int numKnown = knownOutputs.size(), numScheduled = 0;
          int numSlow = 0, numDups = 0;
          
          LOG.info(reduceTask.getTaskId() + " Got " + numKnown + 
                   " known map output location(s); scheduling...");
          
          synchronized (scheduledCopies) {
            // Randomize the map output locations to prevent 
            // all reduce-tasks swamping the same tasktracker
            Collections.shuffle(knownOutputs, this.random);
            
            Iterator locIt = knownOutputs.iterator();
            
            currentTime = System.currentTimeMillis();
            while (locIt.hasNext()) {
              
              MapOutputLocation loc = (MapOutputLocation)locIt.next();
              
              // Do not schedule fetches from OBSOLETE maps
              if (obsoleteMapIds.contains(loc.getMapTaskId())) {
                locIt.remove();
                continue;
              }
              
              Long penaltyEnd = penaltyBox.get(loc.getHost());
              boolean penalized = false, duplicate = false;
              
              if (penaltyEnd != null && currentTime < penaltyEnd.longValue()) {
                penalized = true; numSlow++;
              }
              if (uniqueHosts.contains(loc.getHost())) {
                duplicate = true; numDups++;
              }
              
              if (!penalized && !duplicate) {
                uniqueHosts.add(loc.getHost());
                scheduledCopies.add(loc);
                locIt.remove();  // remove from knownOutputs
                numInFlight++; numScheduled++;
              }
            }
            scheduledCopies.notifyAll();
          }
          LOG.info(reduceTask.getTaskId() + " Scheduled " + numScheduled +
                   " of " + numKnown + " known outputs (" + numSlow +
                   " slow hosts and " + numDups + " dup hosts)");
          
          // if we have no copies in flight and we can't schedule anything
          // new, just wait for a bit
          try {
            if (numInFlight == 0 && numScheduled == 0) {
              // we should indicate progress as we don't want TT to think
              // we're stuck and kill us
              reporter.progress();
              Thread.sleep(5000);
            }
          } catch (InterruptedException e) { } // IGNORE
          
          while (numInFlight > 0 && mergeThrowable == null) {
            LOG.debug(reduceTask.getTaskId() + " numInFlight = " + 
                      numInFlight);
            CopyResult cr = getCopyResult();
            
            if (cr != null) {
              if (cr.getSuccess()) {  // a successful copy
                numCopied++;
                bytesTransferred += cr.getSize();
                
                long secsSinceStart = 
                  (System.currentTimeMillis()-startTime)/1000+1;
                float mbs = ((float)bytesTransferred)/(1024*1024);
                float transferRate = mbs/secsSinceStart;
                
                copyPhase.startNextPhase();
                copyPhase.setStatus("copy (" + numCopied + " of " + numOutputs 
                                    + " at " +
                                    mbpsFormat.format(transferRate) +  " MB/s)");
                
                // Note successfull fetch for this mapId to invalidate
                // (possibly) old fetch-failures
                fetchFailedMaps.remove(cr.getLocation().getMapId());
              } else if (cr.isObsolete()) {
                //ignore
                LOG.info(reduceTask.getTaskId() + 
                         " Ignoring obsolete copy result for Map Task: " + 
                         cr.getLocation().getMapTaskId() + " from host: " + 
                         cr.getHost());
              } else {
                retryFetches.add(cr.getLocation());
                
                // note the failed-fetch
                String mapTaskId = cr.getLocation().getMapTaskId();
                Integer mapId = cr.getLocation().getMapId();
                
                Integer noFailedFetches = 
                  mapTaskToFailedFetchesMap.get(mapTaskId);
                noFailedFetches = 
                  (noFailedFetches == null) ? 1 : (noFailedFetches + 1);
                mapTaskToFailedFetchesMap.put(mapTaskId, noFailedFetches);
                LOG.info("Task " + getTaskId() + ": Failed fetch #" + 
                         noFailedFetches + " from " + mapTaskId);
                
                // did the fetch fail too many times?
                if ((noFailedFetches % MAX_FETCH_RETRIES_PER_MAP) == 0) {
                  synchronized (ReduceTask.this) {
                    taskStatus.addFetchFailedMap(mapTaskId);
                    LOG.info("Failed to fetch map-output from " + mapTaskId + 
                             " even after MAX_FETCH_RETRIES_PER_MAP retries... "
                             + " reporting to the JobTracker");
                  }
                }


                // note unique failed-fetch maps
                if (noFailedFetches == MAX_FETCH_RETRIES_PER_MAP) {
                  fetchFailedMaps.add(mapId);
                  
                  // did we have too many unique failed-fetch maps?
                  if (fetchFailedMaps.size() >= MAX_FAILED_UNIQUE_FETCHES) {
                    LOG.fatal("Shuffle failed with too many fetch failures! " +
                              "Killing task " + getTaskId() + ".");
                    umbilical.shuffleError(getTaskId(), 
                                           "Exceeded MAX_FAILED_UNIQUE_FETCHES;"
                                           + " bailing-out.");
                  }
                }
                
                // wait a random amount of time for next contact
                currentTime = System.currentTimeMillis();
                long nextContact = currentTime + 60 * 1000 + 
                                   backoff.nextInt(maxBackoff*1000);
                penaltyBox.put(cr.getHost(), nextContact);          
                LOG.warn(reduceTask.getTaskId() + " adding host " +
                         cr.getHost() + " to penalty box, next contact in " +
                         ((nextContact-currentTime)/1000) + " seconds");
                
                // other outputs from the failed host may be present in the
                // knownOutputs cache, purge them. This is important in case
                // the failure is due to a lost tasktracker (causes many
                // unnecessary backoffs). If not, we only take a small hit
                // polling the tasktracker a few more times
                Iterator locIt = knownOutputs.iterator();
                while (locIt.hasNext()) {
                  MapOutputLocation loc = (MapOutputLocation)locIt.next();
                  if (cr.getHost().equals(loc.getHost())) {
                    retryFetches.add(loc);
                    locIt.remove();
                  }
                }
              }
              uniqueHosts.remove(cr.getHost());
              numInFlight--;
            }
            
            boolean busy = true;
            // ensure we have enough to keep us busy
            if (numInFlight < lowThreshold && (numOutputs-numCopied) > 
                probe_sample_size) {
              busy = false;
            }
            //Check whether we have more CopyResult to check. If there is none,
            //and we are not busy enough, break
            synchronized (copyResults) {
              if (copyResults.size() == 0 && !busy) {
                break;
              }
            }
          }
          
        }
        
        // all done, inform the copiers to exit
        synchronized (copiers) {
          synchronized (scheduledCopies) {
            for (int i=0; i < copiers.length; i++) {
              copiers[i].interrupt();
              copiers[i] = null;
            }
          }
        }
        
        //Do a merge of in-memory files (if there are any)
        if (mergeThrowable == null) {
          try {
            //wait for an ongoing merge (if it is in flight) to complete
            while (mergeInProgress) {
              Thread.sleep(200);
            }
            LOG.info(reduceTask.getTaskId() + 
                     " Copying of all map outputs complete. " + 
                     "Initiating the last merge on the remaining files in " + 
                     inMemFileSys.getUri());
            if (mergeThrowable != null) {
              //this could happen if the merge that
              //was in progress threw an exception
              throw mergeThrowable;
            }
            //initiate merge
            Path[] inMemClosedFiles = inMemFileSys.getFiles(MAP_OUTPUT_FILTER);
            if (inMemClosedFiles.length == 0) {
              LOG.info(reduceTask.getTaskId() + "Nothing to merge from " + 
                       inMemFileSys.getUri());
              return neededOutputs.isEmpty();
            }
            //name this output file same as the name of the first file that is 
            //there in the current list of inmem files (this is guaranteed to be
            //absent on the disk currently. So we don't overwrite a prev. 
            //created spill). Also we need to create the output file now since
            //it is not guaranteed that this file will be present after merge
            //is called (we delete empty sequence files as soon as we see them
            //in the merge method)
            int mapId = extractMapIdFromPathName(inMemClosedFiles[0]);
            Path outputPath = mapOutputFile.getInputFileForWrite(mapId, 
                             reduceTask.getTaskId(), ramfsMergeOutputSize);
            SequenceFile.Writer writer = sorter.cloneFileAttributes(
                                                                    inMemFileSys.makeQualified(inMemClosedFiles[0]), 
                                                                    localFileSys.makeQualified(outputPath), null);
            
            SequenceFile.Sorter.RawKeyValueIterator rIter = null;
            try {
              rIter = sorter.merge(inMemClosedFiles, true, 
                                   inMemClosedFiles.length, 
                                   new Path(reduceTask.getTaskId()));
            } catch (Exception e) { 
              //make sure that we delete the ondisk file that we created earlier
              //when we invoked cloneFileAttributes
              writer.close();
              localFileSys.delete(inMemClosedFiles[0]);
              throw new IOException (StringUtils.stringifyException(e));
            }
            sorter.writeFile(rIter, writer);
            writer.close();
            LOG.info(reduceTask.getTaskId() +
                     " Merge of the " +inMemClosedFiles.length +
                     " files in InMemoryFileSystem complete." +
                     " Local file is " + outputPath);
          } catch (Throwable t) {
            LOG.warn(reduceTask.getTaskId() +
                     " Final merge of the inmemory files threw an exception: " + 
                     StringUtils.stringifyException(t));
            return false;
          }
        }
        return mergeThrowable == null && neededOutputs.isEmpty();
      } finally {
        inMemFileSys.close();
      }
    }
    
    
    private CopyResult getCopyResult() {  
      synchronized (copyResults) {
        while (copyResults.isEmpty()) {
          try {
            copyResults.wait();
          } catch (InterruptedException e) { }
        }
        if (copyResults.isEmpty()) {
          return null;
        } else {
          return copyResults.remove(0);
        }
      }    
    }
    
    /** 
     * Queries the {@link TaskTracker} for a set of map-completion events from
     * a given event ID.
     * 
     * @param fromEventId the first event ID we want to start from, this is
     *                     modified by the call to this method
     * @param jobClient the {@link JobTracker}
     * @return the set of map-completion events from the given event ID 
     * @throws IOException
     */  
    private void getMapCompletionEvents(IntWritable fromEventId, 
                                        List<MapOutputLocation> knownOutputs)
    throws IOException {
      
      long currentTime = System.currentTimeMillis();    
      long pollTime = lastPollTime + MIN_POLL_INTERVAL;
      while (currentTime < pollTime) {
        try {
          Thread.sleep(pollTime-currentTime);
        } catch (InterruptedException ie) { } // IGNORE
        currentTime = System.currentTimeMillis();
      }
      
      TaskCompletionEvent events[] = 
        umbilical.getMapCompletionEvents(reduceTask.getJobId(), 
                                         fromEventId.get(), probe_sample_size);
      
      // Note the last successful poll time-stamp
      lastPollTime = currentTime;


      // Update the last seen event ID
      fromEventId.set(fromEventId.get() + events.length);
      
      // Process the TaskCompletionEvents:
      // 1. Save the SUCCEEDED maps in knownOutputs to fetch the outputs.
      // 2. Save the OBSOLETE/FAILED/KILLED maps in obsoleteOutputs to stop fetching
      //    from those maps.
      // 3. Remove TIPFAILED maps from neededOutputs since we don't need their
      //    outputs at all.
      for (TaskCompletionEvent event : events) {
        switch (event.getTaskStatus()) {
          case SUCCEEDED:
          {
            URI u = URI.create(event.getTaskTrackerHttp());
            String host = u.getHost();
            int port = u.getPort();
            String taskId = event.getTaskId();
            int mId = event.idWithinJob();
            knownOutputs.add(new MapOutputLocation(taskId, mId, host, port));
          }
          break;
          case FAILED:
          case KILLED:
          case OBSOLETE:
          {
            obsoleteMapIds.add(event.getTaskId());
            LOG.info("Ignoring obsolete output of " + event.getTaskStatus() + 
                     " map-task: '" + event.getTaskId() + "'");
          }
          break;
          case TIPFAILED:
          {
            neededOutputs.remove(event.idWithinJob());
            LOG.info("Ignoring output of failed map TIP: '" +  
                 event.getTaskId() + "'");
          }
          break;
        }
      }


    }
    
    
    private class InMemFSMergeThread extends Thread {
      private InMemoryFileSystem inMemFileSys;
      private LocalFileSystem localFileSys;
      private SequenceFile.Sorter sorter;
      
      public InMemFSMergeThread(InMemoryFileSystem inMemFileSys, 
                                LocalFileSystem localFileSys, SequenceFile.Sorter sorter) {
        this.inMemFileSys = inMemFileSys;
        this.localFileSys = localFileSys;
        this.sorter = sorter;
      }
      public void run() {
        LOG.info(reduceTask.getTaskId() + " Thread started: " + getName());
        try {
          Path[] inMemClosedFiles;
          //initiate merge
          synchronized (ReduceTask.this) {
            inMemClosedFiles = inMemFileSys.getFiles(MAP_OUTPUT_FILTER);
          }
          //Note that the above Path[] could be of length 0 if all copies are 
          //in flight. So we make sure that we have some 'closed' map
          //output files to merge to get the benefit of in-memory merge
          if (inMemClosedFiles.length >= 
              (int)(MAX_INMEM_FILESYS_USE/MAX_INMEM_FILESIZE_FRACTION)) {
            //name this output file same as the name of the first file that is 
            //there in the current list of inmem files (this is guaranteed to
            //be absent on the disk currently. So we don't overwrite a prev. 
            //created spill). Also we need to create the output file now since
            //it is not guaranteed that this file will be present after merge
            //is called (we delete empty sequence files as soon as we see them
            //in the merge method)


            //figure out the mapId 
            int mapId = extractMapIdFromPathName(inMemClosedFiles[0]);
            
            Path outputPath = mapOutputFile.getInputFileForWrite(mapId, 
                              reduceTask.getTaskId(), ramfsMergeOutputSize);


            SequenceFile.Writer writer = sorter.cloneFileAttributes(
                                                                    inMemFileSys.makeQualified(inMemClosedFiles[0]), 
                                                                    localFileSys.makeQualified(outputPath), null);
            SequenceFile.Sorter.RawKeyValueIterator rIter;
            try {
              rIter = sorter.merge(inMemClosedFiles, true, 
                                   inMemClosedFiles.length, new Path(reduceTask.getTaskId()));
            } catch (Exception e) { 
              //make sure that we delete the ondisk file that we created 
              //earlier when we invoked cloneFileAttributes
              writer.close();
              localFileSys.delete(outputPath);
              throw new IOException (StringUtils.stringifyException(e));
            }
            sorter.writeFile(rIter, writer);
            writer.close();
            LOG.info(reduceTask.getTaskId() + 
                     " Merge of the " +inMemClosedFiles.length +
                     " files in InMemoryFileSystem complete." +
                     " Local file is " + outputPath);
          }
          else {
            LOG.info(reduceTask.getTaskId() + " Nothing to merge from " + 
                     inMemFileSys.getUri());
          }
        } catch (Throwable t) {
          LOG.warn(reduceTask.getTaskId() +
                   " Intermediate Merge of the inmemory files threw an exception: "
                   + StringUtils.stringifyException(t));
          ReduceCopier.this.mergeThrowable = t;
        }
        finally {
          mergeInProgress = false;
        }
      }
    }
    final private PathFilter MAP_OUTPUT_FILTER = new PathFilter() {
        public boolean accept(Path file) {
          return file.toString().endsWith(".out");
        }     
      };
  }
}
Source Code of org.apache.hadoop.mapred.ReduceTask$ReduceCopier

Related Classes of org.apache.hadoop.mapred.ReduceTask$ReduceCopier