Source Code of org.apache.tez.mapreduce.hadoop.MRHelpers$OldInputSplitComparator

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */


package org.apache.tez.mapreduce.hadoop;


import java.io.DataOutputStream;
import java.io.File;
import java.io.IOException;
import java.net.InetAddress;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Comparator;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Vector;


import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.classification.InterfaceAudience.LimitedPrivate;
import org.apache.hadoop.classification.InterfaceAudience.Private;
import org.apache.hadoop.classification.InterfaceStability.Unstable;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.serializer.Deserializer;
import org.apache.hadoop.io.serializer.SerializationFactory;
import org.apache.hadoop.io.serializer.Serializer;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.JobSubmissionFiles;
import org.apache.hadoop.mapreduce.split.JobSplitWriter;
import org.apache.hadoop.mapreduce.v2.proto.MRProtos;
import org.apache.hadoop.security.Credentials;
import org.apache.hadoop.util.ReflectionUtils;
import org.apache.hadoop.yarn.ContainerLogAppender;
import org.apache.hadoop.yarn.api.ApplicationConstants;
import org.apache.hadoop.yarn.api.ApplicationConstants.Environment;
import org.apache.hadoop.yarn.api.records.LocalResource;
import org.apache.hadoop.yarn.api.records.LocalResourceType;
import org.apache.hadoop.yarn.api.records.LocalResourceVisibility;
import org.apache.hadoop.yarn.api.records.Resource;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.util.ConverterUtils;
import org.apache.tez.common.TezJobConfig;
import org.apache.tez.common.TezUtils;
import org.apache.tez.common.TezYARNUtils;
import org.apache.tez.common.security.TokenCache;
import org.apache.tez.dag.api.InputDescriptor;
import org.apache.tez.dag.api.OutputDescriptor;
import org.apache.tez.dag.api.TezConfiguration;
import org.apache.tez.dag.api.TezUncheckedException;
import org.apache.tez.dag.api.Vertex;
import org.apache.tez.dag.api.VertexLocationHint.TaskLocationHint;
import org.apache.tez.mapreduce.combine.MRCombiner;
import org.apache.tez.mapreduce.committer.MROutputCommitter;
import org.apache.tez.mapreduce.input.MRInputLegacy;
import org.apache.tez.mapreduce.output.MROutput;
import org.apache.tez.mapreduce.output.MROutputLegacy;
import org.apache.tez.mapreduce.partition.MRPartitioner;
import org.apache.tez.mapreduce.protos.MRRuntimeProtos.MRInputUserPayloadProto;
import org.apache.tez.mapreduce.protos.MRRuntimeProtos.MRSplitProto;
import org.apache.tez.mapreduce.protos.MRRuntimeProtos.MRSplitsProto;
import org.apache.tez.runtime.api.TezRootInputInitializer;


import com.google.common.base.Function;
import com.google.common.base.Preconditions;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import com.google.protobuf.ByteString;




public class MRHelpers {


  private static final Log LOG = LogFactory.getLog(MRHelpers.class);


  static final int SPLIT_SERIALIZED_LENGTH_ESTIMATE = 40;
  static final String JOB_SPLIT_RESOURCE_NAME = "job.split";
  static final String JOB_SPLIT_METAINFO_RESOURCE_NAME =
      "job.splitmetainfo";


  /**
   * Comparator for org.apache.hadoop.mapreduce.InputSplit
   */
  private static class InputSplitComparator
      implements Comparator<org.apache.hadoop.mapreduce.InputSplit> {
    @Override
    public int compare(org.apache.hadoop.mapreduce.InputSplit o1,
        org.apache.hadoop.mapreduce.InputSplit o2) {
      try {
        long len1 = o1.getLength();
        long len2 = o2.getLength();
        if (len1 < len2) {
          return 1;
        } else if (len1 == len2) {
          return 0;
        } else {
          return -1;
        }
      } catch (IOException ie) {
        throw new RuntimeException("exception in InputSplit compare", ie);
      } catch (InterruptedException ie) {
        throw new RuntimeException("exception in InputSplit compare", ie);
      }
    }
  }


  /**
   * Comparator for org.apache.hadoop.mapred.InputSplit
   */
  private static class OldInputSplitComparator
      implements Comparator<org.apache.hadoop.mapred.InputSplit> {
    @Override
    public int compare(org.apache.hadoop.mapred.InputSplit o1,
        org.apache.hadoop.mapred.InputSplit o2) {
      try {
        long len1 = o1.getLength();
        long len2 = o2.getLength();
        if (len1 < len2) {
          return 1;
        } else if (len1 == len2) {
          return 0;
        } else {
          return -1;
        }
      } catch (IOException ie) {
        throw new RuntimeException("Problem getting input split size", ie);
      }
    }
  }


  @SuppressWarnings({ "rawtypes", "unchecked" })
  @Private
  public static org.apache.hadoop.mapreduce.InputSplit[] generateNewSplits(
      JobContext jobContext, String inputFormatName, int numTasks) 
          throws ClassNotFoundException, IOException,
      InterruptedException {
    Configuration conf = jobContext.getConfiguration();
    InputFormat<?, ?> inputFormat = ReflectionUtils.newInstance(
        jobContext.getInputFormatClass(), conf);


    InputFormat<?, ?> finalInputFormat = inputFormat;
    
    if (inputFormatName != null && !inputFormatName.isEmpty()) {
      if (!inputFormat.getClass().equals(
          org.apache.hadoop.mapreduce.split.TezGroupedSplitsInputFormat.class)){
        throw new TezUncheckedException(
        "Expected " +
        org.apache.hadoop.mapreduce.split.TezGroupedSplitsInputFormat.class.getName()
        + " in conf but got: " + inputFormat.getClass().getName());
      }
      try {
      inputFormat = (org.apache.hadoop.mapreduce.InputFormat) 
          ReflectionUtils.newInstance(Class.forName(inputFormatName), conf);
      } catch (ClassNotFoundException e) {
        throw new TezUncheckedException(e);
      }


      org.apache.hadoop.mapreduce.split.TezGroupedSplitsInputFormat groupedFormat = 
          new org.apache.hadoop.mapreduce.split.TezGroupedSplitsInputFormat();
      groupedFormat.setConf(conf);
      groupedFormat.setInputFormat(inputFormat);
      groupedFormat.setDesiredNumberOfSplits(numTasks);
      finalInputFormat = groupedFormat;
    }
    
    List<org.apache.hadoop.mapreduce.InputSplit> array = finalInputFormat
        .getSplits(jobContext);
    org.apache.hadoop.mapreduce.InputSplit[] splits = (org.apache.hadoop.mapreduce.InputSplit[]) array
        .toArray(new org.apache.hadoop.mapreduce.InputSplit[array.size()]);


    // sort the splits into order based on size, so that the biggest
    // go first
    Arrays.sort(splits, new InputSplitComparator());
    return splits;
  }


  /**
   * Generate new-api mapreduce InputFormat splits
   * @param jobContext JobContext required by InputFormat
   * @param inputSplitDir Directory in which to generate splits information
   *
   * @return InputSplitInfo containing the split files' information and the
   * location hints for each split generated to be used to determining parallelism of
   * the map stage.
   *
   * @throws IOException
   * @throws InterruptedException
   * @throws ClassNotFoundException
   */
  private static InputSplitInfoDisk writeNewSplits(JobContext jobContext,
      Path inputSplitDir) throws IOException, InterruptedException,
      ClassNotFoundException {
    
    org.apache.hadoop.mapreduce.InputSplit[] splits = 
        generateNewSplits(jobContext, null, 0);
    
    Configuration conf = jobContext.getConfiguration();


    JobSplitWriter.createSplitFiles(inputSplitDir, conf,
        inputSplitDir.getFileSystem(conf), splits);


    List<TaskLocationHint> locationHints =
        new ArrayList<TaskLocationHint>(splits.length);
    for (int i = 0; i < splits.length; ++i) {
      locationHints.add(
          new TaskLocationHint(new HashSet<String>(
              Arrays.asList(splits[i].getLocations())), null));
    }


    return new InputSplitInfoDisk(
        JobSubmissionFiles.getJobSplitFile(inputSplitDir),
        JobSubmissionFiles.getJobSplitMetaFile(inputSplitDir),
        splits.length, locationHints, jobContext.getCredentials());
  }


  @SuppressWarnings({ "rawtypes", "unchecked" })
  @Private
  public static org.apache.hadoop.mapred.InputSplit[] generateOldSplits(
      JobConf jobConf, String inputFormatName, int numTasks) throws IOException {
    org.apache.hadoop.mapred.InputFormat inputFormat = jobConf.getInputFormat();
    org.apache.hadoop.mapred.InputFormat finalInputFormat = inputFormat;
    if (inputFormatName != null && !inputFormatName.isEmpty()) {
      if (!inputFormat.getClass().equals(
          org.apache.hadoop.mapred.split.TezGroupedSplitsInputFormat.class)){
        throw new TezUncheckedException(
        "Expected " +
        org.apache.hadoop.mapred.split.TezGroupedSplitsInputFormat.class.getName()
        + " in conf but got: " + inputFormat.getClass().getName());
      }
      try {
        inputFormat = (org.apache.hadoop.mapred.InputFormat) 
            ReflectionUtils.newInstance(Class.forName(inputFormatName), jobConf);
      } catch (ClassNotFoundException e) {
        throw new TezUncheckedException(e);
      }
      org.apache.hadoop.mapred.split.TezGroupedSplitsInputFormat groupedFormat = 
          new org.apache.hadoop.mapred.split.TezGroupedSplitsInputFormat();
      groupedFormat.setConf(jobConf);
      groupedFormat.setInputFormat(inputFormat);
      groupedFormat.setDesiredNumberOfSplits(numTasks);
      finalInputFormat = groupedFormat;
    }
    org.apache.hadoop.mapred.InputSplit[] splits = finalInputFormat
        .getSplits(jobConf, jobConf.getNumMapTasks());
    // sort the splits into order based on size, so that the biggest
    // go first
    Arrays.sort(splits, new OldInputSplitComparator());
    return splits;
  }
  
  /**
   * Generate old-api mapred InputFormat splits
   * @param jobConf JobConf required by InputFormat class
   * @param inputSplitDir Directory in which to generate splits information
   *
   * @return InputSplitInfo containing the split files' information and the
   * number of splits generated to be used to determining parallelism of
   * the map stage.
   *
   * @throws IOException
   */
  private static InputSplitInfoDisk writeOldSplits(JobConf jobConf,
      Path inputSplitDir) throws IOException {
    
    org.apache.hadoop.mapred.InputSplit[] splits = 
        generateOldSplits(jobConf, null, 0);
    
    JobSplitWriter.createSplitFiles(inputSplitDir, jobConf,
        inputSplitDir.getFileSystem(jobConf), splits);


    List<TaskLocationHint> locationHints =
        new ArrayList<TaskLocationHint>(splits.length);
    for (int i = 0; i < splits.length; ++i) {
      locationHints.add(
          new TaskLocationHint(new HashSet<String>(
              Arrays.asList(splits[i].getLocations())), null));
    }


    return new InputSplitInfoDisk(
        JobSubmissionFiles.getJobSplitFile(inputSplitDir),
        JobSubmissionFiles.getJobSplitMetaFile(inputSplitDir),
        splits.length, locationHints, jobConf.getCredentials());
  }


  /**
   * Helper api to generate splits
   * @param conf Configuration with all necessary information set to generate
   * splits. The following are required at a minimum:
   *
   *   - mapred.mapper.new-api: determine whether mapred.InputFormat or
   *     mapreduce.InputFormat is to be used
   *   - mapred.input.format.class or mapreduce.job.inputformat.class:
   *     determines the InputFormat class to be used
   *
   * In addition to this, all the configs needed by the InputFormat class also
   * have to be set. For example, FileInputFormat needs the input directory
   * paths to be set in the config.
   *
   * @param inputSplitsDir Directory in which the splits file and meta info file
   * will be generated. job.split and job.splitmetainfo files in this directory
   * will be overwritten. Should be a fully-qualified path.
   *
   * @return InputSplitInfo containing the split files' information and the
   * number of splits generated to be used to determining parallelism of
   * the map stage.
   *
   * @throws IOException
   * @throws InterruptedException
   * @throws ClassNotFoundException
   */
  public static InputSplitInfoDisk generateInputSplits(Configuration conf,
      Path inputSplitsDir) throws IOException, InterruptedException,
      ClassNotFoundException {
    Job job = Job.getInstance(conf);
    JobConf jobConf = new JobConf(conf);
    conf.setBoolean(MRJobConfig.MR_TEZ_SPLITS_VIA_EVENTS, false);
    if (jobConf.getUseNewMapper()) {
      LOG.info("Generating new input splits"
          + ", splitsDir=" + inputSplitsDir.toString());
      return writeNewSplits(job, inputSplitsDir);
    } else {
      LOG.info("Generating old input splits"
          + ", splitsDir=" + inputSplitsDir.toString());
      return writeOldSplits(jobConf, inputSplitsDir);
    }
  }


  /**
   * Generates Input splits and stores them in a {@link MRProtos} instance.
   * 
   * Returns an instance of {@link InputSplitInfoMem}
   * 
   * @param conf
   *          an instance of Configuration which is used to determine whether
   *          the mapred of mapreduce API is being used. This Configuration
   *          instance should also contain adequate information to be able to
   *          generate splits - like the InputFormat being used and related
   *          configuration.
   * @return an instance of {@link InputSplitInfoMem} which supports a subset of
   *         the APIs defined on {@link InputSplitInfo}
   * @throws IOException
   * @throws ClassNotFoundException
   * @throws InterruptedException
   */
  public static InputSplitInfoMem generateInputSplitsToMem(Configuration conf)
      throws IOException, ClassNotFoundException, InterruptedException {


    InputSplitInfoMem splitInfoMem = null;
    JobConf jobConf = new JobConf(conf);
    if (jobConf.getUseNewMapper()) {
      LOG.info("Generating mapreduce api input splits");
      Job job = Job.getInstance(conf);
      org.apache.hadoop.mapreduce.InputSplit[] splits = 
          generateNewSplits(job, null, 0);
      splitInfoMem = new InputSplitInfoMem(splits, createTaskLocationHintsFromSplits(splits),
          splits.length, job.getCredentials(), job.getConfiguration());
    } else {
      LOG.info("Generating mapred api input splits");
      org.apache.hadoop.mapred.InputSplit[] splits = 
          generateOldSplits(jobConf, null, 0);
      splitInfoMem = new InputSplitInfoMem(splits, createTaskLocationHintsFromSplits(splits),
          splits.length, jobConf.getCredentials(), jobConf);
    }
    LOG.info("NumSplits: " + splitInfoMem.getNumTasks() + ", SerializedSize: "
        + splitInfoMem.getSplitsProto().getSerializedSize());
    return splitInfoMem;
  }


  


  @Private
  public static <T extends org.apache.hadoop.mapreduce.InputSplit> MRSplitProto createSplitProto(
      T newSplit, SerializationFactory serializationFactory)
      throws IOException, InterruptedException {
    MRSplitProto.Builder builder = MRSplitProto
        .newBuilder();
    
    builder.setSplitClassName(newSplit.getClass().getName());


    @SuppressWarnings("unchecked")
    Serializer<T> serializer = serializationFactory
        .getSerializer((Class<T>) newSplit.getClass());
    ByteString.Output out = ByteString
        .newOutput(SPLIT_SERIALIZED_LENGTH_ESTIMATE);
    serializer.open(out);
    serializer.serialize(newSplit);
    // TODO MR Compat: Check against max block locations per split.
    ByteString splitBs = out.toByteString();
    builder.setSplitBytes(splitBs);


    return builder.build();
  }


  @Private
  public static MRSplitProto createSplitProto(
      org.apache.hadoop.mapred.InputSplit oldSplit) throws IOException {
    MRSplitProto.Builder builder = MRSplitProto.newBuilder();


    builder.setSplitClassName(oldSplit.getClass().getName());
    
    ByteString.Output os = ByteString
        .newOutput(SPLIT_SERIALIZED_LENGTH_ESTIMATE);
    oldSplit.write(new DataOutputStream(os));
    ByteString splitBs = os.toByteString();
    builder.setSplitBytes(splitBs);


    return builder.build();
  }


  private static String getChildLogLevel(Configuration conf, boolean isMap) {
    if (isMap) {
      return conf.get(
          MRJobConfig.MAP_LOG_LEVEL,
          JobConf.DEFAULT_LOG_LEVEL.toString()
          );
    } else {
      return conf.get(
          MRJobConfig.REDUCE_LOG_LEVEL,
          JobConf.DEFAULT_LOG_LEVEL.toString()
          );
    }
  }


  private static String getLog4jCmdLineProperties(Configuration conf,
      boolean isMap) {
    Vector<String> logProps = new Vector<String>(4);
    addLog4jSystemProperties(getChildLogLevel(conf, isMap), logProps);
    StringBuilder sb = new StringBuilder();
    for (String str : logProps) {
      sb.append(str).append(" ");
    }
    return sb.toString();
  }


  /**
   * Add the JVM system properties necessary to configure
   * {@link ContainerLogAppender}.
   *
   * @param logLevel
   *          the desired log level (eg INFO/WARN/DEBUG)
   * @param vargs
   *          the argument list to append to
   */
  public static void addLog4jSystemProperties(String logLevel,
      List<String> vargs) {
    vargs.add("-Dlog4j.configuration="
        + TezConfiguration.TEZ_CONTAINER_LOG4J_PROPERTIES_FILE);
    vargs.add("-D" + YarnConfiguration.YARN_APP_CONTAINER_LOG_DIR + "="
        + ApplicationConstants.LOG_DIR_EXPANSION_VAR);
    vargs.add("-D" + TezConfiguration.TEZ_ROOT_LOGGER_NAME + "=" + logLevel
        + "," + TezConfiguration.TEZ_CONTAINER_LOGGER_NAME);
  }


  /**
   * Generate JVM options to be used to launch map tasks
   *
   * Uses mapreduce.admin.map.child.java.opts, mapreduce.map.java.opts and
   * mapreduce.map.log.level from config to generate the opts.
   *
   * @param conf Configuration to be used to extract JVM opts specific info
   * @return JAVA_OPTS string to be used in launching the JVM
   */
  @SuppressWarnings("deprecation")
  public static String getMapJavaOpts(Configuration conf) {
    String adminOpts = conf.get(
        MRJobConfig.MAPRED_MAP_ADMIN_JAVA_OPTS,
        MRJobConfig.DEFAULT_MAPRED_ADMIN_JAVA_OPTS);


    String userOpts = conf.get(
        MRJobConfig.MAP_JAVA_OPTS,
        conf.get(
            JobConf.MAPRED_TASK_JAVA_OPTS,
            JobConf.DEFAULT_MAPRED_TASK_JAVA_OPTS));


    return adminOpts.trim() + " " + userOpts.trim() + " "
        + getLog4jCmdLineProperties(conf, true);
  }


  /**
   * Generate JVM options to be used to launch reduce tasks
   *
   * Uses mapreduce.admin.reduce.child.java.opts, mapreduce.reduce.java.opts
   * and mapreduce.reduce.log.level from config to generate the opts.
   *
   * @param conf Configuration to be used to extract JVM opts specific info
   * @return JAVA_OPTS string to be used in launching the JVM
   */
  @SuppressWarnings("deprecation")
  public static String getReduceJavaOpts(Configuration conf) {
    String adminOpts = conf.get(
        MRJobConfig.MAPRED_REDUCE_ADMIN_JAVA_OPTS,
        MRJobConfig.DEFAULT_MAPRED_ADMIN_JAVA_OPTS);


    String userOpts = conf.get(
        MRJobConfig.REDUCE_JAVA_OPTS,
        conf.get(
            JobConf.MAPRED_TASK_JAVA_OPTS,
            JobConf.DEFAULT_MAPRED_TASK_JAVA_OPTS));


    return adminOpts.trim() + " " + userOpts.trim() + " "
        + getLog4jCmdLineProperties(conf, false);
  }


  /**
   * Sets up parameters which used to be set by the MR JobClient. Includes
   * setting whether to use the new api or the old api. Note: Must be called
   * before generating InputSplits
   *
   * @param conf
   *          configuration for the vertex.
   */
  public static void doJobClientMagic(Configuration conf) throws IOException {
    setUseNewAPI(conf);
    // TODO Maybe add functionality to check output specifications - e.g. fail
    // early if the output directory exists.
    InetAddress ip = InetAddress.getLocalHost();
    if (ip != null) {
      String submitHostAddress = ip.getHostAddress();
      String submitHostName = ip.getHostName();
      conf.set(MRJobConfig.JOB_SUBMITHOST, submitHostName);
      conf.set(MRJobConfig.JOB_SUBMITHOSTADDR, submitHostAddress);
    }
    // conf.set("hadoop.http.filter.initializers",
    // "org.apache.hadoop.yarn.server.webproxy.amfilter.AmFilterInitializer");
    // Skipping setting JOB_DIR - not used by AM.


    // Maybe generate SHUFFLE secret. The AM uses the job token generated in
    // the AM anyway.


    // TODO eventually ACLs
    conf.set(TezJobConfig.TEZ_RUNTIME_PARTITIONER_CLASS, MRPartitioner.class.getName());
    
    boolean useNewApi = conf.getBoolean("mapred.mapper.new-api", false);
    if (useNewApi) {
      if (conf.get(MRJobConfig.COMBINE_CLASS_ATTR) != null) {
        conf.set(TezJobConfig.TEZ_RUNTIME_COMBINER_CLASS, MRCombiner.class.getName());
      }
    } else {
      if (conf.get("mapred.combiner.class") != null) {
        conf.set(TezJobConfig.TEZ_RUNTIME_COMBINER_CLASS, MRCombiner.class.getName());
      }
    }
    
    setWorkingDirectory(conf);
  }


  private static void setWorkingDirectory(Configuration conf) {
    String name = conf.get(JobContext.WORKING_DIR);
    if (name == null) {
      try {
        Path dir = FileSystem.get(conf).getWorkingDirectory();
        conf.set(JobContext.WORKING_DIR, dir.toString());
      } catch (IOException e) {
        throw new RuntimeException(e);
      }
    }
  }


  /**
   * Default to the new APIs unless they are explicitly set or the old mapper or
   * reduce attributes are used.
   *
   * @throws IOException
   *           if the configuration is inconsistant
   */
  private static void setUseNewAPI(Configuration conf) throws IOException {
    int numReduces = conf.getInt(MRJobConfig.NUM_REDUCES, 1);
    String oldMapperClass = "mapred.mapper.class";
    String oldReduceClass = "mapred.reducer.class";
    conf.setBooleanIfUnset("mapred.mapper.new-api",
        conf.get(oldMapperClass) == null);
    if (conf.getBoolean("mapred.mapper.new-api", false)) {
      String mode = "new map API";
      ensureNotSet(conf, "mapred.input.format.class", mode);
      ensureNotSet(conf, oldMapperClass, mode);
      if (numReduces != 0) {
        ensureNotSet(conf, "mapred.partitioner.class", mode);
      } else {
        ensureNotSet(conf, "mapred.output.format.class", mode);
      }
    } else {
      String mode = "map compatability";
      ensureNotSet(conf, MRJobConfig.INPUT_FORMAT_CLASS_ATTR, mode);
      ensureNotSet(conf, MRJobConfig.MAP_CLASS_ATTR, mode);
      if (numReduces != 0) {
        ensureNotSet(conf, MRJobConfig.PARTITIONER_CLASS_ATTR, mode);
      } else {
        ensureNotSet(conf, MRJobConfig.OUTPUT_FORMAT_CLASS_ATTR, mode);
      }
    }
    if (numReduces != 0) {
      conf.setBooleanIfUnset("mapred.reducer.new-api",
          conf.get(oldReduceClass) == null);
      if (conf.getBoolean("mapred.reducer.new-api", false)) {
        String mode = "new reduce API";
        ensureNotSet(conf, "mapred.output.format.class", mode);
        ensureNotSet(conf, oldReduceClass, mode);
      } else {
        String mode = "reduce compatability";
        ensureNotSet(conf, MRJobConfig.OUTPUT_FORMAT_CLASS_ATTR, mode);
        ensureNotSet(conf, MRJobConfig.REDUCE_CLASS_ATTR, mode);
      }
    }
  }


  private static void ensureNotSet(Configuration conf, String attr, String msg)
      throws IOException {
    if (conf.get(attr) != null) {
      throw new IOException(attr + " is incompatible with " + msg + " mode.");
    }
  }


  @LimitedPrivate("Hive, Pig")
  @Unstable
  public static byte[] createUserPayloadFromConf(Configuration conf)
      throws IOException {
    return TezUtils.createUserPayloadFromConf(conf);
  }
  
  @LimitedPrivate("Hive, Pig")
  public static ByteString createByteStringFromConf(Configuration conf)
      throws IOException {
    return TezUtils.createByteStringFromConf(conf);
  }


  @LimitedPrivate("Hive, Pig")
  @Unstable
  public static Configuration createConfFromUserPayload(byte[] bb)
      throws IOException {
    return TezUtils.createConfFromUserPayload(bb);
  }


  @LimitedPrivate("Hive, Pig")
  public static Configuration createConfFromByteString(ByteString bs)
      throws IOException {
    return TezUtils.createConfFromByteString(bs);
  }


  public static byte[] createMRInputPayload(byte[] configurationBytes,
      MRSplitsProto mrSplitsProto) throws IOException {
    Preconditions.checkArgument(configurationBytes != null,
        "Configuration bytes must be specified");
    return createMRInputPayload(ByteString
        .copyFrom(configurationBytes), mrSplitsProto, null);
  }
  
  public static byte[] createMRInputPayload(Configuration conf,
      MRSplitsProto mrSplitsProto) throws IOException {
    Preconditions
        .checkArgument(conf != null, "Configuration must be specified");
    
    return createMRInputPayload(createByteStringFromConf(conf), 
        mrSplitsProto, null);
  }
  
  /**
   * Called to specify that grouping of input splits be performed by Tez
   * The configurationBytes conf should have the input format class configuration 
   * set to the TezGroupedSplitsInputFormat. The real input format class name 
   * should be passed as an argument to this method.
   */
  public static byte[] createMRInputPayloadWithGrouping(byte[] configurationBytes,
      String inputFormatName) throws IOException {
    Preconditions.checkArgument(configurationBytes != null,
        "Configuration bytes must be specified");
    Preconditions.checkArgument(inputFormatName != null, 
        "InputFormat must be specified");
    return createMRInputPayload(ByteString
        .copyFrom(configurationBytes), null, inputFormatName);    
  }


  /**
   * Called to specify that grouping of input splits be performed by Tez
   * The conf should have the input format class configuration 
   * set to the TezGroupedSplitsInputFormat. The real input format class name 
   * should be passed as an argument to this method.
   */
  public static byte[] createMRInputPayloadWithGrouping(Configuration conf,
      String inputFormatName) throws IOException {
    Preconditions
        .checkArgument(conf != null, "Configuration must be specified");
    Preconditions.checkArgument(inputFormatName != null, 
        "InputFormat must be specified");
    return createMRInputPayload(createByteStringFromConf(conf), 
        null, inputFormatName);    
  }


  private static byte[] createMRInputPayload(ByteString bytes, 
      MRSplitsProto mrSplitsProto, String inputFormatName) throws IOException {
    MRInputUserPayloadProto.Builder userPayloadBuilder = MRInputUserPayloadProto
        .newBuilder();
    userPayloadBuilder.setConfigurationBytes(bytes);
    if (mrSplitsProto != null) {
      userPayloadBuilder.setSplits(mrSplitsProto);
    }
    if (inputFormatName!=null) {
      userPayloadBuilder.setInputFormatName(inputFormatName);
    }
    // TODO Should this be a ByteBuffer or a byte array ? A ByteBuffer would be
    // more efficient.
    return userPayloadBuilder.build().toByteArray();
  }
  
  public static MRInputUserPayloadProto parseMRInputPayload(byte[] bytes)
      throws IOException {
    return MRInputUserPayloadProto.parseFrom(bytes);
  }


  /**
   * Update provided localResources collection with the required local
   * resources needed by MapReduce tasks with respect to Input splits.
   *
   * @param fs Filesystem instance to access status of splits related files
   * @param inputSplitInfo Information on location of split files
   * @param localResources LocalResources collection to be updated
   * @throws IOException
   */
  public static void updateLocalResourcesForInputSplits(
      FileSystem fs,
      InputSplitInfo inputSplitInfo,
      Map<String, LocalResource> localResources) throws IOException {
    if (localResources.containsKey(JOB_SPLIT_RESOURCE_NAME)) {
      throw new RuntimeException("LocalResources already contains a"
          + " resource named " + JOB_SPLIT_RESOURCE_NAME);
    }
    if (localResources.containsKey(JOB_SPLIT_METAINFO_RESOURCE_NAME)) {
      throw new RuntimeException("LocalResources already contains a"
          + " resource named " + JOB_SPLIT_METAINFO_RESOURCE_NAME);
    }


    FileStatus splitFileStatus =
        fs.getFileStatus(inputSplitInfo.getSplitsFile());
    FileStatus metaInfoFileStatus =
        fs.getFileStatus(inputSplitInfo.getSplitsMetaInfoFile());
    localResources.put(JOB_SPLIT_RESOURCE_NAME,
        LocalResource.newInstance(
            ConverterUtils.getYarnUrlFromPath(inputSplitInfo.getSplitsFile()),
            LocalResourceType.FILE,
            LocalResourceVisibility.APPLICATION,
            splitFileStatus.getLen(), splitFileStatus.getModificationTime()));
    localResources.put(JOB_SPLIT_METAINFO_RESOURCE_NAME,
        LocalResource.newInstance(
            ConverterUtils.getYarnUrlFromPath(
                inputSplitInfo.getSplitsMetaInfoFile()),
            LocalResourceType.FILE,
            LocalResourceVisibility.APPLICATION,
            metaInfoFileStatus.getLen(),
            metaInfoFileStatus.getModificationTime()));
  }


  /**
   * Extract the map task's container resource requirements from the
   * provided configuration.
   *
   * Uses mapreduce.map.memory.mb and mapreduce.map.cpu.vcores from the
   * provided configuration.
   *
   * @param conf Configuration with MR specific settings used to extract
   * information from
   *
   * @return Resource object used to define requirements for containers
   * running Map tasks
   */
  public static Resource getMapResource(Configuration conf) {
    return Resource.newInstance(conf.getInt(
        MRJobConfig.MAP_MEMORY_MB, MRJobConfig.DEFAULT_MAP_MEMORY_MB),
        conf.getInt(MRJobConfig.MAP_CPU_VCORES,
            MRJobConfig.DEFAULT_MAP_CPU_VCORES));
  }


  /**
   * Extract the reduce task's container resource requirements from the
   * provided configuration.
   *
   * Uses mapreduce.reduce.memory.mb and mapreduce.reduce.cpu.vcores from the
   * provided configuration.
   *
   * @param conf Configuration with MR specific settings used to extract
   * information from
   *
   * @return Resource object used to define requirements for containers
   * running Reduce tasks
   */
  public static Resource getReduceResource(Configuration conf) {
    return Resource.newInstance(conf.getInt(
        MRJobConfig.REDUCE_MEMORY_MB, MRJobConfig.DEFAULT_REDUCE_MEMORY_MB),
        conf.getInt(MRJobConfig.REDUCE_CPU_VCORES,
            MRJobConfig.DEFAULT_REDUCE_CPU_VCORES));
  }


  /**
   * Setup classpath and other environment variables
   * @param conf Configuration to retrieve settings from
   * @param environment Environment to update
   * @param isMap Whether task is a map or reduce task
   */
  public static void updateEnvironmentForMRTasks(Configuration conf,
      Map<String, String> environment, boolean isMap) {
    // Shell
    environment.put(Environment.SHELL.name(), conf.get(
        MRJobConfig.MAPRED_ADMIN_USER_SHELL, MRJobConfig.DEFAULT_SHELL));


    // Add pwd to LD_LIBRARY_PATH, add this before adding anything else
    TezYARNUtils.addToEnvironment(environment, Environment.LD_LIBRARY_PATH.name(),
        Environment.PWD.$(), File.pathSeparator);


    // Add the env variables passed by the admin
    TezYARNUtils.setEnvFromInputString(environment, conf.get(
        MRJobConfig.MAPRED_ADMIN_USER_ENV,
        MRJobConfig.DEFAULT_MAPRED_ADMIN_USER_ENV),
        File.pathSeparator);


    // Add the env variables passed by the user
    String mapredChildEnv = (isMap ?
        conf.get(MRJobConfig.MAP_ENV, "")
        : conf.get(MRJobConfig.REDUCE_ENV, ""));
    TezYARNUtils.setEnvFromInputString(environment, mapredChildEnv, File.pathSeparator);


    // Set logging level in the environment.
    environment.put(
        "HADOOP_ROOT_LOGGER",
        getChildLogLevel(conf, isMap) + ",CLA");
  }


  private static Configuration getBaseJobConf(Configuration conf) {
    if (conf != null) {
      return new JobConf(conf);
    } else {
      return new JobConf();
    }
  }


  /**
   * Get default initialize JobConf-based configuration
   * @param conf Conf to initialize JobConf with.
   * @return Base configuration for MR-based jobs
   */
  public static Configuration getBaseMRConfiguration(Configuration conf) {
    return getBaseJobConf(conf);
  }


  /**
   * Get default initialize JobConf-based configuration
   * @return Base configuration for MR-based jobs
   */
  public static Configuration getBaseMRConfiguration() {
    return getBaseJobConf(null);
  }


  /**
   * Setup environment for the AM based on MR-based configuration
   * @param conf Configuration from which to extract information
   * @param environment Environment map to update
   */
  public static void updateEnvironmentForMRAM(Configuration conf, Map<String, String> environment) {
    TezYARNUtils.setEnvFromInputString(environment, conf.get(MRJobConfig.MR_AM_ADMIN_USER_ENV),
        File.pathSeparator);
    TezYARNUtils.setEnvFromInputString(environment, conf.get(MRJobConfig.MR_AM_ENV),
        File.pathSeparator);
  }


  /**
   * Extract Java Opts for the AM based on MR-based configuration
   * @param conf Configuration from which to extract information
   * @return Java opts for the AM
   */
  public static String getMRAMJavaOpts(Configuration conf) {
    // Admin opts
    String mrAppMasterAdminOptions = conf.get(
        MRJobConfig.MR_AM_ADMIN_COMMAND_OPTS,
        MRJobConfig.DEFAULT_MR_AM_ADMIN_COMMAND_OPTS);
    // Add AM user command opts
    String mrAppMasterUserOptions = conf.get(MRJobConfig.MR_AM_COMMAND_OPTS,
        MRJobConfig.DEFAULT_MR_AM_COMMAND_OPTS);


    return mrAppMasterAdminOptions.trim()
        + " " + mrAppMasterUserOptions.trim();
  }


  /**
   * Convenience method to add an MR Input to the specified vertex. The name of
   * the Input is "MRInput" </p>
   * 
   * This should only be called for one vertex in a DAG
   * 
   * @param vertex
   * @param userPayload
   * @param initClazz class to init the input in the AM
   */
  public static void addMRInput(Vertex vertex, byte[] userPayload,
      Class<? extends TezRootInputInitializer> initClazz) {
    InputDescriptor id = new InputDescriptor(MRInputLegacy.class.getName())
        .setUserPayload(userPayload);
    vertex.addInput("MRInput", id, initClazz);
  }


  /**
   * Convenience method to add an MR Output to the specified vertex. The name of
   * the Output is "MROutput" </p>
   * 
   * This should only be called for one vertex in a DAG
   * 
   * @param vertex
   * @param userPayload
   */
  public static void addMROutput(Vertex vertex, byte[] userPayload) {
    OutputDescriptor od = new OutputDescriptor(MROutput.class.getName())
        .setUserPayload(userPayload);
    vertex.addOutput("MROutput", od, MROutputCommitter.class);
  }


  @Private
  public static void addMROutputLegacy(Vertex vertex, byte[] userPayload) {
    OutputDescriptor od = new OutputDescriptor(MROutputLegacy.class.getName())
        .setUserPayload(userPayload);
    vertex.addOutput("MROutput", od, MROutputCommitter.class);
  }


  @SuppressWarnings("unchecked")
  public static InputSplit createOldFormatSplitFromUserPayload(
      MRSplitProto splitProto, SerializationFactory serializationFactory)
      throws IOException {
    // This may not need to use serialization factory, since OldFormat
    // always uses Writable to write splits.
    Preconditions.checkNotNull(splitProto, "splitProto cannot be null");
    String className = splitProto.getSplitClassName();
    Class<InputSplit> clazz;


    try {
      clazz = (Class<InputSplit>) Class.forName(className);
    } catch (ClassNotFoundException e) {
      throw new IOException("Failed to load InputSplit class: [" + className + "]", e);
    }


    Deserializer<InputSplit> deserializer = serializationFactory
        .getDeserializer(clazz);
    deserializer.open(splitProto.getSplitBytes().newInput());
    InputSplit inputSplit = deserializer.deserialize(null);
    deserializer.close();
    return inputSplit;
  }


  @SuppressWarnings("unchecked")
  public static org.apache.hadoop.mapreduce.InputSplit createNewFormatSplitFromUserPayload(
      MRSplitProto splitProto, SerializationFactory serializationFactory)
      throws IOException {
    Preconditions.checkNotNull(splitProto, "splitProto must be specified");
    String className = splitProto.getSplitClassName();
    Class<org.apache.hadoop.mapreduce.InputSplit> clazz;


    try {
      clazz = (Class<org.apache.hadoop.mapreduce.InputSplit>) Class
          .forName(className);
    } catch (ClassNotFoundException e) {
      throw new IOException("Failed to load InputSplit class: [" + className + "]", e);
    }


    Deserializer<org.apache.hadoop.mapreduce.InputSplit> deserializer = serializationFactory
        .getDeserializer(clazz);
    deserializer.open(splitProto.getSplitBytes().newInput());
    org.apache.hadoop.mapreduce.InputSplit inputSplit = deserializer
        .deserialize(null);
    deserializer.close();
    return inputSplit;
  }


  private static List<TaskLocationHint> createTaskLocationHintsFromSplits(
      org.apache.hadoop.mapreduce.InputSplit[] newFormatSplits) {
    Iterable<TaskLocationHint> iterable = Iterables.transform(Arrays.asList(newFormatSplits),
        new Function<org.apache.hadoop.mapreduce.InputSplit, TaskLocationHint>() {
          @Override
          public TaskLocationHint apply(org.apache.hadoop.mapreduce.InputSplit input) {
            try {
              return new TaskLocationHint(new HashSet<String>(Arrays.asList(input.getLocations())),
                  null);
            } catch (IOException e) {
              throw new RuntimeException(e);
            } catch (InterruptedException e) {
              throw new RuntimeException(e);
            }
          }
        });
    return Lists.newArrayList(iterable);
  }


  private static List<TaskLocationHint> createTaskLocationHintsFromSplits(
      org.apache.hadoop.mapred.InputSplit[] oldFormatSplits) {
    Iterable<TaskLocationHint> iterable = Iterables.transform(Arrays.asList(oldFormatSplits),
        new Function<org.apache.hadoop.mapred.InputSplit, TaskLocationHint>() {
          @Override
          public TaskLocationHint apply(org.apache.hadoop.mapred.InputSplit input) {
            try {
              return new TaskLocationHint(new HashSet<String>(Arrays.asList(input.getLocations())),
                  null);
            } catch (IOException e) {
              throw new RuntimeException(e);
            }
          }
        });


    return Lists.newArrayList(iterable);
  }


  /**
   * Merge tokens from a configured binary file into provided Credentials object.
   * Uses "mapreduce.job.credentials.binary" property to find location of token file.
   * @param creds Credentials object to add new tokens to
   * @param conf Configuration containing location of token file.
   * 
   * TezClient reads credentials from the property - TezJobConfig.TEZ_CREDENTIALS_PATH. This method
   * is not required if that property is set.
   * 
   * @throws IOException
   */
  public static void mergeMRBinaryTokens(Credentials creds,
      Configuration conf) throws IOException {
    String tokenFilePath = conf.get(MRJobConfig.MAPREDUCE_JOB_CREDENTIALS_BINARY);
    if (tokenFilePath == null || tokenFilePath.isEmpty()) {
      return;
    }
    TokenCache.mergeBinaryTokens(creds, conf, tokenFilePath);
  }


}
Source Code of org.apache.tez.mapreduce.hadoop.MRHelpers$OldInputSplitComparator

Related Classes of org.apache.tez.mapreduce.hadoop.MRHelpers$OldInputSplitComparator