Package org.apache.mahout.common

Source Code of org.apache.mahout.common.AbstractJob

/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements.  See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.mahout.common;

import java.io.File;
import java.io.IOException;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
import java.util.concurrent.atomic.AtomicInteger;

import com.google.common.collect.Lists;
import com.google.common.io.Closeables;
import org.apache.commons.cli2.CommandLine;
import org.apache.commons.cli2.Group;
import org.apache.commons.cli2.Option;
import org.apache.commons.cli2.OptionException;
import org.apache.commons.cli2.builder.ArgumentBuilder;
import org.apache.commons.cli2.builder.DefaultOptionBuilder;
import org.apache.commons.cli2.builder.GroupBuilder;
import org.apache.commons.cli2.commandline.Parser;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.OutputFormat;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.lucene.analysis.Analyzer;
import org.apache.mahout.common.commandline.DefaultOptionCreator;
import org.apache.mahout.common.lucene.AnalyzerUtils;
import org.apache.mahout.math.VectorWritable;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.google.common.base.Preconditions;
import org.apache.lucene.analysis.standard.StandardAnalyzer;

/**
* <p>Superclass of many Mahout Hadoop "jobs". A job drives configuration and launch of one or
* more maps and reduces in order to accomplish some task.</p>
*
* <p>Command line arguments available to all subclasses are:</p>
*
* <ul>
<li>--tempDir (path): Specifies a directory where the job may place temp files
*   (default "temp")</li>
<li>--help: Show help message</li>
* </ul>
*
* <p>In addition, note some key command line parameters that are parsed by Hadoop, which jobs
* may need to set:</p>
*
* <ul>
<li>-Dmapred.job.name=(name): Sets the Hadoop task names. It will be suffixed by
*    the mapper and reducer class names</li>
<li>-Dmapred.output.compress={true,false}: Compress final output (default true)</li>
<li>-Dmapred.input.dir=(path): input file, or directory containing input files (required)</li>
<li>-Dmapred.output.dir=(path): path to write output files (required)</li>
* </ul>
*
* <p>Note that because of how Hadoop parses arguments, all "-D" arguments must appear before all other
* arguments.</p>
*/
public abstract class AbstractJob extends Configured implements Tool {

  private static final Logger log = LoggerFactory.getLogger(AbstractJob.class);

  /** option used to specify the input path */
  private Option inputOption;

  /** option used to specify the output path */
  private Option outputOption;

  /** input path, populated by {@link #parseArguments(String[])} */
  protected Path inputPath;
  protected File inputFile; //the input represented as a file

  /** output path, populated by {@link #parseArguments(String[])} */
  protected Path outputPath;
  protected File outputFile; //the output represented as a file

  /** temp path, populated by {@link #parseArguments(String[])} */
  protected Path tempPath;

  protected Map<String, List<String>> argMap;

  /** internal list of options that have been added */
  private final List<Option> options;
  private Group group;

  protected AbstractJob() {
    options = Lists.newLinkedList();
  }

  /** Returns the input path established by a call to {@link #parseArguments(String[])}.
   *  The source of the path may be an input option added using {@link #addInputOption()}
   *  or it may be the value of the {@code mapred.input.dir} configuration
   *  property.
   */
  protected Path getInputPath() {
    return inputPath;
  }

  /** Returns the output path established by a call to {@link #parseArguments(String[])}.
   *  The source of the path may be an output option added using {@link #addOutputOption()}
   *  or it may be the value of the {@code mapred.input.dir} configuration
   *  property.
   */
  protected Path getOutputPath() {
    return outputPath;
  }

  protected Path getOutputPath(String path) {
    return new Path(outputPath, path);
  }

  protected File getInputFile() {
    return inputFile;
  }

  protected File getOutputFile() {
    return outputFile;
  }


  protected Path getTempPath() {
    return tempPath;
  }

  protected Path getTempPath(String directory) {
    return new Path(tempPath, directory);
  }
 
  @Override
  public Configuration getConf() {
    Configuration result = super.getConf();
    if (result == null) {
      return new Configuration();
    }
    return result;
  }

  /** Add an option with no argument whose presence can be checked for using
   *  {@code containsKey} method on the map returned by {@link #parseArguments(String[])};
   */
  protected void addFlag(String name, String shortName, String description) {
    options.add(buildOption(name, shortName, description, false, false, null));
  }

  /** Add an option to the the set of options this job will parse when
   *  {@link #parseArguments(String[])} is called. This options has an argument
   *  with null as its default value.
   */
  protected void addOption(String name, String shortName, String description) {
    options.add(buildOption(name, shortName, description, true, false, null));
  }

  /** Add an option to the the set of options this job will parse when
   *  {@link #parseArguments(String[])} is called.
   *
   * @param required if true the {@link #parseArguments(String[])} will throw
   *    fail with an error and usage message if this option is not specified
   *    on the command line.
   */
  protected void addOption(String name, String shortName, String description, boolean required) {
    options.add(buildOption(name, shortName, description, true, required, null));
  }

  /** Add an option to the the set of options this job will parse when
   *  {@link #parseArguments(String[])} is called. If this option is not
   *  specified on the command line the default value will be
   *  used.
   *
   * @param defaultValue the default argument value if this argument is not
   *   found on the command-line. null is allowed.
   */
  protected void addOption(String name, String shortName, String description, String defaultValue) {
    options.add(buildOption(name, shortName, description, true, false, defaultValue));
  }

  /** Add an arbitrary option to the set of options this job will parse when
   *  {@link #parseArguments(String[])} is called. If this option has no
   *  argument, use {@code containsKey} on the map returned by
   *  {@code parseArguments} to check for its presence. Otherwise, the
   *  string value of the option will be placed in the map using a key
   *  equal to this options long name preceded by '--'.
   * @return the option added.
   */
  protected Option addOption(Option option) {
    options.add(option);
    return option;
  }

  protected Group getGroup() {
    return group;
  }

  /** Add the default input directory option, '-i' which takes a directory
   *  name as an argument. When {@link #parseArguments(String[])} is
   *  called, the inputPath will be set based upon the value for this option.
   *  If this method is called, the input is required.
   */
  protected void addInputOption() {
    this.inputOption = addOption(DefaultOptionCreator.inputOption().create());
  }

  /** Add the default output directory option, '-o' which takes a directory
   *  name as an argument. When {@link #parseArguments(String[])} is
   *  called, the outputPath will be set based upon the value for this option.
   *  If this method is called, the output is required.
   */
  protected void addOutputOption() {
    this.outputOption = addOption(DefaultOptionCreator.outputOption().create());
  }

  /** Build an option with the given parameters. Name and description are
   *  required.
   *
   * @param name the long name of the option prefixed with '--' on the command-line
   * @param shortName the short name of the option, prefixed with '-' on the command-line
   * @param description description of the option displayed in help method
   * @param hasArg true if the option has an argument.
   * @param required true if the option is required.
   * @param defaultValue default argument value, can be null.
   * @return the option.
   */
  protected static Option buildOption(String name,
                                      String shortName,
                                      String description,
                                      boolean hasArg,
                                      boolean required,
                                      String defaultValue) {

    return buildOption(name, shortName, description, hasArg, 1, 1, required, defaultValue);
  }

  protected static Option buildOption(String name,
                                      String shortName,
                                      String description,
                                      boolean hasArg, int min, int max,
                                      boolean required,
                                      String defaultValue) {

    DefaultOptionBuilder optBuilder = new DefaultOptionBuilder().withLongName(name).withDescription(description)
        .withRequired(required);

    if (shortName != null) {
      optBuilder.withShortName(shortName);
    }

    if (hasArg) {
      ArgumentBuilder argBuilder = new ArgumentBuilder().withName(name).withMinimum(min).withMaximum(max);

      if (defaultValue != null) {
        argBuilder = argBuilder.withDefault(defaultValue);
      }

      optBuilder.withArgument(argBuilder.create());
    }

    return optBuilder.create();
  }

  /**
   * @param name The name of the option
   * @return the {@link org.apache.commons.cli2.Option} with the name, else null
   */
  protected Option getCLIOption(String name) {
    for (Option option : options) {
      if (option.getPreferredName().equals(name)) {
        return option;
      }
    }
    return null;
  }

  /** Parse the arguments specified based on the options defined using the
   *  various {@code addOption} methods. If -h is specified or an
   *  exception is encountered print help and return null. Has the
   *  side effect of setting inputPath and outputPath
   *  if {@code addInputOption} or {@code addOutputOption}
   *  or {@code mapred.input.dir} or {@code mapred.output.dir}
   *  are present in the Configuration.
   *
   * @return a {@code Map<String,String>} containing options and their argument values.
   *  The presence of a flag can be tested using {@code containsKey}, while
   *  argument values can be retrieved using {@code get(optionName)}. The
   *  names used for keys are the option name parameter prefixed by '--'.
   *
   * @see #parseArguments(String[], boolean, boolean)  -- passes in false, false for the optional args.
   */
  public Map<String, List<String>> parseArguments(String[] args) throws IOException {
    return parseArguments(args, false, false);
  }

  /**
   *
   * @param args  The args to parse
   * @param inputOptional if false, then the input option, if set, need not be present.  If true and input is an option
   *                      and there is no input, then throw an error
   * @param outputOptional if false, then the output option, if set, need not be present.  If true and output is an
   *                       option and there is no output, then throw an error
   * @return the args parsed into a map.
   */
  public Map<String, List<String>> parseArguments(String[] args, boolean inputOptional, boolean outputOptional)
    throws IOException {
    Option helpOpt = addOption(DefaultOptionCreator.helpOption());
    addOption("tempDir", null, "Intermediate output directory", "temp");
    addOption("startPhase", null, "First phase to run", "0");
    addOption("endPhase", null, "Last phase to run", String.valueOf(Integer.MAX_VALUE));

    GroupBuilder gBuilder = new GroupBuilder().withName("Job-Specific Options:");

    for (Option opt : options) {
      gBuilder = gBuilder.withOption(opt);
    }

    group = gBuilder.create();

    CommandLine cmdLine;
    try {
      Parser parser = new Parser();
      parser.setGroup(group);
      parser.setHelpOption(helpOpt);
      cmdLine = parser.parse(args);

    } catch (OptionException e) {
      log.error(e.getMessage());
      CommandLineUtil.printHelpWithGenericOptions(group, e);
      return null;
    }

    if (cmdLine.hasOption(helpOpt)) {
      CommandLineUtil.printHelpWithGenericOptions(group);
      return null;
    }

    try {
      parseDirectories(cmdLine, inputOptional, outputOptional);
    } catch (IllegalArgumentException e) {
      log.error(e.getMessage());
      CommandLineUtil.printHelpWithGenericOptions(group);
      return null;
    }

    argMap = new TreeMap<String, List<String>>();
    maybePut(argMap, cmdLine, this.options.toArray(new Option[this.options.size()]));

    this.tempPath = new Path(getOption("tempDir"));

    if (!hasOption("quiet")) {
      log.info("Command line arguments: {}", argMap);
    }
    return argMap;
  }
 
  /**
   * Build the option key (--name) from the option name
   */
  public static String keyFor(String optionName) {
    return "--" + optionName;
  }

  /**
   * @return the requested option, or null if it has not been specified
   */
  public String getOption(String optionName) {
    List<String> list = argMap.get(keyFor(optionName));
    if (list != null && !list.isEmpty()) {
      return list.get(0);
    }
    return null;
  }

  /**
   * Get the option, else the default
   * @param optionName The name of the option to look up, without the --
   * @param defaultVal The default value.
   * @return The requested option, else the default value if it doesn't exist
   */
  public String getOption(String optionName, String defaultVal) {
    String res = getOption(optionName);
    if (res == null) {
      res = defaultVal;
    }
    return res;
  }

  public int getInt(String optionName) {
    return Integer.parseInt(getOption(optionName));
  }

  public int getInt(String optionName, int defaultVal) {
    return Integer.parseInt(getOption(optionName, String.valueOf(defaultVal)));
  }

  public float getFloat(String optionName) {
    return Float.parseFloat(getOption(optionName));
  }

  public float getFloat(String optionName, float defaultVal) {
    return Float.parseFloat(getOption(optionName, String.valueOf(defaultVal)));
  }

  /**
   * Options can occur multiple times, so return the list
   * @param optionName The unadorned (no "--" prefixing it) option name
   * @return The values, else null.  If the option is present, but has no values, then the result will be an
   * empty list (Collections.emptyList())
   */
  public List<String> getOptions(String optionName) {
    return argMap.get(keyFor(optionName));
  }

  /**
   * @return if the requested option has been specified
   */
  public boolean hasOption(String optionName) {
    return argMap.containsKey(keyFor(optionName));
  }


  /**
   * Get the cardinality of the input vectors
   *
   * @param matrix
   * @return the cardinality of the vector
   */
  public int getDimensions(Path matrix) throws IOException {

    SequenceFile.Reader reader = null;
    try {
      reader = new SequenceFile.Reader(FileSystem.get(getConf()), matrix, getConf());

      Writable row = ClassUtils.instantiateAs(reader.getKeyClass().asSubclass(Writable.class), Writable.class);

      Preconditions.checkArgument(reader.getValueClass().equals(VectorWritable.class),
          "value type of sequencefile must be a VectorWritable");

      VectorWritable vectorWritable = new VectorWritable();
      boolean hasAtLeastOneRow = reader.next(row, vectorWritable);
      Preconditions.checkState(hasAtLeastOneRow, "matrix must have at least one row");

      return vectorWritable.get().size();

    } finally {
      Closeables.close(reader, true);
    }
  }

  /**
   * Obtain input and output directories from command-line options or hadoop
   *  properties. If {@code addInputOption} or {@code addOutputOption}
   *  has been called, this method will throw an {@code OptionException} if
   *  no source (command-line or property) for that value is present.
   *  Otherwise, {@code inputPath} or {@code outputPath} will be
   *  non-null only if specified as a hadoop property. Command-line options
   *  take precedence over hadoop properties.
   *
   * @throws IllegalArgumentException if either inputOption is present,
   *   and neither {@code --input} nor {@code -Dmapred.input dir} are
   *   specified or outputOption is present and neither {@code --output}
   *   nor {@code -Dmapred.output.dir} are specified.
   */
  protected void parseDirectories(CommandLine cmdLine, boolean inputOptional, boolean outputOptional) {

    Configuration conf = getConf();

    if (inputOption != null && cmdLine.hasOption(inputOption)) {
      this.inputPath = new Path(cmdLine.getValue(inputOption).toString());
      this.inputFile = new File(cmdLine.getValue(inputOption).toString());
    }
    if (inputPath == null && conf.get("mapred.input.dir") != null) {
      this.inputPath = new Path(conf.get("mapred.input.dir"));
    }

    if (outputOption != null && cmdLine.hasOption(outputOption)) {
      this.outputPath = new Path(cmdLine.getValue(outputOption).toString());
      this.outputFile = new File(cmdLine.getValue(outputOption).toString());
    }
    if (outputPath == null && conf.get("mapred.output.dir") != null) {
      this.outputPath = new Path(conf.get("mapred.output.dir"));
    }

    Preconditions.checkArgument(inputOptional || inputOption == null || inputPath != null,
        "No input specified or -Dmapred.input.dir must be provided to specify input directory");
    Preconditions.checkArgument(outputOptional || outputOption == null || outputPath != null,
        "No output specified:  or -Dmapred.output.dir must be provided to specify output directory");
  }

  protected static void maybePut(Map<String, List<String>> args, CommandLine cmdLine, Option... opt) {
    for (Option o : opt) {

      // the option appeared on the command-line, or it has a value
      // (which is likely a default value).
      if (cmdLine.hasOption(o) || cmdLine.getValue(o) != null
          || (cmdLine.getValues(o) != null && !cmdLine.getValues(o).isEmpty())) {

        // nulls are ok, for cases where options are simple flags.
        List<?> vo = cmdLine.getValues(o);
        if (vo != null && !vo.isEmpty()) {
          List<String> vals = Lists.newArrayList();
          for (Object o1 : vo) {
            vals.add(o1.toString());
          }
          args.put(o.getPreferredName(), vals);
        } else {
          args.put(o.getPreferredName(), null);
        }
      }
    }
  }

  /**
   *
   * @param args The input argument map
   * @param optName The adorned (including "--") option name
   * @return The first value in the match, else null
   */
  public static String getOption(Map<String, List<String>> args, String optName) {
    List<String> res = args.get(optName);
    if (res != null && !res.isEmpty()) {
      return res.get(0);
    }
    return null;
  }


  protected static boolean shouldRunNextPhase(Map<String, List<String>> args, AtomicInteger currentPhase) {
    int phase = currentPhase.getAndIncrement();
    String startPhase = getOption(args, "--startPhase");
    String endPhase = getOption(args, "--endPhase");
    boolean phaseSkipped = (startPhase != null && phase < Integer.parseInt(startPhase))
        || (endPhase != null && phase > Integer.parseInt(endPhase));
    if (phaseSkipped) {
      log.info("Skipping phase {}", phase);
    }
    return !phaseSkipped;
  }

  protected Job prepareJob(Path inputPath,
                           Path outputPath,
                           Class<? extends InputFormat> inputFormat,
                           Class<? extends Mapper> mapper,
                           Class<? extends Writable> mapperKey,
                           Class<? extends Writable> mapperValue,
                           Class<? extends OutputFormat> outputFormat) throws IOException {
    return prepareJob(inputPath, outputPath, inputFormat, mapper, mapperKey, mapperValue, outputFormat, null);

  }
  protected Job prepareJob(Path inputPath,
                           Path outputPath,
                           Class<? extends InputFormat> inputFormat,
                           Class<? extends Mapper> mapper,
                           Class<? extends Writable> mapperKey,
                           Class<? extends Writable> mapperValue,
                           Class<? extends OutputFormat> outputFormat,
                           String jobname) throws IOException {

    Job job = HadoopUtil.prepareJob(inputPath, outputPath,
            inputFormat, mapper, mapperKey, mapperValue, outputFormat, getConf());

    String name =
        jobname != null ? jobname : HadoopUtil.getCustomJobName(getClass().getSimpleName(), job, mapper, Reducer.class);

    job.setJobName(name);
    return job;

  }

  protected Job prepareJob(Path inputPath, Path outputPath, Class<? extends Mapper> mapper,
      Class<? extends Writable> mapperKey, Class<? extends Writable> mapperValue, Class<? extends Reducer> reducer,
      Class<? extends Writable> reducerKey, Class<? extends Writable> reducerValue) throws IOException {
    return prepareJob(inputPath, outputPath, SequenceFileInputFormat.class, mapper, mapperKey, mapperValue, reducer,
        reducerKey, reducerValue, SequenceFileOutputFormat.class);
  }

  protected Job prepareJob(Path inputPath,
                           Path outputPath,
                           Class<? extends InputFormat> inputFormat,
                           Class<? extends Mapper> mapper,
                           Class<? extends Writable> mapperKey,
                           Class<? extends Writable> mapperValue,
                           Class<? extends Reducer> reducer,
                           Class<? extends Writable> reducerKey,
                           Class<? extends Writable> reducerValue,
                           Class<? extends OutputFormat> outputFormat) throws IOException {
    Job job = HadoopUtil.prepareJob(inputPath, outputPath,
            inputFormat, mapper, mapperKey, mapperValue, reducer, reducerKey, reducerValue, outputFormat, getConf());
    job.setJobName(HadoopUtil.getCustomJobName(getClass().getSimpleName(), job, mapper, Reducer.class));
    return job;
  }

  /**
   * necessary to make this job (having a combined input path) work on Amazon S3, hopefully this is
   * obsolete when MultipleInputs is available again
   */
  public static void setS3SafeCombinedInputPath(Job job, Path referencePath, Path inputPathOne, Path inputPathTwo)
    throws IOException {
    FileSystem fs = FileSystem.get(referencePath.toUri(), job.getConfiguration());
    FileInputFormat.setInputPaths(job, inputPathOne.makeQualified(fs), inputPathTwo.makeQualified(fs));
  }

  protected Class<? extends Analyzer> getAnalyzerClassFromOption() throws ClassNotFoundException {
    Class<? extends Analyzer> analyzerClass = StandardAnalyzer.class;
    if (hasOption(DefaultOptionCreator.ANALYZER_NAME_OPTION)) {
      String className = getOption(DefaultOptionCreator.ANALYZER_NAME_OPTION);
      analyzerClass = Class.forName(className).asSubclass(Analyzer.class);
      // try instantiating it, b/c there isn't any point in setting it if
      // you can't instantiate it
      //ClassUtils.instantiateAs(analyzerClass, Analyzer.class);
      AnalyzerUtils.createAnalyzer(analyzerClass);
    }
    return analyzerClass;
  }
 
  /**
   * Overrides the base implementation to install the Oozie action configuration resource
   * into the provided Configuration object; note that ToolRunner calls setConf on the Tool
   * before it invokes run.
   */
  @Override
  public void setConf(Configuration conf) {
    super.setConf(conf);
     
    // If running in an Oozie workflow as a Java action, need to add the
    // Configuration resource provided by Oozie to this job's config.
    String oozieActionConfXml = System.getProperty("oozie.action.conf.xml");
    if (oozieActionConfXml != null && conf != null) {
      conf.addResource(new Path("file:///", oozieActionConfXml));
      log.info("Added Oozie action Configuration resource {} to the Hadoop Configuration", oozieActionConfXml);
    }     
  }
}
TOP

Related Classes of org.apache.mahout.common.AbstractJob

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.