Package com.datasalt.pangool.tuplemr

Source Code of com.datasalt.pangool.tuplemr.MapOnlyJobBuilder

/**
* Copyright [2012] [Datasalt Systems S.L.]
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*   http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.datasalt.pangool.tuplemr;

import java.io.IOException;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.UUID;

import com.datasalt.pangool.utils.InstancesDistributor;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.OutputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import com.datasalt.pangool.io.ITuple;
import com.datasalt.pangool.io.Schema;
import com.datasalt.pangool.tuplemr.MultipleInputsInterface.Input;
import com.datasalt.pangool.tuplemr.NamedOutputsInterface.Output;
import com.datasalt.pangool.tuplemr.mapred.MapOnlyMapper;
import com.datasalt.pangool.tuplemr.mapred.lib.output.ProxyOutputFormat;
import com.datasalt.pangool.tuplemr.mapred.lib.output.TupleOutputFormat;

/**
* The MapOnlyJobBuilder is a simple Pangool primitive that executes map-only Jobs. You must implement
* {@link MapOnlyMapper} for using it.
*/
@SuppressWarnings("rawtypes")
public class MapOnlyJobBuilder {

  private Configuration conf;

  private Class<?> jarByClass;
  private Class<?> outputKeyClass;
  private Class<?> outputValueClass;
  private OutputFormat outputFormat;

  private Path outputPath;

  private MultipleInputsInterface multipleInputs;
  private NamedOutputsInterface namedOutputs;

  private MapOnlyMapper mapOnlyMapper;
  private String jobName = null;

  private List<String> instanceFilesCreated = new ArrayList<String>();
 
  public MapOnlyJobBuilder setJarByClass(Class<?> jarByClass) {
    this.jarByClass = jarByClass;
    return this;
  }

  /**
   * Deprecated. Use {@link #addInput(org.apache.hadoop.fs.Path, org.apache.hadoop.mapreduce.InputFormat, com.datasalt.pangool.tuplemr.mapred.MapOnlyMapper)}
   * instead.
   */
  @Deprecated
  public MapOnlyJobBuilder addInput(Path path, InputFormat inputFormat) {
    multipleInputs.getMultiInputs().add(new Input(path, inputFormat, null));
    return this;
  }

  public MapOnlyJobBuilder addInput(Path path, InputFormat inputFormat, MapOnlyMapper processor) {
    multipleInputs.getMultiInputs().add(new Input(path, inputFormat, processor));
    return this;
  }

  public void addNamedOutput(String namedOutput, OutputFormat outputFormat, Class keyClass,
                             Class valueClass) throws TupleMRException {
    addNamedOutput(namedOutput, outputFormat, keyClass, valueClass, null);
  }

  public void addNamedOutput(String namedOutput, OutputFormat outputFormat, Class keyClass,
                             Class valueClass, Map<String, String> specificContext) throws TupleMRException {
    namedOutputs.add(new Output(namedOutput, outputFormat, keyClass, valueClass, specificContext));
  }

  public void addNamedTupleOutput(String namedOutput, Schema outputSchema) throws TupleMRException {
    Output output = new Output(namedOutput, new TupleOutputFormat(outputSchema),
        ITuple.class, NullWritable.class, null);
    namedOutputs.add(output);
  }

  public MapOnlyJobBuilder setTupleOutput(Path outputPath, Schema schema) {
    this.outputPath = outputPath;
    this.outputFormat = new TupleOutputFormat(schema);
    this.outputKeyClass = ITuple.class;
    this.outputValueClass = NullWritable.class;
    return this;
  }

  public MapOnlyJobBuilder setOutput(Path outputPath, OutputFormat outputFormat,
                                     Class<?> outputKeyClass, Class<?> outputValueClass) {
    this.outputFormat = outputFormat;
    this.outputKeyClass = outputKeyClass;
    this.outputValueClass = outputValueClass;
    this.outputPath = outputPath;
    return this;
  }

  /**
   * Deprecated. Use {@link #addInput(org.apache.hadoop.fs.Path, org.apache.hadoop.mapreduce.InputFormat, com.datasalt.pangool.tuplemr.mapred.MapOnlyMapper)}
   * instead.
   */
  @Deprecated
  public MapOnlyJobBuilder setMapper(MapOnlyMapper mapOnlyMapper) {
    this.mapOnlyMapper = mapOnlyMapper;
    return this;
  }

  public MapOnlyJobBuilder(Configuration conf) {
    this(conf, null);
  }

  public MapOnlyJobBuilder(Configuration conf, String jobName) {
    this.conf = conf;
    this.jobName = jobName;
    this.multipleInputs = new MultipleInputsInterface(conf);
    this.namedOutputs = new NamedOutputsInterface(conf);
  }

  /**
   * Run this method after running your Job for instance files to be properly cleaned.
   * @throws IOException
   */
  public void cleanUpInstanceFiles() throws IOException {
    for(String instanceFile: instanceFilesCreated) {
      InstancesDistributor.removeFromCache(conf, instanceFile);
    }
  }
 
  public Job createJob() throws IOException, TupleMRException, URISyntaxException {
    // perform a deep copy of the configuration
    this.conf = new Configuration(this.conf);

    String uniqueName = UUID.randomUUID().toString() + '.' + "out-format.dat";
    try {
      InstancesDistributor.distribute(outputFormat, uniqueName, conf);
      instanceFilesCreated.add(uniqueName);
    } catch (URISyntaxException e1) {
      throw new TupleMRException(e1);
    }
   
    Job job;
    if (jobName == null) {
      job = new Job(conf);
    } else {
      job = new Job(conf, jobName);
    }
    job.setNumReduceTasks(0);
   
    job.getConfiguration().set(ProxyOutputFormat.PROXIED_OUTPUT_FORMAT_CONF, uniqueName);
    job.setOutputFormatClass(ProxyOutputFormat.class);

    if (outputKeyClass == null) {
      throw new TupleMRException("Output spec must be defined, use setOutput()");
    }
    job.setOutputKeyClass(outputKeyClass);
    job.setOutputValueClass(outputValueClass);
    FileOutputFormat.setOutputPath(job, outputPath);

    Input lastInput = null;

    for (Input input : multipleInputs.getMultiInputs()) {
      if (input.inputProcessor == null) {
        input.inputProcessor = mapOnlyMapper;
        if (input.inputProcessor == null) {
          throw new TupleMRException("Either mapOnlyMapper property or full Input spec must be set.");
        }
      }
      lastInput = input;
    }

    if (lastInput == null) {
      throw new TupleMRException("At least one input must be specified");
    }
    job.setJarByClass((jarByClass != null) ? jarByClass : lastInput.inputProcessor.getClass());

    instanceFilesCreated.addAll(multipleInputs.configureJob(job));
    instanceFilesCreated.addAll(namedOutputs.configureJob(job));

    return job;
  }
}
TOP

Related Classes of com.datasalt.pangool.tuplemr.MapOnlyJobBuilder

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.