Source Code of org.apache.blur.mapreduce.lib.CsvBlurDriver

package org.apache.blur.mapreduce.lib;


/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
import java.io.IOException;
import java.io.PrintWriter;
import java.util.HashSet;
import java.util.Set;


import org.apache.blur.thrift.BlurClient;
import org.apache.blur.thrift.generated.Blur.Iface;
import org.apache.blur.thrift.generated.TableDescriptor;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.cli.PosixParser;
import org.apache.commons.lang.StringEscapeUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.compress.BZip2Codec;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.DefaultCodec;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.hadoop.io.compress.SnappyCodec;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.CombineFileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileRecordReader;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.util.GenericOptionsParser;


import com.google.common.base.Splitter;


@SuppressWarnings("static-access")
public class CsvBlurDriver {


  public static final String CSVLOADER = "csvloader";
  public static final String MAPRED_COMPRESS_MAP_OUTPUT = "mapred.compress.map.output";
  public static final String MAPRED_MAP_OUTPUT_COMPRESSION_CODEC = "mapred.map.output.compression.codec";
  public static final int DEFAULT_WIDTH = 100;
  public static final String HEADER = "The \"" +CSVLOADER +
      "\" command is used to load delimited into a Blur table.\nThe required options are \"-c\", \"-t\", \"-d\". The " +
      "standard format for the contents of a file is:\"rowid,recordid,family,col1,col2,...\". However there are " +
      "several options, such as the rowid and recordid can be generated based on the data in the record via the " +
      "\"-A\" and \"-a\" options. The family can assigned based on the path via the \"-I\" option. The column " +
      "name order can be mapped via the \"-d\" option. Also you can set the input " +
      "format to either sequence files vie the \"-S\" option or leave the default text files.";


  enum COMPRESSION {
    SNAPPY(SnappyCodec.class), GZIP(GzipCodec.class), BZIP(BZip2Codec.class), DEFAULT(DefaultCodec.class);


    private final String className;


    private COMPRESSION(Class<? extends CompressionCodec> clazz) {
      className = clazz.getName();
    }


    public String getClassName() {
      return className;
    }
  }


  interface ControllerPool {
    Iface getClient(String controllerConnectionStr);
  }


  public static void main(String... args) throws Exception {
    Configuration configuration = new Configuration();
    String[] otherArgs = new GenericOptionsParser(configuration, args).getRemainingArgs();
    Job job = setupJob(configuration, new ControllerPool() {
      @Override
      public Iface getClient(String controllerConnectionStr) {
        return BlurClient.getClient(controllerConnectionStr);
      }
    }, otherArgs);
    if (job == null) {
      System.exit(1);
    }


    boolean waitForCompletion = job.waitForCompletion(true);
    System.exit(waitForCompletion ? 0 : 1);
  }


  public static Job setupJob(Configuration configuration, ControllerPool controllerPool, String... otherArgs)
      throws Exception {
    CommandLine cmd = parse(otherArgs);
    if (cmd == null) {
      return null;
    }


    final String controllerConnectionStr = cmd.getOptionValue("c");
    final String tableName = cmd.getOptionValue("t");


    final Iface client = controllerPool.getClient(controllerConnectionStr);
    TableDescriptor tableDescriptor = client.describe(tableName);


    Job job = new Job(configuration, "Blur indexer [" + tableName + "]");
    job.setJarByClass(CsvBlurDriver.class);
    job.setMapperClass(CsvBlurMapper.class);


    if (cmd.hasOption("p")) {
      job.getConfiguration().set(MAPRED_COMPRESS_MAP_OUTPUT, "true");
      String codecStr = cmd.getOptionValue("p");
      COMPRESSION compression;
      try {
        compression = COMPRESSION.valueOf(codecStr.trim().toUpperCase());
      } catch (IllegalArgumentException e) {
        compression = null;
      }
      if (compression == null) {
        job.getConfiguration().set(MAPRED_MAP_OUTPUT_COMPRESSION_CODEC, codecStr.trim());
      } else {
        job.getConfiguration().set(MAPRED_MAP_OUTPUT_COMPRESSION_CODEC, compression.getClassName());
      }
    }
    if (cmd.hasOption("a")) {
      CsvBlurMapper.setAutoGenerateRecordIdAsHashOfData(job, true);
    }
    if (cmd.hasOption("A")) {
      CsvBlurMapper.setAutoGenerateRowIdAsHashOfData(job, true);
    }
    if (cmd.hasOption("S")) {
      job.setInputFormatClass(SequenceFileInputFormat.class);
    } else {
      job.setInputFormatClass(TextInputFormat.class);
    }


    if (cmd.hasOption("C")) {
      if (cmd.hasOption("S")) {
        String[] optionValues = cmd.getOptionValues("C");
        job.setInputFormatClass(CsvBlurCombineSequenceFileInputFormat.class);
        CombineFileInputFormat.setMinInputSplitSize(job, Long.parseLong(optionValues[0]));
        CombineFileInputFormat.setMaxInputSplitSize(job, Long.parseLong(optionValues[1]));
      } else {
        System.err.println("'C' can only be used with option 'S'");
        return null;
      }
    }


    if (cmd.hasOption("i")) {
      for (String input : cmd.getOptionValues("i")) {
        Path path = new Path(input);
        Set<Path> pathSet = recurisvelyGetPathesContainingFiles(path, job.getConfiguration());
        if (pathSet.isEmpty()) {
          FileInputFormat.addInputPath(job, path);
        } else {
          for (Path p : pathSet) {
            FileInputFormat.addInputPath(job, p);
          }
        }
      }
    }
    // processing the 'I' option
    if (cmd.hasOption("I")) {
      Option[] options = cmd.getOptions();
      for (Option option : options) {
        if (option.getOpt().equals("I")) {
          String[] values = option.getValues();
          if (values.length < 2) {
            System.err.println("'I' parameter missing minimum args of (family path*)");
            return null;
          }
          for (String p : getSubArray(values, 1)) {
            Path path = new Path(p);
            CsvBlurMapper.addFamilyPath(job, values[0], path);
            FileInputFormat.addInputPath(job, path);
          }
        }
      }
    }


    if (cmd.hasOption("s")) {
      CsvBlurMapper.setSeparator(job, StringEscapeUtils.unescapeJava(cmd.getOptionValue("s")));
    }
    if (cmd.hasOption("o")) {
      BlurOutputFormat.setOptimizeInFlight(job, false);
    }
    if (cmd.hasOption("l")) {
      BlurOutputFormat.setIndexLocally(job, false);
    }
    if (cmd.hasOption("b")) {
      int maxDocumentBufferSize = Integer.parseInt(cmd.getOptionValue("b"));
      BlurOutputFormat.setMaxDocumentBufferSize(job, maxDocumentBufferSize);
    }
    if (cmd.hasOption("r")) {
      int reducerMultiplier = Integer.parseInt(cmd.getOptionValue("r"));
      BlurOutputFormat.setReducerMultiplier(job, reducerMultiplier);
    }
    // processing the 'd' option
    Option[] options = cmd.getOptions();
    for (Option option : options) {
      if (option.getOpt().equals("d")) {
        String[] values = option.getValues();
        if (values.length < 2) {
          System.err.println("'d' parameter missing minimum args of (family columname*)");
          return null;
        }
        CsvBlurMapper.addColumns(job, values[0], getSubArray(values, 1));
      }
    }
    BlurOutputFormat.setupJob(job, tableDescriptor);
    BlurMapReduceUtil.addDependencyJars(job.getConfiguration(), Splitter.class);
    return job;
  }


  private static String[] getSubArray(String[] array, int starting) {
    String[] result = new String[array.length - starting];
    System.arraycopy(array, starting, result, 0, result.length);
    return result;
  }


  private static Set<Path> recurisvelyGetPathesContainingFiles(Path path, Configuration configuration)
      throws IOException {
    Set<Path> pathSet = new HashSet<Path>();
    FileSystem fileSystem = path.getFileSystem(configuration);
    FileStatus[] listStatus = fileSystem.listStatus(path);
    for (FileStatus status : listStatus) {
      if (status.isDir()) {
        pathSet.addAll(recurisvelyGetPathesContainingFiles(status.getPath(), configuration));
      } else {
        pathSet.add(status.getPath().getParent());
      }
    }
    return pathSet;
  }


  private static CommandLine parse(String... otherArgs) throws ParseException {
    Options options = new Options();
    options.addOption(OptionBuilder.withArgName("controller*").hasArgs().isRequired(true)
        .withDescription("* Thrift controller connection string. (host1:40010 host2:40010 ...)").create("c"));
    options.addOption(OptionBuilder.withArgName("tablename").hasArg().isRequired(true)
        .withDescription("* Blur table name.").create("t"));
    options.addOption(OptionBuilder.withArgName("family column*").hasArgs().isRequired(true)
        .withDescription("* Define the mapping of fields in the CSV file to column names. (family col1 col2 col3 ...)")
        .create("d"));
    options.addOption(OptionBuilder
        .withArgName("delimiter")
        .hasArg()
        .withDescription(
            "The file delimiter to be used. (default value ',')  NOTE: For special "
                + "charactors like the default hadoop separator of ASCII value 1, you can use standard "
                + "java escaping (\\u0001)").create("s"));
    options.addOption(OptionBuilder.withArgName("path*").hasArg()
        .withDescription("The directory to index, the family name is assumed to BE present in the file contents. (hdfs://namenode/input/in1)").create("i"));
    options.addOption(OptionBuilder.withArgName("family path*").hasArgs()
        .withDescription("The directory to index with a family name, the family name is assumed to NOT be present in the file contents. (family hdfs://namenode/input/in1)").create("I"));
    options
        .addOption(OptionBuilder
            .withArgName("auto generate record ids")
            .withDescription(
                "No Record Ids - Automatically generate record ids for each record based on a MD5 has of the data within the record.")
            .create("a"));
    options
        .addOption(OptionBuilder
            .withArgName("auto generate row ids")
            .withDescription(
                "No Row Ids - Automatically generate row ids for each record based on a MD5 has of the data within the record.")
            .create("A"));
    options.addOption(OptionBuilder.withArgName("disable optimize indexes during copy")
        .withDescription("Disable optimize indexes during copy, this has very little overhead. (enabled by default)")
        .create("o"));
    options.addOption(OptionBuilder
        .withArgName("disable index locally")
        .withDescription(
            "Disable the use storage local on the server that is running the reducing "
                + "task and copy to Blur table once complete. (enabled by default)").create("l"));
    options.addOption(OptionBuilder.withArgName("sequence files inputs")
        .withDescription("The input files are sequence files.").create("S"));
    options.addOption(OptionBuilder
        .withArgName("size")
        .hasArg()
        .withDescription(
            "The maximum number of Lucene documents to buffer in the reducer for a single "
                + "row before spilling over to disk. (default 1000)").create("b"));
    options.addOption(OptionBuilder
        .withArgName("multiplier")
        .hasArg()
        .withDescription(
            "The reducer multipler allows for an increase in the number of reducers per "
                + "shard in the given table.  For example if the table has 128 shards and the "
                + "reducer multiplier is 4 the total number of reducers will be 512, 4 reducers "
                + "per shard. (default 1)").create("r"));
    options.addOption(OptionBuilder
        .withArgName("minimum maximum")
        .hasArgs(2)
        .withDescription(
            "Enables a combine file input to help deal with many small files as the input. Provide "
                + "the minimum and maximum size per mapper.  For a minimum of 1GB and a maximum of "
                + "2.5GB: (1000000000 2500000000)").create("C"));
    options.addOption(OptionBuilder
        .withArgName("codec")
        .hasArgs(1)
        .withDescription(
            "Sets the compression codec for the map compress output setting. (SNAPPY,GZIP,BZIP,DEFAULT, or classname)")
        .create("p"));


    CommandLineParser parser = new PosixParser();
    CommandLine cmd = null;
    try {
      cmd = parser.parse(options, otherArgs);
    } catch (ParseException e) {
      System.err.println(e.getMessage());
      HelpFormatter formatter = new HelpFormatter();
      PrintWriter pw = new PrintWriter(System.err, true);
      formatter.printHelp(pw, DEFAULT_WIDTH, CSVLOADER, HEADER, options, HelpFormatter.DEFAULT_LEFT_PAD,
          HelpFormatter.DEFAULT_DESC_PAD, null, false);
      return null;
    }


    if (!(cmd.hasOption("I") || cmd.hasOption("i"))) {
      System.err.println("Missing input directory, see options 'i' and 'I'.");
      HelpFormatter formatter = new HelpFormatter();
      PrintWriter pw = new PrintWriter(System.err, true);
      formatter.printHelp(pw, DEFAULT_WIDTH, CSVLOADER, HEADER, options, HelpFormatter.DEFAULT_LEFT_PAD,
          HelpFormatter.DEFAULT_DESC_PAD, null, false);
      return null;
    }
    return cmd;
  }


  public static class CsvBlurCombineSequenceFileInputFormat extends CombineFileInputFormat<Writable, Text> {


    @Override
    public RecordReader<Writable, Text> createRecordReader(InputSplit split, TaskAttemptContext context)
        throws IOException {
      return new SequenceFileRecordReader<Writable, Text>();
    }


  }
}
Source Code of org.apache.blur.mapreduce.lib.CsvBlurDriver

Related Classes of org.apache.blur.mapreduce.lib.CsvBlurDriver