package org.apache.blur.mapreduce.lib;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.PrintWriter;
import java.util.HashSet;
import java.util.Set;
import org.apache.blur.thrift.BlurClient;
import org.apache.blur.thrift.generated.Blur.Iface;
import org.apache.blur.thrift.generated.TableDescriptor;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.cli.PosixParser;
import org.apache.commons.lang.StringEscapeUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.compress.BZip2Codec;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.DefaultCodec;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.hadoop.io.compress.SnappyCodec;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.CombineFileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileRecordReader;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import com.google.common.base.Splitter;
@SuppressWarnings("static-access")
public class CsvBlurDriver {
public static final String CSVLOADER = "csvloader";
public static final String MAPRED_COMPRESS_MAP_OUTPUT = "mapred.compress.map.output";
public static final String MAPRED_MAP_OUTPUT_COMPRESSION_CODEC = "mapred.map.output.compression.codec";
public static final int DEFAULT_WIDTH = 100;
public static final String HEADER = "The \"" +CSVLOADER +
"\" command is used to load delimited into a Blur table.\nThe required options are \"-c\", \"-t\", \"-d\". The " +
"standard format for the contents of a file is:\"rowid,recordid,family,col1,col2,...\". However there are " +
"several options, such as the rowid and recordid can be generated based on the data in the record via the " +
"\"-A\" and \"-a\" options. The family can assigned based on the path via the \"-I\" option. The column " +
"name order can be mapped via the \"-d\" option. Also you can set the input " +
"format to either sequence files vie the \"-S\" option or leave the default text files.";
enum COMPRESSION {
SNAPPY(SnappyCodec.class), GZIP(GzipCodec.class), BZIP(BZip2Codec.class), DEFAULT(DefaultCodec.class);
private final String className;
private COMPRESSION(Class<? extends CompressionCodec> clazz) {
className = clazz.getName();
}
public String getClassName() {
return className;
}
}
interface ControllerPool {
Iface getClient(String controllerConnectionStr);
}
public static void main(String... args) throws Exception {
Configuration configuration = new Configuration();
String[] otherArgs = new GenericOptionsParser(configuration, args).getRemainingArgs();
Job job = setupJob(configuration, new ControllerPool() {
@Override
public Iface getClient(String controllerConnectionStr) {
return BlurClient.getClient(controllerConnectionStr);
}
}, otherArgs);
if (job == null) {
System.exit(1);
}
boolean waitForCompletion = job.waitForCompletion(true);
System.exit(waitForCompletion ? 0 : 1);
}
public static Job setupJob(Configuration configuration, ControllerPool controllerPool, String... otherArgs)
throws Exception {
CommandLine cmd = parse(otherArgs);
if (cmd == null) {
return null;
}
final String controllerConnectionStr = cmd.getOptionValue("c");
final String tableName = cmd.getOptionValue("t");
final Iface client = controllerPool.getClient(controllerConnectionStr);
TableDescriptor tableDescriptor = client.describe(tableName);
Job job = new Job(configuration, "Blur indexer [" + tableName + "]");
job.setJarByClass(CsvBlurDriver.class);
job.setMapperClass(CsvBlurMapper.class);
if (cmd.hasOption("p")) {
job.getConfiguration().set(MAPRED_COMPRESS_MAP_OUTPUT, "true");
String codecStr = cmd.getOptionValue("p");
COMPRESSION compression;
try {
compression = COMPRESSION.valueOf(codecStr.trim().toUpperCase());
} catch (IllegalArgumentException e) {
compression = null;
}
if (compression == null) {
job.getConfiguration().set(MAPRED_MAP_OUTPUT_COMPRESSION_CODEC, codecStr.trim());
} else {
job.getConfiguration().set(MAPRED_MAP_OUTPUT_COMPRESSION_CODEC, compression.getClassName());
}
}
if (cmd.hasOption("a")) {
CsvBlurMapper.setAutoGenerateRecordIdAsHashOfData(job, true);
}
if (cmd.hasOption("A")) {
CsvBlurMapper.setAutoGenerateRowIdAsHashOfData(job, true);
}
if (cmd.hasOption("S")) {
job.setInputFormatClass(SequenceFileInputFormat.class);
} else {
job.setInputFormatClass(TextInputFormat.class);
}
if (cmd.hasOption("C")) {
if (cmd.hasOption("S")) {
String[] optionValues = cmd.getOptionValues("C");
job.setInputFormatClass(CsvBlurCombineSequenceFileInputFormat.class);
CombineFileInputFormat.setMinInputSplitSize(job, Long.parseLong(optionValues[0]));
CombineFileInputFormat.setMaxInputSplitSize(job, Long.parseLong(optionValues[1]));
} else {
System.err.println("'C' can only be used with option 'S'");
return null;
}
}
if (cmd.hasOption("i")) {
for (String input : cmd.getOptionValues("i")) {
Path path = new Path(input);
Set<Path> pathSet = recurisvelyGetPathesContainingFiles(path, job.getConfiguration());
if (pathSet.isEmpty()) {
FileInputFormat.addInputPath(job, path);
} else {
for (Path p : pathSet) {
FileInputFormat.addInputPath(job, p);
}
}
}
}
// processing the 'I' option
if (cmd.hasOption("I")) {
Option[] options = cmd.getOptions();
for (Option option : options) {
if (option.getOpt().equals("I")) {
String[] values = option.getValues();
if (values.length < 2) {
System.err.println("'I' parameter missing minimum args of (family path*)");
return null;
}
for (String p : getSubArray(values, 1)) {
Path path = new Path(p);
CsvBlurMapper.addFamilyPath(job, values[0], path);
FileInputFormat.addInputPath(job, path);
}
}
}
}
if (cmd.hasOption("s")) {
CsvBlurMapper.setSeparator(job, StringEscapeUtils.unescapeJava(cmd.getOptionValue("s")));
}
if (cmd.hasOption("o")) {
BlurOutputFormat.setOptimizeInFlight(job, false);
}
if (cmd.hasOption("l")) {
BlurOutputFormat.setIndexLocally(job, false);
}
if (cmd.hasOption("b")) {
int maxDocumentBufferSize = Integer.parseInt(cmd.getOptionValue("b"));
BlurOutputFormat.setMaxDocumentBufferSize(job, maxDocumentBufferSize);
}
if (cmd.hasOption("r")) {
int reducerMultiplier = Integer.parseInt(cmd.getOptionValue("r"));
BlurOutputFormat.setReducerMultiplier(job, reducerMultiplier);
}
// processing the 'd' option
Option[] options = cmd.getOptions();
for (Option option : options) {
if (option.getOpt().equals("d")) {
String[] values = option.getValues();
if (values.length < 2) {
System.err.println("'d' parameter missing minimum args of (family columname*)");
return null;
}
CsvBlurMapper.addColumns(job, values[0], getSubArray(values, 1));
}
}
BlurOutputFormat.setupJob(job, tableDescriptor);
BlurMapReduceUtil.addDependencyJars(job.getConfiguration(), Splitter.class);
return job;
}
private static String[] getSubArray(String[] array, int starting) {
String[] result = new String[array.length - starting];
System.arraycopy(array, starting, result, 0, result.length);
return result;
}
private static Set<Path> recurisvelyGetPathesContainingFiles(Path path, Configuration configuration)
throws IOException {
Set<Path> pathSet = new HashSet<Path>();
FileSystem fileSystem = path.getFileSystem(configuration);
FileStatus[] listStatus = fileSystem.listStatus(path);
for (FileStatus status : listStatus) {
if (status.isDir()) {
pathSet.addAll(recurisvelyGetPathesContainingFiles(status.getPath(), configuration));
} else {
pathSet.add(status.getPath().getParent());
}
}
return pathSet;
}
private static CommandLine parse(String... otherArgs) throws ParseException {
Options options = new Options();
options.addOption(OptionBuilder.withArgName("controller*").hasArgs().isRequired(true)
.withDescription("* Thrift controller connection string. (host1:40010 host2:40010 ...)").create("c"));
options.addOption(OptionBuilder.withArgName("tablename").hasArg().isRequired(true)
.withDescription("* Blur table name.").create("t"));
options.addOption(OptionBuilder.withArgName("family column*").hasArgs().isRequired(true)
.withDescription("* Define the mapping of fields in the CSV file to column names. (family col1 col2 col3 ...)")
.create("d"));
options.addOption(OptionBuilder
.withArgName("delimiter")
.hasArg()
.withDescription(
"The file delimiter to be used. (default value ',') NOTE: For special "
+ "charactors like the default hadoop separator of ASCII value 1, you can use standard "
+ "java escaping (\\u0001)").create("s"));
options.addOption(OptionBuilder.withArgName("path*").hasArg()
.withDescription("The directory to index, the family name is assumed to BE present in the file contents. (hdfs://namenode/input/in1)").create("i"));
options.addOption(OptionBuilder.withArgName("family path*").hasArgs()
.withDescription("The directory to index with a family name, the family name is assumed to NOT be present in the file contents. (family hdfs://namenode/input/in1)").create("I"));
options
.addOption(OptionBuilder
.withArgName("auto generate record ids")
.withDescription(
"No Record Ids - Automatically generate record ids for each record based on a MD5 has of the data within the record.")
.create("a"));
options
.addOption(OptionBuilder
.withArgName("auto generate row ids")
.withDescription(
"No Row Ids - Automatically generate row ids for each record based on a MD5 has of the data within the record.")
.create("A"));
options.addOption(OptionBuilder.withArgName("disable optimize indexes during copy")
.withDescription("Disable optimize indexes during copy, this has very little overhead. (enabled by default)")
.create("o"));
options.addOption(OptionBuilder
.withArgName("disable index locally")
.withDescription(
"Disable the use storage local on the server that is running the reducing "
+ "task and copy to Blur table once complete. (enabled by default)").create("l"));
options.addOption(OptionBuilder.withArgName("sequence files inputs")
.withDescription("The input files are sequence files.").create("S"));
options.addOption(OptionBuilder
.withArgName("size")
.hasArg()
.withDescription(
"The maximum number of Lucene documents to buffer in the reducer for a single "
+ "row before spilling over to disk. (default 1000)").create("b"));
options.addOption(OptionBuilder
.withArgName("multiplier")
.hasArg()
.withDescription(
"The reducer multipler allows for an increase in the number of reducers per "
+ "shard in the given table. For example if the table has 128 shards and the "
+ "reducer multiplier is 4 the total number of reducers will be 512, 4 reducers "
+ "per shard. (default 1)").create("r"));
options.addOption(OptionBuilder
.withArgName("minimum maximum")
.hasArgs(2)
.withDescription(
"Enables a combine file input to help deal with many small files as the input. Provide "
+ "the minimum and maximum size per mapper. For a minimum of 1GB and a maximum of "
+ "2.5GB: (1000000000 2500000000)").create("C"));
options.addOption(OptionBuilder
.withArgName("codec")
.hasArgs(1)
.withDescription(
"Sets the compression codec for the map compress output setting. (SNAPPY,GZIP,BZIP,DEFAULT, or classname)")
.create("p"));
CommandLineParser parser = new PosixParser();
CommandLine cmd = null;
try {
cmd = parser.parse(options, otherArgs);
} catch (ParseException e) {
System.err.println(e.getMessage());
HelpFormatter formatter = new HelpFormatter();
PrintWriter pw = new PrintWriter(System.err, true);
formatter.printHelp(pw, DEFAULT_WIDTH, CSVLOADER, HEADER, options, HelpFormatter.DEFAULT_LEFT_PAD,
HelpFormatter.DEFAULT_DESC_PAD, null, false);
return null;
}
if (!(cmd.hasOption("I") || cmd.hasOption("i"))) {
System.err.println("Missing input directory, see options 'i' and 'I'.");
HelpFormatter formatter = new HelpFormatter();
PrintWriter pw = new PrintWriter(System.err, true);
formatter.printHelp(pw, DEFAULT_WIDTH, CSVLOADER, HEADER, options, HelpFormatter.DEFAULT_LEFT_PAD,
HelpFormatter.DEFAULT_DESC_PAD, null, false);
return null;
}
return cmd;
}
public static class CsvBlurCombineSequenceFileInputFormat extends CombineFileInputFormat<Writable, Text> {
@Override
public RecordReader<Writable, Text> createRecordReader(InputSplit split, TaskAttemptContext context)
throws IOException {
return new SequenceFileRecordReader<Writable, Text>();
}
}
}