/*
* Copyright 2013 Cloudera Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.crunch;
import java.io.File;
import java.io.IOException;
import java.io.PrintWriter;
import java.io.StringWriter;
import java.util.ArrayList;
import java.util.Map;
import net.sourceforge.argparse4j.ArgumentParsers;
import net.sourceforge.argparse4j.impl.Arguments;
import net.sourceforge.argparse4j.impl.action.HelpArgumentAction;
import net.sourceforge.argparse4j.impl.choice.RangeArgumentChoice;
import net.sourceforge.argparse4j.impl.type.FileArgumentType;
import net.sourceforge.argparse4j.inf.Argument;
import net.sourceforge.argparse4j.inf.ArgumentGroup;
import net.sourceforge.argparse4j.inf.ArgumentParser;
import net.sourceforge.argparse4j.inf.ArgumentParserException;
import net.sourceforge.argparse4j.inf.Namespace;
import org.apache.avro.Schema;
import org.apache.crunch.types.avro.AvroInputFormat;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.log4j.PropertyConfigurator;
import org.apache.solr.crunch.CrunchIndexerToolOptions.PipelineType;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import parquet.avro.AvroParquetInputFormat;
import com.google.common.base.Preconditions;
import com.google.common.collect.ImmutableMap;
/**
* See http://argparse4j.sourceforge.net and for details see http://argparse4j.sourceforge.net/usage.html
*/
final class CrunchIndexerToolArgumentParser {
private static final Logger LOG = LoggerFactory.getLogger(CrunchIndexerToolArgumentParser.class);
private static final Map<String, String> INPUT_FORMAT_SUBSTITUTIONS = ImmutableMap.of(
"text", TextInputFormat.class.getName(),
"avro", AvroInputFormat.class.getName(),
"avroParquet", AvroParquetInputFormat.class.getName()
);
/**
* Parses the given command line arguments.
*
* @return exitCode null indicates the caller shall proceed with processing,
* non-null indicates the caller shall exit the program with the
* given exit status code.
*/
public Integer parseArgs(String[] args, Configuration conf, CrunchIndexerToolOptions opts) {
assert args != null;
assert conf != null;
assert opts != null;
if (args.length == 0) {
args = new String[] { "--help" };
}
final String descriptionHead = "Spark or MapReduce ETL";
ArgumentParser parser = ArgumentParsers
.newArgumentParser("", false)
.defaultHelp(true)
.description(
descriptionHead + " batch job that pipes data from (splittable or non-splittable) HDFS files "
+ "into Apache Solr, and along the way runs the "
+ "data through a Morphline for extraction and transformation. The program is designed for "
+ "flexible, scalable and fault-tolerant batch ETL pipeline jobs. It is implemented as an Apache Crunch pipeline "
+ "and as such can run on either the Apache Hadoop MapReduce or Apache Spark execution engine.\n"
+ "\n"
+ "The program proceeds in several consecutive phases, as follows: "
+ "\n\n"
+ "1) Randomization phase: This (parallel) phase randomizes the list of HDFS input files in order to spread "
+ "ingestion load more evenly among the mapper tasks of the subsequent phase. This phase is only executed for "
+ "non-splittables files, and skipped otherwise."
+ "\n\n"
+ "2) Extraction phase: This (parallel) phase emits a series of HDFS file input streams (for non-splittable files) "
+ "or a series of input data records (for splittable files). "
+ "\n\n"
+ "3) Morphline phase: This (parallel) phase receives the items of the previous "
+ "phase, and uses a Morphline to extract the relevant content, transform it and load zero or more documents "
+ "into Solr. The ETL functionality is flexible and customizable using chains of arbitrary "
+ "morphline commands that pipe records from one transformation command to another. Commands to parse and "
+ "transform a set of standard data formats such as Avro, Parquet, CSV, Text, HTML, XML, PDF, MS-Office, etc. "
+ "are provided out of the box, and additional custom commands and parsers for additional file or data formats "
+ "can be added as custom morphline commands. Any kind of data format can be "
+ "processed and any kind output format can be generated by any custom Morphline ETL logic. Also, this phase "
+ "can be used to send data directly to a live SolrCloud cluster (via the loadSolr morphline command)."
+ "\n\n"
+ "The program is implemented as a Crunch pipeline and as such Crunch optimizes the logical phases mentioned "
+ "above into an efficient physical execution plan that runs a single mapper-only job, "
+ "or as the corresponding Spark equivalent."
+ "\n\n"
+ "Fault Tolerance: Task attempts are retried on failure per the standard MapReduce or Spark "
+ "semantics. If the whole job fails you can retry simply by rerunning the program again "
+ "using the same arguments."
);
ArgumentGroup indexerArgGroup = parser.addArgumentGroup("CrunchIndexerOptions");
// trailing positional arguments
Argument inputFilesArg = indexerArgGroup.addArgument("input-files")
.metavar("HDFS_URI")
.type(new PathArgumentType(conf).verifyExists().verifyCanRead())
.nargs("*")
.setDefault()
.help("HDFS URI of file or directory tree to ingest.");
Argument inputFileListArg = indexerArgGroup.addArgument("--input-file-list", "--input-list")
.action(Arguments.append())
.metavar("URI")
.type(new PathArgumentType(conf).acceptSystemIn().verifyExists().verifyCanRead())
.help("Local URI or HDFS URI of a UTF-8 encoded file containing a list of HDFS URIs to ingest, " +
"one URI per line in the file. If '-' is specified, URIs are read from the standard input. " +
"Multiple --input-file-list arguments can be specified.");
Argument inputFormatArg = indexerArgGroup.addArgument("--input-file-format")
.metavar("FQCN")
.type(String.class)
.help("The Hadoop FileInputFormat to use for extracting data from splittable HDFS files. Can be a "
+ "fully qualified Java class name or one of ['text', 'avro', 'avroParquet']. If this option "
+ "is present the extraction phase will emit a series of input data records rather than a series "
+ "of HDFS file input streams.");
Argument inputFileProjectionSchemaArg = indexerArgGroup.addArgument("--input-file-projection-schema")
.metavar("FILE")
.type(new FileArgumentType().verifyExists().verifyIsFile().verifyCanRead())
.help("Relative or absolute path to an Avro schema file on the local file system. This will be used "
+ "as the projection schema for Parquet input files.");
Argument inputFileReaderSchemaArg = indexerArgGroup.addArgument("--input-file-reader-schema")
.metavar("FILE")
.type(new FileArgumentType().verifyExists().verifyIsFile().verifyCanRead())
.help("Relative or absolute path to an Avro schema file on the local file system. This will be used "
+ "as the reader schema for Avro or Parquet input files. "
+ "Example: src/test/resources/test-documents/strings.avsc");
Argument morphlineFileArg = indexerArgGroup.addArgument("--morphline-file")
.metavar("FILE")
.type(new FileArgumentType().verifyExists().verifyIsFile().verifyCanRead())
.required(true)
.help("Relative or absolute path to a local config file that contains one or more morphlines. "
+ "The file must be UTF-8 encoded. It will be uploaded to each remote task. "
+ "Example: /path/to/morphline.conf");
Argument morphlineIdArg = indexerArgGroup.addArgument("--morphline-id")
.metavar("STRING")
.type(String.class)
.help("The identifier of the morphline that shall be executed within the morphline config file "
+ "specified by --morphline-file. If the --morphline-id option is omitted the first (i.e. "
+ "top-most) morphline within the config file is used. Example: morphline1");
Argument pipelineTypeArg = indexerArgGroup.addArgument("--pipeline-type")
.metavar("STRING")
.type(PipelineType.class)
.setDefault(PipelineType.mapreduce)
.help("The engine to use for executing the job. Can be 'mapreduce' or 'spark'.");
ArgumentGroup miscArgGroup = indexerArgGroup; //parser.addArgumentGroup("Misc arguments");
miscArgGroup.addArgument("--xhelp", "--help", "-help")
.help("Show this help message and exit")
.action(new HelpArgumentAction() {
@Override
public void run(ArgumentParser parser, Argument arg, Map<String, Object> attrs, String flag, Object value) throws ArgumentParserException {
StringWriter strWriter = new StringWriter();
parser.printHelp(new PrintWriter(strWriter, true));
String help = strWriter.toString();
int i = help.indexOf(descriptionHead);
String description = help.substring(i).trim();
String usage = help.substring("usage: ".length(), i).trim();
System.out.println(
"MapReduceUsage: export HADOOP_CLASSPATH=$myDependencyJarPaths; hadoop jar $myDriverJar \n" + CrunchIndexerTool.class.getName()
+ " --libjars $myDependencyJarFiles [MapReduceGenericOptions]...\n"
+ " " + usage + "\n"
+ "\n"
+ "SparkUsage: spark-submit [SparkGenericOptions]... "
+ "--master local|yarn --deploy-mode client|cluster\n"
+ "--jars $myDependencyJarFiles --class " + CrunchIndexerTool.class.getName() + " $myDriverJar\n"
+ " " + usage + "\n"
+ "\n"
+ description + "\n"
+ "\n"
+ "SparkGenericOptions: To print all options run 'spark-submit --help'\n"
+ "\n"
+ "MapReduceGenericOptions: " + ToolRunnerHelpFormatter.getGenericCommandUsage()
);
System.out.println(
"Examples: \n\n"
+ "# Prepare - Copy input files into HDFS:\n"
+ "hadoop fs -copyFromLocal src/test/resources/test-documents/hello1.txt hdfs:/user/systest/input/\n"
+ "\n"
+ "# Prepare variables for convenient reuse:\n"
+ "export myDriverJarDir=target # for build from git\n"
+ "export myDriverJarDir=/opt/cloudera/parcels/CDH/lib/solr/contrib/crunch # for CDH with parcels\n"
+ "export myDriverJarDir=/usr/lib/solr/contrib/crunch # for CDH with packages\n"
+ "export myDependencyJarDir=target/lib # for build from git\n"
+ "export myDependencyJarDir=/opt/cloudera/parcels/CDH/lib/search/lib/search-crunch # for CDH with parcels\n"
+ "export myDependencyJarDir=/usr/lib/search/lib/search-crunch # for CDH with packages\n"
+ "export myDriverJar=$(find $myDriverJarDir -maxdepth 1 -name '*.jar' ! -name '*-job.jar' ! -name '*-sources.jar')\n"
+ "export myDependencyJarFiles=$(find $myDependencyJarDir -name '*.jar' | sort | tr '\\n' ',' | head -c -1)\n"
+ "export myDependencyJarPaths=$(find $myDependencyJarDir -name '*.jar' | sort | tr '\\n' ':' | head -c -1)\n"
+ "\n"
+ "# MapReduce on Yarn - Ingest text file line by line into Solr:\n"
+ "export HADOOP_CLASSPATH=$myDependencyJarPaths; hadoop \\\n"
+ " --config /etc/hadoop/conf.cloudera.YARN-1 \\\n"
+ " jar $myDriverJar " + CrunchIndexerTool.class.getName() + " \\\n"
+ " --libjars $myDependencyJarFiles \\\n"
+ " -D 'mapred.child.java.opts=-Xmx500m' \\\n"
+ " -D morphlineVariable.ZK_HOST=$(hostname):2181/solr \\\n"
+ " --files src/test/resources/test-documents/string.avsc \\\n"
+ " --morphline-file src/test/resources/test-morphlines/loadSolrLine.conf \\\n"
+ " --pipeline-type mapreduce \\\n"
+ " --chatty \\\n"
+ " --log4j src/test/resources/log4j.properties \\\n"
+ " /user/systest/input/hello1.txt\n"
+ "\n"
+ "# Spark in Local Mode (for rapid prototyping) - Ingest into Solr:\n"
+ "spark-submit \\\n"
+ " --master local \\\n"
+ " --deploy-mode client \\\n"
+ " --jars $myDependencyJarFiles \\\n"
+ " --executor-memory 500M \\\n"
+ " # --driver-library-path /opt/cloudera/parcels/CDH/lib/hadoop/lib/native # for Snappy on CDH with parcels\\\n"
+ " # --driver-library-path /usr/lib/hadoop/lib/native # for Snappy on CDH with packages \\\n"
+ " --class " + CrunchIndexerTool.class.getName() + " \\\n"
+ " $myDriverJar \\\n"
+ " -D morphlineVariable.ZK_HOST=$(hostname):2181/solr \\\n"
+ " --morphline-file src/test/resources/test-morphlines/loadSolrLine.conf \\\n"
+ " --pipeline-type spark \\\n"
+ " --chatty \\\n"
+ " --log4j src/test/resources/log4j.properties \\\n"
+ " /user/systest/input/hello1.txt\n"
+ "\n"
+ "# Spark on Yarn in Client Mode (for testing) - Ingest into Solr:\n"
+ "Same as above, except replace '--master local' with '--master yarn'\n"
+ "\n"
+ "# View the yarn executor log files (there is no GUI yet):\n"
+ "yarn logs --applicationId $application_XYZ\n"
+ "\n"
+ "# Spark on Yarn in Cluster Mode (for production) - Ingest into Solr:\n"
+ "spark-submit \\\n"
+ " --master yarn \\\n"
+ " --deploy-mode cluster \\\n"
+ " --jars $myDependencyJarFiles \\\n"
+ " --executor-memory 500M \\\n"
+ " --class " + CrunchIndexerTool.class.getName() + " \\\n"
+ " --files src/test/resources/log4j.properties,src/test/resources/test-morphlines/loadSolrLine.conf \\\n"
+ " $myDriverJar \\\n"
+ " -D hadoop.tmp.dir=/tmp \\\n"
+ " -D morphlineVariable.ZK_HOST=$(hostname):2181/solr \\\n"
+ " --morphline-file loadSolrLine.conf \\\n"
+ " --pipeline-type spark \\\n"
+ " --chatty \\\n"
+ " --log4j log4j.properties \\\n"
+ " /user/systest/input/hello1.txt\n"
);
throw new FoundHelpArgument(); // Trick to prevent processing of any remaining arguments
}
});
Argument mappersArg = miscArgGroup.addArgument("--mappers")
.metavar("INTEGER")
.type(Integer.class)
.choices(new RangeArgumentChoice(-1, Integer.MAX_VALUE)) // TODO: also support X% syntax where X is an integer
.setDefault(-1)
.help("Tuning knob that indicates the maximum number of MR mapper tasks to use. -1 indicates use all map slots " +
"available on the cluster. This parameter only applies to non-splittable input files");
Argument dryRunArg = miscArgGroup.addArgument("--dry-run")
.action(Arguments.storeTrue())
.help("Run the pipeline but print documents to stdout instead of loading them into Solr. " +
"This can be used for quicker turnaround during early trial & debug sessions.");
Argument log4jConfigFileArg = miscArgGroup.addArgument("--log4j")
.metavar("FILE")
.type(new FileArgumentType().verifyExists().verifyIsFile().verifyCanRead())
.help("Relative or absolute path to a log4j.properties config file on the local file system. This file " +
"will be uploaded to each remote task. Example: /path/to/log4j.properties");
Argument verboseArg = miscArgGroup.addArgument("--chatty")
.action(Arguments.storeTrue())
.help("Turn on verbose output.");
Namespace ns;
try {
ns = parser.parseArgs(args);
} catch (FoundHelpArgument e) {
return 0;
} catch (ArgumentParserException e) {
parser.handleError(e);
return 1;
}
opts.log4jConfigFile = (File) ns.get(log4jConfigFileArg.getDest());
if (opts.log4jConfigFile != null) {
PropertyConfigurator.configure(opts.log4jConfigFile.getPath());
}
LOG.debug("Parsed command line args: {}", ns);
opts.inputFileLists = getList(ns, inputFileListArg);
opts.inputFiles = ns.get(inputFilesArg.getDest());
opts.mappers = (Integer) ns.get(mappersArg.getDest());
opts.morphlineFile = ns.get(morphlineFileArg.getDest());
opts.morphlineId = ns.get(morphlineIdArg.getDest());
opts.pipelineType = ns.get(pipelineTypeArg.getDest());
opts.isDryRun = (Boolean) ns.get(dryRunArg.getDest());
opts.isVerbose = (Boolean) ns.get(verboseArg.getDest());
try {
opts.inputFileReaderSchema = parseSchema((File)ns.get(inputFileReaderSchemaArg.getDest()), parser);
opts.inputFileProjectionSchema = parseSchema((File)ns.get(inputFileProjectionSchemaArg.getDest()), parser);
opts.inputFileFormat = getClass(inputFormatArg, ns, FileInputFormat.class, parser, INPUT_FORMAT_SUBSTITUTIONS);
String sparkMaster = System.getProperty("spark.master");
if (opts.pipelineType == PipelineType.spark) {
if (sparkMaster == null) {
throw new ArgumentParserException("--pipeline-type=" + PipelineType.spark + " must not run as a MapReduce job", parser);
}
} else if (opts.pipelineType == PipelineType.mapreduce) {
if (sparkMaster != null) {
throw new ArgumentParserException("--pipeline-type=" + PipelineType.mapreduce + " must not run as a Spark job", parser);
}
}
} catch (ArgumentParserException e) {
parser.handleError(e);
return 1;
}
return null;
}
private <T> T getList(Namespace ns, Argument arg) {
T list = ns.get(arg.getDest());
if (list == null) {
list = (T) new ArrayList();
}
return list;
}
private Schema parseSchema(File file, ArgumentParser parser) throws ArgumentParserException {
if (file == null) {
return null;
}
try {
return new Schema.Parser().parse(file);
} catch (IOException e) {
throw new ArgumentParserException(e, parser);
}
}
private Class getClass(Argument arg, Namespace ns, Class baseClazz, ArgumentParser parser,
Map<String, String> substitutions) throws ArgumentParserException {
Class clazz = null;
String className = ns.getString(arg.getDest());
if (substitutions != null && substitutions.containsKey(className)) {
className = substitutions.get(className);
Preconditions.checkNotNull(className);
}
if (className != null) {
try {
clazz = Class.forName(className);
} catch (Exception e) {
throw new RuntimeException(e);
}
if (!baseClazz.isAssignableFrom(clazz)) {
throw new ArgumentParserException("--" + arg.getDest().replace('_', '-') + " " + className
+ " must be an instance of class " + baseClazz.getName(), parser);
}
}
return clazz;
}
/** Marker trick to prevent processing of any remaining arguments once --help option has been parsed */
private static final class FoundHelpArgument extends RuntimeException {
}
}