+ "Fault Tolerance: Task attempts are retried on failure per the standard MapReduce or Spark "
+ "semantics. If the whole job fails you can retry simply by rerunning the program again "
+ "using the same arguments."
);
ArgumentGroup indexerArgGroup = parser.addArgumentGroup("CrunchIndexerOptions");
// trailing positional arguments
Argument inputFilesArg = indexerArgGroup.addArgument("input-files")
.metavar("HDFS_URI")
.type(new PathArgumentType(conf).verifyExists().verifyCanRead())
.nargs("*")
.setDefault()
.help("HDFS URI of file or directory tree to ingest.");
Argument inputFileListArg = indexerArgGroup.addArgument("--input-file-list", "--input-list")
.action(Arguments.append())
.metavar("URI")
.type(new PathArgumentType(conf).acceptSystemIn().verifyExists().verifyCanRead())
.help("Local URI or HDFS URI of a UTF-8 encoded file containing a list of HDFS URIs to ingest, " +
"one URI per line in the file. If '-' is specified, URIs are read from the standard input. " +
"Multiple --input-file-list arguments can be specified.");
Argument inputFormatArg = indexerArgGroup.addArgument("--input-file-format")
.metavar("FQCN")
.type(String.class)
.help("The Hadoop FileInputFormat to use for extracting data from splittable HDFS files. Can be a "
+ "fully qualified Java class name or one of ['text', 'avro', 'avroParquet']. If this option "
+ "is present the extraction phase will emit a series of input data records rather than a series "
+ "of HDFS file input streams.");
Argument inputFileProjectionSchemaArg = indexerArgGroup.addArgument("--input-file-projection-schema")
.metavar("FILE")
.type(new FileArgumentType().verifyExists().verifyIsFile().verifyCanRead())
.help("Relative or absolute path to an Avro schema file on the local file system. This will be used "
+ "as the projection schema for Parquet input files.");
Argument inputFileReaderSchemaArg = indexerArgGroup.addArgument("--input-file-reader-schema")
.metavar("FILE")
.type(new FileArgumentType().verifyExists().verifyIsFile().verifyCanRead())
.help("Relative or absolute path to an Avro schema file on the local file system. This will be used "
+ "as the reader schema for Avro or Parquet input files. "
+ "Example: src/test/resources/test-documents/strings.avsc");
Argument morphlineFileArg = indexerArgGroup.addArgument("--morphline-file")
.metavar("FILE")
.type(new FileArgumentType().verifyExists().verifyIsFile().verifyCanRead())
.required(true)
.help("Relative or absolute path to a local config file that contains one or more morphlines. "
+ "The file must be UTF-8 encoded. It will be uploaded to each remote task. "
+ "Example: /path/to/morphline.conf");
Argument morphlineIdArg = indexerArgGroup.addArgument("--morphline-id")
.metavar("STRING")
.type(String.class)
.help("The identifier of the morphline that shall be executed within the morphline config file "
+ "specified by --morphline-file. If the --morphline-id option is omitted the first (i.e. "
+ "top-most) morphline within the config file is used. Example: morphline1");
Argument pipelineTypeArg = indexerArgGroup.addArgument("--pipeline-type")
.metavar("STRING")
.type(PipelineType.class)
.setDefault(PipelineType.mapreduce)
.help("The engine to use for executing the job. Can be 'mapreduce' or 'spark'.");
ArgumentGroup miscArgGroup = indexerArgGroup; //parser.addArgumentGroup("Misc arguments");
miscArgGroup.addArgument("--xhelp", "--help", "-help")
.help("Show this help message and exit")
.action(new HelpArgumentAction() {
@Override
public void run(ArgumentParser parser, Argument arg, Map<String, Object> attrs, String flag, Object value) throws ArgumentParserException {
StringWriter strWriter = new StringWriter();
parser.printHelp(new PrintWriter(strWriter, true));
String help = strWriter.toString();
int i = help.indexOf(descriptionHead);
String description = help.substring(i).trim();
String usage = help.substring("usage: ".length(), i).trim();
System.out.println(
"MapReduceUsage: export HADOOP_CLASSPATH=$myDependencyJarPaths; hadoop jar $myDriverJar \n" + CrunchIndexerTool.class.getName()
+ " --libjars $myDependencyJarFiles [MapReduceGenericOptions]...\n"
+ " " + usage + "\n"
+ "\n"
+ "SparkUsage: spark-submit [SparkGenericOptions]... "
+ "--master local|yarn --deploy-mode client|cluster\n"
+ "--jars $myDependencyJarFiles --class " + CrunchIndexerTool.class.getName() + " $myDriverJar\n"
+ " " + usage + "\n"
+ "\n"
+ description + "\n"
+ "\n"
+ "SparkGenericOptions: To print all options run 'spark-submit --help'\n"
+ "\n"
+ "MapReduceGenericOptions: " + ToolRunnerHelpFormatter.getGenericCommandUsage()
);
System.out.println(
"Examples: \n\n"
+ "# Prepare - Copy input files into HDFS:\n"
+ "hadoop fs -copyFromLocal src/test/resources/test-documents/hello1.txt hdfs:/user/systest/input/\n"
+ "\n"
+ "# Prepare variables for convenient reuse:\n"
+ "export myDriverJarDir=target # for build from git\n"
+ "export myDriverJarDir=/opt/cloudera/parcels/CDH/lib/solr/contrib/crunch # for CDH with parcels\n"
+ "export myDriverJarDir=/usr/lib/solr/contrib/crunch # for CDH with packages\n"
+ "export myDependencyJarDir=target/lib # for build from git\n"
+ "export myDependencyJarDir=/opt/cloudera/parcels/CDH/lib/search/lib/search-crunch # for CDH with parcels\n"
+ "export myDependencyJarDir=/usr/lib/search/lib/search-crunch # for CDH with packages\n"
+ "export myDriverJar=$(find $myDriverJarDir -maxdepth 1 -name '*.jar' ! -name '*-job.jar' ! -name '*-sources.jar')\n"
+ "export myDependencyJarFiles=$(find $myDependencyJarDir -name '*.jar' | sort | tr '\\n' ',' | head -c -1)\n"
+ "export myDependencyJarPaths=$(find $myDependencyJarDir -name '*.jar' | sort | tr '\\n' ':' | head -c -1)\n"
+ "\n"
+ "# MapReduce on Yarn - Ingest text file line by line into Solr:\n"
+ "export HADOOP_CLASSPATH=$myDependencyJarPaths; hadoop \\\n"
+ " --config /etc/hadoop/conf.cloudera.YARN-1 \\\n"
+ " jar $myDriverJar " + CrunchIndexerTool.class.getName() + " \\\n"
+ " --libjars $myDependencyJarFiles \\\n"
+ " -D 'mapred.child.java.opts=-Xmx500m' \\\n"
+ " -D morphlineVariable.ZK_HOST=$(hostname):2181/solr \\\n"
+ " --files src/test/resources/test-documents/string.avsc \\\n"
+ " --morphline-file src/test/resources/test-morphlines/loadSolrLine.conf \\\n"
+ " --pipeline-type mapreduce \\\n"
+ " --chatty \\\n"
+ " --log4j src/test/resources/log4j.properties \\\n"
+ " /user/systest/input/hello1.txt\n"
+ "\n"
+ "# Spark in Local Mode (for rapid prototyping) - Ingest into Solr:\n"
+ "spark-submit \\\n"
+ " --master local \\\n"
+ " --deploy-mode client \\\n"
+ " --jars $myDependencyJarFiles \\\n"
+ " --executor-memory 500M \\\n"
+ " # --driver-library-path /opt/cloudera/parcels/CDH/lib/hadoop/lib/native # for Snappy on CDH with parcels\\\n"
+ " # --driver-library-path /usr/lib/hadoop/lib/native # for Snappy on CDH with packages \\\n"
+ " --class " + CrunchIndexerTool.class.getName() + " \\\n"
+ " $myDriverJar \\\n"
+ " -D morphlineVariable.ZK_HOST=$(hostname):2181/solr \\\n"
+ " --morphline-file src/test/resources/test-morphlines/loadSolrLine.conf \\\n"
+ " --pipeline-type spark \\\n"
+ " --chatty \\\n"
+ " --log4j src/test/resources/log4j.properties \\\n"
+ " /user/systest/input/hello1.txt\n"
+ "\n"
+ "# Spark on Yarn in Client Mode (for testing) - Ingest into Solr:\n"
+ "Same as above, except replace '--master local' with '--master yarn'\n"
+ "\n"
+ "# View the yarn executor log files (there is no GUI yet):\n"
+ "yarn logs --applicationId $application_XYZ\n"
+ "\n"
+ "# Spark on Yarn in Cluster Mode (for production) - Ingest into Solr:\n"
+ "spark-submit \\\n"
+ " --master yarn \\\n"
+ " --deploy-mode cluster \\\n"
+ " --jars $myDependencyJarFiles \\\n"
+ " --executor-memory 500M \\\n"
+ " --class " + CrunchIndexerTool.class.getName() + " \\\n"
+ " --files src/test/resources/log4j.properties,src/test/resources/test-morphlines/loadSolrLine.conf \\\n"
+ " $myDriverJar \\\n"
+ " -D hadoop.tmp.dir=/tmp \\\n"
+ " -D morphlineVariable.ZK_HOST=$(hostname):2181/solr \\\n"
+ " --morphline-file loadSolrLine.conf \\\n"
+ " --pipeline-type spark \\\n"
+ " --chatty \\\n"
+ " --log4j log4j.properties \\\n"
+ " /user/systest/input/hello1.txt\n"
);
throw new FoundHelpArgument(); // Trick to prevent processing of any remaining arguments
}
});
Argument mappersArg = miscArgGroup.addArgument("--mappers")
.metavar("INTEGER")
.type(Integer.class)
.choices(new RangeArgumentChoice(-1, Integer.MAX_VALUE)) // TODO: also support X% syntax where X is an integer
.setDefault(-1)
.help("Tuning knob that indicates the maximum number of MR mapper tasks to use. -1 indicates use all map slots " +
"available on the cluster. This parameter only applies to non-splittable input files");
Argument dryRunArg = miscArgGroup.addArgument("--dry-run")
.action(Arguments.storeTrue())
.help("Run the pipeline but print documents to stdout instead of loading them into Solr. " +
"This can be used for quicker turnaround during early trial & debug sessions.");
Argument log4jConfigFileArg = miscArgGroup.addArgument("--log4j")
.metavar("FILE")
.type(new FileArgumentType().verifyExists().verifyIsFile().verifyCanRead())
.help("Relative or absolute path to a log4j.properties config file on the local file system. This file " +
"will be uploaded to each remote task. Example: /path/to/log4j.properties");
Argument verboseArg = miscArgGroup.addArgument("--chatty")
.action(Arguments.storeTrue())
.help("Turn on verbose output.");
Namespace ns;
try {