public int run(String[] args) throws Exception {
DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
ArgumentBuilder abuilder = new ArgumentBuilder();
GroupBuilder gbuilder = new GroupBuilder();
Option inputDirOpt = DefaultOptionCreator.inputOption().create();
Option outputDirOpt = DefaultOptionCreator.outputOption().create();
Option minSupportOpt = obuilder.withLongName("minSupport").withArgument(
abuilder.withName("minSupport").withMinimum(1).withMaximum(1).create()).withDescription(
"(Optional) Minimum Support. Default Value: 2").withShortName("s").create();
Option analyzerNameOpt = obuilder.withLongName("analyzerName").withArgument(
abuilder.withName("analyzerName").withMinimum(1).withMaximum(1).create()).withDescription(
"The class name of the analyzer").withShortName("a").create();
Option chunkSizeOpt = obuilder.withLongName("chunkSize").withArgument(
abuilder.withName("chunkSize").withMinimum(1).withMaximum(1).create()).withDescription(
"The chunkSize in MegaBytes. 100-10000 MB").withShortName("chunk").create();
Option weightOpt = obuilder.withLongName("weight").withRequired(false).withArgument(
abuilder.withName("weight").withMinimum(1).withMaximum(1).create()).withDescription(
"The kind of weight to use. Currently TF or TFIDF").withShortName("wt").create();
Option minDFOpt = obuilder.withLongName("minDF").withRequired(false).withArgument(
abuilder.withName("minDF").withMinimum(1).withMaximum(1).create()).withDescription(
"The minimum document frequency. Default is 1").withShortName("md").create();
Option maxDFPercentOpt = obuilder.withLongName("maxDFPercent").withRequired(false).withArgument(
abuilder.withName("maxDFPercent").withMinimum(1).withMaximum(1).create()).withDescription(
"The max percentage of docs for the DF. Can be used to remove really high frequency terms."
+ " Expressed as an integer between 0 and 100. Default is 99. If maxDFSigma is also set, it will override this value.").withShortName("x").create();
Option maxDFSigmaOpt = obuilder.withLongName("maxDFSigma").withRequired(false).withArgument(
abuilder.withName("maxDFSigma").withMinimum(1).withMaximum(1).create()).withDescription(
"What portion of the tf (tf-idf) vectors to be used, expressed in times the standard deviation (sigma) of the document frequencies of these vectors." +
" Can be used to remove really high frequency terms."
+ " Expressed as a double value. Good value to be specified is 3.0. In case the value is less then 0 no vectors " +
"will be filtered out. Default is -1.0. Overrides maxDFPercent").withShortName("xs").create();
Option minLLROpt = obuilder.withLongName("minLLR").withRequired(false).withArgument(
abuilder.withName("minLLR").withMinimum(1).withMaximum(1).create()).withDescription(
"(Optional)The minimum Log Likelihood Ratio(Float) Default is " + LLRReducer.DEFAULT_MIN_LLR)
.withShortName("ml").create();
Option numReduceTasksOpt = obuilder.withLongName("numReducers").withArgument(
abuilder.withName("numReducers").withMinimum(1).withMaximum(1).create()).withDescription(
"(Optional) Number of reduce tasks. Default Value: 1").withShortName("nr").create();
Option powerOpt = obuilder.withLongName("norm").withRequired(false).withArgument(
abuilder.withName("norm").withMinimum(1).withMaximum(1).create()).withDescription(
"The norm to use, expressed as either a float or \"INF\" if you want to use the Infinite norm. "
+ "Must be greater or equal to 0. The default is not to normalize").withShortName("n").create();
Option logNormalizeOpt = obuilder.withLongName("logNormalize").withRequired(false)
.withDescription(
"(Optional) Whether output vectors should be logNormalize. If set true else false")
.withShortName("lnorm").create();
Option maxNGramSizeOpt = obuilder.withLongName("maxNGramSize").withRequired(false).withArgument(
abuilder.withName("ngramSize").withMinimum(1).withMaximum(1).create())
.withDescription(
"(Optional) The maximum size of ngrams to create"
+ " (2 = bigrams, 3 = trigrams, etc) Default Value:1").withShortName("ng").create();
Option sequentialAccessVectorOpt = obuilder.withLongName("sequentialAccessVector").withRequired(false)
.withDescription(
"(Optional) Whether output vectors should be SequentialAccessVectors. If set true else false")
.withShortName("seq").create();
Option namedVectorOpt = obuilder.withLongName("namedVector").withRequired(false)
.withDescription(
"(Optional) Whether output vectors should be NamedVectors. If set true else false")
.withShortName("nv").create();
Option overwriteOutput = obuilder.withLongName("overwrite").withRequired(false).withDescription(
"If set, overwrite the output directory").withShortName("ow").create();
Option helpOpt = obuilder.withLongName("help").withDescription("Print out help").withShortName("h")
.create();
Group group = gbuilder.withName("Options").withOption(minSupportOpt).withOption(analyzerNameOpt)
.withOption(chunkSizeOpt).withOption(outputDirOpt).withOption(inputDirOpt).withOption(minDFOpt)
.withOption(maxDFSigmaOpt).withOption(maxDFPercentOpt).withOption(weightOpt).withOption(powerOpt).withOption(minLLROpt)