.create();
try {
Parser parser = new Parser();
parser.setGroup(group);
parser.setHelpOption(helpOpt);
CommandLine cmdLine = parser.parse(args);
if (cmdLine.hasOption(helpOpt)) {
CommandLineUtil.printHelp(group);
return -1;
}
Path inputDir = new Path((String) cmdLine.getValue(inputDirOpt));
Path outputDir = new Path((String) cmdLine.getValue(outputDirOpt));
int chunkSize = 100;
if (cmdLine.hasOption(chunkSizeOpt)) {
chunkSize = Integer.parseInt((String) cmdLine.getValue(chunkSizeOpt));
}
int minSupport = 2;
if (cmdLine.hasOption(minSupportOpt)) {
String minSupportString = (String) cmdLine.getValue(minSupportOpt);
minSupport = Integer.parseInt(minSupportString);
}
int maxNGramSize = 1;
if (cmdLine.hasOption(maxNGramSizeOpt)) {
try {
maxNGramSize = Integer.parseInt(cmdLine.getValue(maxNGramSizeOpt).toString());
} catch (NumberFormatException ex) {
log.warn("Could not parse ngram size option");
}
}
log.info("Maximum n-gram size is: {}", maxNGramSize);
if (cmdLine.hasOption(overwriteOutput)) {
HadoopUtil.delete(getConf(), outputDir);
}
float minLLRValue = LLRReducer.DEFAULT_MIN_LLR;
if (cmdLine.hasOption(minLLROpt)) {
minLLRValue = Float.parseFloat(cmdLine.getValue(minLLROpt).toString());
}
log.info("Minimum LLR value: {}", minLLRValue);
int reduceTasks = 1;
if (cmdLine.hasOption(numReduceTasksOpt)) {
reduceTasks = Integer.parseInt(cmdLine.getValue(numReduceTasksOpt).toString());
}
log.info("Number of reduce tasks: {}", reduceTasks);
Class<? extends Analyzer> analyzerClass = DefaultAnalyzer.class;
if (cmdLine.hasOption(analyzerNameOpt)) {
String className = cmdLine.getValue(analyzerNameOpt).toString();
analyzerClass = Class.forName(className).asSubclass(Analyzer.class);
// try instantiating it, b/c there isn't any point in setting it if
// you can't instantiate it
ClassUtils.instantiateAs(analyzerClass, Analyzer.class);
}
boolean processIdf;
if (cmdLine.hasOption(weightOpt)) {
String wString = cmdLine.getValue(weightOpt).toString();
if ("tf".equalsIgnoreCase(wString)) {
processIdf = false;
} else if ("tfidf".equalsIgnoreCase(wString)) {
processIdf = true;
} else {
throw new OptionException(weightOpt);
}
} else {
processIdf = true;
}
int minDf = 1;
if (cmdLine.hasOption(minDFOpt)) {
minDf = Integer.parseInt(cmdLine.getValue(minDFOpt).toString());
}
int maxDFPercent = 99;
if (cmdLine.hasOption(maxDFPercentOpt)) {
maxDFPercent = Integer.parseInt(cmdLine.getValue(maxDFPercentOpt).toString());
}
double maxDFSigma = -1.0;
if (cmdLine.hasOption(maxDFSigmaOpt)) {
maxDFSigma = Double.parseDouble(cmdLine.getValue(maxDFSigmaOpt).toString());
}
float norm = PartialVectorMerger.NO_NORMALIZING;
if (cmdLine.hasOption(powerOpt)) {
String power = cmdLine.getValue(powerOpt).toString();
if ("INF".equals(power)) {
norm = Float.POSITIVE_INFINITY;
} else {
norm = Float.parseFloat(power);
}
}
boolean logNormalize = false;
if (cmdLine.hasOption(logNormalizeOpt)) {
logNormalize = true;
}
Configuration conf = getConf();
Path tokenizedPath = new Path(outputDir, DocumentProcessor.TOKENIZED_DOCUMENT_OUTPUT_FOLDER);
//TODO: move this into DictionaryVectorizer , and then fold SparseVectorsFrom with EncodedVectorsFrom to have one framework for all of this.
DocumentProcessor.tokenizeDocuments(inputDir, analyzerClass, tokenizedPath, conf);
boolean sequentialAccessOutput = false;
if (cmdLine.hasOption(sequentialAccessVectorOpt)) {
sequentialAccessOutput = true;
}
boolean namedVectors = false;
if (cmdLine.hasOption(namedVectorOpt)) {
namedVectors = true;
}
boolean shouldPrune = maxDFSigma >=0.0;
String tfDirName = shouldPrune ? DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER+"-toprune" : DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER;