.withOption(dictOutOpt).withOption(powerOpt).withOption(outWriterOpt).withOption(maxDFPercentOpt)
.withOption(weightOpt).withOption(minDFOpt).create();
try {
Parser parser = new Parser();
parser.setGroup(group);
CommandLine cmdLine = parser.parse(args);
if (cmdLine.hasOption(helpOpt)) {
CommandLineUtil.printHelp(group);
return;
}
// Springify all this
if (cmdLine.hasOption(inputOpt)) { // Lucene case
File file = new File(cmdLine.getValue(inputOpt).toString());
if (file.exists() && file.isDirectory()) {
long maxDocs = Long.MAX_VALUE;
if (cmdLine.hasOption(maxOpt)) {
maxDocs = Long.parseLong(cmdLine.getValue(maxOpt).toString());
}
if (maxDocs < 0) {
throw new IllegalArgumentException("maxDocs must be >= 0");
}
Directory dir = FSDirectory.open(file);
IndexReader reader = IndexReader.open(dir, true);
Weight weight;
if (cmdLine.hasOption(weightOpt)) {
String wString = cmdLine.getValue(weightOpt).toString();
if (wString.equalsIgnoreCase("tf")) {
weight = new TF();
} else if (wString.equalsIgnoreCase("tfidf")) {
weight = new TFIDF();
} else {
throw new OptionException(weightOpt);
}
} else {
weight = new TFIDF();
}
String field = cmdLine.getValue(fieldOpt).toString();
int minDf = 1;
if (cmdLine.hasOption(minDFOpt)) {
minDf = Integer.parseInt(cmdLine.getValue(minDFOpt).toString());
}
int maxDFPercent = 99;
if (cmdLine.hasOption(maxDFPercentOpt)) {
maxDFPercent = Integer.parseInt(cmdLine.getValue(maxDFPercentOpt).toString());
}
TermInfo termInfo = new CachedTermInfo(reader, field, minDf, maxDFPercent);
VectorMapper mapper = new TFDFMapper(reader, weight, termInfo);
double norm = LuceneIterable.NO_NORMALIZING;
if (cmdLine.hasOption(powerOpt)) {
String power = cmdLine.getValue(powerOpt).toString();
if (power.equals("INF")) {
norm = Double.POSITIVE_INFINITY;
} else {
norm = Double.parseDouble(power);
}
}
String idField = null;
if (cmdLine.hasOption(idFieldOpt)) {
idField = cmdLine.getValue(idFieldOpt).toString();
}
LuceneIterable iterable;
if (norm == LuceneIterable.NO_NORMALIZING) {
iterable = new LuceneIterable(reader, idField, field, mapper, LuceneIterable.NO_NORMALIZING);
} else {
iterable = new LuceneIterable(reader, idField, field, mapper, norm);
}
String outFile = cmdLine.getValue(outputOpt).toString();
log.info("Output File: {}", outFile);
VectorWriter vectorWriter;
if (cmdLine.hasOption(outWriterOpt)) {
String outWriter = cmdLine.getValue(outWriterOpt).toString();
if (outWriter.equals("file")) {
BufferedWriter writer = new BufferedWriter(new FileWriter(outFile));
vectorWriter = new JWriterVectorWriter(writer);
} else {
vectorWriter = getSeqFileWriter(outFile);
}
} else {
vectorWriter = getSeqFileWriter(outFile);
}
long numDocs = vectorWriter.write(iterable, maxDocs);
vectorWriter.close();
log.info("Wrote: {} vectors", numDocs);
String delimiter = cmdLine.hasOption(delimiterOpt) ? cmdLine.getValue(delimiterOpt).toString()
: "\t";
File dictOutFile = new File(cmdLine.getValue(dictOutOpt).toString());
log.info("Dictionary Output file: {}", dictOutFile);
BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(
new FileOutputStream(dictOutFile), Charset.forName("UTF8")));
JWriterTermInfoWriter tiWriter = new JWriterTermInfoWriter(writer, delimiter, field);
tiWriter.write(termInfo);