weight = new TFIDF();
} else {
throw new IllegalArgumentException("Weight type " + weightType + " is not supported");
}
TermInfo termInfo = new CachedTermInfo(reader, field, minDf, maxDFPercent);
LuceneIterable iterable;
if (norm == LuceneIterable.NO_NORMALIZING) {
iterable = new LuceneIterable(reader, idField, field, termInfo, weight, LuceneIterable.NO_NORMALIZING,
maxPercentErrorDocs);
} else {
iterable = new LuceneIterable(reader, idField, field, termInfo, weight, norm, maxPercentErrorDocs);
}
log.info("Output File: {}", outFile);
VectorWriter vectorWriter = getSeqFileWriter(outFile);
try {
long numDocs = vectorWriter.write(iterable, maxDocs);
log.info("Wrote: {} vectors", numDocs);
} finally {
Closeables.close(vectorWriter, false);
}
File dictOutFile = new File(dictOut);
log.info("Dictionary Output file: {}", dictOutFile);
Writer writer = Files.newWriter(dictOutFile, Charsets.UTF_8);
DelimitedTermInfoWriter tiWriter = new DelimitedTermInfoWriter(writer, delimiter, field);
try {
tiWriter.write(termInfo);
} finally {
Closeables.close(tiWriter, false);
}
if (!"".equals(seqDictOut)) {
log.info("SequenceFile Dictionary Output file: {}", seqDictOut);
Path path = new Path(seqDictOut);
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(conf);
SequenceFile.Writer seqWriter = null;
try {
seqWriter = SequenceFile.createWriter(fs, conf, path, Text.class, IntWritable.class);
Text term = new Text();
IntWritable termIndex = new IntWritable();
Iterator<TermEntry> termEntries = termInfo.getAllEntries();
while (termEntries.hasNext()) {
TermEntry termEntry = termEntries.next();
term.set(termEntry.getTerm());
termIndex.set(termEntry.getTermIdx());
seqWriter.append(term, termIndex);