final Path temp = job.getLocalPath("index/_"
+ Integer.toString(new Random().nextInt()));
fs.delete(perm, true); // delete old, if any
final AnalyzerFactory factory = new AnalyzerFactory(job);
final IndexWriter writer = // build locally first
new IndexWriter(
FSDirectory.open(new File(fs.startLocalOutput(perm, temp).toString())),
new NutchDocumentAnalyzer(job), true,
new MaxFieldLength(IndexWriter.DEFAULT_MAX_FIELD_LENGTH));
writer.setMergeFactor(job.getInt("indexer.mergeFactor", 10));
writer.setMaxBufferedDocs(job.getInt("indexer.minMergeDocs", 100));
writer.setMaxMergeDocs(job.getInt("indexer.maxMergeDocs",
Integer.MAX_VALUE));
writer.setTermIndexInterval(job.getInt("indexer.termIndexInterval", 128));
writer.setMaxFieldLength(job.getInt("indexer.max.tokens", 10000));
writer.setInfoStream(LogUtil.getInfoStream(LOG));
writer.setUseCompoundFile(false);
writer.setSimilarity(new NutchSimilarity());
return new RecordWriter<WritableComparable, LuceneDocumentWrapper>() {
boolean closed;
public void write(WritableComparable key, LuceneDocumentWrapper value)
throws IOException { // unwrap & index doc
Document doc = value.get();
NutchAnalyzer analyzer = factory.get(doc.get("lang"));
if (LOG.isInfoEnabled()) {
LOG.info(" Indexing [" + doc.getField("url").stringValue() + "]"
+ " with analyzer " + analyzer);
}
writer.addDocument(doc, analyzer);