if (!fs.exists(p)) {
sLogger.info("index directory doesn't exist, creating...");
fs.mkdirs(p);
}
RetrievalEnvironment env = new RetrievalEnvironment(indexRootPath, fs);
// Look for the docno mapping, which maps from docid (String) to docno
// (sequentially-number integer). If it doesn't exist create it.
Path mappingFile = env.getDocnoMappingData();
Path mappingDir = env.getDocnoMappingDirectory();
if (!fs.exists(p)) {
sLogger.info("docno-mapping.dat doesn't exist, creating...");
String[] arr = new String[] { collection, mappingDir.toString(), mappingFile.toString(),
new Integer(numMappers).toString() };
NumberTextDocuments tool = new NumberTextDocuments();
tool.setConf(conf);
tool.run(arr);
fs.delete(mappingDir, true);
}
// Now we're ready to start the preprocessing pipeline... set
// appropriate properties.
conf.setInt("Ivory.NumMapTasks", numMappers);
conf.setInt("Ivory.NumReduceTasks", numReducers);
conf.set("Ivory.CollectionName", "TextCollection");
conf.set("Ivory.CollectionPath", collection);
conf.set("Ivory.IndexPath", indexRootPath);
conf.set("Ivory.InputFormat", "edu.umd.cloud9.collection.line.TextDocumentInputFormat");
conf.set("Ivory.Tokenizer", "ivory.tokenize.GalagoTokenizer");
conf.set("Ivory.DocnoMappingClass", "edu.umd.cloud9.collection.line.TextDocnoMapping");
conf.set("Ivory.DocnoMappingFile", env.getDocnoMappingData().toString());
conf.set("Ivory.DocnoMappingFile", "");
conf.setInt("Ivory.DocnoOffset", 0); // docnos start at 1
conf.setInt("Ivory.MinDf", 2); // toss away singleton terms
conf.setInt("Ivory.MaxDf", Integer.MAX_VALUE);