Path p = new Path(indexRootPath);
if (!fs.exists(p)) {
LOG.info("Index path doesn't exist, creating...");
fs.mkdirs(p);
}
RetrievalEnvironment env = new RetrievalEnvironment(indexRootPath, fs);
// Build docno mapping from raw collection
Path mappingFile = env.getDocnoMappingData();
if (!fs.exists(mappingFile)) {
LOG.info(mappingFile + " doesn't exist, creating...");
String[] arr = new String[] {
"-input=" + rawCollection,
"-output_path=" + indexRootPath + "/wiki-docid-tmp",
"-output_file=" + mappingFile.toString(),
"-wiki_language=" + collectionLang };
LOG.info("Running BuildWikipediaDocnoMapping with args " + Arrays.toString(arr));
BuildWikipediaDocnoMapping tool = new BuildWikipediaDocnoMapping();
tool.setConf(conf);
tool.run(arr);
fs.delete(new Path(indexRootPath + "/wiki-docid-tmp"), true);
} else {
LOG.info("Docno mapping already exists at: " + mappingFile);
}
// Repack Wikipedia into sequential compressed block
if (!fs.exists(new Path(seqCollection + "/part-00000"))) {
LOG.info(seqCollection + " doesn't exist, creating...");
String[] arr = new String[] { "-input=" + rawCollection,
"-output=" + seqCollection,
"-mapping_file=" + mappingFile.toString(),
"-compression_type=block",
"-wiki_language=" + collectionLang };
LOG.info("Running RepackWikipedia with args " + Arrays.toString(arr));
RepackWikipedia tool = new RepackWikipedia();
tool.setConf(conf);
tool.run(arr);
} else {
LOG.info("Repacked collection already exists at: " + seqCollection);
}
conf.set(Constants.CollectionName, "Wikipedia-"+collectionLang);
conf.setInt(Constants.NumMapTasks, numMappers);
conf.setInt(Constants.NumReduceTasks, numReducers);
conf.set(Constants.CollectionPath, seqCollection);
conf.set(Constants.IndexPath, indexRootPath);
conf.set(Constants.InputFormat, SequenceFileInputFormat.class.getCanonicalName());
conf.set(Constants.DocnoMappingClass, WikipediaDocnoMapping.class.getCanonicalName());
conf.set(Constants.Tokenizer, tokenizerClass); //"ivory.tokenize.OpenNLPTokenizer"
conf.setInt(Constants.MinDf, MinDF);
conf.setInt(Constants.MaxDf, Integer.MAX_VALUE);
conf.setInt(Constants.DocnoOffset, 0); // docnos start at 1
conf.setInt(Constants.TermIndexWindow, TermIndexWindow);
// Builds term doc vectors from document collection, and filters the terms that are not included
// in Ivory.SrcVocab.
long startTime = System.currentTimeMillis();
long preprocessStartTime = System.currentTimeMillis();
LOG.info("Building term doc vectors...");
int exitCode = new BuildTermDocVectors(conf).run();
if (exitCode >= 0) {
LOG.info("Job BuildTermDocVectors finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");
}else {
LOG.info("Error: BuildTermDocVectors. Terminating...");
return -1;
}
// Get CF and DF counts.
startTime = System.currentTimeMillis();
LOG.info("Counting terms...");
exitCode = new ComputeGlobalTermStatistics(conf).run();
LOG.info("TermCount = " + env.readCollectionTermCount());
if (exitCode >= 0) {
LOG.info("Job ComputeGlobalTermStatistics finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");
}else {
LOG.info("Error: ComputeGlobalTermStatistics. Terminating...");
return -1;
}
// Build a map from terms to sequentially generated integer term ids.
startTime = System.currentTimeMillis();
LOG.info("Building term-to-integer id mapping...");
exitCode = new BuildDictionary(conf).run();
if (exitCode >= 0) {
LOG.info("Job BuildDictionary finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");
}else{
LOG.info("Error: BuildDictionary. Terminating...");
return -1;
}
// Compute term weights, and output weighted term doc vectors.
LOG.info("Building weighted term doc vectors...");
startTime = System.currentTimeMillis();
conf.set("Ivory.ScoringModel", "ivory.pwsim.score.Bm25");
conf.setBoolean("Ivory.Normalize", IsNormalized);
conf.setInt("Ivory.MinNumTerms",MinNumTermsPerArticle);
if (mode == CROSS_LINGUAL_F) {
// Translate term doc vectors into English.
exitCode = new BuildTranslatedTermDocVectors(conf).run();
} else {
// Build weighted term doc vectors.
exitCode = new BuildWeightedTermDocVectors(conf).run();
}
if (exitCode >= 0) {
LOG.info("Job BuildTranslated/WeightedTermDocVectors finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");
}else {
LOG.info("Error: BuildTranslated/WeightedTermDocVectors. Terminating...");
return -1;
}
// normalize (optional) and convert weighted term doc vectors into int doc vectors for efficiency
startTime = System.currentTimeMillis();
LOG.info("Building weighted integer doc vectors...");
conf.setBoolean("Ivory.Normalize", IsNormalized);
if (mode == MONO_LINGUAL) {
exitCode = new BuildIntDocVectors(conf).run();
exitCode = new BuildWeightedIntDocVectors(conf).run();
if (exitCode >= 0) {
LOG.info("Job BuildWeightedIntDocVectors finished in "+(System.currentTimeMillis() - startTime) / 1000.0 + " seconds");
}else {
LOG.info("Error: BuildWeightedIntDocVectors. Terminating...");
return -1;
}
} else {
BuildTargetLangWeightedIntDocVectors weightedIntVectorsTool =
new BuildTargetLangWeightedIntDocVectors(conf);
int finalNumDocs = weightedIntVectorsTool.run();
LOG.info("Job BuildTargetLangWeightedIntDocVectors finished in " +
(System.currentTimeMillis() - startTime) / 1000.0 + " seconds");
if (finalNumDocs > 0) {
LOG.info("Changed doc count: " + env.readCollectionDocumentCount() +" => " + finalNumDocs);
env.writeCollectionDocumentCount(finalNumDocs);
}else {
LOG.info("No document output! Terminating...");
return -1;
}
// set Property.CollectionTermCount to the size of the target vocab. since all docs are translated into that vocab. This property is read by WriteRandomVectors via RunComputeSignatures.
Vocab engVocabH = null;
try {
engVocabH = HadoopAlign.loadVocab(new Path(conf.get("Ivory.FinalVocab")), conf);
} catch (IOException e) {
e.printStackTrace();
}
LOG.info("Changed term count: " + env.readCollectionTermCount() + " => " + engVocabH.size());
env.writeCollectionTermCount(engVocabH.size());
}
LOG.info("Preprocessing job finished in " + (System.currentTimeMillis() - preprocessStartTime) / 1000.0 + " seconds");
return 0;