Path p = new Path(indexRootPath);
if (!fs.exists(p)) {
LOG.info("Index path doesn't exist, creating...");
fs.mkdirs(p);
}
RetrievalEnvironment env = new RetrievalEnvironment(indexRootPath, fs);
// Build docno mapping from raw collection
Path mappingFile = env.getDocnoMappingData();
if (!fs.exists(mappingFile)) {
LOG.info(mappingFile + " doesn't exist, creating...");
String[] arr = new String[] { rawCollection, indexRootPath + "/wiki-docid-tmp",
mappingFile.toString(), new Integer(numMappers).toString() };
BuildWikipediaDocnoMapping tool = new BuildWikipediaDocnoMapping();
tool.setConf(conf);
tool.run(arr);
fs.delete(new Path(indexRootPath + "/wiki-docid-tmp"), true);
}else{
LOG.info(p+" exists");
}
// Repack Wikipedia into sequential compressed block
p = new Path(seqCollection);
if (!fs.exists(p)) {
LOG.info(seqCollection + " doesn't exist, creating...");
String[] arr = new String[] { rawCollection, seqCollection, mappingFile.toString(), "block"};
RepackWikipedia tool = new RepackWikipedia();
tool.setConf(conf);
tool.run(arr);
}
conf.set("Ivory.CollectionName", "Wikipedia-"+collectionLang);
conf.setInt("Ivory.NumMapTasks", numMappers);
conf.setInt("Ivory.NumReduceTasks", numReducers);
conf.set("Ivory.CollectionPath", seqCollection);
conf.set("Ivory.IndexPath", indexRootPath);
conf.set("Ivory.InputFormat", "org.apache.hadoop.mapred.SequenceFileInputFormat");
conf.set("Ivory.DocnoMappingClass", "edu.umd.cloud9.collection.wikipedia.WikipediaDocnoMapping");
conf.set("Ivory.Tokenizer", tokenizerClass); //"ivory.tokenize.OpenNLPTokenizer"
conf.setInt("Ivory.MinDf", MinDF);
conf.setInt("Ivory.MaxDf", Integer.MAX_VALUE);
// Builds term doc vectors from document collection, and filters the terms that are not included in Ivory.SrcVocab
long startTime = System.currentTimeMillis();
long preprocessStartTime = System.currentTimeMillis();
LOG.info("Building term doc vectors...");
BuildTermDocVectors termDocVectorsTool = new BuildTermDocVectors(conf);
termDocVectorsTool.run();
LOG.info("Job finished in "+(System.currentTimeMillis()-startTime)/1000.0+" seconds");
// Get CF and DF counts
startTime = System.currentTimeMillis();
LOG.info("Counting terms...");
GetTermCount termCountWithDfAndCfTool = new GetTermCount(conf);
termCountWithDfAndCfTool.run();
LOG.info("TermCount = "+env.readCollectionTermCount()+"\nJob finished in "+(System.currentTimeMillis()-startTime)/1000.0+" seconds");
// Build a map from terms to sequentially generated integer term ids
startTime = System.currentTimeMillis();
conf.setInt("Ivory.TermIndexWindow", TermIndexWindow);
LOG.info("Building term-to-integer id mapping...");
BuildTermIdMap termIDsDfCfTool = new BuildTermIdMap(conf);
termIDsDfCfTool.run();
LOG.info("Job finished in "+(System.currentTimeMillis()-startTime)/1000.0+" seconds");
// Compute term weights, and output weighted term doc vectors
startTime = System.currentTimeMillis();
LOG.info("Building weighted term doc vectors...");
conf.set("Ivory.ScoringModel", "ivory.pwsim.score.Bm25");
if(mode == CROSS_LINGUAL_F){
conf.setInt("Ivory.MinNumTerms",MinNumTermsPerArticle);
// translate term doc vectors into English.
conf.setBoolean("Ivory.Normalize", false);
BuildTranslatedTermDocVectors weightedTermVectorsTool = new BuildTranslatedTermDocVectors(conf);
weightedTermVectorsTool.run();
}else{
conf.setInt("Ivory.MinNumTerms",MinNumTermsPerArticle);
// get weighted term doc vectors
conf.setBoolean("Ivory.Normalize", false);
BuildWeightedTermDocVectors weightedTermVectorsTool = new BuildWeightedTermDocVectors(conf);
weightedTermVectorsTool.run();
}
LOG.info("Job finished in "+(System.currentTimeMillis()-startTime)/1000.0+" seconds");
// normalize (optional) and convert weighted term doc vectors into int doc vectors for efficiency
startTime = System.currentTimeMillis();
LOG.info("Building weighted integer doc vectors...");
conf.setBoolean("Ivory.Normalize", IsNormalized);
if(mode == MONO_LINGUAL){
new BuildIntDocVectors(conf).run();
new BuildWeightedIntDocVectors(conf).run();
LOG.info("Job BuildWeightedIntDocVectors finished in "+(System.currentTimeMillis()-startTime)/1000.0+" seconds");
}else{
BuildTargetLangWeightedIntDocVectors weightedIntVectorsTool = new BuildTargetLangWeightedIntDocVectors(conf);
LOG.info("Job BuildTargetLangWeightedIntDocVectors finished in "+(System.currentTimeMillis()-startTime)/1000.0+" seconds");
int finalNumDocs = weightedIntVectorsTool.run();
if(finalNumDocs > 0){
LOG.info("Changed doc count from "+env.readCollectionDocumentCount() + " to = "+finalNumDocs);
env.writeCollectionDocumentCount(finalNumDocs);
}
// set Property.CollectionTermCount to the size of the target vocab. since all docs are translated into that vocab. This property is read by WriteRandomVectors via RunComputeSignatures.
Vocab engVocabH = null;
try {
engVocabH = HadoopAlign.loadVocab(new Path(conf.get("Ivory.FinalVocab")), conf);
} catch (IOException e) {
e.printStackTrace();
}
LOG.info("Changed term count to : "+env.readCollectionTermCount() + " = " + engVocabH.size());
env.writeCollectionTermCount(engVocabH.size());
}
LOG.info("Preprocessing job finished in "+(System.currentTimeMillis()-preprocessStartTime)/1000.0+" seconds");
return 0;