"-output_path=" + indexRootPath + "/wiki-docid-tmp",
"-output_file=" + mappingFile.toString(),
"-wiki_language=" + collectionLang };
LOG.info("Running BuildWikipediaDocnoMapping with args " + Arrays.toString(arr));
BuildWikipediaDocnoMapping tool = new BuildWikipediaDocnoMapping();
tool.setConf(conf);
tool.run(arr);
fs.delete(new Path(indexRootPath + "/wiki-docid-tmp"), true);
} else {
LOG.info("Docno mapping already exists at: " + mappingFile);
}
// Repack Wikipedia into sequential compressed block
if (!fs.exists(new Path(seqCollection + "/part-00000"))) {
LOG.info(seqCollection + " doesn't exist, creating...");
String[] arr = new String[] { "-input=" + rawCollection,
"-output=" + seqCollection,
"-mapping_file=" + mappingFile.toString(),
"-compression_type=block",
"-wiki_language=" + collectionLang };
LOG.info("Running RepackWikipedia with args " + Arrays.toString(arr));
RepackWikipedia tool = new RepackWikipedia();
tool.setConf(conf);
tool.run(arr);
} else {
LOG.info("Repacked collection already exists at: " + seqCollection);
}
conf.set(Constants.CollectionName, "Wikipedia-"+collectionLang);