"-output_path=" + indexRootPath + "/wiki-docid-tmp",
"-output_file=" + mappingFile.toString(),
"-lang=" + collectionLang };
LOG.info("Running BuildWikipediaDocnoMapping with args " + Arrays.toString(arr));
BuildWikipediaDocnoMapping tool = new BuildWikipediaDocnoMapping();
tool.setConf(conf);
tool.run(arr);
fs.delete(new Path(indexRootPath + "/wiki-docid-tmp"), true);
}
// Repack Wikipedia into sequential compressed block
p = new Path(seqCollection);
LOG.info(seqCollection + " doesn't exist, creating...");
String[] arr = new String[] { "-input=" + rawCollection,
"-output=" + seqCollection,
"-mapping_file=" + mappingFile.toString(),
"-compression_type=block",
"-wiki_language=" + collectionLang };
LOG.info("Running RepackWikipedia with args " + Arrays.toString(arr));
RepackWikipedia tool = new RepackWikipedia();
tool.setConf(conf);
tool.run(arr);
conf.set(Constants.CollectionName, "Wikipedia-"+collectionLang);
conf.setInt(Constants.NumMapTasks, numMappers);
conf.setInt(Constants.NumReduceTasks, numReducers);
conf.set(Constants.CollectionPath, seqCollection);