int blockSize = getConf().getInt("Ivory.BlockSize", -1);
int topN = getConf().getInt("Ivory.TopN", -1);
FileSystem fs = FileSystem.get(getConf());
RetrievalEnvironment re = new RetrievalEnvironment(indexPath, fs);
String collectionName = re.readCollectionName();
int numDocs = re.readCollectionDocumentCount();
Path docLengthPath = re.getDoclengthsData();
String scoringModel = getConf().get("Ivory.ScoringModel");
sLogger.info("Characteristics of the collection:");
sLogger.info(" - CollectionName: " + collectionName);
sLogger.info(" - IndexPath: " + indexPath);
sLogger.info("Characteristics of the job:");
sLogger.info(" - NumMapTasks: " + mapTasks);
sLogger.info(" - NumReduceTasks: " + reduceTasks);
sLogger.info(" - DfCut: " + getConf().getInt("Ivory.DfCut", 0));
sLogger.info(" - BlockSize: " + blockSize);
sLogger.info(" - ScoringModel: " + scoringModel);
sLogger.info(" - topN: " + topN);
sLogger.info(" - OutputPath: " + outputPath);
getConf().setInt("Ivory.CollectionDocumentCount", numDocs);
if (fs.exists(new Path(outputPath))) {
System.out.println("PCP output path already exists!");
return 0;
}
int numBlocks = numDocs / blockSize + 1;
for (int i = 0; i < numBlocks; i++) {
int start = blockSize * i;
int end = i == numBlocks - 1 ? numDocs : blockSize * (i + 1);
JobConf conf = new JobConf(getConf(), PCP.class);
DistributedCache.addCacheFile(docLengthPath.toUri(), conf);
sLogger.info("block " + i + ": " + start + "-" + end);
conf.setInt("Ivory.BlockStart", start);
conf.setInt("Ivory.BlockEnd", end);
conf.setJobName("PCP:" + collectionName + "-dfCut=" + dfCut
+ (topN > 0 ? "-topN" + topN : "-all") + ":Block #" + i);
conf.setNumMapTasks(mapTasks);
conf.setNumReduceTasks(reduceTasks);
String currentOutputPath = outputPath + "/block" + i;
FileInputFormat.setInputPaths(conf, new Path(re.getPostingsDirectory()));
FileOutputFormat.setOutputPath(conf, new Path(currentOutputPath));
conf.setInputFormat(SequenceFileInputFormat.class);
conf.setOutputKeyClass(IntWritable.class);
conf.setOutputValueClass(HMapIFW.class);