Configuration conf = getConf();
FileSystem fs = FileSystem.get(conf);
String indexPath = conf.get(Constants.IndexPath);
RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);
int reduceTasks = 10;
String collectionName = env.readCollectionName();
String termDocVectorsPath = env.getTermDocVectorsDirectory();
String termDfCfPath = env.getTermDfCfDirectory();
if (!fs.exists(new Path(indexPath))) {
LOG.info("index path doesn't existing: skipping!");
return 0;
}
LOG.info("PowerTool: " + ComputeGlobalTermStatistics.class.getCanonicalName());
LOG.info(String.format(" - %s: %s", Constants.CollectionName, collectionName));
LOG.info(String.format(" - %s: %s", Constants.IndexPath, indexPath));
LOG.info(String.format(" - %s: %s", Constants.NumReduceTasks, reduceTasks));
Path outputPath = new Path(termDfCfPath);
if (fs.exists(outputPath)) {
LOG.info("TermDfCf directory exist: skipping!");
return 0;
}
Job job = new Job(getConf(), ComputeGlobalTermStatistics.class.getSimpleName() + ":"
+ collectionName);
job.setJarByClass(ComputeGlobalTermStatistics.class);
job.setNumReduceTasks(reduceTasks);
FileInputFormat.setInputPaths(job, new Path(termDocVectorsPath));
FileOutputFormat.setOutputPath(job, outputPath);
job.setInputFormatClass(SequenceFileInputFormat.class);
job.setOutputFormatClass(SequenceFileOutputFormat.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(PairOfIntLong.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(PairOfIntLong.class);
job.setMapperClass(MyMapper.class);
job.setCombinerClass(MyCombiner.class);
job.setReducerClass(MyReducer.class);
long startTime = System.currentTimeMillis();
job.waitForCompletion(true);
LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");
Counters counters = job.getCounters();
// Write out number of postings. NOTE: this value is not the same as
// number of postings, because postings for non-English terms are
// discarded, or as result of df cut.
env.writeCollectionTermCount((int) counters.findCounter(Statistics.Terms).getValue());
env.writeCollectionLength(counters.findCounter(Statistics.SumOfDocLengths).getValue());
return 0;
}