public int runTool() throws Exception {
Configuration conf = getConf();
FileSystem fs = FileSystem.get(conf);
String indexPath = conf.get(Constants.IndexPath);
RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);
String collectionName = env.readCollectionName();
int reduceTasks = conf.getInt(Constants.NumReduceTasks, 0);
int minSplitSize = conf.getInt(Constants.MinSplitSize, 0);
int collectionDocCnt = env.readCollectionDocumentCount();
String postingsType = conf.get(Constants.PostingsListsType,
ivory.core.data.index.PostingsListDocSortedPositional.class.getCanonicalName());
@SuppressWarnings("unchecked")
Class<? extends PostingsList> postingsClass =
(Class<? extends PostingsList>) Class.forName(postingsType);
LOG.info("PowerTool: " + BuildIPInvertedIndexDocSorted.class.getCanonicalName());
LOG.info(String.format(" - %s: %s", Constants.IndexPath, indexPath));
LOG.info(String.format(" - %s: %s", Constants.CollectionName, collectionName));
LOG.info(String.format(" - %s: %s", Constants.CollectionDocumentCount, collectionDocCnt));
LOG.info(String.format(" - %s: %s", Constants.PostingsListsType, postingsClass.getCanonicalName()));
LOG.info(String.format(" - %s: %s", Constants.NumReduceTasks, reduceTasks));
LOG.info(String.format(" - %s: %s", Constants.MinSplitSize, minSplitSize));
if (!fs.exists(new Path(indexPath))) {
fs.mkdirs(new Path(indexPath));
}
Path inputPath = new Path(env.getIntDocVectorsDirectory());
Path postingsPath = new Path(env.getPostingsDirectory());
if (fs.exists(postingsPath)) {
LOG.info("Postings already exist: no indexing will be performed.");
return 0;
}
conf.setInt(Constants.CollectionDocumentCount, collectionDocCnt);
conf.setInt("mapred.min.split.size", minSplitSize);
conf.set("mapred.child.java.opts", "-Xmx2048m");
Job job = new Job(conf,
BuildIPInvertedIndexDocSorted.class.getSimpleName() + ":" + collectionName);
job.setJarByClass(BuildIPInvertedIndexDocSorted.class);
job.setNumReduceTasks(reduceTasks);
FileInputFormat.setInputPaths(job, inputPath);
FileOutputFormat.setOutputPath(job, postingsPath);
job.setInputFormatClass(SequenceFileInputFormat.class);
job.setOutputFormatClass(SequenceFileOutputFormat.class);
job.setMapOutputKeyClass(PairOfInts.class);
job.setMapOutputValueClass(TermPositions.class);
job.setOutputKeyClass(IntWritable.class);
job.setOutputValueClass(postingsClass);
job.setMapperClass(MyMapper.class);
job.setReducerClass(MyReducer.class);
job.setPartitionerClass(MyPartitioner.class);
long startTime = System.currentTimeMillis();
job.waitForCompletion(true);
LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");
env.writePostingsType(postingsClass.getCanonicalName());
return 0;
}