Path outDir1 =
new Path("dedup-urls-"+
Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
JobConf job = new NutchJob(getConf());
for (int i = 0; i < indexDirs.length; i++) {
if (LOG.isInfoEnabled()) {
LOG.info("Dedup: adding indexes in: " + indexDirs[i]);
}
FileInputFormat.addInputPath(job, indexDirs[i]);
}
job.setJobName("dedup 1: urls by time");
job.setInputFormat(InputFormat.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IndexDoc.class);
job.setReducerClass(UrlsReducer.class);
FileOutputFormat.setOutputPath(job, outDir1);
job.setOutputKeyClass(MD5Hash.class);
job.setOutputValueClass(IndexDoc.class);
job.setOutputFormat(SequenceFileOutputFormat.class);
JobClient.runJob(job);
Path outDir2 =
new Path("dedup-hash-"+
Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
job = new NutchJob(getConf());
job.setJobName("dedup 2: content by hash");
FileInputFormat.addInputPath(job, outDir1);
job.setInputFormat(SequenceFileInputFormat.class);
job.setMapOutputKeyClass(MD5Hash.class);
job.setMapOutputValueClass(IndexDoc.class);
job.setPartitionerClass(HashPartitioner.class);
job.setSpeculativeExecution(false);
job.setReducerClass(HashReducer.class);
FileOutputFormat.setOutputPath(job, outDir2);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IndexDoc.class);
job.setOutputFormat(SequenceFileOutputFormat.class);
JobClient.runJob(job);
// remove outDir1 - no longer needed
fs.delete(outDir1, true);
job = new NutchJob(getConf());
job.setJobName("dedup 3: delete from index(es)");
FileInputFormat.addInputPath(job, outDir2);
job.setInputFormat(SequenceFileInputFormat.class);
//job.setInputKeyClass(Text.class);
//job.setInputValueClass(IndexDoc.class);
job.setInt("io.file.buffer.size", 4096);
job.setMapperClass(DeleteDuplicates.class);
job.setReducerClass(DeleteDuplicates.class);
job.setOutputFormat(DeleteDuplicates.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
JobClient.runJob(job);
fs.delete(outDir2, true);