Path routes = new Path(webGraphDb, ROUTES_DIR);
Path tempRoute = new Path(webGraphDb, ROUTES_DIR + "-"
+ Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
// run the initializer
JobConf init = new NutchJob(conf);
init.setJobName("Initializer: " + webGraphDb);
FileInputFormat.addInputPath(init, outlinkDb);
FileInputFormat.addInputPath(init, nodeDb);
init.setInputFormat(SequenceFileInputFormat.class);
init.setMapperClass(Initializer.class);
init.setReducerClass(Initializer.class);
init.setMapOutputKeyClass(Text.class);
init.setMapOutputValueClass(ObjectWritable.class);
init.setOutputKeyClass(Text.class);
init.setOutputValueClass(Route.class);
FileOutputFormat.setOutputPath(init, tempRoute);
init.setOutputFormat(SequenceFileOutputFormat.class);
try {
LOG.info("Loops: starting initializer");
JobClient.runJob(init);
LOG.info("Loops: installing initializer " + routes);
FSUtils.replace(fs, routes, tempRoute, true);
LOG.info("Loops: finished initializer");
}
catch (IOException e) {
LOG.error(StringUtils.stringifyException(e));
throw e;
}
// run the loops job for a maxdepth, default 2, which will find a 3 link
// loop cycle
int depth = conf.getInt("link.loops.depth", 2);
for (int i = 0; i < depth; i++) {
JobConf looper = new NutchJob(conf);
looper.setJobName("Looper: " + (i + 1) + " of " + depth);
FileInputFormat.addInputPath(looper, outlinkDb);
FileInputFormat.addInputPath(looper, routes);
looper.setInputFormat(SequenceFileInputFormat.class);
looper.setMapperClass(Looper.class);
looper.setReducerClass(Looper.class);
looper.setMapOutputKeyClass(Text.class);
looper.setMapOutputValueClass(ObjectWritable.class);
looper.setOutputKeyClass(Text.class);
looper.setOutputValueClass(Route.class);
FileOutputFormat.setOutputPath(looper, tempRoute);
looper.setOutputFormat(SequenceFileOutputFormat.class);
looper.setBoolean("last", i == (depth - 1));
try {
LOG.info("Loops: starting looper");
JobClient.runJob(looper);
LOG.info("Loops: installing looper " + routes);
FSUtils.replace(fs, routes, tempRoute, true);
LOG.info("Loops: finished looper");
}
catch (IOException e) {
LOG.error(StringUtils.stringifyException(e));
throw e;
}
}
// run the finalizer
JobConf finalizer = new NutchJob(conf);
finalizer.setJobName("Finalizer: " + webGraphDb);
FileInputFormat.addInputPath(finalizer, routes);
finalizer.setInputFormat(SequenceFileInputFormat.class);
finalizer.setMapperClass(Finalizer.class);
finalizer.setReducerClass(Finalizer.class);
finalizer.setMapOutputKeyClass(Text.class);
finalizer.setMapOutputValueClass(Route.class);
finalizer.setOutputKeyClass(Text.class);
finalizer.setOutputValueClass(LoopSet.class);
FileOutputFormat.setOutputPath(finalizer, new Path(webGraphDb, LOOPS_DIR));
finalizer.setOutputFormat(MapFileOutputFormat.class);
try {
LOG.info("Loops: starting finalizer");
JobClient.runJob(finalizer);
LOG.info("Loops: finished finalizer");