fs.mkdirs(outlinkDb);
}
Path tempOutlinkDb = new Path(outlinkDb + "-"
+ Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
JobConf outlinkJob = new NutchJob(conf);
outlinkJob.setJobName("Outlinkdb: " + outlinkDb);
boolean deleteGone = conf.getBoolean("link.delete.gone", false);
boolean preserveBackup = conf.getBoolean("db.preserve.backup", true);
if (deleteGone) {
LOG.info("OutlinkDb: deleting gone links");
}
// get the parse data and crawl fetch data for all segments
if (segments != null) {
for (int i = 0; i < segments.length; i++) {
Path parseData = new Path(segments[i], ParseData.DIR_NAME);
if (fs.exists(parseData)) {
LOG.info("OutlinkDb: adding input: " + parseData);
FileInputFormat.addInputPath(outlinkJob, parseData);
}
if (deleteGone) {
Path crawlFetch = new Path(segments[i], CrawlDatum.FETCH_DIR_NAME);
if (fs.exists(crawlFetch)) {
LOG.info("OutlinkDb: adding input: " + crawlFetch);
FileInputFormat.addInputPath(outlinkJob, crawlFetch);
}
}
}
}
// add the existing webgraph
LOG.info("OutlinkDb: adding input: " + outlinkDb);
FileInputFormat.addInputPath(outlinkJob, outlinkDb);
outlinkJob.setBoolean(OutlinkDb.URL_NORMALIZING, normalize);
outlinkJob.setBoolean(OutlinkDb.URL_FILTERING, filter);
outlinkJob.setInputFormat(SequenceFileInputFormat.class);
outlinkJob.setMapperClass(OutlinkDb.class);
outlinkJob.setReducerClass(OutlinkDb.class);
outlinkJob.setMapOutputKeyClass(Text.class);
outlinkJob.setMapOutputValueClass(NutchWritable.class);
outlinkJob.setOutputKeyClass(Text.class);
outlinkJob.setOutputValueClass(LinkDatum.class);
FileOutputFormat.setOutputPath(outlinkJob, tempOutlinkDb);
outlinkJob.setOutputFormat(MapFileOutputFormat.class);
outlinkJob.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);
// run the outlinkdb job and replace any old outlinkdb with the new one
try {
LOG.info("OutlinkDb: running");
JobClient.runJob(outlinkJob);
LOG.info("OutlinkDb: installing " + outlinkDb);
FSUtils.replace(fs, oldOutlinkDb, outlinkDb, true);
FSUtils.replace(fs, outlinkDb, tempOutlinkDb, true);
if (!preserveBackup && fs.exists(oldOutlinkDb)) fs.delete(oldOutlinkDb, true);
LOG.info("OutlinkDb: finished");
}
catch (IOException e) {
// remove lock file and and temporary directory if an error occurs
LockUtil.removeLockFile(fs, lock);
if (fs.exists(tempOutlinkDb)) {
fs.delete(tempOutlinkDb, true);
}
LOG.error(StringUtils.stringifyException(e));
throw e;
}
// inlink and temp link database paths
Path inlinkDb = new Path(webGraphDb, INLINK_DIR);
Path tempInlinkDb = new Path(inlinkDb + "-"
+ Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
JobConf inlinkJob = new NutchJob(conf);
inlinkJob.setJobName("Inlinkdb " + inlinkDb);
LOG.info("InlinkDb: adding input: " + outlinkDb);
FileInputFormat.addInputPath(inlinkJob, outlinkDb);
inlinkJob.setInputFormat(SequenceFileInputFormat.class);
inlinkJob.setMapperClass(InlinkDb.class);
inlinkJob.setMapOutputKeyClass(Text.class);
inlinkJob.setMapOutputValueClass(LinkDatum.class);
inlinkJob.setOutputKeyClass(Text.class);
inlinkJob.setOutputValueClass(LinkDatum.class);
FileOutputFormat.setOutputPath(inlinkJob, tempInlinkDb);
inlinkJob.setOutputFormat(MapFileOutputFormat.class);
inlinkJob.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);
try {
// run the inlink and replace any old with new
LOG.info("InlinkDb: running");
JobClient.runJob(inlinkJob);
LOG.info("InlinkDb: installing " + inlinkDb);
FSUtils.replace(fs, inlinkDb, tempInlinkDb, true);
LOG.info("InlinkDb: finished");
}
catch (IOException e) {
// remove lock file and and temporary directory if an error occurs
LockUtil.removeLockFile(fs, lock);
if (fs.exists(tempInlinkDb)) {
fs.delete(tempInlinkDb, true);
}
LOG.error(StringUtils.stringifyException(e));
throw e;
}
// node and temp node database paths
Path nodeDb = new Path(webGraphDb, NODE_DIR);
Path tempNodeDb = new Path(nodeDb + "-"
+ Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
JobConf nodeJob = new NutchJob(conf);
nodeJob.setJobName("NodeDb " + nodeDb);
LOG.info("NodeDb: adding input: " + outlinkDb);
LOG.info("NodeDb: adding input: " + inlinkDb);
FileInputFormat.addInputPath(nodeJob, outlinkDb);
FileInputFormat.addInputPath(nodeJob, inlinkDb);
nodeJob.setInputFormat(SequenceFileInputFormat.class);
nodeJob.setReducerClass(NodeDb.class);
nodeJob.setMapOutputKeyClass(Text.class);
nodeJob.setMapOutputValueClass(LinkDatum.class);
nodeJob.setOutputKeyClass(Text.class);
nodeJob.setOutputValueClass(Node.class);
FileOutputFormat.setOutputPath(nodeJob, tempNodeDb);
nodeJob.setOutputFormat(MapFileOutputFormat.class);
nodeJob.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);
try {
// run the node job and replace old nodedb with new
LOG.info("NodeDb: running");