LOG.info("ReprUrlFixer: crawlDb " + crawlDb);
Path crawlDbCurrent = new Path(crawlDb, CrawlDb.CURRENT_NAME);
Path newCrawlDb = new Path(crawlDb,
Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
JobConf updater = new NutchJob(conf);
updater.setJobName("ReprUtilFixer: " + crawlDb.toString());
FileInputFormat.addInputPath(updater, crawlDbCurrent);
FileOutputFormat.setOutputPath(updater, newCrawlDb);
updater.setInputFormat(SequenceFileInputFormat.class);
updater.setReducerClass(ReprUrlFixer.class);
updater.setOutputKeyClass(Text.class);
updater.setOutputValueClass(CrawlDatum.class);
updater.setOutputFormat(MapFileOutputFormat.class);
try {
JobClient.runJob(updater);
LOG.info("ReprUrlFixer: installing new crawldb " + crawlDb);
CrawlDb.install(updater, crawlDb);
}
catch (IOException e) {
LOG.error(StringUtils.stringifyException(e));
throw e;
}
}
// run the segments through the repr fixer, logic will be run on both the
// crawl_parse and the crawl_fetch directories for every segment specified
if (segments != null) {
for (int i = 0; i < segments.length; i++) {
Path segment = segments[i];
LOG.info("ReprUrlFixer: fetching segment " + segment);
Path segFetch = new Path(segment, CrawlDatum.FETCH_DIR_NAME);
Path newSegFetch = new Path(segment, CrawlDatum.FETCH_DIR_NAME + "-"
+ Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
JobConf fetch = new NutchJob(conf);
fetch.setJobName("ReprUrlFixer: " + segment.toString());
FileInputFormat.addInputPath(fetch, segFetch);
FileOutputFormat.setOutputPath(fetch, newSegFetch);
fetch.setInputFormat(SequenceFileInputFormat.class);
fetch.setReducerClass(ReprUrlFixer.class);
fetch.setOutputKeyClass(Text.class);
fetch.setOutputValueClass(CrawlDatum.class);
fetch.setOutputFormat(MapFileOutputFormat.class);
try {
JobClient.runJob(fetch);
LOG.info("ReprUrlFixer: installing new segment fetch directory " + newSegFetch);
FSUtils.replace(fs, segFetch, newSegFetch, true);
LOG.info("ReprUrlFixer: finished installing segment fetch directory");
}
catch (IOException e) {
LOG.error(StringUtils.stringifyException(e));
throw e;
}
LOG.info("ReprUrlFixer: parsing segment " + segment);
Path segParse = new Path(segment, CrawlDatum.PARSE_DIR_NAME);
Path newSegParse = new Path(segment, CrawlDatum.PARSE_DIR_NAME + "-"
+ Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
JobConf parse = new NutchJob(conf);
parse.setJobName("ReprUrlFixer: " + segment.toString());
FileInputFormat.addInputPath(parse, segParse);
FileOutputFormat.setOutputPath(parse, newSegParse);
parse.setInputFormat(SequenceFileInputFormat.class);
parse.setReducerClass(ReprUrlFixer.class);
parse.setOutputKeyClass(Text.class);
parse.setOutputValueClass(CrawlDatum.class);
parse.setOutputFormat(MapFileOutputFormat.class);
try {
JobClient.runJob(parse);
LOG.info("ReprUrlFixer: installing new segment parse directry " + newSegParse);
FSUtils.replace(fs, segParse, newSegParse, true);