res.elapsed = System.currentTimeMillis();
InjectorJob injector = new InjectorJob(conf);
GeneratorJob generator = new GeneratorJob(conf);
FetcherJob fetcher = new FetcherJob(conf);
ParserJob parseSegment = new ParserJob(conf);
DbUpdaterJob crawlDbTool = new DbUpdaterJob(conf);
// not needed in the new API
//LinkDb linkDbTool = new LinkDb(getConf());
long start = System.currentTimeMillis();
// initialize crawlDb
injector.inject(rootUrlDir);
long delta = System.currentTimeMillis() - start;
res.addTiming("inject", "0", delta);
int i;
for (i = 0; i < depth; i++) { // generate new segment
start = System.currentTimeMillis();
String batchId = generator.generate(topN, System.currentTimeMillis(),
false, false);
delta = System.currentTimeMillis() - start;
res.addTiming("generate", i + "", delta);
if (batchId == null) {
LOG.info("Stopping at depth=" + i + " - no more URLs to fetch.");
break;
}
boolean isParsing = getConf().getBoolean("fetcher.parse", false);
start = System.currentTimeMillis();
fetcher.fetch(batchId, threads, false, -1); // fetch it
delta = System.currentTimeMillis() - start;
res.addTiming("fetch", i + "", delta);
if (!isParsing) {
start = System.currentTimeMillis();
parseSegment.parse(batchId, false, false); // parse it, if needed
delta = System.currentTimeMillis() - start;
res.addTiming("parse", i + "", delta);
}
start = System.currentTimeMillis();
crawlDbTool.run(new String[0]); // update crawldb
delta = System.currentTimeMillis() - start;
res.addTiming("update", i + "", delta);
}
if (i == 0) {
LOG.warn("No URLs to fetch - check your seed list and URL filters.");