// start the fetch server
FetchServer fetchserver = new FetchServer(fetcher, this);
fetchserver.start();
// prepare the pagedbs
PageDB oldPageDB = new PageDB(pagedbDir);
PageDB tmpPageDB;
PageDB newPageDB;
if (createNewPageDB) {
if (distributed) {
tmpPageDB = new DPageDB(pagedbDir + ".tmp", pageCatcher);
// if (starting) {
// logger.info("Waiting for other nodes to start...");
// ((DPageDB)tmpPageDB).synch();
// logger.info("All nodes started");
// starting = false;
// }
} else {
tmpPageDB = new PageDB(pagedbDir + ".tmp");
}
newPageDB = new PageDB(pagedbDir + ".new");
} else {
tmpPageDB = new NoPageDB();
newPageDB = new NoPageDB();
}
// delete leftover new pagedb
newPageDB.deleteDir(false);
// prepare the new pagedb
oldPageDB.open(PageDB.READ);
// Crawl recovery attempt
boolean skipFetch = false;
long skip = 0;
progress = CrawlerProgress.restartCrawlerProgress();
if (null != progress) {
// the previous cycle was interrupted
switch (progress.stage()) {
case CrawlerProgress.START:
case CrawlerProgress.STOP:
logger.info("Last crawler state is either before starting or after finishing, will start next cycle.");
progress = null;
break;
case CrawlerProgress.FETCH:
if ((progress.cycle() == oldPageDB.getCycles() + 1)) {
skip = progress.processed();
logger.info("Crawler was interrupted while fetching at cycle "+progress.cycle()+", will continue current cycle skipping "+skip+" docs.");
tmpPageDB.open(PageDB.WRITE + PageDB.APPEND);
} else {
logger.info("Last crawler report inconsistent with pagedb state, will restart.");
progress = null;
}
break;
case CrawlerProgress.SORT:
// fall through
case CrawlerProgress.MERGE:
logger.info("Crawler was interrupted while sorting at cycle "+progress.cycle()+", will continue current cycle.");
tmpPageDB.open(PageDB.WRITE + PageDB.APPEND); // this will force a sort upon closing
// fall through
case CrawlerProgress.TRIM:
if (progress.stage() == CrawlerProgress.TRIM) {
logger.info("Crawler was interrupted while trimming at cycle "+progress.cycle()+", will continue current cycle.");
}
skipFetch = true;
break;
default:
logger.error("Unknown crawler state report, will restart.");
progress = null;
break;
}
}
if (null == progress) {
// there was no interrupted previous cycle or it was inconsistent
tmpPageDB.deleteDir(false);
tmpPageDB.open(PageDB.WRITE);
tmpPageDB.setNextCycleOf(oldPageDB);
progress = new CrawlerProgress(tmpPageDB.getCycles());
}
tmpPageDB.setProgressHandler(progress);
if (!skipFetch) {
if (0 == skip) {
progress.startFetch(oldPageDB.getSize(), oldPageDB.getFetchedSize());
logger.info("Starting crawl cycle " + (oldPageDB.getCycles()+1));
} else {
logger.info("Continuing crawl cycle " + (oldPageDB.getCycles()+1));
}
declareStartCycle(oldPageDB);
FetchlistQueueMonitor fetchlistFeeder = new FetchlistQueueMonitor(oldPageDB, tmpPageDB, skip);
InjectedFetchlistQueueMonitor injectedFetchlistFeeder = new InjectedFetchlistQueueMonitor(tmpPageDB);
FetchdataQueueMonitor fetchdataConsumer = new FetchdataQueueMonitor(oldPageDB, tmpPageDB, processor);
cycleFinished = false;
fetchlistFeeder.start();
injectedFetchlistFeeder.start();
fetchdataConsumer.start();
// This is where the main thread spends its time while the crawl cycle takes place.
// Wait until the fetchlist and fetchdata threads are done
synchronized(cycleFinishedMonitor) {
while (running() && !cycleFinished) {
logger.debug("Waiting: running="+running()+" cycleFinished="+cycleFinished+" fetchList="+fetchlistQueue.size()+" injectedFetchList="+injectedFetchlistQueue.size()+" fetchData="+fetchdataQueue.size());
cycleFinishedMonitor.wait(60000); // wake up every minute or when the cycle finishes
}
}
logger.debug("Waiting no more: running="+running()+" cycleFinished="+cycleFinished+" fetchList="+fetchlistQueue.size()+" injectedFetchList="+injectedFetchlistQueue.size()+" fetchData="+fetchdataQueue.size());
}
if (running()) {
progress.report();
logger.debug("Closing old and temporary pagedbs");
oldPageDB.close();
tmpPageDB.close();
logger.debug("Old and temporary pagedbs closed");
if (createNewPageDB) {
// dedup & trim
new PageDBTrimmer().trimPageDB(tmpPageDB, newPageDB, progress);
// check the trimmed pagedb size
if (protectAgainstEmptyPageDB && newPageDB.getSize() == 0) {
logger.error("The new PageDB is empty, will stop the crawler before replacing the old PageDB. Please check the hotspots, modules and other settings before restarting.");
stopCrawler();
}
if (running()) {
boolean ok = false;
String oldName = oldPageDB.getDir();
if (attempt(tmpPageDB.deleteDir(false), "deleting pagedb.tmp")) {
if (attempt(oldPageDB.rename(tmpPageDB.getDir()), "renaming pagedb -> pagedb.tmp")) {
if (attempt(newPageDB.rename(oldName), "renaming pagedb.new -> pagedb")) {
if (attempt(tmpPageDB.deleteDir(false), "deleting pagedb.tmp (2)")) {
ok = true;
}
}
}