String dbType = config.getProperty("/crawler/database/param[@name='dbtype']", "");
String dbName = config.getProperty("/crawler/database/param[@name='dbname']", "");
String dbNameQueues = config.getProperty("/crawler/queues/param[@name='dbname']", "");
ISourceQueue sourceQueue = QueueFactory.getSourceQueueInstance(dbType, dbConnection, dbName, "sources", test, interactiveOnly, suspiciousOnly, accountId, sourceId, engineId);
crawlerDB = CrawlerDBFactory.getCrawlerDBInstance(dbType, dbConnection, dbName, dbNameQueues, logger);
logger.log("=================================");
logger.log("Crawler starting (version: " + StringUtils.trimToEmpty(crawlerDB.getVersion()) + ")");
logger.log(" Simultaneous sources crawled : " + String.valueOf(limit));
if (!"".equals(accountId))
logger.log(" account : " + accountId);
if (!"".equals(engineId))
logger.log(" engine : " + engineId);
if (once)
logger.log(" mode once");
if (!"".equals(sourceId))
logger.log(" source : " + sourceId);
if (suspiciousOnly)
logger.log(" mode suspicious");
if (reScan)
logger.log(" mode rescan");
if (reset)
logger.log(" mode reset");
if (deeper)
logger.log(" mode deeper");
if (interactiveOnly)
logger.log(" mode interactive only");
if (test)
logger.log(" mode test");
if (verbose)
logger.log(" mode verbose");
logger.log("");
logger.log("=================================");
logger.log("");
crawlerDB.fixStartupSourcesStatus();
ThreadPoolExecutor sourceExecutor = (ThreadPoolExecutor) Executors.newFixedThreadPool(limit);
boolean bFinished = false;
while (!stopRequested && !bFinished) {
try {
stopRequested = fileStop.exists() || !filePid.exists();
if (stopRequested) break;
// Refresh PID file time
try {
FileUtils.touch(filePid);
}
catch (Exception e) {
e.printStackTrace();
System.exit(-1);
}
// How many sources are enqueued ?
long countSource = sourceQueue.size();
logger.log(" Sources to be crawled : " + String.valueOf(countSource));
// If all threads are still processing a source wait and retry
if (!sourceExecutor.getQueue().isEmpty() && (sourceExecutor.getActiveCount()==limit)) {
logger.log(" All threads are busy : wait and retry in a few seconds");
Utils.sleep(15000);
continue;
}
// We pop a new source from source queue only if thread pool queue is empty
// we want to pop a new source from source queue at the very last time
// as source queue content can change at any time
if (countSource==0 && once) {
// try waiting 5 minutes
logger.log(" No more source to crawl : start waiting 5 minutes");
if ("".equals(sourceId)) {
int waitingSince = 0;
stopRequested = fileStop.exists() || !filePid.exists();
while (waitingSince<300*1000 && countSource==0 && !stopRequested) {
Utils.sleep(5000);
waitingSince += 5000;
countSource = sourceQueue.size();
stopRequested = fileStop.exists() || !filePid.exists();
}
}
if (countSource == 0) {
// No source to crawl after waiting 5 minutes => stop crawling
if (!stopRequested) logger.log(" No more source to crawl after waiting 5 minutes and mode once : stop crawling");
bFinished = true;
continue;
}
}
Map<String,Object> srcData = sourceQueue.pop();
if (srcData!=null) {
String srcId = String.valueOf(srcData.get("id"));
if (CrawlerUtils.isAcceptedCountry((String)srcData.get("country"), countryInclude, countryExclude) || !"".equals(sourceId)) {
String sourceCrawlMode = CrawlerUtils.getSourceCrawlMode(Integer.parseInt((String)srcData.get("crawl_mode")), reScan, reset, deeper, resetFromCache) ;
// Build the source item according to its type and so its class
ISource src = ConnectorFactory.getSourceInstance(crawlerDB.getSourceClass((String)srcData.get("type")), srcId, sourceCrawlMode, srcData);
if (src!=null && src.isCrawlAllowedBySchedule()) {
logger.log(" Pushing source : " + String.valueOf(src.getId()));
sourceExecutor.submit(new ProcessorSource(src, config, logger, this));
} else {
sourceQueue.unpop(Integer.valueOf(srcId));
logger.log(" Skip source due to schedule : " + String.valueOf(src.getId()));
}
} else {
sourceQueue.unpop(Integer.valueOf(srcId));
logger.log(" Skip source due to country : " + srcId);
}
}
} catch (Exception e) {
e.printStackTrace();