Package fr.eolya.crawler.queue

Examples of fr.eolya.crawler.queue.ISourceQueue


    String dbType = config.getProperty("/crawler/database/param[@name='dbtype']", "");
    String dbName = config.getProperty("/crawler/database/param[@name='dbname']", "");

    String dbNameQueues = config.getProperty("/crawler/queues/param[@name='dbname']", "");
   
    ISourceQueue sourceQueue = QueueFactory.getSourceQueueInstance(dbType, dbConnection, dbName, "sources", test, interactiveOnly, suspiciousOnly, accountId, sourceId, engineId);
    crawlerDB = CrawlerDBFactory.getCrawlerDBInstance(dbType, dbConnection, dbName, dbNameQueues, logger);

    logger.log("=================================");
    logger.log("Crawler starting (version: " + StringUtils.trimToEmpty(crawlerDB.getVersion()) + ")");
    logger.log("    Simultaneous sources crawled : " + String.valueOf(limit));
    if (!"".equals(accountId))
      logger.log("    account : " + accountId);
    if (!"".equals(engineId))
      logger.log("    engine : " + engineId);
    if (once)
      logger.log("    mode once");
    if (!"".equals(sourceId))
      logger.log("    source : " + sourceId);
    if (suspiciousOnly)
      logger.log("    mode suspicious");
    if (reScan)
      logger.log("    mode rescan");
    if (reset)
      logger.log("    mode reset");
    if (deeper)
      logger.log("    mode deeper");
    if (interactiveOnly)
      logger.log("    mode interactive only");
    if (test)
      logger.log("    mode test");
    if (verbose)
      logger.log("    mode verbose");
    logger.log("");

    logger.log("=================================");
    logger.log("");

    crawlerDB.fixStartupSourcesStatus();

    ThreadPoolExecutor sourceExecutor = (ThreadPoolExecutor) Executors.newFixedThreadPool(limit);

    boolean bFinished = false;
    while (!stopRequested && !bFinished) {
      try {
        stopRequested = fileStop.exists() || !filePid.exists();
        if (stopRequested) break;
       
        // Refresh PID file time
        try {
          FileUtils.touch(filePid);
        }
        catch (Exception e) {
          e.printStackTrace();
          System.exit(-1);
        }

        // How many sources are enqueued ?
        long countSource = sourceQueue.size();
        logger.log("    Sources to be crawled : " + String.valueOf(countSource));

        // If all threads are still processing a source wait and retry
        if (!sourceExecutor.getQueue().isEmpty() && (sourceExecutor.getActiveCount()==limit)) { 
          logger.log("    All threads are busy : wait and retry in a few seconds");
          Utils.sleep(15000)
          continue;
        }

        // We pop a new source from source queue only if thread pool queue is empty
        // we want to pop a new source from source queue at the very last time
        // as source queue content can change at any time

        if (countSource==0 && once) {
          // try waiting 5 minutes
          logger.log("    No more source to crawl : start waiting 5 minutes");
          if ("".equals(sourceId)) {
            int waitingSince = 0;
            stopRequested = fileStop.exists() || !filePid.exists();
            while (waitingSince<300*1000 && countSource==0 && !stopRequested) {
              Utils.sleep(5000);
              waitingSince += 5000;
              countSource = sourceQueue.size();
              stopRequested = fileStop.exists() || !filePid.exists();
            }
          }
          if (countSource == 0) {
            // No source to crawl after waiting 5 minutes => stop crawling
            if (!stopRequested) logger.log("    No more source to crawl after waiting 5 minutes and mode once : stop crawling");
            bFinished = true;
            continue;
          }
        }

        Map<String,Object> srcData = sourceQueue.pop();
        if (srcData!=null) {
          String srcId = String.valueOf(srcData.get("id"));

          if (CrawlerUtils.isAcceptedCountry((String)srcData.get("country"), countryInclude, countryExclude) || !"".equals(sourceId)) {

            String sourceCrawlMode = CrawlerUtils.getSourceCrawlMode(Integer.parseInt((String)srcData.get("crawl_mode")), reScan, reset, deeper, resetFromCache) ;

            // Build the source item according to its type and so its class
            ISource src = ConnectorFactory.getSourceInstance(crawlerDB.getSourceClass((String)srcData.get("type")), srcId, sourceCrawlMode, srcData);

            if (src!=null && src.isCrawlAllowedBySchedule()) {
              logger.log("        Pushing source : " + String.valueOf(src.getId()));
              sourceExecutor.submit(new ProcessorSource(src, config, logger, this));
            } else {
              sourceQueue.unpop(Integer.valueOf(srcId));
              logger.log("        Skip source due to schedule : " + String.valueOf(src.getId()));                     
            }
          } else {
            sourceQueue.unpop(Integer.valueOf(srcId));
            logger.log("        Skip source due to country : " + srcId);                                                       
          }
        }
      } catch (Exception e) {
        e.printStackTrace();
View Full Code Here

TOP

Related Classes of fr.eolya.crawler.queue.ISourceQueue

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.