Source Code of com.googlecode.flaxcrawler.frontier.DefaultScheduler

package com.googlecode.flaxcrawler.frontier;


import com.googlecode.flaxcrawler.concurrent.TaskQueue;
import com.googlecode.flaxcrawler.model.CrawlerTask;
import java.util.LinkedList;
import java.util.Queue;
import org.apache.log4j.Logger;


/**
 * Standard scheduler implementation. Starts an asyncronous worker thread that reads urls from the queue
 * and adds them to the {@code TaskQueue}.
 */
public class DefaultScheduler implements Scheduler {


    private Logger log = Logger.getLogger(this.getClass());
    private TaskQueue taskQueue;
    private StatisticsService statisticsService;
    private final Object syncRoot = new Object();
    private final Queue<CrawlerTask> schedulerQueue = new LinkedList<CrawlerTask>();
    private final Thread workerThread;


    public DefaultScheduler(TaskQueue taskQueue, StatisticsService statisticsService) {
        this.statisticsService = statisticsService;
        this.taskQueue = taskQueue;


        workerThread = new Thread(new Runnable() {


            public void run() {
                doWorkLoop();
            }
        });
        workerThread.setDaemon(true);
        workerThread.start();
        log.info("Scheduler was successfully initialized and started");
    }


    public void schedule(CrawlerTask crawlerTask) {
        synchronized (syncRoot) {
            log.debug("Enqueueing task " + crawlerTask.getUrl() + " to the scheduler queue");
            schedulerQueue.add(crawlerTask);
        }
    }


    /**
     * Reads urls from queue and adds them to the TaskQueue (if url was not crawled yet)
     */
    private void doWorkLoop() {
        while (true) {
            CrawlerTask task = null;


            try {
                synchronized (syncRoot) {
                    task = schedulerQueue.poll();
                }
                if (task != null) {
                    if (!statisticsService.isCrawled(task.getUrl())) {
                        taskQueue.enqueue(task);
                        statisticsService.afterScheduling(task);
                        log.debug("Scheduled crawling of the " + task.getUrl());
                    } else {
                        log.debug("Url " + task.getUrl() + " was already crawled");
                    }
                }


                // Yielding context to another thread
                Thread.sleep(1);
            } catch (Exception ex) {
                log.error("Error processing task " + task == null ? "NOTASK" : task.getUrl() + " from the scheduler queue", ex);
            }
        }
    }
}
Source Code of com.googlecode.flaxcrawler.frontier.DefaultScheduler

Related Classes of com.googlecode.flaxcrawler.frontier.DefaultScheduler