Package net.vidageek.crawler

Source Code of net.vidageek.crawler.PageCrawler

/**
*
*/
package net.vidageek.crawler;

import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;

import net.vidageek.crawler.component.Downloader;
import net.vidageek.crawler.component.ExecutorCounter;
import net.vidageek.crawler.component.LinkNormalizer;
import net.vidageek.crawler.component.PageCrawlerExecutor;
import net.vidageek.crawler.config.CrawlerConfiguration;
import net.vidageek.crawler.exception.CrawlerException;
import net.vidageek.crawler.queue.DelayedBlockingQueue;
import net.vidageek.crawler.visitor.DoesNotFollowVisitedUrlVisitor;

import org.apache.log4j.Logger;

/**
* @author jonasabreu
*
*/
public class PageCrawler {

  private final Logger log = Logger.getLogger(PageCrawler.class);

  private final CrawlerConfiguration config;

  public PageCrawler(final CrawlerConfiguration config) {
    this.config = config;
  }

  public PageCrawler(final String beginUrl) {
    this(CrawlerConfiguration.forStartPoint(beginUrl).build());
  }

  public PageCrawler(final String beginUrl, final Downloader downloader, final LinkNormalizer normalizer) {
    config = CrawlerConfiguration.forStartPoint(beginUrl).withDownloader(downloader).withLinkNormalizer(normalizer)
        .build();
  }

  public void crawl(final PageVisitor visitor) {
    if (visitor == null) {
      throw new IllegalArgumentException("visitor cannot be null");
    }

    ThreadPoolExecutor executor = new ThreadPoolExecutor(config.minPoolSize(), config.maxPoolSize(), config
        .keepAliveMilliseconds(), TimeUnit.MILLISECONDS, new DelayedBlockingQueue(config
        .requestDelayMilliseconds()));

    final ExecutorCounter counter = new ExecutorCounter();

    try {
      executor
          .execute(new PageCrawlerExecutor(new Url(config.beginUrl(), 0), executor, counter, config
              .downloader(), config.normalizer(), new DoesNotFollowVisitedUrlVisitor(config.beginUrl(),
              visitor)));

      while (counter.value() != 0) {
        log.debug("executors that finished: " + executor.getCompletedTaskCount());
        log.debug("Number of Executors alive: " + counter.value());
        sleep();
      }
    } finally {
      executor.shutdown();
    }
  }

  private void sleep() {
    try {
      Thread.sleep(1000);
    } catch (InterruptedException e) {
      throw new CrawlerException("main thread died. ", e);
    }

  }
}
TOP

Related Classes of net.vidageek.crawler.PageCrawler

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.