Package de.jungblut.crawl

Source Code of de.jungblut.crawl.MultithreadedCrawler

package de.jungblut.crawl;

import java.io.IOException;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Deque;
import java.util.LinkedList;
import java.util.List;
import java.util.Set;
import java.util.concurrent.CompletionService;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorCompletionService;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import com.google.common.hash.BloomFilter;
import com.google.common.hash.Funnels;

import de.jungblut.crawl.extraction.Extractor;
import de.jungblut.crawl.extraction.OutlinkExtractor;

/**
* Fast multithreaded crawler, will start a fixed threadpool of 32 threads each
* will be fed by 10 urls at once. Majorly designed for speed and to use all the
* available bandwidth. Based on other internet bandwidths, you may retune the
* parameters of threadpool sizes and how many items should be batched. For my
* 6k ADSL it works fine by 32 threads batched on 10 urls. You may scale this
* linearly up, since this class has almost no contention and small sequential
* code. It is also backed by a bloom filter to check if a URL was visited, so
* the memory footprint stays low.
*
* @author thomas.jungblut
*/
public final class MultithreadedCrawler<T extends FetchResult> implements
    Crawler<T> {

  private static final Log LOG = LogFactory.getLog(MultithreadedCrawler.class);

  private static final int THREAD_POOL_SIZE = 32;
  private static final int BATCH_SIZE = 10;

  private Extractor<T> extractor;
  private FetchResultPersister<T> persister;
  private Thread persisterThread;
  private int fetches = 100000;
  private int poolSize = THREAD_POOL_SIZE;
  private int batchSize = BATCH_SIZE;

  /**
   * Constructs a new Multithreaded Crawler.
   *
   * @param threadPoolSize the number of threads to use.
   * @param batchSize the number of URLs a batch for a thread should contain.
   * @param fetches the number of urls to fetch.
   * @param extractor the extraction logic.
   * @param writer the writer.
   */
  public MultithreadedCrawler(int threadPoolSize, int batchSize, int fetches,
      Extractor<T> extractor, ResultWriter<T> writer) throws IOException {
    this.poolSize = threadPoolSize;
    this.batchSize = batchSize;
    setup(fetches, extractor, writer);
  }

  /**
   * Constructs a new Multithreaded Crawler with 32 threads working on 10 url
   * batches at each time.
   *
   * @param fetches the number of urls to fetch.
   * @param extractor the extraction logic.
   * @param writer the writer.
   */
  public MultithreadedCrawler(int fetches, Extractor<T> extractor,
      ResultWriter<T> writer) throws IOException {
    setup(fetches, extractor, writer);
  }

  @Override
  public final void setup(int fetches, Extractor<T> extractor,
      ResultWriter<T> writer) throws IOException {
    this.fetches = fetches;
    this.extractor = extractor;

    // start the persisting thread
    persister = new FetchResultPersister<>(writer);
    persisterThread = new Thread(persister);
    persisterThread.start();
  }

  @Override
  public final void process(String... seedUrls) throws InterruptedException,
      ExecutionException {
    final Deque<String> linksToCrawl = new LinkedList<>();
    BloomFilter<CharSequence> visited = BloomFilter.create(
        Funnels.stringFunnel(Charset.defaultCharset()), fetches);
    ExecutorService threadPool = Executors.newFixedThreadPool(poolSize);
    final CompletionService<Set<T>> completionService = new ExecutorCompletionService<>(
        threadPool);

    long appStart = System.currentTimeMillis();

    LOG.info("Num sites to fetch " + fetches);

    int currentRunningThreads = 0;
    // seed our to crawl set with the start url
    linksToCrawl.addAll(Arrays.asList(seedUrls));
    for (String seed : seedUrls) {
      visited.put(seed);
    }
    // while we have not fetched enough sites yet
    while (true) {
      // batch together up to 10 items or how much in the list is
      final int length = linksToCrawl.size() > batchSize ? batchSize
          : linksToCrawl.size();
      // only schedule if we have fetches leftover
      if (fetches > 0) {
        fetches -= length;
        List<String> linkList = new ArrayList<>(length);
        for (int i = 0; i < length; i++) {
          linkList.add(linksToCrawl.poll());
        }
        // submit a new thread for a batch
        completionService.submit(new FetchThread<>(linkList, extractor));
        currentRunningThreads++;
      }
      // Now we can have a look if other threads have completed yet.
      Future<Set<T>> poll = null;
      if ((linksToCrawl.isEmpty() && currentRunningThreads > 0)
          || currentRunningThreads > poolSize) {
        poll = completionService.take();
      } else {
        poll = completionService.poll();
      }
      if (poll != null) {
        currentRunningThreads--;
        Set<T> set = poll.get();
        if (set != null) {
          // for each of our crawling results
          for (T v : set) {
            // go through the found outlinks
            for (String out : v.outlinks) {
              // if we haven't visited them yet
              if (!visited.mightContain(out)) {
                // queue them up
                linksToCrawl.offer(out);
                visited.put(out);
              }
            }
            persister.add(v);
          }
        }
      } else {
        // sleep for a second if none completed yet
        Thread.sleep(1000l);
      }
      if (fetches <= 0 && currentRunningThreads == 0) {
        break;
      }
    }

    persister.stop();
    persisterThread.join();
    threadPool.shutdownNow();
    LOG.info("Took overall time of " + (System.currentTimeMillis() - appStart)
        / 1000 + "s.");
  }

  public static void main(String[] args) throws InterruptedException,
      ExecutionException, IOException {
    String seedUrl = "http://news.google.de/";
    new MultithreadedCrawler<>(1000, new OutlinkExtractor(),
        new SequenceFileResultWriter<>()).process(seedUrl);
  }

}
TOP

Related Classes of de.jungblut.crawl.MultithreadedCrawler

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.