Package net.sf.jpluck.spider

Source Code of net.sf.jpluck.spider.Spider

package net.sf.jpluck.spider;

import net.sf.jpluck.http.CookieStore;
import net.sf.jpluck.http.HttpCache;
import net.sf.jpluck.http.HttpResponse;
import net.sf.jpluck.jxl.Feed;

import org.apache.commons.threadpool.DefaultThreadPool;

import java.net.URI;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import java.util.Vector;
import java.util.logging.Logger;


public class Spider implements Runnable {
  public static final int DEFAULT_MAX_RETRIEVE_THREADS = 2;
  public static final int DEFAULT_MAX_PARSE_THREADS = 1;
  CookieStore cookieStore;
  net.sf.jpluck.jxl.Document jxlDocument;
  net.sf.jpluck.plucker.Document pluckerDocument;
  int httpTimeout;
  private DefaultThreadPool parseQueue;
  private DefaultThreadPool retrieveQueue;
  private HttpCache httpCache;
  private List listenerList = new Vector();
  private Logger logger = Logger.getLogger("spider");
  private Set uriSet = new HashSet();
  private volatile boolean running = false;
  private int activeThreads = 0;
  private int timeoutCount;
  private static boolean offline=false;   

  public Spider(net.sf.jpluck.plucker.Document pluckerDocument, net.sf.jpluck.jxl.Document jxlDocument,
          HttpCache httpCache, CookieStore cookieStore, int maxRetrieveThreads, int maxParseThreads,
          int httpTimeout) {
    this.pluckerDocument = pluckerDocument;
    this.jxlDocument = jxlDocument;
    this.httpCache = httpCache;
    this.cookieStore = cookieStore;
    retrieveQueue = new DefaultThreadPool(maxRetrieveThreads);
    parseQueue = new DefaultThreadPool(maxParseThreads);
    this.httpTimeout = httpTimeout;
  }

  public boolean isRunning() {
    return running;
  }

  public void addSpiderListener(SpiderListener listener) {
    if (!listenerList.contains(listener)) {
      listenerList.add(listener);
    }
  }

  public void cancel() {
    running = false;
  }

  public void removeSpiderListener(SpiderListener listener) {
    if (listenerList.contains(listener)) {
      listenerList.remove(listener);
    }
  }

  public void run() {
    if (running) {
      throw new IllegalStateException("Spider is already running.");
    }
    running = true;
    retrieve(jxlDocument.getUri(), LinkFilter.EXTERNAL_LINK, 0, jxlDocument.getReferrer());
    while (running && (activeThreads > 0)) {
      synchronized (this) {
        try {
          wait();
        } catch (InterruptedException e) {
        }
      }
    }
    retrieveQueue.stop();
    parseQueue.stop();
    running = false;
  }

  synchronized void fireParsingCompleted(String uri) {
    for (Iterator iterator = listenerList.iterator(); iterator.hasNext();) {
      SpiderListener spiderListener = (SpiderListener) iterator.next();
      spiderListener.parsingCompleted(uri);
    }
  }

  synchronized void fireParsingScheduled(String uri) {
    for (Iterator iterator = listenerList.iterator(); iterator.hasNext();) {
      SpiderListener spiderListener = (SpiderListener) iterator.next();
      spiderListener.parsingScheduled(uri);
    }
  }

  synchronized void fireParsingStarted(String uri) {
    for (Iterator iterator = listenerList.iterator(); iterator.hasNext();) {
      SpiderListener spiderListener = (SpiderListener) iterator.next();
      spiderListener.parsingStarted(uri);
    }
  }

  synchronized void fireRetrievalCompleted(String uri) {
    for (Iterator iterator = listenerList.iterator(); iterator.hasNext();) {
      SpiderListener spiderListener = (SpiderListener) iterator.next();
      spiderListener.retrievalCompleted(uri);
    }
  }

  synchronized void fireRetrievalScheduled(String uri) {
    for (Iterator iterator = listenerList.iterator(); iterator.hasNext();) {
      SpiderListener spiderListener = (SpiderListener) iterator.next();
      spiderListener.retrievalScheduled(uri);
    }
  }

  synchronized void fireRetrievalStarted(String uri) {
    for (Iterator iterator = listenerList.iterator(); iterator.hasNext();) {
      SpiderListener spiderListener = (SpiderListener) iterator.next();
      spiderListener.retrievalStarted(uri);
    }
  }

  public synchronized void fireStatusMessage(String message) {
    for (Iterator iterator = listenerList.iterator(); iterator.hasNext();) {
      SpiderListener spiderListener = (SpiderListener) iterator.next();
      spiderListener.statusMessage(message);
    }
  }
  synchronized void notifyTaskDone() {
    activeThreads--;
    notify();
  }

  synchronized void notifyTaskScheduled() {
    activeThreads++;
  }

  synchronized void parse(Resource resource) {
    if (pluckerDocument.contains(resource.getURI())) {
      logger.finer(resource.getURI() + " already parsed. Skippimg.");
      return;
    }

    Parser parser = null;
    if (jxlDocument instanceof Feed) {
      parser = new FeedParser(this, resource);
    } else {
      parser = new SiteParser(this, resource);
    }
    notifyTaskScheduled();
    parseQueue.invokeLater(parser);
    logger.finer("Added to parsing queue: " + resource.getURI());
    fireParsingScheduled(resource.getURI());
  }

  synchronized void retrieve(String uri, int linkType, int level, String referrer) {
    if ((linkType == LinkFilter.EMBEDDED_IMAGE_LINK) && !jxlDocument.isIncludeEmbeddedImages()) {
      return;
    }
    if (!uriSet.contains(uri) && (level <= jxlDocument.getLinkDepth())) {
      uriSet.add(uri);
      if (jxlDocument.shouldExclude(uri) || !jxlDocument.shouldInclude(uri, linkType)) {
        logger.fine("Excluding " + uri);
        return;
      }

      String scheme = URI.create(uri).getScheme();
      if (scheme.equals("file")) {
        notifyTaskScheduled();
        retrieveQueue.invokeLater(new FileRetriever(this, uri, level,
                              (linkType == LinkFilter.EMBEDDED_IMAGE_LINK)));
      } else if (scheme.equals("http")) {
        notifyTaskScheduled();
        retrieveQueue.invokeLater(new HttpRetriever(this, uri, level,
                              (linkType == LinkFilter.EMBEDDED_IMAGE_LINK), referrer));
      } else {
        logger.warning("Scheme not handled: " + scheme + " (" + uri + ")");
        return;
      }
      fireRetrievalScheduled(uri);
      logger.finer("Added to retrieval queue: " + uri);
    }
  }

  HttpResponse retrieveFromCache(String uri) {
    HttpResponse response = null;
    if (jxlDocument.isUseHTTPCache() && httpCache != null) {
      synchronized (httpCache) {
        if (!running) {
          return null;
        }
        if (!httpCache.isOpen()) {
          logger.info("Opening HTTP cache.");
          fireStatusMessage("Opening HTTP cache");
          httpCache.open();
        }
      }
      response = httpCache.retrieve(uri);
    }

    return response;
  }

  void storeInCache(String uri, HttpResponse response) {
    if ((httpCache != null) && jxlDocument.isUseHTTPCache() && running) {
      logger.finest("Storing in cache: " + uri);
      httpCache.add(uri, response);
    }
  }
  public static boolean isOffline() {
    return offline;
  }

  public static void setOffline(boolean b) {
    offline = b;
  }
}
TOP

Related Classes of net.sf.jpluck.spider.Spider

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.