package net.sf.jpluck.spider;
import net.sf.jpluck.http.CookieStore;
import net.sf.jpluck.http.HttpCache;
import net.sf.jpluck.http.HttpResponse;
import net.sf.jpluck.jxl.Feed;
import org.apache.commons.threadpool.DefaultThreadPool;
import java.net.URI;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import java.util.Vector;
import java.util.logging.Logger;
public class Spider implements Runnable {
public static final int DEFAULT_MAX_RETRIEVE_THREADS = 2;
public static final int DEFAULT_MAX_PARSE_THREADS = 1;
CookieStore cookieStore;
net.sf.jpluck.jxl.Document jxlDocument;
net.sf.jpluck.plucker.Document pluckerDocument;
int httpTimeout;
private DefaultThreadPool parseQueue;
private DefaultThreadPool retrieveQueue;
private HttpCache httpCache;
private List listenerList = new Vector();
private Logger logger = Logger.getLogger("spider");
private Set uriSet = new HashSet();
private volatile boolean running = false;
private int activeThreads = 0;
private int timeoutCount;
private static boolean offline=false;
public Spider(net.sf.jpluck.plucker.Document pluckerDocument, net.sf.jpluck.jxl.Document jxlDocument,
HttpCache httpCache, CookieStore cookieStore, int maxRetrieveThreads, int maxParseThreads,
int httpTimeout) {
this.pluckerDocument = pluckerDocument;
this.jxlDocument = jxlDocument;
this.httpCache = httpCache;
this.cookieStore = cookieStore;
retrieveQueue = new DefaultThreadPool(maxRetrieveThreads);
parseQueue = new DefaultThreadPool(maxParseThreads);
this.httpTimeout = httpTimeout;
}
public boolean isRunning() {
return running;
}
public void addSpiderListener(SpiderListener listener) {
if (!listenerList.contains(listener)) {
listenerList.add(listener);
}
}
public void cancel() {
running = false;
}
public void removeSpiderListener(SpiderListener listener) {
if (listenerList.contains(listener)) {
listenerList.remove(listener);
}
}
public void run() {
if (running) {
throw new IllegalStateException("Spider is already running.");
}
running = true;
retrieve(jxlDocument.getUri(), LinkFilter.EXTERNAL_LINK, 0, jxlDocument.getReferrer());
while (running && (activeThreads > 0)) {
synchronized (this) {
try {
wait();
} catch (InterruptedException e) {
}
}
}
retrieveQueue.stop();
parseQueue.stop();
running = false;
}
synchronized void fireParsingCompleted(String uri) {
for (Iterator iterator = listenerList.iterator(); iterator.hasNext();) {
SpiderListener spiderListener = (SpiderListener) iterator.next();
spiderListener.parsingCompleted(uri);
}
}
synchronized void fireParsingScheduled(String uri) {
for (Iterator iterator = listenerList.iterator(); iterator.hasNext();) {
SpiderListener spiderListener = (SpiderListener) iterator.next();
spiderListener.parsingScheduled(uri);
}
}
synchronized void fireParsingStarted(String uri) {
for (Iterator iterator = listenerList.iterator(); iterator.hasNext();) {
SpiderListener spiderListener = (SpiderListener) iterator.next();
spiderListener.parsingStarted(uri);
}
}
synchronized void fireRetrievalCompleted(String uri) {
for (Iterator iterator = listenerList.iterator(); iterator.hasNext();) {
SpiderListener spiderListener = (SpiderListener) iterator.next();
spiderListener.retrievalCompleted(uri);
}
}
synchronized void fireRetrievalScheduled(String uri) {
for (Iterator iterator = listenerList.iterator(); iterator.hasNext();) {
SpiderListener spiderListener = (SpiderListener) iterator.next();
spiderListener.retrievalScheduled(uri);
}
}
synchronized void fireRetrievalStarted(String uri) {
for (Iterator iterator = listenerList.iterator(); iterator.hasNext();) {
SpiderListener spiderListener = (SpiderListener) iterator.next();
spiderListener.retrievalStarted(uri);
}
}
public synchronized void fireStatusMessage(String message) {
for (Iterator iterator = listenerList.iterator(); iterator.hasNext();) {
SpiderListener spiderListener = (SpiderListener) iterator.next();
spiderListener.statusMessage(message);
}
}
synchronized void notifyTaskDone() {
activeThreads--;
notify();
}
synchronized void notifyTaskScheduled() {
activeThreads++;
}
synchronized void parse(Resource resource) {
if (pluckerDocument.contains(resource.getURI())) {
logger.finer(resource.getURI() + " already parsed. Skippimg.");
return;
}
Parser parser = null;
if (jxlDocument instanceof Feed) {
parser = new FeedParser(this, resource);
} else {
parser = new SiteParser(this, resource);
}
notifyTaskScheduled();
parseQueue.invokeLater(parser);
logger.finer("Added to parsing queue: " + resource.getURI());
fireParsingScheduled(resource.getURI());
}
synchronized void retrieve(String uri, int linkType, int level, String referrer) {
if ((linkType == LinkFilter.EMBEDDED_IMAGE_LINK) && !jxlDocument.isIncludeEmbeddedImages()) {
return;
}
if (!uriSet.contains(uri) && (level <= jxlDocument.getLinkDepth())) {
uriSet.add(uri);
if (jxlDocument.shouldExclude(uri) || !jxlDocument.shouldInclude(uri, linkType)) {
logger.fine("Excluding " + uri);
return;
}
String scheme = URI.create(uri).getScheme();
if (scheme.equals("file")) {
notifyTaskScheduled();
retrieveQueue.invokeLater(new FileRetriever(this, uri, level,
(linkType == LinkFilter.EMBEDDED_IMAGE_LINK)));
} else if (scheme.equals("http")) {
notifyTaskScheduled();
retrieveQueue.invokeLater(new HttpRetriever(this, uri, level,
(linkType == LinkFilter.EMBEDDED_IMAGE_LINK), referrer));
} else {
logger.warning("Scheme not handled: " + scheme + " (" + uri + ")");
return;
}
fireRetrievalScheduled(uri);
logger.finer("Added to retrieval queue: " + uri);
}
}
HttpResponse retrieveFromCache(String uri) {
HttpResponse response = null;
if (jxlDocument.isUseHTTPCache() && httpCache != null) {
synchronized (httpCache) {
if (!running) {
return null;
}
if (!httpCache.isOpen()) {
logger.info("Opening HTTP cache.");
fireStatusMessage("Opening HTTP cache");
httpCache.open();
}
}
response = httpCache.retrieve(uri);
}
return response;
}
void storeInCache(String uri, HttpResponse response) {
if ((httpCache != null) && jxlDocument.isUseHTTPCache() && running) {
logger.finest("Storing in cache: " + uri);
httpCache.add(uri, response);
}
}
public static boolean isOffline() {
return offline;
}
public static void setOffline(boolean b) {
offline = b;
}
}