package net.matuschek.spider;
/**
* This class implements a web robot that does a search trough
* the web starting from a given start document up to a given
* search depth.
*
* @author Daniel Matuschek / Oliver Schmidt
* @version $Revision: 1.35 $
*/
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.lang.reflect.Field;
import java.lang.reflect.Modifier;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.StringTokenizer;
import java.util.Vector;
import net.matuschek.html.FormFiller;
import net.matuschek.html.HtmlDocument;
import net.matuschek.http.DocManagerException;
import net.matuschek.http.DownloadRuleSet;
import net.matuschek.http.ExtendedURL;
import net.matuschek.http.HttpConstants;
import net.matuschek.http.HttpDoc;
import net.matuschek.http.HttpDocManager;
import net.matuschek.http.HttpException;
import net.matuschek.http.HttpHeader;
import net.matuschek.http.HttpTool;
import net.matuschek.http.HttpToolCallback;
import net.matuschek.http.NTLMAuthorization;
import net.matuschek.http.cookie.CookieManager;
import net.matuschek.spider.docfilter.FilterChain;
import net.matuschek.spider.docfilter.FilterException;
import org.apache.log4j.Category;
import org.w3c.dom.Element;
public class WebRobot implements Runnable, Cloneable {
/** the name of the robot */
private final static String ROBOT_NAME = "JoBo";
/** the default agent name */
private final static String AGENT_NAME =
ROBOT_NAME+"/1.4 (http://www.matuschek.net/jobo.html)";
/** the robot exception handler*/
protected RobotExceptionHandler exceptionHandler =
new DefaultRobotExceptionHandler();
/** default maximal search depth */
private final static int DEFAULT_DEPTH = 10;
/** the URL where the robot walk starts from */
protected URL startURL = null;
/** the host and directory where retrieval started from */
protected String startDir = "";
/** maximal search depth */
protected int maxDepth = DEFAULT_DEPTH;
/** is it allowed to walk to other hosts then the starting host ? */
protected boolean walkToOtherHosts = false;
/** DocManager will store or process retrieved documents */
protected HttpDocManager docManager;
/** HttpTool will be used to retrieve documents from a web server */
protected HttpTool httpTool = new HttpTool();
/** Log4J category for logging */
protected Category log;
/** Referer used to retrieve to first document */
protected String startReferer = "-";
/** test for robots.txt */
protected NoRobots robCheck;
/** current tasks */
protected TaskList todo = null;
/** a list of all URLs we got already */
protected TaskList visited = null;
/** ignore settings in /robots.txt ? */
protected boolean ignoreRobotsTxt = false;
/** sleep that number of seconds after every retrieved document */
protected int sleepTime = 1;
/** fill out forms */
protected FormFiller formFiller = new FormFiller();
/** this URLs can be visited more then once */
protected Vector visitMany = new Vector();
/** for callback to the user interface **/
protected WebRobotCallback webRobotCallback = null;
/** should we stop robot operation ? **/
protected boolean stopIt = false;
/** to check if it is allowed to travel to a given URL **/
protected URLCheck urlCheck = null;
/** should the robot suspend the current walk() **/
protected boolean sleep;
/** list of allowed URLs (even if walkToOtherHosts is false) **/
protected Vector allowedURLs = new Vector();
/** allow travelling the whole host ? */
protected boolean allowWholeHost = true;
/**
* maximum document age in seconds, negative value means
* no limit
*/
protected long maxDocumentAge = -1; // no limit
/**
* allow travelling to all subdomains of the start host ?
* @see #setAllowWholeDomain(boolean)
*/
protected boolean allowWholeDomain = true;
/**
* do more flexible tests if the new URL is on the same host
* @see #basicURLCheck(URL)
*/
protected boolean flexibleHostCheck = false;
/**
* FilterChain to filter the document before storing it
*/
protected FilterChain filters = null;
/**
* don't retrieve pages again that are already stored in the DocManager
*/
protected boolean allowCaching = true;
/**
* Check for documents with the same content
*/
protected boolean duplicateCheck = false;
/**
* initializes the robot with the default implementation
* of the TaskList interface
*
* @param expected document count
*/
public WebRobot(int expectedDocumentCount) {
log = Category.getInstance(getClass().getName());
content2UrlMap = new HashMap(expectedDocumentCount);
registerVisitedList(new HashedMemoryTaskList(false,
expectedDocumentCount));
registerToDoList(new HashedMemoryTaskList(true,
expectedDocumentCount));
this.expectedDocumentCount = expectedDocumentCount;
this.setAgentName(AGENT_NAME);
}
/**
* initializes the robot with the default implementation of the TaskList
* interface
*/
public WebRobot() {
this(DEFAULT_EXPECTED_DOCUMENT_COUNT);
}
/**
* Sets the implementation class for the backend task list storage.
* WebRobot uses the TaskList interface to store future tasks.
*
* If you want to use your own TaskList implementation, just call
* this method.
*
* @param todo TaskList to be used for the "to do" list
*/
public void registerToDoList(TaskList todo) {
this.todo = todo;
}
/**
* Sets the implementation class for the backend task list storage.
* WebRobot uses the TaskList interface to store URLs that have
* been retrieved before.
*
* If you want to use your own TaskList implementation, just call
* this method.
*
* @param visited TaskList to be used for the list of visited URLs
*/
public void registerVisitedList(TaskList visited) {
this.visited = visited;
}
/**
* @return the start URL for this robot
*/
public URL getStartURL() {
return startURL;
}
/**
* Sets the start URL for this robot
* @param startURL the start URL
*/
public void setStartURL(URL startURL) {
String path = startURL.getPath();
this.startURL = startURL;
// is it a directory ?
if (path.endsWith("/")) {
this.startDir = startURL.getHost() + path;
} else {
int pos = path.lastIndexOf("/");
if (pos < 0) {
// this happens for URLs without a path
this.startDir = startURL.getHost() + "/";
} else {
this.startDir = startURL.getHost() + path.substring(0, pos + 1);
}
}
}
/**
* @return the maximal allowed search depth
*/
public int getMaxDepth() {
return maxDepth;
}
/**
* sets the maximal search depth
* @param maxDepth
*/
public void setMaxDepth(int maxDepth) {
this.maxDepth = maxDepth;
}
/**
* Get the value of bandwith of the used HttpTool
* @return value of bandwith.
*/
public int getBandwidth() {
return httpTool.getBandwidth();
}
/**
* Set the value of bandwith of the used HttpTool
* @param bandwidth Value to assign to bandwith.
*/
public void setBandwidth(int bandwidth) {
httpTool.setBandwidth(bandwidth);
}
/**
* gets the WalkToOtherHost status
* @return true if the Robot is allowed to travel to other
* host then the start host, false otherwise
*/
public boolean getWalkToOtherHosts() {
return walkToOtherHosts;
}
/**
* sets the WalkToOtherHosts status
* @param walkToOtherHosts true if the Robot is allowed to travel to other
* host then the start host, false otherwise
*/
public void setWalkToOtherHosts(boolean walkToOtherHosts) {
this.walkToOtherHosts = walkToOtherHosts;
}
/**
* gets the AllowWholeHost value
* @return true if the Robot is allowed to travel to the whole
* host where it started from, false otherwise. If false, it is only
* allowed to travel to URLs below the start URL
*/
public boolean getAllowWholeHost() {
return allowWholeHost;
}
/**
* sets the AllowWholeHost status
* @param allowWholeHost if true, the Robot is allowed to
* travel to the whole host where it started from. Otherwise it is only
* allowed to travel to URLs below the start URL.
*/
public void setAllowWholeHost(boolean allowWholeHost) {
this.allowWholeHost = allowWholeHost;
}
/**
* Gets the AllowWholeDomain value.
* @return true if the Robot is allowed to travel to the whole
* domain of the start host, false otherwise.
* @see #setAllowWholeDomain(boolean)
*/
public boolean getAllowWholeDomain() {
return allowWholeDomain;
}
/**
* Sets the AllowWholeDomain status
* @param allowWholeDomain if true, the Robot is allows to travel
* to all hosts in the same domain as the starting host. E.g. if you
* start at www.apache.org, it is also allowed to travel to
* jakarta.apache.org, xml.apache.org ...
*/
public void setAllowWholeDomain(boolean allowWholeDomain) {
this.allowWholeDomain = allowWholeDomain;
}
/**
* Gets the state of flexible host checking (enabled or disabled).
*
* To find out if a new URL is on the same host, the robot usually
* compares the host part of both. Some web servers have an inconsistent
* addressing scheme and use the hostname www.domain.com and domain.com.
* With flexible host check enabled, the robot will consider both
* hosts as equal.
*
* @return true, if flexible host checking is enabled
*/
public boolean getFlexibleHostCheck() {
return flexibleHostCheck;
}
/**
* Defines if the host test should be more flexible.
*
* To find out if a new URL is on the same host, the robot usually
* compares the host part of both. Some web servers have an inconsistent
* addressing scheme and use the hostname www.domain.com and domain.com.
* With flexible host check enabled, the robot will consider both
* hosts as equal.
*
* @param flexibleHostCheck set this true, to enable flexible host checking
* (disabled by default)
*/
public void setFlexibleHostCheck(boolean flexibleHostCheck) {
this.flexibleHostCheck = flexibleHostCheck;
}
/**
* Gets the AllowCaching value.
* @return true if the Robot is allowed to cache documents in the
* docManager
* @see #setAllowCaching(boolean)
*/
public boolean getAllowCaching() {
return allowCaching;
}
/**
* Sets the AllowCaching status
*
* @param allowCaching if true, the Robot is allows to use
* cached documents. That means it will first try to get teh document
* from the docManager cache and will only retrieve it if it is
* not found in the cache. If the cache returns a document, the robot
* will NEVER retrieve it again. Therefore, expiration mechanisms have
* to be included in the HttpDocManager method retrieveFromCache.
* @see net.matuschek.http.HttpDocManager#retrieveFromCache(java.net.URL)
*/
public void setAllowCaching(boolean allowCaching) {
this.allowCaching = allowCaching;
}
/**
* @return the document manager of this robot
* @see HttpDocManager
*/
public HttpDocManager getDocManager() {
return docManager;
}
/**
* Sets the document manager for this robot <br />
* Without a document manager, the robot will travel through the web but
* don't do anything with the retrieved documents (simply forget
* them).
* A document manager can store them, extract information or
* whatever you like.
* There can be only one document manager, but you are free to combine
* functionalities of available document managers in a new object (e.g.
* to store the document and extract meta informations).
* @param docManager
*/
public void setDocManager(HttpDocManager docManager) {
this.docManager = docManager;
}
/**
* Sets the CookieManager used by the HttpTool
* By default a MemoryCookieManager will be used, but you can
* use this method to use your own CookieManager implementation.
*
* @param cm an object that implements the CookieManager interface
*/
public void setCookieManager(CookieManager cm) {
httpTool.setCookieManager(cm);
}
/**
* Gets the CookieManager used by the HttpTool
*
* @return the CookieManager that will be used by the HttpTool
*/
public CookieManager getCookieManager() {
return httpTool.getCookieManager();
}
/**
* Sets the DownloadRule
* @param rule the download rule set to use
*/
public void setDownloadRuleSet(DownloadRuleSet rules) {
httpTool.setDownloadRuleSet(rules);
}
/**
* Sets the URLCheck for this robot
* @param check
*/
public void setURLCheck(URLCheck check) {
this.urlCheck = check;
}
/**
* sets a proxy to use
* @param proxyDescr the Proxy definition in the format host:port
*/
public void setProxy(String proxyDescr) throws HttpException {
httpTool.setProxy(proxyDescr);
}
/**
* @return the current proxy setting in the format host:port
*/
public String getProxy() {
return httpTool.getProxy();
}
/**
* @return the Referer setting for the first HTTP reuest
*/
public String getStartReferer() {
return startReferer;
}
/**
* sets the Referer setting for the first HTTP reuest
* @param startReferer an URL (e.g. http://www.matuschek.net)
*/
public void setStartReferer(String startReferer) {
this.startReferer = startReferer;
}
/**
* should we ignore robots.txt Robot Exclusion protocol ?
* @param ignoreRobotsTxt if set to true, the robot will ignore
* the settings of the /robots.txt file on the webserver
* <b>Know what you are doing if you change this setting</b>
*/
public void setIgnoreRobotsTxt(boolean ignoreRobotsTxt) {
robCheck.setIgnore(ignoreRobotsTxt);
}
/**
* @return the sleeptime setting
*/
public int getSleepTime() {
return sleepTime;
}
/**
* set the sleeptime<br />
* after every retrieved document the robot will wait this time
* before getting the next document. this allows it to limit the
* load on the server
* @param sleeptime wait time in seconds
*/
public void setSleepTime(int sleepTime) {
this.sleepTime = sleepTime;
}
/**
* sets the From: HTTP header<br />
* this should be a valid email address. it is not needed for the robot,
* but you should use it, because the administrator of the web server
* can contact you if the robot is doing things that he don't want
* @param fromAdress an RFC 822 email adress
*/
public void setFromAddress(String fromAddress) {
httpTool.setFromAddress(fromAddress);
}
/**
* sets the list of form handlers
* @see net.matuschek.html.FormHandler for more
* information about form handlers
*/
public void setFormHandlers(Vector handlers) {
formFiller.setFormHandlers(handlers);
if (handlers != null && handlers.size() > 0) {
hasFormHandlers = true;
}
}
/**
* @return the list of form handlers
* @see net.matuschek.html.FormHandler for more information
* about form handlers
*/
public Vector getFormHandlers() {
return formFiller.getFormHandlers();
}
/**
* Gets the name of the "User-Agent" header that the robot will use
* @return the user agent name
*/
public String getAgentName() {
if (httpTool != null) {
return httpTool.getAgentName();
} else {
return null;
}
}
/**
* sets the Agent-Name authentication for this robot
* @param name a name for this robot
* (e.g. "Mozilla 4.0 (compatible; Robot)")
*/
public void setAgentName(String name) {
httpTool.setAgentName(name);
// robCheck = new NoRobots(ROBOT_NAME, httpTool);
robCheck = new NoRobots(name, httpTool);
}
/**
* Gets the timeout for getting data in seconds of the used HttpTool
* @return the value of sockerTimeout
* @see #setTimeout(int)
*/
public int getTimeout() {
if (httpTool != null) {
return httpTool.getTimeout();
} else {
return -1;
}
}
/**
* Sets the timeout for getting data. If HttpTool can't read data from a
* remote web server after this number of seconds it will stop the download
* of the current file
* @param timeout Timeout in seconds
*/
public void setTimeout(int timeout) {
httpTool.setTimeout(timeout);
}
/**
* Gets the ntlmAuthentication of the robot
* @return the ntlmAuthentication
*/
public NTLMAuthorization getNtlmAuthorization() {
if (httpTool != null) {
return httpTool.getNtlmAuthorization();
} else {
return null;
}
}
/**
* sets a ntlmAuthentication for this robot
* @param ntlmAuthentication for this robot
*/
public void setNtlmAuthorization(NTLMAuthorization ntlmAuthorization) {
httpTool.setNtlmAuthorization(ntlmAuthorization);
}
/**
* Gets the setting of the IgnoreRobotsTxt property
* @return true if robots.txt will be ignored, false otherwise
*/
public boolean getIgnoreRobotsTxt() {
return ignoreRobotsTxt;
}
/**
* Gets a vector of URLs that can be visited more then once
* @return a vector containing URLs formated as Strings
*/
public Vector getVisitMany() {
return visitMany;
}
public void setVisitMany(Vector visitMany) {
this.visitMany = visitMany;
}
public void setHttpToolCallback(HttpToolCallback callback) {
httpTool.setCallback(callback);
}
public WebRobotCallback getWebRobotCallback() {
return webRobotCallback;
}
public void setWebRobotCallback(WebRobotCallback webRobotCallback) {
this.webRobotCallback = webRobotCallback;
}
/**
* Sets the sleep status for this robot. If a WebRobot is set to sleep
* after starting run(), is will wait after retrieving the current document
* and wait for setSleep(false)
*/
public void setSleep(boolean sleep) {
this.sleep = sleep;
}
/**
* Is the robot sleeping ?
*/
public boolean isSleeping() {
return this.sleep;
}
/**
* Set the list of allowed URLs
* @param allowed a Vector containing Strings. URLs will be checked
* if they begin of a string in this vector
*/
public void setAllowedURLs(Vector allowed) {
this.allowedURLs = allowed;
}
/**
* Gets the list of allowed URLs
* @return a Vector containing Strings
* @see #setAllowedURLs(Vector)
*/
public Vector getAllowedURLs() {
return this.allowedURLs;
}
/**
* Enable/disable cookies
* @param enable if true, HTTP cookies will be enabled, if false
* the robot will not use cookies
*/
public void setEnableCookies(boolean enable) {
httpTool.setEnableCookies(enable);
}
/**
* Get the status of the cookie engine
* @return true, if HTTP cookies are enabled, false otherwise
*/
public boolean getEnableCookies() {
return httpTool.getEnableCookies();
}
/**
* Set the maximum age of documents to retrieve to this number
* of seconds
* @param maxAge integer value of the maximum document age
* (in seconds), negative value means no limit.
*/
public void setMaxDocumentAge(long maxAge) {
this.maxDocumentAge = maxAge;
}
/**
* Gets the maximum age of documents to retrieve
* @return maximum document age (in seconds), negative value means
* no limit.
*/
public long getMaxDocumentAge() {
return this.maxDocumentAge;
}
/**
* Sets a FilterChain. If teh WebRobot use a FilterChain it will
* process any retrieved document by this FilterChain before
* storing it
*
* @param filter a FilterChain to use for filtering HttpDocs
*/
public void setFilters(FilterChain filters) {
this.filters = filters;
}
/**
* Delete all cookies
*/
public void clearCookies() {
httpTool.clearCookies();
}
/**
* thread run() method, simply calls work()
* @see #work()
*/
public void run() {
work();
}
/**
* do your job travel through the web using the configured
* parameters and retrieve documents
*/
public void work() {
RobotTask task = createRobotTask(startURL, maxDepth, startReferer);
todo.add(task);
walkTree();
// ok, we did it, clean up dynamic data (the vistited vector)
cleanUp();
log.info("Documents retrieved by: Web=" + countWeb + " Cache=" + countCache + " Refresh=" + countRefresh+ " NoRefresh=" + countNoRefresh);
}
/**
* stop the current robot run
* note that this will not abourt the current download but stop after
* the current download has finished
*/
public void stopRobot() {
stopIt = true;
}
/**
* Holds information about memory status.
* @see handleMemoryError(OutOfMemoryError)
*/
private int memoryLevel = 0;
/** Can new tasks be added? (may depend on memoryLevel) */
protected boolean activatedNewTasks = true;
/** Are visited URLs collected? (may depend on memoryLevel) */
protected boolean activatedUrlHistory = true;
/** Are visited contents collected? (may depend on memoryLevel) */
protected boolean activatedContentHistory = true;
/** memory buffer of 200 KB to be freed in case of urgent memory needs */
private byte memoryBuffer[] = new byte[200 * 1024];
/**
* do your job !
*/
public void walkTree() {
while ((todo.size() > 0) && (!stopIt)) {
RobotTask task;
synchronized(visited) {
task = todo.removeFirst();
if (visited.contains(task) && (!visitMany.contains(task.getUrl().toString()))) {
log.debug("already visited: " + task.getUrl());
continue;
}
if (activatedUrlHistory) {
visited.add(task);
}
}
boolean repeat = true;
while (repeat) {
try {
retrieveURL(task);
repeat = false;
} catch (OutOfMemoryError memoryError) {
handleMemoryError(memoryError);
}
}
// sleep, if sleep is set to true
while (sleep) {
// callback
if (webRobotCallback != null) {
webRobotCallback.webRobotSleeping(true);
}
try {
Thread.sleep(1000);
} catch (InterruptedException e) {
};
}
// callback
if (webRobotCallback != null) {
webRobotCallback.webRobotSleeping(false);
}
// callback
if (webRobotCallback != null) {
webRobotCallback.webRobotUpdateQueueStatus(todo.size());
}
spawnThread();
}
// callback
if (webRobotCallback != null) {
finishThreads();
}
}
/**
* Implements OutOfMemory handling strategies.
* Action depends on memoryLevel
* @param memoryError
* @throws OutOfMemoryError
*/
protected void handleMemoryError(OutOfMemoryError memoryError)
throws OutOfMemoryError {
memoryLevel++;
log.error("OutOfMemoryError level=" + memoryLevel + "! (visited=" + visited.size() + ", todo=" + todo.size() + ")");
switch (memoryLevel) {
case 1:
// don�t remember visited URLs and contents any more
// and try it again
visited.clear(); activatedUrlHistory = false;
content2UrlMap.clear(); activatedContentHistory = false;
System.gc();
break;
case 2:
// stop adding new Tasks, just process todo-list.
// free memory buffer
// and try it again
activatedNewTasks = false;
memoryBuffer = null;
System.gc();
break;
case 3:
// there is nothing we can do any more.
// throw exception to stop robot
throw memoryError;
default :
// Should never be reached.
if (memoryBuffer != null) {
// avoid removal of memoryBuffer by compiler
System.err.println(memoryBuffer[0]);
}
throw memoryError;
}
}
/**
* calls webRobotDone and finishes docManager if
* executed in mainThread
*/
protected void finishThreads() {
webRobotCallback.webRobotDone();
if (docManager != null) {
docManager.finish();
}
}
/**
* Start subThreads for spidering.
* WARNING: Should only be implemented and used for local
* spidering purposes!
*/
protected synchronized void spawnThread() {
}
/** counter for calls of retrieveURL */
protected int iteration = 0;
/**
* retrieve the next URL, save it, extract all included links and
* add those links to the tasks list
* @param task task to retrieve, function does nothing if this is null
*/
public void retrieveURL(RobotTask task) {
if (task == null) {
log.debug("Empty task found, ignoring");
return;
}
long now = System.currentTimeMillis();
updateProgressInfo();
URL u = task.getUrl();
String urlString = u.toString();
String referer = task.getReferer();
int depth = task.getMaxDepth();
if (depth < 0) {
log.info("Max search depth reached");
return;
}
// we may need this additional check even if we
// tested it during adding to the tasks list
if (!isAllowed(u)) {
log.info("Url '" + u + "' filtered out.");
return;
}
if (u.getFile().equals("")) {
try {
urlString = urlString + "/";
u = new URL(urlString);
// fix for double retrieved files
task.setUrl(u);
} catch (MalformedURLException e) {
log.error("URL not well formed: " + e.toString());
// use exception handler to handle exception
exceptionHandler.handleException(this, u, e);
return;
}
}
log.info("retrieving " + urlString);
httpTool.setReferer(referer);
HttpDoc doc = null;
Vector links = null;
boolean cached = false;
// look in the cache first, but only for static pages
boolean reScan = true;
if ((docManager != null && allowCaching)
&& (task.getMethod() == HttpConstants.GET)
&& (task.getParamString() == null)) {
doc = docManager.retrieveFromCache(u);
/* if (doc != null) {
try {
links = ((UrlCollector) docManager).retrieveLinks(doc);
} catch (IOException e) {
log.info("Could not get links for " + u + ": " + e.getMessage());
links = null;
}
}*/
if (doc != null) {
countCache++;
long lastRetrieved = doc.getDateAsMilliSeconds();
double ageInSeconds = (now - lastRetrieved) / 1000;
if (ageInSeconds < 0) {
log.warn("DocumentAge < 0!");
}
reScan = maxDocumentAge >= 0 && ageInSeconds > maxDocumentAge;
if (reScan) {
long lastModified = doc.getLastModifiedAsMilliSeconds();
Date lastModifiedDate = new Date(lastModified);
httpTool.setIfModifiedSince(lastModifiedDate);
}
} else {
httpTool.setIfModifiedSince(null);
}
}
// if not found in cache, retrieve from the web page
if (reScan) {
HttpDoc newDoc;
boolean error = false;
try {
if (u.getProtocol().equalsIgnoreCase("file")) {
// retrieve from file
newDoc = retrieveFileURL(u, httpTool.getIfModifiedSince());
} else {
// retrieve from Web
newDoc = httpTool.retrieveDocument(u, task.getMethod(), task.getParamString());
if (newDoc != null) {
newDoc.setDate(now);
}
sleepNow();
}
if (newDoc!= null && !newDoc.isNotModified()) {
if (!(newDoc.isOk() || newDoc.isRedirect())) {
error = true;
}
} else {
// (newDoc == null || newDoc.isNotModified()) && doc != null
// -> Not modified
// -> refresh time stamp
if (doc != null) {
doc.setDate(now);
doc.setCached(false);
newDoc = null;
}
}
} catch (HttpException hex) {
error = true; newDoc = null;
}
if (error) {
int retry = task.retry();
if (retry <= maxRetries) {
synchronized(visited) {
todo.add(task);
visited.remove(task);
}
log.info("Adding " + u + " for retry no. " + retry);
return;
} else {
doc = docManager.retrieveFromCache(u);
if (doc == null) {
log.warn("Unsuccessfull retries for " + u);
return;
} else {
long docDate = doc.getDateAsMilliSeconds();
long age = (now - docDate);
age /= 1000;
if (expirationAge < 0 || age < expirationAge) {
newDoc = doc;
cached = true;
log.info("Cached document not expired: " + u);
} else {
log.warn("Cached document expired: " + u);
docManager.removeDocument(u);
return;
}
}
}
}
if (newDoc != null) {
countWeb++;
doc = newDoc;
links = null; // force recalculation of links
countRefresh++;
} else {
cached = true;
countNoRefresh++;
}
} else {
cached = true;
log.debug("Page " + u + " retrieved from cache");
}
// Add it to the visited vector
// needs to be synchronized with todo-list
// visited.add(task);
// got a NULL document, that doc was not retrieved
// usually, it was not downloaded because a rule didn't allow
// to download it
if (doc == null) {
log.info("not downloaded " + u);
return;
}
// Duplicate check
String duplicate=null;
if (duplicateCheck) {
duplicate = getContentVisitedURL(doc);
if (duplicate != null) {
log.info("URLs with same content found: " + urlString + " = " + duplicate);
} else {
try {
duplicate = docManager.findDuplicate(doc);
if (duplicate != null) {
log.info("URLs with same content found in cache: " + urlString + " = " + duplicate);
}
} catch (IOException e) {
e.printStackTrace();
}
}
if (duplicate != null) {
String pureDuplicate = removeParameters(duplicate);
String pureUrl = removeParameters(urlString);
if (!pureUrl.equals(pureDuplicate) && !cached) {
// different url not yet stored -> store it
try {
// retrieve links from original
HttpDoc linksDoc = docManager.retrieveFromCache(new URL(duplicate));
if (linksDoc != null) {
doc.setLinks(linksDoc.getLinks());
}
docManager.storeDocument(doc);
} catch (Exception e) {
e.printStackTrace();
}
}
RobotTask newTask;
try {
newTask = createRobotTask(new URL(duplicate), depth, referer);
// check already here for visited tasks to save memory
if (!visited.contains(newTask)) {
addTask(newTask);
}
} catch (MalformedURLException e) {
e.printStackTrace(); // Can�t happen
}
return;
}
}
// was it an UnAuthorized document ?
if (doc.isUnauthorized()) {
log.info("got HTTP Unauthorized for URL " + u);
}
if (doc.isOk() || cached) {
// callback
if (webRobotCallback != null) {
int contentLength=0;
if (doc.getContent() != null) { contentLength=doc.getContent().length; }
webRobotCallback.webRobotRetrievedDoc(urlString, contentLength);
}
// extract links
try {
if (doc.isHTML() && (depth > 0)) {
// solving encoding problem
// HtmlDocument htmlDoc = new HtmlDocument(u, doc.getContent());
HtmlDocument htmlDoc = null;
HttpHeader contentTypeHeader = doc.getHeader("Content-type");
if (contentTypeHeader != null) {
String contentType = contentTypeHeader.getValue();
int index = contentType.toLowerCase().indexOf("charset=");
if (index > 0) {
htmlDoc = new HtmlDocument(u, doc.getContent(), contentType.substring(index+8));
} else {
htmlDoc = new HtmlDocument(u, doc.getContent());
}
} else {
htmlDoc = new HtmlDocument(u, doc.getContent());
}
// add links
// this depth-check is critical!
// otherwise far too many RobotTasks will be created
// this will cause a premature OutOfMemoryException!
if (depth > 0) {
if (duplicate != null) {
HttpDoc linksDoc = docManager.retrieveFromCache(new URL(duplicate));
doc.setLinks(linksDoc.getLinks());
} else if (cached) {
}
if (links == null) {
links = htmlDoc.getLinks();
doc.setLinks(links);
}
if (duplicate == null) {
HashSet checkedLinks = new HashSet();
for (int i = 0; i < links.size(); i++) {
URL link = (URL) links.elementAt(i);
log.info("Link: "+link);
// check already here for duplicate links to avoid expensive
// creation of RobotTasks
if (!checkedLinks.contains(link)) {
checkedLinks.add(link);
String myReferer = u.toString();
if (u.getUserInfo() != null) {
// remove userinfo from referer
int endindex = myReferer.indexOf("@")+1;
myReferer = "http://"+ myReferer.substring(endindex);
}
RobotTask newTask = createRobotTask((URL) links.elementAt(i), depth - 1, myReferer);
// check already here for visited tasks to save memory
if (!visited.contains(newTask)) {
// bad workaround to retrieve images first
if (newTask.urlString.endsWith(".jpg")) {
addTaskAtStart(newTask);
} else {
addTask(newTask);
}
}
}
}
}
}
if (hasFormHandlers) {
// add forms
Vector forms = htmlDoc.getElements("form");
for (int i = 0; i < forms.size(); i++) {
ExtendedURL eurl = formFiller.fillForm(u, (Element) forms.elementAt(i));
if (eurl != null) {
RobotTask newTask = createRobotTask(eurl.getURL(), depth - 1, u.toString());
newTask.setParamString(eurl.getParams());
newTask.setMethod(eurl.getRequestMethod());
addTask(newTask);
}
}
}
}
// catch any occuring error to keep on processing
} catch (OutOfMemoryError e) {
throw e;
} catch (Throwable e){
log.error("Unexpected error while extraction links from url '" + u + "':"+e);
e.printStackTrace();
// continue processing
}
// filter and store the document
if ((docManager != null)) {
try {
if (filters != null) {
doc = filters.process(doc);
} else {
log.debug("No filters defined");
}
if (isProcessingAllowed(doc)) {
docManager.processDocument(doc);
} else {
String md5 = doc.getHeaderValue(HttpHeader.CONTENT_MD5);
doc.setContent("Not for indexing".getBytes());
doc.setHeaderValue(HttpHeader.CONTENT_MD5, md5);
}
try {
docManager.storeDocument(doc);
} catch (Exception e) {
log.warn("could not store (not for indexing) " + urlString + ": " + e.getMessage());
}
if (activatedContentHistory && duplicate==null) {
setContentVisitedURL(doc, urlString);
}
} catch (DocManagerException e1) {
log.error("could not process document: " + e1.getMessage());
exceptionHandler.handleException(this, u, e1);
} catch (FilterException e2) {
log.error(e2.getMessage());
}
}
} else {
// it was NOT a 200 return code !
if (doc.isRedirect()) {
String ref = doc.getLocation();
log.info("Got redirect to " + ref);
try {
URL u2 = new URL(u, ref);
// is it on another host ?
// On a redirect, browsers use the old Referer instead of the
// URL that got this redirect
// Therefore we do not use u.toString as Referer but the old Referer
RobotTask newTask = createRobotTask(u2, depth - 1, referer);
// it will be inserted at the beginning of the vector !
addTaskAtStart(newTask);
} catch (MalformedURLException e) {
// ignore this URL
}
// handle other values
} else if (doc.isNotFound()) {
// the document was not found
exceptionHandler.handleException(this, u, new HttpException("Document not found"));
} else if (doc.isUnauthorized()) {
// the document was not found
exceptionHandler.handleException(
this,
u,
new HttpException("No authorization for the document."));
} else {
// an other error occured.
exceptionHandler.handleException(this, u, new HttpException("Unknown document error (Http return code "+doc.getHttpCode()+")."));
}
}
}
/**
* Inform about spidering progress.
* May use iteration, startTime,
* countCache, countWeb, countRefresh, countNoRefresh
*/
public void updateProgressInfo() {
}
/**
* sleep for sleepTime seconds.
*/
public void sleepNow() {
if (sleepTime > 0) {
synchronized(this) {
if (webRobotCallback != null) {
webRobotCallback.webRobotSleeping(true);
}
try {
Thread.sleep(sleepTime * 1000);
} catch (InterruptedException e) {
}
if (webRobotCallback != null) {
webRobotCallback.webRobotSleeping(false);
}
}
}
}
/**
* retrieves a file from the local file system.
* @param url the url of the file to retrieve
* @return HttpDoc containing the content and mime type
*/
private HttpDoc retrieveFileURL(URL url, Date ifModifiedSince) throws HttpException {
HttpDoc doc = new HttpDoc();
try {
String host = url.getHost();
String filename = url.getFile();
if ((host == null) || (host.equals(""))) {
// local file
// remove leading / or \
if ((filename.startsWith("\\")) || (filename.startsWith("/"))) {
filename = filename.substring(1);
}
} else {
filename = "//" + host + filename;
}
// get the mimetype and put in the http header
String mimetypestr = getMimeTypeForFilename(filename);
if (mimetypestr != null) {
HttpHeader header = new HttpHeader("content-type", mimetypestr);
doc.addHeader(header);
}
// get the content from the file
File file = new File(filename);
if (!file.exists()) {
doc.setHttpCode("httpcode " + HttpConstants.HTTP_NOTFOUND);
return doc;
}
long fileLastModified = file.lastModified();
long ifModifiedSinceTime = ifModifiedSince == null ? 0 : ifModifiedSince.getTime();
if (fileLastModified > ifModifiedSinceTime) {
byte[] content = readFileToByteArray(file);
doc.setContent(content);
doc.setHttpCode("httpcode " + HttpConstants.HTTP_OK);
} else {
doc.setHttpCode("httpcode " + HttpConstants.HTTP_NOTMODIFIED);
}
doc.setLastModified(fileLastModified);
doc.setDate(System.currentTimeMillis());
doc.setURL(url);
return doc;
} catch (Exception e) {
throw new HttpException(e.getMessage());
}
}
/**
* Get the Mime type for the given filename.
* @param filename
* @return Mime type
*/
protected String getMimeTypeForFilename(String filename) {
if (filename.endsWith(".html") || filename.endsWith(".htm")) {
return "text/html";
} else {
return null;
}
}
/**
* Clean up temporary data
*/
protected void cleanUp() {
stopIt = false;
visited.clear();
todo.clear();
}
/**
* adds a new task to the task vector but does some checks to
*/
protected void addTask(RobotTask task) {
if (taskAddAllowed(task) && activatedNewTasks) {
todo.add(task);
}
}
/**
* adds a new tasks at the beginning of the tasks list
* @see #addTask(RobotTask)
*/
protected void addTaskAtStart(RobotTask task) {
if (taskAddAllowed(task) && activatedNewTasks) {
todo.addAtStart(task);
}
}
/**
* Checks if a tasks should be added to the task list
* @param robotTask
* @return true if this tasks can be added to the task list,
* false otherwise
*/
protected boolean taskAddAllowed(RobotTask task) {
if (task == null) {
log.info("Null task not allowed");
return false;
}
if (!isAllowed(task.getUrl())) {
return false;
}
if (todo.contains(task)) {
return false;
}
return true;
}
/**
* Is it allowed to travel to this new URL ?
* @param u the URL to test
* @return true if traveling to this URL is allowed, false otherwise
*/
protected boolean isAllowed(URL u) {
// do the basic checks
if (basicURLCheck(u)) {
// if we have an URLCheck then test this URL against it
if ((urlCheck != null) && (!urlCheck.checkURL(u))) {
log.debug("not allowed by URLCheck:" + u);
return false;
}
if (robCheck.ok(u)) {
return true;
} else {
log.debug("not allowed by robots.txt:" + u);
return false;
}
}
return false;
}
/**
* Is it allowed to process this document ?
* @param document
* @return true if processing of this URL is allowed
*/
protected boolean isProcessingAllowed(HttpDoc doc) {
URL u = doc.getURL();
if ((urlCheck != null) && (!urlCheck.checkURLForProcessing(u))) {
log.debug("processing not allowed by URLCheck:" + u);
return false;
}
DownloadRuleSet downloadRuleSet = httpTool.getDownloadRuleSet();
if (downloadRuleSet != null && !downloadRuleSet.processAllowed(doc.getHttpHeaders())) {
log.debug("processing not allowed by DownloadRuleSet:" + u);
return false;
}
return true;
}
/**
* Basic URL allow check
* it is allowed to walk to a new URL if <ul>
* <li>WalkToOtherHost is true. In this case there will be no additional
* tests.</li>
* <li>The new URL is located below the start URL, e.g. is the start URL
* is http://localhost/test, the URL http://localhost/test/index.html
* is allowed, but http://localhost/ is not allowed.</li>
* <li>AllowWholeHost is true and the new URL is located on the same host
* as the start URL.</li>
* <li>FlexibleHostCheck is true and the host part of the current URL
* is equal to the host part of the start URL modulo the prefix "www."
* </li>
* <li>The URL starts with a string in the "AllowedURLs" list.</li>
* </ul>
*/
protected boolean basicURLCheck(URL currURL) {
String currURLStr = currURL.getHost() + currURL.getPath();
String currHost = currURL.getHost().toLowerCase();
String startHost = startURL.getHost().toLowerCase();
// no more checks, if walkToOtherHosts is true
if (walkToOtherHosts) {
return true;
}
// new URL below start URL ?
if (currURLStr.startsWith(startDir)) {
return true;
}
// on the same host ?
if (allowWholeHost && (currURL.getHost().equalsIgnoreCase(startURL.getHost()))) {
return true;
}
// on the same host with flexible test (host name with and without "www."
if (flexibleHostCheck) {
if (cutWWW(currHost).equalsIgnoreCase(cutWWW(startHost))) {
return true;
}
}
// allow whole domain ?
if (allowWholeDomain) {
if (currHost.endsWith(getDomain(startHost))) {
return true;
}
}
// in the list of allowed URLs ?
for (int i = 0; i < allowedURLs.size(); i++) {
String s = (String) allowedURLs.elementAt(i);
if (currURLStr.startsWith(s)) {
return true;
}
}
log.debug("URL " + currURLStr + " not allowed");
return false;
}
/**
* remove a leading www. from a given hostname
*
* @param hostname some hostname
* @return the hostname if it doesn't start with "www." otherwise
* the hostname without the leading www.
*/
private String cutWWW(String hostname) {
if (hostname.toLowerCase().startsWith("www.")) {
return hostname.substring(4);
} else {
return hostname;
}
}
/**
* Gets the domain name of a given host (just delete everything
* to the last "."
*
* @param hostname some hostname
* @return the domain part of this hostname
*/
private String getDomain(String hostname) {
int pos = hostname.indexOf(".");
if (pos < 0) {
// this should not happen !
return hostname;
} else {
return hostname.substring(pos + 1);
}
}
/**
* Method getExceptionHandler.
* @return RobotExceptionHandler the exceptionhandler of the robot
*/
public RobotExceptionHandler getExceptionHandler() {
return exceptionHandler;
}
/**
* Method setExceptionHandler.
* sets the exceptionhandler of the robot
* @param newExceptionHandler the new exception handler
*/
public void setExceptionHandler(RobotExceptionHandler newExceptionHandler) {
if (newExceptionHandler != null) {
exceptionHandler = newExceptionHandler;
}
}
/**
* Method setStart.
* sets the start URL
* @param the startURL as String
*/
public void setStart(String startURL) {
try {
setStartURL(new URL(startURL));
} catch (MalformedURLException e) {
e.printStackTrace();
}
}
/**
* Method getStart.
* gets the start url as string
* @return String
*/
public String getStart() {
URL url = getStartURL();
if (url != null) {
return url.toExternalForm();
} else {
return null;
}
}
/**
* This method finishes HttpTool, NoRobots, HttpDocManager.
*/
public void finish() {
if (httpTool != null) {
httpTool.finish();
}
if (robCheck != null) {
robCheck.finish();
}
if (docManager != null) {
docManager.finish();
}
}
public static void main(String[] args) {
if (args.length > 0) System.err.println("Arguments will be ignored!");
Field[] fields = WebRobot.class.getDeclaredFields();
StringBuffer str = new StringBuffer(60);
for (int i = 0; i < fields.length; i++) {
if (!Modifier.isFinal(fields[i].getModifiers())
&& !Modifier.isStatic(fields[i].getModifiers())) {
str.delete(0, str.length());
str.append(" robot." + fields[i].getName() + " = " + fields[i].getName() + ";");
while (str.length() < 50) {
str.append(" ");
}
System.out.println(str.toString()+"// ("+fields[i].getType().getName()+")");
}
}
}
/** default expected count of documents */
private static final int DEFAULT_EXPECTED_DOCUMENT_COUNT = 50000;
/** expected count of documents */
protected int expectedDocumentCount = DEFAULT_EXPECTED_DOCUMENT_COUNT;
/** remember visited content here (md5, urlString) */
protected HashMap content2UrlMap;
/** counter for pages that were found in cache */
long countCache = 0;
/** counter for pages retrieved by web */
long countWeb = 0;
/** counter for pages that didn�t need a refresh */
long countNoRefresh = 0;
/** counter for refreshed pages (=cache+web) */
long countRefresh = 0;
/**
* Method getContentVisitedURL.
* Checks if the content was visited before and retrieves the corresponding URL.
* @param content
* @return found url or null if not found
*/
public String getContentVisitedURL(HttpDoc doc) {
Object key = doc.getContentMD5();
synchronized(content2UrlMap) {
String url = (String) content2UrlMap.get(key);
return url;
}
}
/**
* Method setContentVisitedURL.
* Makes an URL retrievable by its content by entering it in content2UrlMap.
* @param content
* @param url
*/
public void setContentVisitedURL(HttpDoc doc, String url) {
Object key = doc.getContentMD5();
synchronized(content2UrlMap) {
content2UrlMap.put(key, url);
}
}
private final RobotTask createRobotTask(URL url, int maxDepth, String startReferer) {
url = removeWasteParameters(url);
return new RobotTask(url, maxDepth, startReferer);
}
/** only true if form-handlers are defined */
boolean hasFormHandlers = false;
/** list of wasteParameters (will be removed from URLs) **/
protected Vector wasteParameters = new Vector();
/**
* Set the list of wasteParameters (will be removed from URLs)
* @param wasteParameters
* if they begin of a string in this vector
*/
public void setWasteParameters(Vector wasteParameters) {
this.wasteParameters = wasteParameters;
}
/**
* Gets the list of wasteParameters (will be removed from URLs)
* @return a Vector containing Strings
*/
public Vector getWasteParameters() {
return this.wasteParameters;
}
/** Removes wasteParameters from URL.
* (eg. ID)
* @param url
* @return URL
*/
public URL removeWasteParameters(URL url) {
String urlString = url.toExternalForm();
String newUrlString = removeParametersFromString(urlString, wasteParameters);
if (urlString != newUrlString) {
try {
url = new URL(newUrlString);
} catch (MalformedURLException ex) {
ex.printStackTrace();
}
};
return url;
}
/**
* Remove passed Parameters from UrlString
* @param urlString
* @param wasteParameters
* @return String
*/
public static String removeParametersFromString(String urlString, Vector wasteParameters) {
if (wasteParameters != null && wasteParameters.size() > 0) {
int questionMark = urlString.indexOf("?");
if (questionMark>0 && questionMark<urlString.length()) {
int restPosition = urlString.indexOf("#", questionMark);
String parameters;
String rest;
if (restPosition<0) {
parameters = urlString.substring(questionMark+1);
rest = null;
} else {
parameters = urlString.substring(questionMark+1,restPosition);
rest = urlString.substring(restPosition);
}
StringBuffer filteredUrl = new StringBuffer(urlString.substring(0,questionMark));
StringTokenizer tokenizer = new StringTokenizer(parameters, "&");
String and = "?";
boolean changed = false;
while (tokenizer.hasMoreTokens()) {
String token = tokenizer.nextToken();
boolean keep = true;
for (int w=0; w<wasteParameters.size(); w++) {
String wasteParameter = (String) wasteParameters.elementAt(w);
if (token.startsWith(wasteParameter + "=")) {
keep = false;
changed = true;
break;
}
}
if (keep) {
filteredUrl.append(and);
filteredUrl.append(token);
and = "&";
}
}
if (rest != null) filteredUrl.append(rest);
if (changed) {
urlString = filteredUrl.toString();
}
}
}
return urlString;
}
/** time of WebRobot start in milliseconds */
protected long startTime = System.currentTimeMillis();
/** number of allowed retries for document retrieval */
protected int maxRetries = 0;
/**
* Set allowed retries for document retrieval
* @param maxRetries
*/
public void setMaxRetries(int maxRetries) { this.maxRetries = maxRetries; }
/**
* Get allowed retries for document retrieval
* @return maxRetries
*/
public int getMaxRetries() { return maxRetries; }
/**
* expiration age of documents in cache.
* Documents older than expirationAge will be removed,
* negative value means no limit.
*/
protected long expirationAge = -1;
/**
* set expiration age of documents in cache.
* Documents older than expirationAge will be removed,
* negative value means no limit.
* @param age
*/
public void setExpirationAge(long age) { expirationAge = age; }
/**
* get expiration age of documents in cache.
* @return long
*/
public long getExpirationAge() { return expirationAge; }
/**
* Remove Parameters from Url
* @param url
* @return url without parameters
*/
private final static String removeParameters(String url) {
int pos = url.indexOf("?");
return pos >= 0 ? url.substring(0,pos) : url;
}
/**
* Reads a File to a byte array.
* @param file
* @return byte[]
* @throws IOException
*/
protected byte[] readFileToByteArray(File file) throws IOException
{
FileInputStream in = null;
try
{
byte[] buffer = new byte[(int) file.length()];
in = new FileInputStream(file);
in.read(buffer);
return buffer;
}
finally
{
if (in != null)
{
try
{
in.close();
}
catch (IOException e)
{
}
}
}
}
}