Source Code of net.matuschek.spider.WebRobot

package net.matuschek.spider;


/**
 * This class implements a web robot that does a search trough
 * the web starting from a given start document up to a given 
 * search depth.
 * 
 * @author Daniel Matuschek / Oliver Schmidt 
 * @version $Revision: 1.35 $
 */


import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.lang.reflect.Field;
import java.lang.reflect.Modifier;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.StringTokenizer;
import java.util.Vector;


import net.matuschek.html.FormFiller;
import net.matuschek.html.HtmlDocument;
import net.matuschek.http.DocManagerException;
import net.matuschek.http.DownloadRuleSet;
import net.matuschek.http.ExtendedURL;
import net.matuschek.http.HttpConstants;
import net.matuschek.http.HttpDoc;
import net.matuschek.http.HttpDocManager;
import net.matuschek.http.HttpException;
import net.matuschek.http.HttpHeader;
import net.matuschek.http.HttpTool;
import net.matuschek.http.HttpToolCallback;
import net.matuschek.http.NTLMAuthorization;
import net.matuschek.http.cookie.CookieManager;
import net.matuschek.spider.docfilter.FilterChain;
import net.matuschek.spider.docfilter.FilterException;


import org.apache.log4j.Category;
import org.w3c.dom.Element;


public class WebRobot implements Runnable, Cloneable {


  /** the name of the robot */
  private final static String ROBOT_NAME = "JoBo";


  /** the default agent name */
  private final static String AGENT_NAME = 
          ROBOT_NAME+"/1.4 (http://www.matuschek.net/jobo.html)";


  /** the robot exception handler*/
  protected RobotExceptionHandler exceptionHandler = 
          new DefaultRobotExceptionHandler();


  /** default maximal search depth */
  private final static int DEFAULT_DEPTH = 10;


  /** the URL where the robot walk starts from */
  protected URL startURL = null;


  /** the host and directory where retrieval started from */
  protected String startDir = "";


  /** maximal search depth */
  protected int maxDepth = DEFAULT_DEPTH;


  /** is it allowed to walk to other hosts then the starting host ? */
  protected boolean walkToOtherHosts = false;


  /** DocManager will store or process retrieved documents */
  protected HttpDocManager docManager;


  /** HttpTool will be used to retrieve documents from a web server */
  protected HttpTool httpTool = new HttpTool();


  /** Log4J category for logging */
  protected Category log;


  /** Referer used to retrieve to first document */
  protected String startReferer = "-";


  /** test for robots.txt */
  protected NoRobots robCheck;


  /** current tasks */
  protected TaskList todo = null;


  /** a list of all URLs we got already */
  protected TaskList visited = null;
  
  /** ignore settings in /robots.txt ? */
  protected boolean ignoreRobotsTxt = false;


  /** sleep that number of seconds after every retrieved document */
  protected int sleepTime = 1;


  /** fill out forms */
  protected FormFiller formFiller = new FormFiller();


  /** this URLs can be visited more then once */
  protected Vector visitMany = new Vector();


  /** for callback to the user interface **/
  protected WebRobotCallback webRobotCallback = null;


  /** should we stop robot operation ? **/
  protected boolean stopIt = false;


  /** to check if it is allowed to travel to a given URL **/
  protected URLCheck urlCheck = null;


  /** should the robot suspend the current walk() **/
  protected boolean sleep;


  /** list of allowed URLs (even if walkToOtherHosts is false) **/
  protected Vector allowedURLs = new Vector();


  /** allow travelling the whole host ? */
  protected boolean allowWholeHost = true;


  /** 
   * maximum document age in seconds, negative value means
   * no limit 
   */
  protected long maxDocumentAge = -1; // no limit


  /** 
   * allow travelling to all subdomains of the start host ? 
   * @see #setAllowWholeDomain(boolean)
   */
  protected boolean allowWholeDomain = true;


  /** 
   * do more flexible tests if the new URL is on the same host
   * @see #basicURLCheck(URL)
   */
  protected boolean flexibleHostCheck = false;


  /**
   * FilterChain to filter the document before storing it
   */
  protected FilterChain filters = null;


  /**
   * don't retrieve pages again that are already stored in the DocManager
   */
  protected boolean allowCaching = true;
  
  /**
   * Check for documents with the same content
   */
  protected boolean duplicateCheck = false;
  
  /**
   * initializes the robot with the default implementation 
   * of the TaskList interface
   * 
   * @param expected document count
   */
  public WebRobot(int expectedDocumentCount) {
    log = Category.getInstance(getClass().getName());
    content2UrlMap = new HashMap(expectedDocumentCount);
    registerVisitedList(new HashedMemoryTaskList(false,
          expectedDocumentCount));
    registerToDoList(new HashedMemoryTaskList(true,
          expectedDocumentCount));
    this.expectedDocumentCount = expectedDocumentCount;
    this.setAgentName(AGENT_NAME);
  }


  /**
   * initializes the robot with the default implementation of the TaskList
   * interface
   */
  public WebRobot() {
    this(DEFAULT_EXPECTED_DOCUMENT_COUNT);
  }
  
  /**
   * Sets the implementation class for the backend task list storage.
   * WebRobot uses the TaskList interface to store future tasks.
   *
   * If you want to use your own TaskList implementation, just call
   * this method.
   * 
   * @param todo TaskList to be used for the "to do" list
   */
  public void registerToDoList(TaskList todo) {
    this.todo = todo;
  }


  /**
   * Sets the implementation class for the backend task list storage.
   * WebRobot uses the TaskList interface to store URLs that have
   * been retrieved before.
   *
   * If you want to use your own TaskList implementation, just call
   * this method.
   * 
   * @param visited TaskList to be used for the list of visited URLs
   */
  public void registerVisitedList(TaskList visited) {
    this.visited = visited;
  }


  /**
   * @return the start URL for this robot
   */
  public URL getStartURL() {
    return startURL;
  }


  /**
   * Sets the start URL for this robot
   * @param startURL the start URL
   */
  public void setStartURL(URL startURL) {
    String path = startURL.getPath();
    this.startURL = startURL;


    // is it a directory ?
    if (path.endsWith("/")) {
      this.startDir = startURL.getHost() + path;
    } else {
      int pos = path.lastIndexOf("/");
      if (pos < 0) {
        // this happens for URLs without a path
        this.startDir = startURL.getHost() + "/";
      } else {
        this.startDir = startURL.getHost() + path.substring(0, pos + 1);
      }
    }
  }


  /**
   * @return the maximal allowed search depth
   */
  public int getMaxDepth() {
    return maxDepth;
  }


  /**
   * sets the maximal search depth
   * @param maxDepth
   */
  public void setMaxDepth(int maxDepth) {
    this.maxDepth = maxDepth;
  }


  /**
   * Get the value of bandwith of the used HttpTool
   * @return value of bandwith.
   */
  public int getBandwidth() {
    return httpTool.getBandwidth();
  }


  /**
   * Set the value of bandwith  of the used HttpTool
   * @param bandwidth  Value to assign to bandwith.
   */
  public void setBandwidth(int bandwidth) {
    httpTool.setBandwidth(bandwidth);
  }


  /**
   * gets the WalkToOtherHost status
   * @return true if the Robot is allowed to travel to other
   * host then the start host, false otherwise
   */
  public boolean getWalkToOtherHosts() {
    return walkToOtherHosts;
  }


  /**
   * sets the WalkToOtherHosts status
   * @param walkToOtherHosts true if the Robot is allowed to travel to other
   * host then the start host, false otherwise
   */
  public void setWalkToOtherHosts(boolean walkToOtherHosts) {
    this.walkToOtherHosts = walkToOtherHosts;
  }


  /**
   * gets the AllowWholeHost value
   * @return true if the Robot is allowed to travel to the whole 
   * host where it started from, false otherwise. If false, it is only
   * allowed to travel to URLs below the start URL
   */
  public boolean getAllowWholeHost() {
    return allowWholeHost;
  }


  /**
   * sets the AllowWholeHost status
   * @param allowWholeHost if true, the Robot is allowed to
   * travel to the whole host where it started from. Otherwise it is only
   * allowed to travel to URLs below the start URL.
   */
  public void setAllowWholeHost(boolean allowWholeHost) {
    this.allowWholeHost = allowWholeHost;
  }


  /**
   * Gets the AllowWholeDomain value.
   * @return true if the Robot is allowed to travel to the whole 
   * domain of the start host, false otherwise. 
   * @see #setAllowWholeDomain(boolean)
   */
  public boolean getAllowWholeDomain() {
    return allowWholeDomain;
  }


  /**
   * Sets the AllowWholeDomain status
   * @param allowWholeDomain if true, the Robot is allows to travel
   * to all hosts in the same domain as the starting host. E.g. if you
   * start at www.apache.org, it is also allowed to travel to
   * jakarta.apache.org, xml.apache.org ...
   */
  public void setAllowWholeDomain(boolean allowWholeDomain) {
    this.allowWholeDomain = allowWholeDomain;
  }


  /**
   * Gets the state of flexible host checking (enabled or disabled).
   *
   * To find out if a new URL is on the same host, the robot usually
   * compares the host part of both. Some web servers have an inconsistent
   * addressing scheme and use the hostname www.domain.com and domain.com.
   * With flexible host check enabled, the robot will consider both
   * hosts as equal.
   *
   * @return true, if flexible host checking is enabled
   */
  public boolean getFlexibleHostCheck() {
    return flexibleHostCheck;
  }


  /**
   * Defines if the host test should be more flexible.
   *
   * To find out if a new URL is on the same host, the robot usually
   * compares the host part of both. Some web servers have an inconsistent
   * addressing scheme and use the hostname www.domain.com and domain.com.
   * With flexible host check enabled, the robot will consider both
   * hosts as equal.
   *
   * @param flexibleHostCheck set this true, to enable flexible host checking
   * (disabled by default)
   */
  public void setFlexibleHostCheck(boolean flexibleHostCheck) {
    this.flexibleHostCheck = flexibleHostCheck;
  }


  /**
   * Gets the AllowCaching value.
   * @return true if the Robot is allowed to cache documents in the
   * docManager
   * @see #setAllowCaching(boolean)
   */
  public boolean getAllowCaching() {
    return allowCaching;
  }


  /**
   * Sets the AllowCaching status
   *
   * @param allowCaching if true, the Robot is allows to use
   * cached documents. That means it will first try to get teh document
   * from the docManager cache and will only retrieve it if it is
   * not found in the cache. If the cache returns a document, the robot
   * will NEVER retrieve it again. Therefore, expiration mechanisms have
   * to be included in the HttpDocManager method retrieveFromCache.
   * @see net.matuschek.http.HttpDocManager#retrieveFromCache(java.net.URL)
   */
  public void setAllowCaching(boolean allowCaching) {
    this.allowCaching = allowCaching;
  }


  /**
   * @return the document manager of this robot
   * @see HttpDocManager
   */
  public HttpDocManager getDocManager() {
    return docManager;
  }


  /**
   * Sets the document manager for this robot <br />
   * Without a document manager, the robot will travel through the web but
   * don't do anything with the retrieved documents (simply forget
   * them). 
   * A document manager can store them, extract information or 
   * whatever you like. 
   * There can be only one document manager, but you are free to combine
   * functionalities of available document managers in a new object (e.g.
   * to store the document and extract meta informations).
   * @param docManager
   */
  public void setDocManager(HttpDocManager docManager) {
    this.docManager = docManager;
  }


  /**
   * Sets the CookieManager used by the HttpTool
   * By default a MemoryCookieManager will be used, but you can
   * use this method to use your own CookieManager implementation.
   *
   * @param cm an object that implements the CookieManager interface
   */
  public void setCookieManager(CookieManager cm) {
    httpTool.setCookieManager(cm);
  }


  /**
   * Gets the CookieManager used by the HttpTool
   *
   * @return the CookieManager that will be used by the HttpTool
   */
  public CookieManager getCookieManager() {
    return httpTool.getCookieManager();
  }


  /**
   * Sets the DownloadRule
   * @param rule the download rule set to use
   */
  public void setDownloadRuleSet(DownloadRuleSet rules) {
    httpTool.setDownloadRuleSet(rules);
  }


  /**
   * Sets the URLCheck for this robot
   * @param check
   */
  public void setURLCheck(URLCheck check) {
    this.urlCheck = check;
  }


  /** 
   *  sets a proxy to use 
   *  @param proxyDescr the Proxy definition in the format host:port
   */
  public void setProxy(String proxyDescr) throws HttpException {
    httpTool.setProxy(proxyDescr);
  }


  /**
   * @return the current proxy setting in the format host:port
   */
  public String getProxy() {
    return httpTool.getProxy();
  }


  /**
   * @return the Referer setting for the first HTTP reuest
   */
  public String getStartReferer() {
    return startReferer;
  }


  /**
   * sets the Referer setting for the first HTTP reuest
   * @param startReferer an URL (e.g. http://www.matuschek.net)
   */
  public void setStartReferer(String startReferer) {
    this.startReferer = startReferer;
  }


  /**
   * should we ignore robots.txt Robot Exclusion protocol ?
   * @param ignoreRobotsTxt if set to true, the robot will ignore
   * the settings of the /robots.txt file on the webserver
   * <b>Know what you are doing if you change this setting</b>
   */
  public void setIgnoreRobotsTxt(boolean ignoreRobotsTxt) {
    robCheck.setIgnore(ignoreRobotsTxt);
  }


  /** 
   * @return the sleeptime setting
   */
  public int getSleepTime() {
    return sleepTime;
  }


  /**
   * set the sleeptime<br />
   * after every retrieved document the robot will wait this time
   * before getting the next document. this allows it to limit the
   * load on the server
   * @param sleeptime wait time in seconds
   */
  public void setSleepTime(int sleepTime) {
    this.sleepTime = sleepTime;
  }


  /**
   * sets the From: HTTP header<br />
   * this should be a valid email address. it is not needed for the robot,
   * but you should use it, because the administrator of the web server
   * can contact you if the robot is doing things that he don't want
   * @param fromAdress an RFC 822 email adress
   */
  public void setFromAddress(String fromAddress) {
    httpTool.setFromAddress(fromAddress);
  }


  /**
   * sets the list of form handlers
   * @see net.matuschek.html.FormHandler for more 
   * information about form handlers
   */
  public void setFormHandlers(Vector handlers) {
    formFiller.setFormHandlers(handlers);
    if (handlers != null && handlers.size() > 0) {
      hasFormHandlers = true;
    }
  }


  /**
   * @return the list of form handlers
   * @see net.matuschek.html.FormHandler for more information 
   * about form handlers
   */
  public Vector getFormHandlers() {
    return formFiller.getFormHandlers();
  }


  /**
   * Gets the name of the "User-Agent" header that the robot will use
   * @return the user agent name 
   */
  public String getAgentName() {
    if (httpTool != null) {
      return httpTool.getAgentName();
    } else {
      return null;
    }
  }


  /**
   * sets the Agent-Name authentication for this robot
   * @param name a name for this robot 
   * (e.g. "Mozilla 4.0 (compatible; Robot)")
   */
  public void setAgentName(String name) {
    httpTool.setAgentName(name);
    // robCheck = new NoRobots(ROBOT_NAME, httpTool);
    robCheck = new NoRobots(name, httpTool);
  }


  /**
   * Gets the timeout for getting data in seconds of the used HttpTool
   * @return the value of sockerTimeout
   * @see #setTimeout(int)
   */
  public int getTimeout() {
    if (httpTool != null) {
      return httpTool.getTimeout();
    } else {
      return -1;
    }
  }


  /**
   * Sets the timeout for getting data. If HttpTool can't read data from a
   * remote web server after this number of seconds it will stop the download
   * of the current file
   * @param timeout Timeout in seconds
   */
  public void setTimeout(int timeout) {
    httpTool.setTimeout(timeout);
  }


  /**
   * Gets the ntlmAuthentication of the robot
   * @return the ntlmAuthentication
   */
  public NTLMAuthorization getNtlmAuthorization() {
    if (httpTool != null) {
      return httpTool.getNtlmAuthorization();
    } else {
      return null;
    }
  }


  /**
   * sets a ntlmAuthentication for this robot
   * @param ntlmAuthentication for this robot 
   */
  public void setNtlmAuthorization(NTLMAuthorization ntlmAuthorization) {
    httpTool.setNtlmAuthorization(ntlmAuthorization);
  }


  /**
   * Gets the setting of the IgnoreRobotsTxt property
   * @return true if robots.txt will be ignored, false otherwise
   */
  public boolean getIgnoreRobotsTxt() {
    return ignoreRobotsTxt;
  }


  /**
   * Gets a vector of URLs that can be visited more then once
   * @return a vector containing URLs formated as Strings
   */
  public Vector getVisitMany() {
    return visitMany;
  }


  public void setVisitMany(Vector visitMany) {
    this.visitMany = visitMany;
  }


  public void setHttpToolCallback(HttpToolCallback callback) {
    httpTool.setCallback(callback);
  }


  public WebRobotCallback getWebRobotCallback() {
    return webRobotCallback;
  }


  public void setWebRobotCallback(WebRobotCallback webRobotCallback) {
    this.webRobotCallback = webRobotCallback;
  }


  /**
   * Sets the sleep status for this robot. If a WebRobot is set to sleep
   * after starting run(), is will wait after retrieving the current document
   * and wait for setSleep(false)
   */
  public void setSleep(boolean sleep) {
    this.sleep = sleep;
  }


  /**
   * Is the robot sleeping ?
   */
  public boolean isSleeping() {
    return this.sleep;
  }


  /** 
   * Set the list of allowed URLs
   * @param allowed a Vector containing Strings. URLs will be checked
   * if they begin of a string in this vector
   */
  public void setAllowedURLs(Vector allowed) {
    this.allowedURLs = allowed;
  }


  /**
   * Gets the list of allowed URLs
   * @return a Vector containing Strings
   * @see #setAllowedURLs(Vector)
   */
  public Vector getAllowedURLs() {
    return this.allowedURLs;
  }
  
  /**
   * Enable/disable cookies
   * @param enable if true, HTTP cookies will be enabled, if false
   * the robot will not use cookies
   */
  public void setEnableCookies(boolean enable) {
    httpTool.setEnableCookies(enable);
  }


  /**
   * Get the status of the cookie engine
   * @return true, if HTTP cookies are enabled, false otherwise
   */
  public boolean getEnableCookies() {
    return httpTool.getEnableCookies();
  }


  /** 
   * Set the maximum age of documents to retrieve to this number
   * of seconds
   * @param maxAge integer value of the maximum document age 
   * (in seconds), negative value means no limit.
   */
  public void setMaxDocumentAge(long maxAge) {
    this.maxDocumentAge = maxAge;
  }
  




  /**
   * Gets the maximum age of documents to retrieve
   * @return maximum document age (in seconds), negative value means 
   * no limit.
   */
  public long getMaxDocumentAge() {
    return this.maxDocumentAge;
  }


  /**
   * Sets a FilterChain. If teh WebRobot use a FilterChain it will
   * process any retrieved document by this FilterChain before
   * storing it
   *
   * @param filter a FilterChain to use for filtering HttpDocs
   */
  public void setFilters(FilterChain filters) {
    this.filters = filters;
  }


  /**
   * Delete all cookies
   */
  public void clearCookies() {
    httpTool.clearCookies();
  }


  /**
   * thread run() method, simply calls work()
   * @see #work()
   */
  public void run() {
    work();
  }


  /**
   * do your job travel through the web using the configured 
   * parameters and retrieve documents
   */
  public void work() {
    RobotTask task = createRobotTask(startURL, maxDepth, startReferer);
    todo.add(task);
    walkTree();
    // ok, we did it, clean up dynamic data (the vistited vector)
    cleanUp();
    log.info("Documents retrieved by: Web=" + countWeb + " Cache=" + countCache + " Refresh=" + countRefresh+ " NoRefresh=" + countNoRefresh);
  }


  /**
   * stop the current robot run 
   * note that this will not abourt the current download but stop after
   * the current download has finished
   */
  public void stopRobot() {
    stopIt = true;
  }


  /**
   * Holds information about memory status.
   * @see handleMemoryError(OutOfMemoryError)
   */
  private int memoryLevel = 0;
  
  /** Can new tasks be added? (may depend on memoryLevel) */
  protected boolean activatedNewTasks = true;
  
  /** Are visited URLs collected? (may depend on memoryLevel) */
  protected boolean activatedUrlHistory = true;
  
  /** Are visited contents collected? (may depend on memoryLevel) */
  protected boolean activatedContentHistory = true;
  
  /** memory buffer of 200 KB to be freed in case of urgent memory needs */
  private byte memoryBuffer[] = new byte[200 * 1024];


  /**
   * do your job !
   */
  
  public void walkTree() {
    while ((todo.size() > 0) && (!stopIt)) {
      RobotTask task;
      synchronized(visited) {
        task = todo.removeFirst();
        if (visited.contains(task) && (!visitMany.contains(task.getUrl().toString()))) {
          log.debug("already visited: " + task.getUrl());
          continue;
        }
        if (activatedUrlHistory) {
          visited.add(task);
        }
      }
      
      boolean repeat = true;
      while (repeat) {
        try {
          retrieveURL(task);
          repeat = false;
        } catch (OutOfMemoryError memoryError) {
          handleMemoryError(memoryError); 
        }
      }


      // sleep, if sleep is set to true
      while (sleep) {
        // callback
        if (webRobotCallback != null) {
          webRobotCallback.webRobotSleeping(true);
        }


        try {
          Thread.sleep(1000);
        } catch (InterruptedException e) {
        };
      }


      // callback
      if (webRobotCallback != null) {
        webRobotCallback.webRobotSleeping(false);
      }


      // callback
      if (webRobotCallback != null) {
        webRobotCallback.webRobotUpdateQueueStatus(todo.size());
      }
      spawnThread();
    }


    // callback
    if (webRobotCallback != null) {
      finishThreads();
    }
  }


  /**
   * Implements OutOfMemory handling strategies.
   * Action depends on memoryLevel
   * @param memoryError
   * @throws OutOfMemoryError
   */
  protected void handleMemoryError(OutOfMemoryError memoryError)
    throws OutOfMemoryError {
    memoryLevel++;
    log.error("OutOfMemoryError level=" + memoryLevel + "! (visited=" + visited.size() + ", todo=" + todo.size() + ")");
    switch (memoryLevel) {
      case 1:
        // don�t remember visited URLs and contents any more
        // and try it again
        visited.clear(); activatedUrlHistory = false;
        content2UrlMap.clear(); activatedContentHistory = false;
        System.gc();
        break;
      case 2:
        // stop adding new Tasks, just process todo-list.
        // free memory buffer 
        // and try it again 
        activatedNewTasks = false;
        memoryBuffer = null;
        System.gc();
        break;
      case 3:
        // there is nothing we can do any more.
        // throw exception to stop robot
        throw memoryError;
      default :
        // Should never be reached.
        if (memoryBuffer != null) {
          // avoid removal of memoryBuffer by compiler
          System.err.println(memoryBuffer[0]);
        }
        throw memoryError;
    }
  }


  /**
   * calls webRobotDone and finishes docManager if 
   * executed in mainThread
   */
  protected void finishThreads() {
    webRobotCallback.webRobotDone();
    if (docManager != null) {
      docManager.finish();
    }
  }
  
  /**
   * Start subThreads for spidering.
   * WARNING: Should only be implemented and used for local
   * spidering purposes!
   */
  protected synchronized void spawnThread() {
  }
  
  /** counter for calls of retrieveURL */
  protected int iteration = 0;
  
  /**
   * retrieve the next URL, save it, extract all included links and
   * add those links to the tasks list
   * @param task task to retrieve, function does nothing if this is null
   */
  public void retrieveURL(RobotTask task) {
    if (task == null) {
      log.debug("Empty task found, ignoring");
      return;
    }
    
    long now = System.currentTimeMillis();


    updateProgressInfo();


    URL u = task.getUrl();
    String urlString = u.toString();
    String referer = task.getReferer();
    int depth = task.getMaxDepth();


    if (depth < 0) {
      log.info("Max search depth reached");
      return;
    }


    // we may need this additional check even if we
    // tested it during adding to the tasks list 
    if (!isAllowed(u)) {
      log.info("Url '" + u + "' filtered out.");
      return;
    }


    if (u.getFile().equals("")) {
      try {
        urlString = urlString + "/";
        u = new URL(urlString);
        // fix for double retrieved files
        task.setUrl(u);
      } catch (MalformedURLException e) {
        log.error("URL not well formed: " + e.toString());
        // use exception handler to handle exception
        exceptionHandler.handleException(this, u, e);
        return;
      }
    }


    log.info("retrieving " + urlString);
    httpTool.setReferer(referer);


    HttpDoc doc = null;
    Vector links = null;
    boolean cached = false;


    // look in the cache first, but only for static pages
    boolean reScan = true;
    if ((docManager != null && allowCaching)
      && (task.getMethod() == HttpConstants.GET)
      && (task.getParamString() == null)) {
      doc = docManager.retrieveFromCache(u);
/*      if (doc != null) {
        try {
          links = ((UrlCollector) docManager).retrieveLinks(doc);
        } catch (IOException e) {
          log.info("Could not get links for " + u + ": " + e.getMessage());
          links = null;
        } 
      }*/
      
      if (doc != null) {
        countCache++;
        long lastRetrieved = doc.getDateAsMilliSeconds();
        double ageInSeconds = (now - lastRetrieved) / 1000;
        if (ageInSeconds < 0) {
          log.warn("DocumentAge < 0!");
        }
        reScan = maxDocumentAge >= 0 && ageInSeconds > maxDocumentAge;
        if (reScan) {
          long lastModified = doc.getLastModifiedAsMilliSeconds();
          Date lastModifiedDate = new Date(lastModified);
          httpTool.setIfModifiedSince(lastModifiedDate);
        }
      } else {
        httpTool.setIfModifiedSince(null);
      }
    }


    // if not found in cache, retrieve from the web page
    if (reScan) {
      HttpDoc newDoc;
      boolean error = false;
      try {
        if (u.getProtocol().equalsIgnoreCase("file")) {
          // retrieve from file
          newDoc = retrieveFileURL(u, httpTool.getIfModifiedSince());
        } else {
          // retrieve from Web
          newDoc = httpTool.retrieveDocument(u, task.getMethod(), task.getParamString());
          if (newDoc != null) {
            newDoc.setDate(now);
          }
          sleepNow();
        }
        
        if (newDoc!= null && !newDoc.isNotModified()) {
          if (!(newDoc.isOk() || newDoc.isRedirect())) {
            error = true;
          }
        } else {
          // (newDoc == null || newDoc.isNotModified()) && doc != null 
          // -> Not modified
          // -> refresh time stamp
          if (doc != null) {
            doc.setDate(now);
            doc.setCached(false);
            newDoc = null;
          }
        }
      } catch (HttpException hex) {
        error = true; newDoc = null;
      }
      if (error) {
        int retry = task.retry();
        if (retry <= maxRetries) {
          synchronized(visited) {
            todo.add(task);
            visited.remove(task);
          }
          log.info("Adding " + u + " for retry no. " + retry);
          return;
        } else {
          doc = docManager.retrieveFromCache(u);
          if (doc == null) {
            log.warn("Unsuccessfull retries for " + u);
            return;
          } else {
            long docDate = doc.getDateAsMilliSeconds();
            long age = (now - docDate);
            age /= 1000;
            if (expirationAge < 0 || age < expirationAge) {
              newDoc = doc;
              cached = true;
              log.info("Cached document not expired: " + u);
            } else {
              log.warn("Cached document expired: " + u);
              docManager.removeDocument(u);
              return;
            }
          }
        }
      }
      
      if (newDoc != null) {
        countWeb++;
        doc = newDoc;
        links = null; // force recalculation of links
        countRefresh++;
      } else {
        cached = true;
        countNoRefresh++;
      }
    } else {
      cached = true;
      log.debug("Page " + u + " retrieved from cache");
    }


    // Add it to the visited vector
    // needs to be synchronized with todo-list
//    visited.add(task); 
    
    // got a NULL document, that doc was not retrieved
    // usually, it was not downloaded because a rule didn't allow
    // to download it
    if (doc == null) {
      log.info("not downloaded " + u);
      return;
    }


    // Duplicate check
    String duplicate=null;
    if (duplicateCheck) {
      duplicate = getContentVisitedURL(doc);
      if (duplicate != null) {
        log.info("URLs with same content found: " + urlString + " = " + duplicate);
      } else {  
        try {
          duplicate = docManager.findDuplicate(doc);
          if (duplicate != null) {
            log.info("URLs with same content found in cache: " + urlString + " = " + duplicate);
          }
        } catch (IOException e) {
          e.printStackTrace();
        }
      }
      
      if (duplicate != null) {
        String pureDuplicate = removeParameters(duplicate);
        String pureUrl = removeParameters(urlString);
        if (!pureUrl.equals(pureDuplicate) && !cached) {
          // different url not yet stored -> store it
          try {
            // retrieve links from original
            HttpDoc linksDoc = docManager.retrieveFromCache(new URL(duplicate));
            if (linksDoc != null) {    
              doc.setLinks(linksDoc.getLinks());
            }
            docManager.storeDocument(doc);
          } catch (Exception e) {
            e.printStackTrace();
          }
        }
        RobotTask newTask;
        try {
          newTask = createRobotTask(new URL(duplicate), depth, referer);
          // check already here for visited tasks to save memory
          if (!visited.contains(newTask)) {
            addTask(newTask);
          }
        } catch (MalformedURLException e) {
          e.printStackTrace(); // Can�t happen
        }
        return;
      } 
    }


    // was it an UnAuthorized document ?
    if (doc.isUnauthorized()) {
      log.info("got HTTP Unauthorized for URL " + u);
    }


    if (doc.isOk() || cached) {
      // callback
      if (webRobotCallback != null) {
        int contentLength=0;
        if (doc.getContent() != null) { contentLength=doc.getContent().length; }
        webRobotCallback.webRobotRetrievedDoc(urlString, contentLength);
      }


      // extract links
      try {
        if (doc.isHTML() && (depth > 0)) {
          // solving encoding problem
          // HtmlDocument htmlDoc = new HtmlDocument(u, doc.getContent());
          HtmlDocument htmlDoc = null;
          HttpHeader contentTypeHeader = doc.getHeader("Content-type");
          if (contentTypeHeader != null) {
            String contentType = contentTypeHeader.getValue();
            int index = contentType.toLowerCase().indexOf("charset=");
            if (index > 0) {
              htmlDoc = new HtmlDocument(u, doc.getContent(), contentType.substring(index+8));
            } else {
              htmlDoc = new HtmlDocument(u, doc.getContent());
            }
          } else {
            htmlDoc = new HtmlDocument(u, doc.getContent());
          }
  
          // add links
          
          // this depth-check is critical!
          // otherwise far too many RobotTasks will be created
          // this will cause a premature OutOfMemoryException!
          if (depth > 0) {
            if (duplicate != null) {
              HttpDoc linksDoc = docManager.retrieveFromCache(new URL(duplicate));
              doc.setLinks(linksDoc.getLinks());
            } else if (cached) {
            } 
            if (links == null) {
              links = htmlDoc.getLinks();
              doc.setLinks(links);
            }
            if (duplicate == null) {
              HashSet checkedLinks = new HashSet();
              for (int i = 0; i < links.size(); i++) {
                URL link = (URL) links.elementAt(i);
                log.info("Link: "+link);
                // check already here for duplicate links to avoid expensive
                // creation of RobotTasks
                if (!checkedLinks.contains(link)) {
                  checkedLinks.add(link);
                  String myReferer = u.toString();
                  if (u.getUserInfo() != null) {
                    // remove userinfo from referer
                    int endindex = myReferer.indexOf("@")+1;
                    myReferer = "http://"+ myReferer.substring(endindex);
                  }
                  
                  RobotTask newTask = createRobotTask((URL) links.elementAt(i), depth - 1, myReferer);
                  // check already here for visited tasks to save memory
                  if (!visited.contains(newTask)) {
                    // bad workaround to retrieve images first
                    if (newTask.urlString.endsWith(".jpg")) {
                      addTaskAtStart(newTask);
                    } else {
                      addTask(newTask);
                    }
                  }
                }
              }
            }
          }
          
          if (hasFormHandlers) {
            // add forms
            Vector forms = htmlDoc.getElements("form");
            for (int i = 0; i < forms.size(); i++) {
              ExtendedURL eurl = formFiller.fillForm(u, (Element) forms.elementAt(i));
              if (eurl != null) {
                RobotTask newTask = createRobotTask(eurl.getURL(), depth - 1, u.toString());
                newTask.setParamString(eurl.getParams());
                newTask.setMethod(eurl.getRequestMethod());
                addTask(newTask);
              }
            }
          }
  
        }
      // catch any occuring error to keep on processing
      } catch (OutOfMemoryError e) {
        throw e;
      } catch (Throwable e){
        log.error("Unexpected error while extraction links from url '" + u + "':"+e);
        e.printStackTrace();
        // continue processing
      }


      // filter and store the document
      if ((docManager != null)) {
        try {
          if (filters != null) {
            doc = filters.process(doc);
          } else {
            log.debug("No filters defined");
          }
          
          if (isProcessingAllowed(doc)) {
            docManager.processDocument(doc);
          } else  {
            String md5 = doc.getHeaderValue(HttpHeader.CONTENT_MD5);
            doc.setContent("Not for indexing".getBytes());
            doc.setHeaderValue(HttpHeader.CONTENT_MD5, md5);
          } 
          
          try {
            docManager.storeDocument(doc);
          } catch (Exception e) {
            log.warn("could not store (not for indexing) " + urlString + ": " + e.getMessage());
          }
          if (activatedContentHistory && duplicate==null) {
            setContentVisitedURL(doc, urlString);
          }
        } catch (DocManagerException e1) {
          log.error("could not process document: " + e1.getMessage());
          exceptionHandler.handleException(this, u, e1);
        } catch (FilterException e2) {
          log.error(e2.getMessage());
        }
      }


    } else {
      // it was NOT a 200 return code !


      if (doc.isRedirect()) {
        String ref = doc.getLocation();
        log.info("Got redirect to " + ref);


        try {
          URL u2 = new URL(u, ref);
          // is it on another host ?


          // On a redirect, browsers use the old Referer instead of the
          // URL that got this redirect
          // Therefore we do not use u.toString as Referer but the old Referer
          RobotTask newTask = createRobotTask(u2, depth - 1, referer);


          // it will be inserted at the beginning of the vector !
          addTaskAtStart(newTask);
        } catch (MalformedURLException e) {
          // ignore this URL
        }
        // handle other values
      } else if (doc.isNotFound()) {
        // the document was not found
        exceptionHandler.handleException(this, u, new HttpException("Document not found"));
      } else if (doc.isUnauthorized()) {
        // the document was not found
        exceptionHandler.handleException(
          this,
          u,
          new HttpException("No authorization for the document."));
      } else {
        // an other error occured.
        exceptionHandler.handleException(this, u, new HttpException("Unknown document error (Http return code "+doc.getHttpCode()+")."));
      }
    }
  }


  /**
   * Inform about spidering progress.
   * May use iteration, startTime,
   * countCache, countWeb, countRefresh, countNoRefresh
   */
  public void updateProgressInfo() {
  }


  /**
   * sleep for sleepTime seconds.
   */
  public void sleepNow() {
    if (sleepTime > 0) {
      synchronized(this) {
        if (webRobotCallback != null) {
          webRobotCallback.webRobotSleeping(true);
        }
        
        try {
          Thread.sleep(sleepTime * 1000);
        } catch (InterruptedException e) {
        }
      
        if (webRobotCallback != null) {
          webRobotCallback.webRobotSleeping(false);
        }
      }
    }
  }


  /**
   * retrieves a file from the local file system.
   * @param url the url of the file to retrieve
   * @return HttpDoc containing the content and mime type
   */
  private HttpDoc retrieveFileURL(URL url, Date ifModifiedSince) throws HttpException {
    HttpDoc doc = new HttpDoc();


    try {
      String host = url.getHost();
      String filename = url.getFile();
      if ((host == null) || (host.equals(""))) {
        // local file
        // remove leading / or \
        if ((filename.startsWith("\\")) || (filename.startsWith("/"))) {
          filename = filename.substring(1);
        }
      } else {
        filename = "//" + host + filename;
      }
      // get the mimetype and put in the http header
      String mimetypestr = getMimeTypeForFilename(filename);
      if (mimetypestr != null) {
        HttpHeader header = new HttpHeader("content-type", mimetypestr);
        doc.addHeader(header);
      }
      
      // get the content from the file
      File file = new File(filename);
      if (!file.exists()) {
        doc.setHttpCode("httpcode " + HttpConstants.HTTP_NOTFOUND);
        return doc;
      }
      long fileLastModified = file.lastModified();
      long ifModifiedSinceTime = ifModifiedSince == null ? 0 : ifModifiedSince.getTime();
      if (fileLastModified > ifModifiedSinceTime) {
        byte[] content = readFileToByteArray(file);
        doc.setContent(content);
        doc.setHttpCode("httpcode " + HttpConstants.HTTP_OK);
      } else {
        doc.setHttpCode("httpcode " + HttpConstants.HTTP_NOTMODIFIED);
      }
      doc.setLastModified(fileLastModified);
      doc.setDate(System.currentTimeMillis());
      doc.setURL(url);
      
      return doc;
    } catch (Exception e) {
      throw new HttpException(e.getMessage());
    }
  }


  /**
   * Get the Mime type for the given filename.
   * @param filename
   * @return Mime type
   */
  protected String getMimeTypeForFilename(String filename) {
    if (filename.endsWith(".html") || filename.endsWith(".htm")) {
      return "text/html";
    } else {
      return null;
    }
  }
  
  /** 
   * Clean up temporary data
   */
  protected void cleanUp() {
    stopIt = false;
    visited.clear();
    todo.clear();
  }


  /** 
   * adds a new task to the task vector but does some checks to 
   */
  protected void addTask(RobotTask task) {
    if (taskAddAllowed(task) && activatedNewTasks) {
      todo.add(task);
    }
  }


  /** 
   * adds a new tasks at the beginning of the tasks list 
   * @see #addTask(RobotTask)
   */
  protected void addTaskAtStart(RobotTask task) {
    if (taskAddAllowed(task) && activatedNewTasks) {
      todo.addAtStart(task);
    }
  }


  /**
   * Checks if a tasks should be added to the task list
   * @param robotTask 
   * @return true if this tasks can be added to the task list,
   * false otherwise
   */
  protected boolean taskAddAllowed(RobotTask task) {
    if (task == null) {
      log.info("Null task not allowed");
      return false;
    }


    if (!isAllowed(task.getUrl())) {
      return false;
    }


    if (todo.contains(task)) {
      return false;
    }


    return true;
  }


  /**
   * Is it allowed to travel to this new URL ?
   * @param u the URL to test
   * @return true if traveling to this URL is allowed, false otherwise
   */
  protected boolean isAllowed(URL u) {


    // do the basic checks
    if (basicURLCheck(u)) {


      // if we have an URLCheck then test this URL against it 
      if ((urlCheck != null) && (!urlCheck.checkURL(u))) {
        log.debug("not allowed by URLCheck:" + u);
        return false;
      }


      if (robCheck.ok(u)) {
        return true;
      } else {
        log.debug("not allowed by robots.txt:" + u);
        return false;
      }
    }
    return false;
  }
  
  /**
   * Is it allowed to process this document ?
   * @param document
   * @return true if processing of this URL is allowed
   */
  protected boolean isProcessingAllowed(HttpDoc doc) {
    URL u = doc.getURL();
    if ((urlCheck != null) && (!urlCheck.checkURLForProcessing(u))) {
      log.debug("processing not allowed by URLCheck:" + u);
      return false;
    }
    
    DownloadRuleSet downloadRuleSet = httpTool.getDownloadRuleSet();
    if (downloadRuleSet != null && !downloadRuleSet.processAllowed(doc.getHttpHeaders())) {
      log.debug("processing not allowed by DownloadRuleSet:" + u);
      return false;
    }


    return true;
  }


  /**
   * Basic URL allow check
   * it is allowed to walk to a new URL if <ul>
   *  <li>WalkToOtherHost is true. In this case there will be no additional
   *      tests.</li>
   *  <li>The new URL is located below the start URL, e.g. is the start URL
   *      is http://localhost/test, the URL http://localhost/test/index.html
   *      is allowed, but http://localhost/ is not allowed.</li>
   *  <li>AllowWholeHost is true and the new URL is located on the same host
   *      as the start URL.</li>
   *  <li>FlexibleHostCheck is true and the host part of the current URL
   *      is equal to the host part of the start URL modulo the prefix "www."
   *      </li>
   *  <li>The URL starts with a string in the "AllowedURLs" list.</li>
   * </ul>
   */
  protected boolean basicURLCheck(URL currURL) {
    String currURLStr = currURL.getHost() + currURL.getPath();
    String currHost = currURL.getHost().toLowerCase();
    String startHost = startURL.getHost().toLowerCase();


    // no more checks, if walkToOtherHosts is true
    if (walkToOtherHosts) {
      return true;
    }


    // new URL below start URL ?
    if (currURLStr.startsWith(startDir)) {
      return true;
    }


    // on the same host ?
    if (allowWholeHost && (currURL.getHost().equalsIgnoreCase(startURL.getHost()))) {
      return true;
    }


    // on the same host with flexible test (host name with and without "www."
    if (flexibleHostCheck) {
      if (cutWWW(currHost).equalsIgnoreCase(cutWWW(startHost))) {
        return true;
      }
    }


    // allow whole domain ?
    if (allowWholeDomain) {
      if (currHost.endsWith(getDomain(startHost))) {
        return true;
      }
    }


    // in the list of allowed URLs ?
    for (int i = 0; i < allowedURLs.size(); i++) {
      String s = (String) allowedURLs.elementAt(i);
      if (currURLStr.startsWith(s)) {
        return true;
      }
    }
    log.debug("URL " + currURLStr + " not allowed");
    return false;
  }


  /**
   * remove a leading www. from a given hostname
   * 
   * @param hostname some hostname
   * @return the hostname if it doesn't start with "www." otherwise
   *  the hostname without the leading www.
   */
  private String cutWWW(String hostname) {
    if (hostname.toLowerCase().startsWith("www.")) {
      return hostname.substring(4);
    } else {
      return hostname;
    }
  }


  /** 
   * Gets the domain name of a given host (just delete everything
   * to the last "."
   *
   * @param hostname some hostname
   * @return the domain part of this hostname
   */
  private String getDomain(String hostname) {
    int pos = hostname.indexOf(".");
    if (pos < 0) {
      // this should not happen !
      return hostname;
    } else {
      return hostname.substring(pos + 1);
    }
  }


  /**
   * Method getExceptionHandler.
   * @return RobotExceptionHandler the exceptionhandler of the robot
   */
  public RobotExceptionHandler getExceptionHandler() {
    return exceptionHandler;
  }


  /**
   * Method setExceptionHandler.
   * sets the exceptionhandler of the robot
   * @param newExceptionHandler the new exception handler
   */
  public void setExceptionHandler(RobotExceptionHandler newExceptionHandler) {
    if (newExceptionHandler != null) {
      exceptionHandler = newExceptionHandler;
    }
  }


  /**
   * Method setStart.
   * sets the start URL 
   * @param the startURL as String
   */
  public void setStart(String startURL) {
    try {
      setStartURL(new URL(startURL));
    } catch (MalformedURLException e) {
      e.printStackTrace();
    }
  }


  /**
   * Method getStart.
   * gets the start url as string
   * @return String
   */
  public String getStart() {
    URL url = getStartURL();
    if (url != null) {
      return url.toExternalForm();
    } else {
      return null;
    }
  }


  /**
   * This method finishes HttpTool, NoRobots, HttpDocManager.
   */
  public void finish() {
    if (httpTool != null) {
      httpTool.finish();
    }
    if (robCheck != null) {
      robCheck.finish();
    }
    if (docManager != null) {
      docManager.finish();
    }
  }


  public static void main(String[] args) {
    if (args.length > 0) System.err.println("Arguments will be ignored!");
    Field[] fields = WebRobot.class.getDeclaredFields();
    StringBuffer str = new StringBuffer(60);
    for (int i = 0; i < fields.length; i++) {
      if (!Modifier.isFinal(fields[i].getModifiers())
        && !Modifier.isStatic(fields[i].getModifiers())) {
        str.delete(0, str.length());
        str.append("    robot." + fields[i].getName() + " = " + fields[i].getName() + ";");
        while (str.length() < 50) {
          str.append(" ");
        }
        System.out.println(str.toString()+"// ("+fields[i].getType().getName()+")");
      }
    }
  }


  /** default expected count of documents */
  private static final int DEFAULT_EXPECTED_DOCUMENT_COUNT = 50000;
  
  /** expected count of documents */
  protected int expectedDocumentCount = DEFAULT_EXPECTED_DOCUMENT_COUNT;
   
  /** remember visited content here (md5, urlString) */ 
  protected HashMap content2UrlMap;


  /**  counter for pages that were found in cache */
  long countCache = 0;
  
  /** counter for pages retrieved by web */
  long countWeb = 0;
  
  /** counter for pages that didn�t need a refresh */
  long countNoRefresh = 0;
  
  /** counter for refreshed pages (=cache+web) */
  long countRefresh = 0;
  
  /**
   * Method getContentVisitedURL.
   * Checks if the content was visited before and retrieves the corresponding URL.
   * @param content
   * @return found url or null if not found
   */
  public String getContentVisitedURL(HttpDoc doc) {
    Object key = doc.getContentMD5();
    synchronized(content2UrlMap) {
      String url = (String) content2UrlMap.get(key);
      return url;
    }
  }
  
  /**
   * Method setContentVisitedURL.
   * Makes an URL retrievable by its content by entering it in content2UrlMap.
   * @param content
   * @param url
   */
  public void setContentVisitedURL(HttpDoc doc, String url) {
    Object key = doc.getContentMD5();
    synchronized(content2UrlMap) {
      content2UrlMap.put(key, url);
    }
  }
  
  private final RobotTask createRobotTask(URL url, int maxDepth, String startReferer) {
    url = removeWasteParameters(url);
    return new RobotTask(url, maxDepth, startReferer);
  }


  /** only true if form-handlers are defined */
  boolean hasFormHandlers = false;
  
  /** list of wasteParameters (will be removed from URLs) **/
  protected Vector wasteParameters = new Vector();
  
  /** 
   * Set the list of wasteParameters (will be removed from URLs)
   * @param wasteParameters 
   * if they begin of a string in this vector
   */
  public void setWasteParameters(Vector wasteParameters) {
    this.wasteParameters = wasteParameters;
  }


  /**
   * Gets the list of wasteParameters (will be removed from URLs)
   * @return a Vector containing Strings
   */
  public Vector getWasteParameters() {
    return this.wasteParameters;
  }


  /** Removes wasteParameters from URL.
   * (eg. ID)
   * @param url
   * @return URL
   */
  public URL removeWasteParameters(URL url) {
    String urlString = url.toExternalForm();
    String newUrlString = removeParametersFromString(urlString, wasteParameters);
    if (urlString != newUrlString) {
      try {
        url = new URL(newUrlString);
      } catch (MalformedURLException ex) {
        ex.printStackTrace();
      }
    };
    return url;
  }
  
  /**
   * Remove passed Parameters from UrlString
   * @param urlString
   * @param wasteParameters
   * @return String
   */
  public static String removeParametersFromString(String urlString, Vector wasteParameters) {
    if (wasteParameters != null && wasteParameters.size() > 0) {
      int questionMark = urlString.indexOf("?");
      if (questionMark>0 && questionMark<urlString.length()) {
        int restPosition = urlString.indexOf("#", questionMark);
        String parameters;
        String rest;
        if (restPosition<0) {
          parameters = urlString.substring(questionMark+1);
          rest = null;
        } else {
          parameters = urlString.substring(questionMark+1,restPosition);
          rest = urlString.substring(restPosition);
        }
          
        StringBuffer filteredUrl = new StringBuffer(urlString.substring(0,questionMark));
        StringTokenizer tokenizer = new StringTokenizer(parameters, "&");
        String and = "?";
        boolean changed = false;
        while (tokenizer.hasMoreTokens()) {
          String token = tokenizer.nextToken();
          boolean keep = true;
          for (int w=0; w<wasteParameters.size(); w++) {
            String wasteParameter = (String) wasteParameters.elementAt(w);
            if (token.startsWith(wasteParameter + "=")) {
              keep = false; 
              changed = true;
              break;
            }
          }
          if (keep) {
            filteredUrl.append(and);
            filteredUrl.append(token);
            and = "&";
          }
        }
        if (rest != null) filteredUrl.append(rest);
        if (changed) {
          urlString = filteredUrl.toString();
        }
      }
    }
    return urlString;
  }
  
  /** time of WebRobot start in milliseconds */
  protected long startTime = System.currentTimeMillis();
  
  /** number of allowed retries for document retrieval */
  protected int maxRetries = 0;
  
  /**
   * Set allowed retries for document retrieval
   * @param maxRetries
   */
  public void setMaxRetries(int maxRetries) { this.maxRetries = maxRetries; }
  
  /**
   * Get allowed retries for document retrieval
   * @return maxRetries
   */
  public int getMaxRetries() { return maxRetries; }
  
  /** 
   * expiration age of documents in cache.
   * Documents older than expirationAge will be removed,
   * negative value means no limit. 
   */
  protected long expirationAge = -1;
  
  /**
   * set expiration age of documents in cache.
   * Documents older than expirationAge will be removed,
   * negative value means no limit. 
   * @param age
   */
  public void setExpirationAge(long age) { expirationAge = age; }
  
  /**
   * get expiration age of documents in cache.
   * @return long
   */
  public long getExpirationAge() { return expirationAge; }
  
  /**
   * Remove Parameters from Url
   * @param url
   * @return url without parameters
   */
  private final static String removeParameters(String url) {
    int pos = url.indexOf("?");
    return pos >= 0 ? url.substring(0,pos) : url;
  }
  
  /**
   * Reads a File to a byte array.
   * @param file
   * @return byte[]
   * @throws IOException
   */
  protected byte[] readFileToByteArray(File file) throws IOException
  {
    FileInputStream in = null;


    try
    {
      byte[] buffer = new byte[(int) file.length()];
      in = new FileInputStream(file);
      in.read(buffer);


      return buffer;
    }
    finally
    {
      if (in != null)
      {
        try
        {
          in.close();
        }
        catch (IOException e)
        {
        }
      }
    }
  }
  
}
Source Code of net.matuschek.spider.WebRobot

Related Classes of net.matuschek.spider.WebRobot