Source Code of com.crawljax.core.Crawler

package com.crawljax.core;


import com.crawljax.browser.EmbeddedBrowser;
import com.crawljax.core.configuration.CrawljaxConfigurationReader;
import com.crawljax.core.exception.BrowserConnectionException;
import com.crawljax.core.exception.CrawlPathToException;
import com.crawljax.core.plugin.CrawljaxPluginsUtil;
import com.crawljax.core.state.CrawlPath;
import com.crawljax.core.state.Eventable;
import com.crawljax.core.state.Eventable.EventType;
import com.crawljax.core.state.Identification;
import com.crawljax.core.state.StateFlowGraph;
import com.crawljax.core.state.StateMachine;
import com.crawljax.core.state.StateVertix;
import com.crawljax.forms.FormHandler;
import com.crawljax.forms.FormInput;
import com.crawljax.util.ElementResolver;


import org.apache.log4j.Logger;


import java.util.List;


/**
 * Class that performs crawl actions. It is designed to be run inside a Thread.
 * 
 * @see #run()
 * @author dannyroest@gmail.com (Danny Roest)
 * @author Stefan Lenselink <S.R.Lenselink@student.tudelft.nl>
 * @version $Id: Crawler.java 451 2010-09-16 15:01:59Z slenselink@google.com $
 */
public class Crawler implements Runnable {


  private static final Logger LOGGER = Logger.getLogger(Crawler.class.getName());


  private static final int ONE_SECOND = 1000;


  /**
   * The main browser window 1 to 1 relation; Every Thread will get on browser assigned in the run
   * function.
   */
  private EmbeddedBrowser browser;


  /**
   * The central DataController. This is a multiple to 1 relation Every Thread shares an instance
   * of the same controller! All operations / fields used in the controller should be checked for
   * thread safety.
   */
  private final CrawljaxController controller;


  /**
   * Depth register.
   */
  private int depth = 0;


  /**
   * The path followed from the index to the current state.
   */
  private final CrawlPath backTrackPath;


  /**
   * The utility which is used to extract the candidate clickables.
   */
  private CandidateElementExtractor candidateExtractor;


  private boolean fired = false;


  /**
   * The name of this Crawler when not default (automatic) this will be added to the Thread name
   * in the thread as (name). In the {@link CrawlerExecutor#beforeExecute(Thread, Runnable)} the
   * name is retrieved using the {@link #toString()} function.
   * 
   * @see Crawler#toString()
   * @see CrawlerExecutor#beforeExecute(Thread, Runnable)
   */
  private String name = "";


  /**
   * The sateMachine for this Crawler, keeping track of the path crawled by this Crawler.
   */
  private final StateMachine stateMachine;


  private final CrawljaxConfigurationReader configurationReader;


  private FormHandler formHandler;


  /**
   * The object to places calls to add new Crawlers or to remove one.
   */
  private final CrawlQueueManager crawlQueueManager;


  /**
   * Enum for describing what has happened after a {@link Crawler#clickTag(Eventable)} has been
   * performed.
   * 
   * @see Crawler#clickTag(Eventable)
   */
  private enum ClickResult {
    cloneDetected, newState, domUnChanged
  }


  /**
   * @param mother
   *            the main CrawljaxController
   * @param exactEventPath
   *            the event path up till this moment.
   * @param name
   *            a name for this crawler (default is empty).
   */
  public Crawler(CrawljaxController mother, List<Eventable> exactEventPath, String name) {
    this(mother, new CrawlPath(exactEventPath));
    this.name = name;
  }


  /**
   * Private Crawler constructor for a 'reload' crawler. Only used internally.
   * 
   * @param mother
   *            the main CrawljaxController
   * @param returnPath
   *            the path used to return to the last state, this can be a empty list
   * @deprecated better to use {@link #Crawler(CrawljaxController, CrawlPath)}
   */
  @Deprecated
  protected Crawler(CrawljaxController mother, List<Eventable> returnPath) {
    this(mother, new CrawlPath(returnPath));
  }


  /**
   * Private Crawler constructor for a 'reload' crawler. Only used internally.
   * 
   * @param mother
   *            the main CrawljaxController
   * @param returnPath
   *            the path used to return to the last state, this can be a empty list
   */
  protected Crawler(CrawljaxController mother, CrawlPath returnPath) {
    this.backTrackPath = returnPath;
    this.controller = mother;
    this.configurationReader = controller.getConfigurationReader();
    this.crawlQueueManager = mother.getCrawlQueueManager();
    if (controller.getSession() != null) {
      this.stateMachine =
              new StateMachine(controller.getSession().getStateFlowGraph(), controller
                      .getSession().getInitialState(), controller.getInvariantList());
    } else {
      /**
       * Reset the state machine to null, because there is no session where to load the
       * stateFlowGraph from.
       */
      this.stateMachine = null;
    }
  }


  /**
   * Brings the browser to the initial state.
   */
  public void goToInitialURL() {
    LOGGER.info("Loading Page "
            + configurationReader.getCrawlSpecificationReader().getSiteUrl());
    getBrowser().goToUrl(configurationReader.getCrawlSpecificationReader().getSiteUrl());
    /**
     * Thread safe
     */
    controller.doBrowserWait(getBrowser());
    CrawljaxPluginsUtil.runOnUrlLoadPlugins(getBrowser());
  }


  /**
   * Try to fire a given event on the Browser.
   * 
   * @param eventable
   *            the eventable to fire
   * @return true iff the event is fired
   */
  private boolean fireEvent(Eventable eventable) {
    if (eventable.getIdentification().getHow().toString().equals("xpath")
            && eventable.getRelatedFrame().equals("")) {


      /**
       * The path in the page to the 'clickable' (link, div, span, etc)
       */
      String xpath = eventable.getIdentification().getValue();


      /**
       * The type of event to execute on the 'clickable' like onClick, mouseOver, hover, etc
       */
      EventType eventType = eventable.getEventType();


      /**
       * Try to find a 'better' / 'quicker' xpath
       */
      String newXPath = new ElementResolver(eventable, getBrowser()).resolve();
      if (newXPath != null && !xpath.equals(newXPath)) {
        LOGGER.info("XPath changed from " + xpath + " to " + newXPath + " relatedFrame:"
                + eventable.getRelatedFrame());
        eventable = new Eventable(
                new Identification(Identification.How.xpath, newXPath), eventType);
      }
    }


    if (getBrowser().fireEvent(eventable)) {


      /**
       * Let the controller execute its specified wait operation on the browser thread safe.
       */
      controller.doBrowserWait(getBrowser());


      /**
       * Close opened windows
       */
      getBrowser().closeOtherWindows();


      return true; // A event fired
    } else {
      /**
       * Execute the OnFireEventFailedPlugins with the current crawlPath with the crawlPath
       * removed 1 state to represent the path TO here.
       */
      CrawljaxPluginsUtil.runOnFireEventFailedPlugins(
              eventable, controller.getSession().getCurrentCrawlPath().immutableCopy(true));
      return false; // no event fired
    }
  }


  /**
   * Enters the form data. First, the related input elements (if any) to the eventable are filled
   * in and then it tries to fill in the remaining input elements.
   * 
   * @param eventable
   *            the eventable element.
   */
  private void handleInputElements(Eventable eventable) {
    List<FormInput> formInputs = eventable.getRelatedFormInputs();


    for (FormInput formInput : formHandler.getFormInputs()) {
      if (!formInputs.contains(formInput)) {
        formInputs.add(formInput);
      }
    }
    eventable.setRelatedFormInputs(formInputs);
    formHandler.handleFormElements(formInputs);
  }


  /**
   * Reload the browser following the {@link #backTrackPath} to the given currentEvent.
   *
   * @throws CrawljaxException
   *             if the {@link Eventable#getTargetStateVertix()} encounters an error.
   */
  private void goBackExact() throws CrawljaxException {
    /**
     * Thread safe
     */
    StateVertix curState = controller.getSession().getInitialState();


    for (Eventable clickable : backTrackPath) {


      if (!controller.getElementChecker().checkCrawlCondition(getBrowser())) {
        return;
      }


      LOGGER.info("Backtracking by executing " + clickable.getEventType() + " on element: "
              + clickable);


      this.getStateMachine().changeState(clickable.getTargetStateVertix());


      curState = clickable.getTargetStateVertix();


      controller.getSession().addEventableToCrawlPath(clickable);


      this.handleInputElements(clickable);


      if (this.fireEvent(clickable)) {


        // TODO ali, do not increase depth if eventable is from guidedcrawling
        depth++;


        /**
         * Run the onRevisitStateValidator(s)
         */
        CrawljaxPluginsUtil.runOnRevisitStatePlugins(this.controller.getSession(),
                curState);
      }


      if (!controller.getElementChecker().checkCrawlCondition(getBrowser())) {
        return;
      }
    }
  }


  /**
   * @param eventable
   *            the element to execute an action on.
   * @return the result of the click operation
   * @throws CrawljaxException
   *             an exception.
   */
  private ClickResult clickTag(final Eventable eventable) throws CrawljaxException {
    // load input element values
    this.handleInputElements(eventable);


    LOGGER.info("Executing " + eventable.getEventType() + " on element: " + eventable
            + "; State: " + this.getStateMachine().getCurrentState().getName());
    if (this.fireEvent(eventable)) {
      StateVertix newState =
              new StateVertix(getBrowser().getCurrentUrl(), controller.getSession()
                      .getStateFlowGraph().getNewStateName(), getBrowser().getDom(),
                      this.controller.getStrippedDom(getBrowser()));


      if (isDomChanged(this.getStateMachine().getCurrentState(), newState)) {
        // Dom is changed, so data might need be filled in again
        controller.getSession().addEventableToCrawlPath(eventable);
        if (this.getStateMachine().update(eventable, newState, this.getBrowser(),
                this.controller.getSession())) {
          // Dom changed
          // No Clone
          // TODO change the interface of runGuidedCrawlingPlugins to remove the
          // controller.getSession().getCurrentCrawlPath() call because its from the
          // session now.
          CrawljaxPluginsUtil.runGuidedCrawlingPlugins(controller, controller
                  .getSession(), controller.getSession().getCurrentCrawlPath(), this
                  .getStateMachine());


          return ClickResult.newState;
        } else {
          // Dom changed; Clone
          return ClickResult.cloneDetected;
        }
      }
    }
    // Event not fired or, Dom not changed
    return ClickResult.domUnChanged;
  }


  /**
   * Return the Exacteventpath.
   * 
   * @return the exacteventpath
   * @deprecated use {@link CrawlSession#getCurrentCrawlPath()}
   */
  @Deprecated
  public final List<Eventable> getExacteventpath() {
    return controller.getSession().getCurrentCrawlPath();
  }


  /**
   * Have we reached the depth limit?
   * 
   * @param depth
   *            the current depth. Added as argument so this call can be moved out if desired.
   * @return true if the limit has been reached
   */
  private boolean depthLimitReached(int depth) {


    if (this.depth >= configurationReader.getCrawlSpecificationReader().getDepth()
            && configurationReader.getCrawlSpecificationReader().getDepth() != 0) {
      LOGGER.info("DEPTH " + depth + " reached returning from rec call. Given depth: "
              + configurationReader.getCrawlSpecificationReader().getDepth());
      return true;
    } else {
      return false;
    }
  }


  private void spawnThreads(StateVertix state) {
    Crawler c = null;
    do {
      if (c != null) {
        this.crawlQueueManager.addWorkToQueue(c);
      }
      c =
              new Crawler(this.controller,
                      controller.getSession().getCurrentCrawlPath().immutableCopy(true));
    } while (state.registerCrawler(c));
  }


  private ClickResult crawlAction(CandidateCrawlAction action) throws CrawljaxException {
    CandidateElement candidateElement = action.getCandidateElement();
    EventType eventType = action.getEventType();


    StateVertix orrigionalState = this.getStateMachine().getCurrentState();


    if (candidateElement.allConditionsSatisfied(getBrowser())) {
      ClickResult clickResult = clickTag(new Eventable(candidateElement, eventType));
      switch (clickResult) {
        case cloneDetected:
          fired = false;
          // We are in the clone state so we continue with the cloned version to search
          // for work.
          this.controller.getSession().branchCrawlPath();
          spawnThreads(orrigionalState);
          break;
        case newState:
          fired = true;
          // Recurse because new state found
          spawnThreads(orrigionalState);
          break;
        case domUnChanged:
          // Dom not updated, continue with the next
          break;
        default:
          break;
      }
      return clickResult;
    } else {


      LOGGER.info("Conditions not satisfied for element: " + candidateElement + "; State: "
              + this.getStateMachine().getCurrentState().getName());
    }
    return ClickResult.domUnChanged;
  }


  /**
   * Crawl through the clickables.
   * 
   * @throws CrawljaxException
   *             if an exception is thrown.
   */
  private boolean crawl() throws CrawljaxException {
    if (depthLimitReached(depth)) {
      return true;
    }


    if (!checkConstraints()) {
      return false;
    }


    // Store the currentState to be able to 'back-track' later.
    StateVertix orrigionalState = this.getStateMachine().getCurrentState();


    if (orrigionalState.searchForCandidateElements(candidateExtractor, configurationReader
            .getTagElements(), configurationReader.getExcludeTagElements(),
            configurationReader.getCrawlSpecificationReader().getClickOnce())) {
      // Only execute the preStateCrawlingPlugins when it's the first time
      LOGGER.info("Starting preStateCrawlingPlugins...");
      CrawljaxPluginsUtil.runPreStateCrawlingPlugins(controller.getSession(),
              orrigionalState.getUnprocessedCandidateElements());
    }


    CandidateCrawlAction action =
            orrigionalState.pollCandidateCrawlAction(this, crawlQueueManager);
    while (action != null) {
      if (depthLimitReached(depth)) {
        return true;
      }


      if (!checkConstraints()) {
        return false;
      }
      ClickResult result = this.crawlAction(action);
      orrigionalState.finishedWorking(this, action);
      switch (result) {
        case newState:
          return newStateDetected(orrigionalState);
        case cloneDetected:
          return true;
        default:
          break;
      }
      action = orrigionalState.pollCandidateCrawlAction(this, crawlQueueManager);
    }
    return true;
  }


  /**
   * A new state has been found!
   * 
   * @param orrigionalState
   *            the current state
   * @return true if crawling must continue false otherwise.
   * @throws CrawljaxException
   */
  private boolean newStateDetected(StateVertix orrigionalState) throws CrawljaxException {


    /**
     * An event has been fired so we are one level deeper
     */
    depth++;
    LOGGER.info("RECURSIVE Call crawl; Current DEPTH= " + depth);
    if (!this.crawl()) {
      // Crawling has stopped
      controller.terminate(false);
      return false;
    }
    this.getStateMachine().changeState(orrigionalState);
    return true;
  }


  /**
   * Initialize the Crawler, retrieve a Browser and go to the initial URL when no browser was
   * present. rewind the state machine and goBack to the state if there is exactEventPath is
   * specified.
   *
   * @throws InterruptedException
   *             when the request for a browser is interrupted.
   */
  public void init() throws InterruptedException {
    // Start a new CrawlPath for this Crawler
    controller.getSession().startNewPath();


    this.browser = this.getBrowser();
    if (this.browser == null) {
      /**
       * As the browser is null, request one and got to the initial URL, if the browser is
       * Already set the browser will be in the initial URL.
       */
      this.browser = controller.getBrowserPool().requestBrowser();
      LOGGER.info("Reloading page for navigating back");
      this.goToInitialURL();
    }
    // TODO Stefan ideally this should be placed in the constructor
    this.formHandler =
            new FormHandler(getBrowser(), configurationReader.getInputSpecification(),
                    configurationReader.getCrawlSpecificationReader().getRandomInputInForms());


    this.candidateExtractor =
            new CandidateElementExtractor(controller.getElementChecker(), this.getBrowser(),
                    formHandler, configurationReader.getCrawlSpecificationReader());
    /**
     * go back into the previous state.
     */
    try {
      this.goBackExact();
    } catch (CrawljaxException e) {
      LOGGER.error("Failed to backtrack", e);
    }
  }


  /**
   * Terminate and clean up this Crawler, release the acquired browser. Notice that other Crawlers
   * might still be active. So this function does NOT shutdown all Crawlers active that should be
   * done with {@link CrawlerExecutor#shutdown()}
   */
  public void shutdown() {
    controller.getBrowserPool().freeBrowser(this.getBrowser());
  }


  /**
   * The main function stated by the ExecutorService. Crawlers add themselves to the list by
   * calling {@link CrawlQueueManager#addWorkToQueue(Crawler)}. When the ExecutorService finds a
   * free thread this method is called and when this method ends the Thread is released again and
   * a new Thread is started
   * 
   * @see java.util.concurrent.Executors#newFixedThreadPool(int)
   * @see java.util.concurrent.ExecutorService
   */
  @Override
  public void run() {
    if (!checkConstraints()) {
      // Constrains are not met at start of this Crawler, so stop immediately
      return;
    }
    if (backTrackPath.last() != null) {
      try {
        if (!backTrackPath.last().getTargetStateVertix().startWorking(this)) {
          return;
        }
      } catch (CrawljaxException e) {
        LOGGER.error("Received Crawljax exception", e);
      }
    }


    try {
      /**
       * Init the Crawler
       */
      try {
        this.init();
      } catch (InterruptedException e) {
        if (this.getBrowser() == null) {
          return;
        }
      }


      /**
       * Hand over the main crawling
       */
      if (!this.crawl()) {
        controller.terminate(false);
      }


      /**
       * Crawling is done; so the crawlPath of the current branch is known
       */
      if (!fired) {
        controller.getSession().removeCrawlPath();
      }
    } catch (BrowserConnectionException e) {
      // The connection of the browser has gone down, most of the times it means that the
      // browser process has crashed.
      LOGGER.error("Crawler failed because the used browser died during Crawling",
              new CrawlPathToException("Crawler failed due to browser crash",
                      controller.getSession().getCurrentCrawlPath(), e));
      // removeBrowser will throw a RuntimeException if the current browser is the last
      // browser in the pool.
      this.controller.getBrowserPool().removeBrowser(
              this.getBrowser(), this.controller.getCrawlQueueManager());
      return;
    } catch (CrawljaxException e) {
      LOGGER.error("Crawl failed!", e);
    }
    /**
     * At last failure or non shutdown the Crawler.
     */
    this.shutdown();
  }


  /**
   * Return the browser used in this Crawler Thread.
   * 
   * @return the browser used in this Crawler Thread
   */
  public EmbeddedBrowser getBrowser() {
    return browser;
  }


  @Override
  public String toString() {
    return this.name;
  }


  /**
   * @return the state machine.
   */
  public StateMachine getStateMachine() {
    return stateMachine;
  }


  /**
   * Test to see if the (new) DOM is changed with regards to the old DOM. This method is Thread
   * safe.
   * 
   * @param stateBefore
   *            the state before the event.
   * @param stateAfter
   *            the state after the event.
   * @return true if the state is changed according to the compare method of the oracle.
   */
  private boolean isDomChanged(final StateVertix stateBefore, final StateVertix stateAfter) {
    boolean isChanged = false;


    // do not need Oracle Comparators now, because hash of stripped dom is
    // already calculated
    // isChanged = !stateComparator.compare(stateBefore.getDom(),
    // stateAfter.getDom(), browser);
    isChanged = !stateAfter.equals(stateBefore);
    if (isChanged) {
      LOGGER.info("Dom is Changed!");
    } else {
      LOGGER.info("Dom Not Changed!");
    }


    return isChanged;
  }


  /**
   * Checks the state and time constraints. This function is nearly Thread-safe.
   * 
   * @return true if all conditions are met.
   */
  private boolean checkConstraints() {
    long timePassed = System.currentTimeMillis() - controller.getSession().getStartTime();
    int maxCrawlTime = configurationReader.getCrawlSpecificationReader().getMaximumRunTime();
    if ((maxCrawlTime != 0) && (timePassed > maxCrawlTime * ONE_SECOND)) {


      LOGGER.info("Max time " + maxCrawlTime + " seconds passed!");
      /* stop crawling */
      return false;
    }
    StateFlowGraph graph = controller.getSession().getStateFlowGraph();
    int maxNumberOfStates =
            configurationReader.getCrawlSpecificationReader().getMaxNumberOfStates();
    if ((maxNumberOfStates != 0) && (graph.getAllStates().size() >= maxNumberOfStates)) {
      LOGGER.info("Max number of states " + maxNumberOfStates + " reached!");
      /* stop crawling */
      return false;
    }
    /* continue crawling */
    return true;
  }


}
Source Code of com.crawljax.core.Crawler

Related Classes of com.crawljax.core.Crawler