Source Code of edu.uci.ics.crawler4j.crawler.WebCrawler

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */


package edu.uci.ics.crawler4j.crawler;


import edu.uci.ics.crawler4j.fetcher.PageFetchResult;
import edu.uci.ics.crawler4j.fetcher.CustomFetchStatus;
import edu.uci.ics.crawler4j.fetcher.PageFetcher;
import edu.uci.ics.crawler4j.frontier.DocIDServer;
import edu.uci.ics.crawler4j.frontier.Frontier;
import edu.uci.ics.crawler4j.parser.HtmlParseData;
import edu.uci.ics.crawler4j.parser.ParseData;
import edu.uci.ics.crawler4j.parser.Parser;
import edu.uci.ics.crawler4j.robotstxt.RobotstxtServer;
import edu.uci.ics.crawler4j.url.WebURL;


import org.apache.http.HttpStatus;
import org.apache.log4j.Logger;


import java.util.ArrayList;
import java.util.List;


/**
 * WebCrawler class in the Runnable class that is executed by each crawler
 * thread.
 * 
 * @author Yasser Ganjisaffar <lastname at gmail dot com>
 */
public class WebCrawler implements Runnable {


  protected static final Logger logger = Logger.getLogger(WebCrawler.class.getName());


  /**
   * The id associated to the crawler thread running this instance
   */
  protected int myId;


  /**
   * The controller instance that has created this crawler thread. This
   * reference to the controller can be used for getting configurations of the
   * current crawl or adding new seeds during runtime.
   */
  protected CrawlController myController;


  /**
   * The thread within which this crawler instance is running.
   */
  private Thread myThread;


  /**
   * The parser that is used by this crawler instance to parse the content of
   * the fetched pages.
   */
  private Parser parser;


  /**
   * The fetcher that is used by this crawler instance to fetch the content of
   * pages from the web.
   */
  private PageFetcher pageFetcher;


  /**
   * The RobotstxtServer instance that is used by this crawler instance to
   * determine whether the crawler is allowed to crawl the content of each
   * page.
   */
  private RobotstxtServer robotstxtServer;


  /**
   * The DocIDServer that is used by this crawler instance to map each URL to
   * a unique docid.
   */
  private DocIDServer docIdServer;


  /**
   * The Frontier object that manages the crawl queue.
   */
  private Frontier frontier;


  /**
   * Is the current crawler instance waiting for new URLs? This field is
   * mainly used by the controller to detect whether all of the crawler
   * instances are waiting for new URLs and therefore there is no more work
   * and crawling can be stopped.
   */
  private boolean isWaitingForNewURLs;


  /**
   * Initializes the current instance of the crawler
   * 
   * @param myId
   *            the id of this crawler instance
   * @param crawlController
   *            the controller that manages this crawling session
   */
  public void init(int myId, CrawlController crawlController) {
    this.myId = myId;
    this.pageFetcher = crawlController.getPageFetcher();
    this.robotstxtServer = crawlController.getRobotstxtServer();
    this.docIdServer = crawlController.getDocIdServer();
    this.frontier = crawlController.getFrontier();
    this.parser = new Parser(crawlController.getConfig());
    this.myController = crawlController;
    this.isWaitingForNewURLs = false;
  }


  /**
   * Get the id of the current crawler instance
   * 
   * @return the id of the current crawler instance
   */
  public int getMyId() {
    return myId;
  }


  public CrawlController getMyController() {
    return myController;
  }


  /**
   * This function is called just before starting the crawl by this crawler
   * instance. It can be used for setting up the data structures or
   * initializations needed by this crawler instance.
   */
  public void onStart() {
  }


  /**
   * This function is called just before the termination of the current
   * crawler instance. It can be used for persisting in-memory data or other
   * finalization tasks.
   */
  public void onBeforeExit() {
  }
  
  /**
   * This function is called once the header of a page is fetched.
   * It can be overwritten by sub-classes to perform custom logic
   * for different status codes. For example, 404 pages can be logged, etc.
   */
  protected void handlePageStatusCode(WebURL webUrl, int statusCode, String statusDescription) {
  }


  /**
   * The CrawlController instance that has created this crawler instance will
   * call this function just before terminating this crawler thread. Classes
   * that extend WebCrawler can override this function to pass their local
   * data to their controller. The controller then puts these local data in a
   * List that can then be used for processing the local data of crawlers (if
   * needed).
   */
  public Object getMyLocalData() {
    return null;
  }


  public void run() {
    onStart();
    while (true) {
      List<WebURL> assignedURLs = new ArrayList<WebURL>(50);
      isWaitingForNewURLs = true;
      frontier.getNextURLs(50, assignedURLs);
      isWaitingForNewURLs = false;
      if (assignedURLs.size() == 0) {
        if (frontier.isFinished()) {
          return;
        }
        try {
          Thread.sleep(3000);
        } catch (InterruptedException e) {
          e.printStackTrace();
        }
      } else {
        for (WebURL curURL : assignedURLs) {
          if (curURL != null) {
            processPage(curURL);
            frontier.setProcessed(curURL);
          }
          if (myController.isShuttingDown()) {
            logger.info("Exiting because of controller shutdown.");
            return;
          }
        }
      }
    }
  }


  /**
   * Classes that extends WebCrawler can overwrite this function to tell the
   * crawler whether the given url should be crawled or not. The following
   * implementation indicates that all urls should be included in the crawl.
   * 
   * @param url
   *            the url which we are interested to know whether it should be
   *            included in the crawl or not.
   * @return if the url should be included in the crawl it returns true,
   *         otherwise false is returned.
   */
  public boolean shouldVisit(WebURL url) {
    return true;
  }


  /**
   * Classes that extends WebCrawler can overwrite this function to process
   * the content of the fetched and parsed page.
   * 
   * @param page
   *            the page object that is just fetched and parsed.
   */
  public void visit(Page page) {
  }


  private void processPage(WebURL curURL) {
    if (curURL == null) {
      return;
    }
    PageFetchResult fetchResult = null;
    try {
      fetchResult = pageFetcher.fetchHeader(curURL);
      int statusCode = fetchResult.getStatusCode();
      handlePageStatusCode(curURL, statusCode, CustomFetchStatus.getStatusDescription(statusCode));
      if (statusCode != HttpStatus.SC_OK) {
        if (statusCode == HttpStatus.SC_MOVED_PERMANENTLY || statusCode == HttpStatus.SC_MOVED_TEMPORARILY) {
          if (myController.getConfig().isFollowRedirects()) {
            String movedToUrl = fetchResult.getMovedToUrl();
            if (movedToUrl == null) {
              return;
            }
            int newDocId = docIdServer.getDocId(movedToUrl);
            if (newDocId > 0) {
              // Redirect page is already seen
              return;
            } else {
              WebURL webURL = new WebURL();
              webURL.setURL(movedToUrl);
              webURL.setParentDocid(curURL.getParentDocid());
              webURL.setParentUrl(curURL.getParentUrl());
              webURL.setDepth(curURL.getDepth());
              webURL.setDocid(-1);
              if (shouldVisit(webURL) && robotstxtServer.allows(webURL)) {
                webURL.setDocid(docIdServer.getNewDocID(movedToUrl));
                frontier.schedule(webURL);
              }
            }
          }
        } else if (fetchResult.getStatusCode() == CustomFetchStatus.PageTooBig) {
          logger.info("Skipping a page which was bigger than max allowed size: " + curURL.getURL());
        }
        return;
      }


      if (!curURL.getURL().equals(fetchResult.getFetchedUrl())) {
        if (docIdServer.isSeenBefore(fetchResult.getFetchedUrl())) {
          // Redirect page is already seen
          return;
        }
        curURL.setURL(fetchResult.getFetchedUrl());
        curURL.setDocid(docIdServer.getNewDocID(fetchResult.getFetchedUrl()));
      }


      Page page = new Page(curURL);
      int docid = curURL.getDocid();
      if (fetchResult.fetchContent(page) && parser.parse(page, curURL.getURL())) {
        ParseData parseData = page.getParseData();
        if (parseData instanceof HtmlParseData) {
          HtmlParseData htmlParseData = (HtmlParseData) parseData;


          List<WebURL> toSchedule = new ArrayList<WebURL>();
          int maxCrawlDepth = myController.getConfig().getMaxDepthOfCrawling();
          for (WebURL webURL : htmlParseData.getOutgoingUrls()) {
            webURL.setParentDocid(docid);
            webURL.setParentUrl(curURL.getURL());
            int newdocid = docIdServer.getDocId(webURL.getURL());
            if (newdocid > 0) {
              // This is not the first time that this Url is
              // visited. So, we set the depth to a negative
              // number.
              webURL.setDepth((short) -1);
              webURL.setDocid(newdocid);
            } else {
              webURL.setDocid(-1);
              webURL.setDepth((short) (curURL.getDepth() + 1));
              if (maxCrawlDepth == -1 || curURL.getDepth() < maxCrawlDepth) {
                if (shouldVisit(webURL) && robotstxtServer.allows(webURL)) {
                  webURL.setDocid(docIdServer.getNewDocID(webURL.getURL()));
                  toSchedule.add(webURL);
                }
              }
            }
          }
          frontier.scheduleAll(toSchedule);
        }
        visit(page);
      }
    } catch (Exception e) {
      e.printStackTrace();
      logger.error(e.getMessage() + ", while processing: " + curURL.getURL());
    } finally {
      if (fetchResult != null) {
        fetchResult.discardContentIfNotConsumed();
      }
    }
  }


  public Thread getThread() {
    return myThread;
  }


  public void setThread(Thread myThread) {
    this.myThread = myThread;
  }


  public boolean isNotWaitingForNewURLs() {
    return !isWaitingForNewURLs;
  }


}
Source Code of edu.uci.ics.crawler4j.crawler.WebCrawler

Related Classes of edu.uci.ics.crawler4j.crawler.WebCrawler