Examples of edu.uci.ics.crawler4j.crawler.CrawlController

edu.uci.ics.crawler4j.crawler.CrawlController
@author Yasser Ganjisaffar

    int numberOfCrawlers = Integer.parseInt(args[1]);
    String storageFolder = args[2];


    String[] crawlDomains = new String[] { "http://uci.edu/" };


    CrawlController controller = new CrawlController(rootFolder);
    for (String domain : crawlDomains) {
      controller.addSeed(domain);
    }


    // Be polite:
    // Make sure that we don't send more than 5 requests per second (200
    // milliseconds between requests).0
    controller.setPolitenessDelay(200);


    // Do you need to set a proxy?
    // If so, you can uncomment the following line
    // controller.setProxy("proxyserver.example.com", 8080);
    // OR
    // controller.setProxy("proxyserver.example.com", 8080, username,
    // password);


    MyImageCrawler.configure(crawlDomains, storageFolder);


    controller.start(MyImageCrawler.class, numberOfCrawlers);
  }

View Full Code Here

       * can use the second parameter to the CrawlController constructor.
       * 
       * Note: if you enable resuming feature and want to start a fresh
       * crawl, you need to delete the contents of rootFolder manually.
       */
      CrawlController controller = new CrawlController(rootFolder);
      
      /*
       * For each crawl, you need to add some seed urls.
       * These are the first URLs that are fetched and
       * then the crawler starts following links which
       * are found in these pages
       */
      controller.addSeed("http://www.ics.uci.edu/~yganjisa/");
      controller.addSeed("http://www.ics.uci.edu/~lopes/");
      controller.addSeed("http://www.ics.uci.edu/");
      
      /*
       * Be polite:
       * Make sure that we don't send more than 5 requests per 
       * second (200 milliseconds between requests).
       */
      controller.setPolitenessDelay(200);
      
      /*
       * Optional:
       * You can set the maximum crawl depth here.
       * The default value is -1 for unlimited depth
       */
      controller.setMaximumCrawlDepth(2);
      
      /*
       * Optional:
       * You can set the maximum number of pages to crawl.
       * The default value is -1 for unlimited depth
       */
      controller.setMaximumPagesToFetch(500);
      
      /*
       * Do you need to set a proxy?
       * If so, you can use: 
       * controller.setProxy("proxyserver.example.com", 8080);
       * OR
       * controller.setProxy("proxyserver.example.com", 8080, username, password);
       */
      
      /*
       * Note: you can configure several other parameters by modifying 
       * crawler4j.properties file
       */
      
      /*
       * Start the crawl. This is a blocking operation, meaning
       * that your code will reach the line after this only when
       * crawling is finished.
       */
      controller.start(MyCrawler.class, numberOfCrawlers);
    }

View Full Code Here

        return;
      }
      String rootFolder = args[0];
      int numberOfCrawlers = Integer.parseInt(args[1]);
      
      CrawlController controller = new CrawlController(rootFolder);    
      controller.addSeed("http://www.ics.uci.edu/");
      controller.start(MyCrawler.class, numberOfCrawlers);  
      
      List<Object> crawlersLocalData = controller.getCrawlersLocalData();
      long totalLinks = 0;
      long totalTextSize = 0;
      int totalProcessedPages = 0;
      for (Object localData : crawlersLocalData) {
        CrawlStat stat = (CrawlStat) localData;

View Full Code Here

            final PageFetcher pageFetcher = new PageFetcher(crawlConfig);


            RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
            final RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
            
            controller = new CrawlController(crawlConfig, pageFetcher, robotstxtServer);
        } catch (Exception e) {
            throw new IllegalArgumentException("Error while initializing crawler controller.", e);
        }
    }

View Full Code Here

     * Instantiate the controller for this crawl.
     */
    PageFetcher pageFetcher = new PageFetcher(config);
    RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
    RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
    CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer);


    /*
     * For each crawl, you need to add some seed urls. These are the first
     * URLs that are fetched and then the crawler starts following links
     * which are found in these pages
     */
    controller.addSeed("http://www.ics.uci.edu/~welling/");
    controller.addSeed("http://www.ics.uci.edu/~lopes/");
    controller.addSeed("http://www.ics.uci.edu/");


    /*
     * Start the crawl. This is a blocking operation, meaning that your code
     * will reach the line after this only when crawling is finished.
     */
    controller.start(StatusHandlerCrawler.class, numberOfCrawlers);
  }

View Full Code Here

     * We will use the same RobotstxtServer for both of the crawlers.
     */
    RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
    RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher1);


    CrawlController controller1 = new CrawlController(config1, pageFetcher1, robotstxtServer);
    CrawlController controller2 = new CrawlController(config2, pageFetcher2, robotstxtServer);


    String[] crawler1Domains = new String[] { "http://www.ics.uci.edu/", "http://www.cnn.com/" };
    String[] crawler2Domains = new String[] { "http://en.wikipedia.org/" };


    controller1.setCustomData(crawler1Domains);
    controller2.setCustomData(crawler2Domains);


    controller1.addSeed("http://www.ics.uci.edu/");
    controller1.addSeed("http://www.cnn.com/");
    controller1.addSeed("http://www.ics.uci.edu/~lopes/");
    controller1.addSeed("http://www.cnn.com/POLITICS/");


    controller2.addSeed("http://en.wikipedia.org/wiki/Main_Page");
    controller2.addSeed("http://en.wikipedia.org/wiki/Obama");
    controller2.addSeed("http://en.wikipedia.org/wiki/Bing");


    /*
     * The first crawler will have 5 cuncurrent threads and the second
     * crawler will have 7 threads.
     */
    controller1.startNonBlocking(BasicCrawler.class, 5);
    controller2.startNonBlocking(BasicCrawler.class, 7);


    controller1.waitUntilFinish();
    System.out.println("Crawler 1 is finished.");


    controller2.waitUntilFinish();
    System.out.println("Crawler 2 is finished.");
  }

View Full Code Here

    String[] crawlDomains = new String[] { "http://uci.edu/" };


    PageFetcher pageFetcher = new PageFetcher(config);
    RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
    RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
    CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer);
    for (String domain : crawlDomains) {
      controller.addSeed(domain);
    }


    ImageCrawler.configure(crawlDomains, storageFolder);


    controller.start(ImageCrawler.class, numberOfCrawlers);
  }

View Full Code Here

     * Instantiate the controller for this crawl.
     */
    PageFetcher pageFetcher = new PageFetcher(config);
    RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
    RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
    CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer);


    /*
     * For each crawl, you need to add some seed urls. These are the first
     * URLs that are fetched and then the crawler starts following links
     * which are found in these pages
     */
    controller.addSeed("http://www.ics.uci.edu/~welling/");
    controller.addSeed("http://www.ics.uci.edu/~lopes/");
    controller.addSeed("http://www.ics.uci.edu/");


    /*
     * Start the crawl. This is a blocking operation, meaning that your code
     * will reach the line after this only when crawling is finished.
     */
    controller.startNonBlocking(BasicCrawler.class, numberOfCrawlers);


    // Wait for 30 seconds
    Thread.sleep(30 * 1000);


    // Send the shutdown request and then wait for finishing
    controller.Shutdown();
    controller.waitUntilFinish();
  }

View Full Code Here

    config.setPolitenessDelay(1000);


    PageFetcher pageFetcher = new PageFetcher(config);
    RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
    RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
    CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer);


    controller.addSeed("http://www.ics.uci.edu/");
    controller.start(LocalDataCollectorCrawler.class, numberOfCrawlers);


    List<Object> crawlersLocalData = controller.getCrawlersLocalData();
    long totalLinks = 0;
    long totalTextSize = 0;
    int totalProcessedPages = 0;
    for (Object localData : crawlersLocalData) {
      CrawlStat stat = (CrawlStat) localData;

View Full Code Here

     * Instantiate the controller for this crawl.
     */
    PageFetcher pageFetcher = new PageFetcher(config);
    RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
    RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
    CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer);


    /*
     * For each crawl, you need to add some seed urls. These are the first
     * URLs that are fetched and then the crawler starts following links
     * which are found in these pages
     */


    controller.addSeed("http://www.ics.uci.edu/");
    controller.addSeed("http://www.ics.uci.edu/~lopes/");
    controller.addSeed("http://www.ics.uci.edu/~welling/");


    /*
     * Start the crawl. This is a blocking operation, meaning that your code
     * will reach the line after this only when crawling is finished.
     */
    controller.start(BasicCrawler.class, numberOfCrawlers);
  }

View Full Code Here

0 1

TOP

Related Classes of edu.uci.ics.crawler4j.crawler.CrawlController

com.manning.hip.testdata.Crawler

com.sleepycat.je.Environment

com.sleepycat.je.EnvironmentConfig

edu.uci.ics.crawler4j.example.advanced.Controller

edu.uci.ics.crawler4j.example.imagecrawler.Controller

edu.uci.ics.crawler4j.example.simple.Controller

edu.uci.ics.crawler4j.examples.basic.BasicCrawlController

edu.uci.ics.crawler4j.examples.imagecrawler.ImageCrawlController

edu.uci.ics.crawler4j.examples.localdata.LocalDataCollectorController

edu.uci.ics.crawler4j.examples.multiple.MultipleCrawlerController

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.