Package edu.uci.ics.crawler4j.examples.multiple

Source Code of edu.uci.ics.crawler4j.examples.multiple.MultipleCrawlerController

/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements.  See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package edu.uci.ics.crawler4j.examples.multiple;

import edu.uci.ics.crawler4j.crawler.CrawlConfig;
import edu.uci.ics.crawler4j.crawler.CrawlController;
import edu.uci.ics.crawler4j.fetcher.PageFetcher;
import edu.uci.ics.crawler4j.robotstxt.RobotstxtConfig;
import edu.uci.ics.crawler4j.robotstxt.RobotstxtServer;

/**
* @author Yasser Ganjisaffar <lastname at gmail dot com>
*/

public class MultipleCrawlerController {

  public static void main(String[] args) throws Exception {
    if (args.length != 1) {
      System.out.println("Needed parameter: ");
      System.out.println("\t rootFolder (it will contain intermediate crawl data)");
      return;
    }

    /*
     * crawlStorageFolder is a folder where intermediate crawl data is
     * stored.
     */
    String crawlStorageFolder = args[0];

    CrawlConfig config1 = new CrawlConfig();
    CrawlConfig config2 = new CrawlConfig();

    /*
     * The two crawlers should have different storage folders for their
     * intermediate data
     */
    config1.setCrawlStorageFolder(crawlStorageFolder + "/crawler1");
    config2.setCrawlStorageFolder(crawlStorageFolder + "/crawler2");

    config1.setPolitenessDelay(1000);
    config2.setPolitenessDelay(2000);

    config1.setMaxPagesToFetch(50);
    config2.setMaxPagesToFetch(100);

    /*
     * We will use different PageFetchers for the two crawlers.
     */
    PageFetcher pageFetcher1 = new PageFetcher(config1);
    PageFetcher pageFetcher2 = new PageFetcher(config2);

    /*
     * We will use the same RobotstxtServer for both of the crawlers.
     */
    RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
    RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher1);

    CrawlController controller1 = new CrawlController(config1, pageFetcher1, robotstxtServer);
    CrawlController controller2 = new CrawlController(config2, pageFetcher2, robotstxtServer);

    String[] crawler1Domains = new String[] { "http://www.ics.uci.edu/", "http://www.cnn.com/" };
    String[] crawler2Domains = new String[] { "http://en.wikipedia.org/" };

    controller1.setCustomData(crawler1Domains);
    controller2.setCustomData(crawler2Domains);

    controller1.addSeed("http://www.ics.uci.edu/");
    controller1.addSeed("http://www.cnn.com/");
    controller1.addSeed("http://www.ics.uci.edu/~lopes/");
    controller1.addSeed("http://www.cnn.com/POLITICS/");

    controller2.addSeed("http://en.wikipedia.org/wiki/Main_Page");
    controller2.addSeed("http://en.wikipedia.org/wiki/Obama");
    controller2.addSeed("http://en.wikipedia.org/wiki/Bing");

    /*
     * The first crawler will have 5 cuncurrent threads and the second
     * crawler will have 7 threads.
     */
    controller1.startNonBlocking(BasicCrawler.class, 5);
    controller2.startNonBlocking(BasicCrawler.class, 7);

    controller1.waitUntilFinish();
    System.out.println("Crawler 1 is finished.");

    controller2.waitUntilFinish();
    System.out.println("Crawler 2 is finished.");
  }
}
TOP

Related Classes of edu.uci.ics.crawler4j.examples.multiple.MultipleCrawlerController

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.