Package edu.uci.ics.crawler4j.crawler

Examples of edu.uci.ics.crawler4j.crawler.CrawlConfig

     * @param storageFolder location used to store the temporary data structures used by the crawler.
    public SiteCrawler(File storageFolder) {
        try {
            crawlConfig = new CrawlConfig();
            crawlConfig.setCrawlStorageFolder( storageFolder.getAbsolutePath() );
            crawlConfig.setUserAgentString("Apache Any23 Web Crawler");
            final PageFetcher pageFetcher = new PageFetcher(crawlConfig);

View Full Code Here

     * numberOfCrawlers shows the number of concurrent threads that should
     * be initiated for crawling.
    int numberOfCrawlers = Integer.parseInt(args[1]);

    CrawlConfig config = new CrawlConfig();


     * Be polite: Make sure that we don't send more than 1 request per
     * second (1000 milliseconds between requests).

     * You can set the maximum crawl depth here. The default value is -1 for
     * unlimited depth

     * You can set the maximum number of pages to crawl. The default value
     * is -1 for unlimited number of pages

     * Do you need to set a proxy? If so, you can use:
     * config.setProxyHost("");
     * config.setProxyPort(8080);
     * If your proxy also needs authentication:
     * config.setProxyUsername(username); config.getProxyPassword(password);

     * This config parameter can be used to set your crawl to be resumable
     * (meaning that you can resume the crawl from a previously
     * interrupted/crashed crawl). Note: if you enable resuming feature and
     * want to start a fresh crawl, you need to delete the contents of
     * rootFolder manually.

     * Instantiate the controller for this crawl.
    PageFetcher pageFetcher = new PageFetcher(config);
View Full Code Here

     * crawlStorageFolder is a folder where intermediate crawl data is
     * stored.
    String crawlStorageFolder = args[0];

    CrawlConfig config1 = new CrawlConfig();
    CrawlConfig config2 = new CrawlConfig();

     * The two crawlers should have different storage folders for their
     * intermediate data
    config1.setCrawlStorageFolder(crawlStorageFolder + "/crawler1");
    config2.setCrawlStorageFolder(crawlStorageFolder + "/crawler2");



     * We will use different PageFetchers for the two crawlers.
    PageFetcher pageFetcher1 = new PageFetcher(config1);
View Full Code Here

     * numberOfCrawlers shows the number of concurrent threads that should
     * be initiated for crawling.
    int numberOfCrawlers = Integer.parseInt(args[1]);

    CrawlConfig config = new CrawlConfig();



    // Unlimited number of pages can be crawled.

     * Instantiate the controller for this crawl.
    PageFetcher pageFetcher = new PageFetcher(config);
View Full Code Here

    String rootFolder = args[0];
    int numberOfCrawlers = Integer.parseInt(args[1]);
    String storageFolder = args[2];

    CrawlConfig config = new CrawlConfig();


     * Since images are binary content, we need to set this parameter to
     * true to make sure they are included in the crawl.

    String[] crawlDomains = new String[] { "" };

    PageFetcher pageFetcher = new PageFetcher(config);
    RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
View Full Code Here

  private Parser parser;
  private PageFetcher pageFetcher;

  public Downloader() {
    CrawlConfig config = new CrawlConfig();
    parser = new Parser(config);
    pageFetcher = new PageFetcher(config);
View Full Code Here

    String rootFolder = args[0];
    int numberOfCrawlers = Integer.parseInt(args[1]);

    CrawlConfig config = new CrawlConfig();

    PageFetcher pageFetcher = new PageFetcher(config);
    RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
    RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
    CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer);
View Full Code Here

     * numberOfCrawlers shows the number of concurrent threads that should
     * be initiated for crawling.
    int numberOfCrawlers = Integer.parseInt(args[1]);

    CrawlConfig config = new CrawlConfig();


     * Be polite: Make sure that we don't send more than 1 request per
     * second (1000 milliseconds between requests).

     * You can set the maximum crawl depth here. The default value is -1 for
     * unlimited depth

     * You can set the maximum number of pages to crawl. The default value
     * is -1 for unlimited number of pages

     * Do you need to set a proxy? If so, you can use:
     * config.setProxyHost("");
     * config.setProxyPort(8080);
     * If your proxy also needs authentication:
     * config.setProxyUsername(username); config.getProxyPassword(password);

     * This config parameter can be used to set your crawl to be resumable
     * (meaning that you can resume the crawl from a previously
     * interrupted/crashed crawl). Note: if you enable resuming feature and
     * want to start a fresh crawl, you need to delete the contents of
     * rootFolder manually.

     * Instantiate the controller for this crawl.
    PageFetcher pageFetcher = new PageFetcher(config);
View Full Code Here


Related Classes of edu.uci.ics.crawler4j.crawler.CrawlConfig

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact