Package org.apache.nutch.fetcher

Examples of org.apache.nutch.fetcher.Fetcher


    Path linkDb = new Path(dir + "/linkdb");
    Path segments = new Path(dir + "/segments");
    res.elapsed = System.currentTimeMillis();
    Injector injector = new Injector(getConf());
    Generator generator = new Generator(getConf());
    Fetcher fetcher = new Fetcher(getConf());
    ParseSegment parseSegment = new ParseSegment(getConf());
    CrawlDb crawlDbTool = new CrawlDb(getConf());
    LinkDb linkDbTool = new LinkDb(getConf());
     
    // initialize crawlDb
    long start = System.currentTimeMillis();
    injector.inject(crawlDb, rootUrlDir);
    long delta = System.currentTimeMillis() - start;
    res.addTiming("inject", "0", delta);
    int i;
    for (i = 0; i < depth; i++) {             // generate new segment
      start = System.currentTimeMillis();
      Path[] segs = generator.generate(crawlDb, segments, -1, topN, System
          .currentTimeMillis());
      delta = System.currentTimeMillis() - start;
      res.addTiming("generate", i + "", delta);
      if (segs == null) {
        LOG.info("Stopping at depth=" + i + " - no more URLs to fetch.");
        break;
      }
      start = System.currentTimeMillis();
      fetcher.fetch(segs[0], threads)// fetch it
      delta = System.currentTimeMillis() - start;
      res.addTiming("fetch", i + "", delta);
      if (!Fetcher.isParsing(job)) {
        start = System.currentTimeMillis();
        parseSegment.parse(segs[0]);    // parse it, if needed
View Full Code Here


    Path index = new Path(dir + "/index");

    Path tmpDir = job.getLocalPath("crawl"+Path.SEPARATOR+getDate());
    Injector injector = new Injector(getConf());
    Generator generator = new Generator(getConf());
    Fetcher fetcher = new Fetcher(getConf());
    ParseSegment parseSegment = new ParseSegment(getConf());
    CrawlDb crawlDbTool = new CrawlDb(getConf());
    LinkDb linkDbTool = new LinkDb(getConf());
     
    // initialize crawlDb
    injector.inject(crawlDb, rootUrlDir);
    int i;
    for (i = 0; i < depth; i++) {             // generate new segment
      Path[] segs = generator.generate(crawlDb, segments, -1, topN, System
          .currentTimeMillis());
      if (segs == null) {
        LOG.info("Stopping at depth=" + i + " - no more URLs to fetch.");
        break;
      }
      fetcher.fetch(segs[0], threads)// fetch it
      if (!Fetcher.isParsing(job)) {
        parseSegment.parse(segs[0]);    // parse it, if needed
      }
      crawlDbTool.update(crawlDb, segs, true, true); // update crawldb
    }
View Full Code Here

    Path index = new Path(dir + "/index");

    Path tmpDir = job.getLocalPath("crawl"+Path.SEPARATOR+getDate());
    Injector injector = new Injector(getConf());
    Generator generator = new Generator(getConf());
    Fetcher fetcher = new Fetcher(getConf());
    ParseSegment parseSegment = new ParseSegment(getConf());
    CrawlDb crawlDbTool = new CrawlDb(getConf());
    LinkDb linkDbTool = new LinkDb(getConf());
     
    // initialize crawlDb
    injector.inject(crawlDb, rootUrlDir);
    int i;
    for (i = 0; i < depth; i++) {             // generate new segment
      Path[] segs = generator.generate(crawlDb, segments, -1, topN, System
          .currentTimeMillis());
      if (segs == null) {
        LOG.info("Stopping at depth=" + i + " - no more URLs to fetch.");
        break;
      }
      fetcher.fetch(segs[0], threads)// fetch it
      if (!Fetcher.isParsing(job)) {
        parseSegment.parse(segs[0]);    // parse it, if needed
      }
      crawlDbTool.update(crawlDb, segs, true, true); // update crawldb
    }
View Full Code Here

     
    for (int i = 0; i < depth; i++) {             // generate new segment
      Path segment =
        new Generator(job).generate(crawlDb, segments, -1,
                                     topN, System.currentTimeMillis());
      new Fetcher(job).fetch(segment, threads)// fetch it
      if (!Fetcher.isParsing(job)) {
        new ParseSegment(job).parse(segment);    // parse it, if needed
      }
      new CrawlDb(job).update(crawlDb, segment); // update crawldb
    }
View Full Code Here

TOP

Related Classes of org.apache.nutch.fetcher.Fetcher

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.