Package org.apache.nutch.crawl

Examples of org.apache.nutch.crawl.DbUpdaterJob


    res.elapsed = System.currentTimeMillis();
    InjectorJob injector = new InjectorJob(conf);
    GeneratorJob generator = new GeneratorJob(conf);
    FetcherJob fetcher = new FetcherJob(conf);
    ParserJob parseSegment = new ParserJob(conf);
    DbUpdaterJob crawlDbTool = new DbUpdaterJob(conf);
    // not needed in the new API
    //LinkDb linkDbTool = new LinkDb(getConf());

    long start = System.currentTimeMillis();
    // initialize crawlDb
    injector.inject(rootUrlDir);
    long delta = System.currentTimeMillis() - start;
    res.addTiming("inject", "0", delta);
    int i;
    for (i = 0; i < depth; i++) {             // generate new segment
      start = System.currentTimeMillis();
      String batchId = generator.generate(topN, System.currentTimeMillis(),
              false, false);
      delta = System.currentTimeMillis() - start;
      res.addTiming("generate", i + "", delta);
      if (batchId == null) {
        LOG.info("Stopping at depth=" + i + " - no more URLs to fetch.");
        break;
      }
      boolean isParsing = getConf().getBoolean("fetcher.parse", false);
      start = System.currentTimeMillis();
      fetcher.fetch(batchId, threads, false, -1)// fetch it
      delta = System.currentTimeMillis() - start;
      res.addTiming("fetch", i + "", delta);
      if (!isParsing) {
        start = System.currentTimeMillis();
        parseSegment.parse(batchId, false, false);    // parse it, if needed
        delta = System.currentTimeMillis() - start;
        res.addTiming("parse", i + "", delta);
      }
      start = System.currentTimeMillis();
      crawlDbTool.run(new String[0]); // update crawldb
      delta = System.currentTimeMillis() - start;
      res.addTiming("update", i + "", delta);
    }
    if (i == 0) {
      LOG.warn("No URLs to fetch - check your seed list and URL filters.");
View Full Code Here


    res.elapsed = System.currentTimeMillis();
    InjectorJob injector = new InjectorJob(conf);
    GeneratorJob generator = new GeneratorJob(conf);
    FetcherJob fetcher = new FetcherJob(conf);
    ParserJob parseBatch = new ParserJob(conf);
    DbUpdaterJob crawlDbTool = new DbUpdaterJob(conf);
    // not needed in the new API
    //LinkDb linkDbTool = new LinkDb(getConf());

    long start = System.currentTimeMillis();
    // initialize crawlDb
    injector.inject(rootUrlDir);
    long delta = System.currentTimeMillis() - start;
    res.addTiming("inject", "0", delta);
    int i;
    for (i = 0; i < depth; i++) {             // generate new batch
      start = System.currentTimeMillis();
      String batchId = generator.generate(topN, System.currentTimeMillis(),
              false, false);
      delta = System.currentTimeMillis() - start;
      res.addTiming("generate", i + "", delta);
      if (batchId == null) {
        LOG.info("Stopping at depth=" + i + " - no more URLs to fetch.");
        break;
      }
      boolean isParsing = getConf().getBoolean("fetcher.parse", false);
      start = System.currentTimeMillis();
      fetcher.fetch(batchId, threads, false, -1)// fetch it
      delta = System.currentTimeMillis() - start;
      res.addTiming("fetch", i + "", delta);
      if (!isParsing) {
        start = System.currentTimeMillis();
        parseBatch.parse(batchId, false, false);    // parse it, if needed
        delta = System.currentTimeMillis() - start;
        res.addTiming("parse", i + "", delta);
      }
      start = System.currentTimeMillis();
      crawlDbTool.run(new String[0]); // update crawldb
      delta = System.currentTimeMillis() - start;
      res.addTiming("update", i + "", delta);
    }
    if (i == 0) {
      LOG.warn("No URLs to fetch - check your seed list and URL filters.");
View Full Code Here

TOP

Related Classes of org.apache.nutch.crawl.DbUpdaterJob

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.