Package org.apache.nutch.util

Examples of org.apache.nutch.util.NutchJob


      LOG.info("CrawlDb statistics start: " + crawlDb);
    }
   
    Path tmpFolder = new Path(crawlDb, "stat_tmp" + System.currentTimeMillis());

    JobConf job = new NutchJob(config);
    job.setJobName("stats " + crawlDb);

    job.addInputPath(new Path(crawlDb, CrawlDb.CURRENT_NAME));
    job.setInputFormat(SequenceFileInputFormat.class);

    job.setMapperClass(CrawlDbStatMapper.class);
    job.setCombinerClass(CrawlDbStatCombiner.class);
    job.setReducerClass(CrawlDbStatReducer.class);

    job.setOutputPath(tmpFolder);
    job.setOutputFormat(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(LongWritable.class);

    JobClient.runJob(job);

    // reading the result
    FileSystem fileSystem = FileSystem.get(config);
View Full Code Here


      LOG.info("CrawlDb db: " + crawlDb);
    }
   
    Path outFolder = new Path(output);

    JobConf job = new NutchJob(config);
    job.setJobName("dump " + crawlDb);

    job.addInputPath(new Path(crawlDb, CrawlDb.CURRENT_NAME));
    job.setInputFormat(SequenceFileInputFormat.class);

    job.setOutputPath(outFolder);
    job.setOutputFormat(TextOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(CrawlDatum.class);

    JobClient.runJob(job);
    if (LOG.isInfoEnabled()) { LOG.info("CrawlDb dump: done"); }
  }
View Full Code Here

    Path tempDir =
      new Path(config.get("mapred.temp.dir", ".") +
               "/readdb-topN-temp-"+
               Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));

    JobConf job = new NutchJob(config);
    job.setJobName("topN prepare " + crawlDb);
    job.addInputPath(new Path(crawlDb, CrawlDb.CURRENT_NAME));
    job.setInputFormat(SequenceFileInputFormat.class);
    job.setMapperClass(CrawlDbTopNMapper.class);
    job.setReducerClass(IdentityReducer.class);

    job.setOutputPath(tempDir);
    job.setOutputFormat(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(FloatWritable.class);
    job.setOutputValueClass(Text.class);

    // XXX hmmm, no setFloat() in the API ... :(
    job.setLong("CrawlDbReader.topN.min", Math.round(1000000.0 * min));
    JobClient.runJob(job);
   
    if (LOG.isInfoEnabled()) {
      LOG.info("CrawlDb topN: collecting topN scores.");
    }
    job = new NutchJob(config);
    job.setJobName("topN collect " + crawlDb);
    job.setLong("CrawlDbReader.topN", topN);

    job.addInputPath(tempDir);
    job.setInputFormat(SequenceFileInputFormat.class);
    job.setMapperClass(IdentityMapper.class);
    job.setReducerClass(CrawlDbTopNReducer.class);

    job.setOutputPath(outFolder);
    job.setOutputFormat(TextOutputFormat.class);
    job.setOutputKeyClass(FloatWritable.class);
    job.setOutputValueClass(Text.class);

    // XXX *sigh* this apparently doesn't work ... :-((
    job.setNumReduceTasks(1); // create a single file.
   
    JobClient.runJob(job);
    FileSystem fs = FileSystem.get(config);
    fs.delete(tempDir);
    if (LOG.isInfoEnabled()) { LOG.info("CrawlDb topN: done"); }
View Full Code Here

    if (LOG.isInfoEnabled()) {
      LOG.info("Indexer: starting");
      LOG.info("Indexer: linkdb: " + linkDb);
    }

    JobConf job = new NutchJob(getConf());
    job.setJobName("index " + indexDir);

    for (int i = 0; i < segments.length; i++) {
      if (LOG.isInfoEnabled()) {
        LOG.info("Indexer: adding segment: " + segments[i]);
      }
      job.addInputPath(new Path(segments[i], CrawlDatum.FETCH_DIR_NAME));
      job.addInputPath(new Path(segments[i], ParseData.DIR_NAME));
      job.addInputPath(new Path(segments[i], ParseText.DIR_NAME));
    }

    job.addInputPath(new Path(crawlDb, CrawlDb.CURRENT_NAME));
    job.addInputPath(new Path(linkDb, LinkDb.CURRENT_NAME));
    job.setInputFormat(SequenceFileInputFormat.class);

    job.setMapperClass(Indexer.class);
    job.setReducerClass(Indexer.class);

    job.setOutputPath(indexDir);
    job.setOutputFormat(OutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(ObjectWritable.class);

    JobClient.runJob(job);
    if (LOG.isInfoEnabled()) { LOG.info("Indexer: done"); }
  }
View Full Code Here

    throws IOException {
    Path newCrawlDb =
      new Path(crawlDb,
               Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));

    JobConf job = new NutchJob(config);
    job.setJobName("crawldb " + crawlDb);


    Path current = new Path(crawlDb, CURRENT_NAME);
    if (FileSystem.get(job).exists(current)) {
      job.addInputPath(current);
    }
    job.setInputFormat(SequenceFileInputFormat.class);

    job.setMapperClass(CrawlDbFilter.class);
    job.setReducerClass(CrawlDbReducer.class);

    job.setOutputPath(newCrawlDb);
    job.setOutputFormat(MapFileOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(CrawlDatum.class);

    return job;
  }
View Full Code Here

      return;
    }

    Configuration conf = NutchConfiguration.create();
    conf.addDefaultResource("crawl-tool.xml");
    JobConf job = new NutchJob(conf);

    Path rootUrlDir = null;
    Path dir = new Path("crawl-" + getDate());
    int threads = job.getInt("fetcher.threads.fetch", 10);
    int depth = 5;
    int topN = Integer.MAX_VALUE;

    for (int i = 0; i < args.length; i++) {
      if ("-dir".equals(args[i])) {
        dir = new Path(args[i+1]);
        i++;
      } else if ("-threads".equals(args[i])) {
        threads = Integer.parseInt(args[i+1]);
        i++;
      } else if ("-depth".equals(args[i])) {
        depth = Integer.parseInt(args[i+1]);
        i++;
      } else if ("-topN".equals(args[i])) {
        topN = Integer.parseInt(args[i+1]);
        i++;
      } else if (args[i] != null) {
        rootUrlDir = new Path(args[i]);
      }
    }

    FileSystem fs = FileSystem.get(job);
    if (fs.exists(dir)) {
      throw new RuntimeException(dir + " already exists.");
    }

    if (LOG.isInfoEnabled()) {
      LOG.info("crawl started in: " + dir);
      LOG.info("rootUrlDir = " + rootUrlDir);
      LOG.info("threads = " + threads);
      LOG.info("depth = " + depth);
      if (topN != Integer.MAX_VALUE)
        LOG.info("topN = " + topN);
    }
   
    Path crawlDb = new Path(dir + "/crawldb");
    Path linkDb = new Path(dir + "/linkdb");
    Path segments = new Path(dir + "/segments");
    Path indexes = new Path(dir + "/indexes");
    Path index = new Path(dir + "/index");

    Path tmpDir = job.getLocalPath("crawl"+Path.SEPARATOR+getDate());
    Injector injector = new Injector(conf);
    Generator generator = new Generator(conf);
    Fetcher fetcher = new Fetcher(conf);
    ParseSegment parseSegment = new ParseSegment(conf);
    CrawlDb crawlDbTool = new CrawlDb(conf);
View Full Code Here

    // map text input file to a <url,CrawlDatum> file
    if (LOG.isInfoEnabled()) {
      LOG.info("Injector: Converting injected urls to crawl db entries.");
    }
    JobConf sortJob = new NutchJob(getConf());
    sortJob.setJobName("inject " + urlDir);
    sortJob.setInputPath(urlDir);
    sortJob.setMapperClass(InjectMapper.class);

    sortJob.setOutputPath(tempDir);
    sortJob.setOutputFormat(SequenceFileOutputFormat.class);
    sortJob.setOutputKeyClass(Text.class);
    sortJob.setOutputValueClass(CrawlDatum.class);
    sortJob.setLong("injector.current.time", System.currentTimeMillis());
    JobClient.runJob(sortJob);

    // merge with existing crawl db
    if (LOG.isInfoEnabled()) {
      LOG.info("Injector: Merging injected urls into crawl db.");
View Full Code Here

        fetcher = new Fetcher();
        Configuration conf = new Configuration();
        // conf.addDefaultResource("crawl-tool.xml");
        conf.addDefaultResource("nutch-default.xml");
        conf.addDefaultResource("nutch-site.xml");
        JobConf job = new NutchJob(conf);
        threads = job.getInt("fetcher.threads.fetch", 10);
        fetcher.setConf(conf);
    }
View Full Code Here

    Path outDir1 =
      new Path("dedup-urls-"+
               Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));

    JobConf job = new NutchJob(getConf());

    for (int i = 0; i < indexDirs.length; i++) {
      if (LOG.isInfoEnabled()) {
        LOG.info("Dedup: adding indexes in: " + indexDirs[i]);
      }
      FileInputFormat.addInputPath(job, indexDirs[i]);
    }
    job.setJobName("dedup 1: urls by time");

    job.setInputFormat(InputFormat.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(IndexDoc.class);

    job.setReducerClass(UrlsReducer.class);
    FileOutputFormat.setOutputPath(job, outDir1);

    job.setOutputKeyClass(MD5Hash.class);
    job.setOutputValueClass(IndexDoc.class);
    job.setOutputFormat(SequenceFileOutputFormat.class);

    JobClient.runJob(job);

    Path outDir2 =
      new Path("dedup-hash-"+
               Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
    job = new NutchJob(getConf());
    job.setJobName("dedup 2: content by hash");

    FileInputFormat.addInputPath(job, outDir1);
    job.setInputFormat(SequenceFileInputFormat.class);
    job.setMapOutputKeyClass(MD5Hash.class);
    job.setMapOutputValueClass(IndexDoc.class);
    job.setPartitionerClass(HashPartitioner.class);
    job.setSpeculativeExecution(false);
   
    job.setReducerClass(HashReducer.class);
    FileOutputFormat.setOutputPath(job, outDir2);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IndexDoc.class);
    job.setOutputFormat(SequenceFileOutputFormat.class);

    JobClient.runJob(job);

    // remove outDir1 - no longer needed
    fs.delete(outDir1, true);
   
    job = new NutchJob(getConf());
    job.setJobName("dedup 3: delete from index(es)");

    FileInputFormat.addInputPath(job, outDir2);
    job.setInputFormat(SequenceFileInputFormat.class);
    //job.setInputKeyClass(Text.class);
    //job.setInputValueClass(IndexDoc.class);

    job.setInt("io.file.buffer.size", 4096);
    job.setMapperClass(DeleteDuplicates.class);
    job.setReducerClass(DeleteDuplicates.class);

    job.setOutputFormat(DeleteDuplicates.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    JobClient.runJob(job);

    fs.delete(outDir2, true);
View Full Code Here

          return -1;
        }
      }
    }
   
    JobConf job = new NutchJob(getConf());
    job.setBoolean(FILTER_KEY, filter);
    job.setBoolean(NORMALIZE_KEY, normalize);
    FileInputFormat.addInputPath(job, new Path(args[0]));
    job.setInputFormat(TextInputFormat.class);
    job.setMapperClass(FG.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Generator.SelectorEntry.class);
    job.setPartitionerClass(PartitionUrlByHost.class);
    job.setReducerClass(FG.class);
    String segName = Generator.generateSegmentName();
    job.setNumReduceTasks(job.getNumMapTasks());
    job.setOutputFormat(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(CrawlDatum.class);
    job.setOutputKeyComparatorClass(Generator.HashComparator.class);
    FileOutputFormat.setOutputPath(job, new Path(args[1],
        new Path(segName, CrawlDatum.GENERATE_DIR_NAME)));
    try {
      JobClient.runJob(job);
      return 0;
View Full Code Here

TOP

Related Classes of org.apache.nutch.util.NutchJob

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.