Examples of org.apache.nutch.util.NutchJob

org.apache.nutch.util.NutchJob
A {@link Job} for Nutch jobs.

    Path routes = new Path(webGraphDb, ROUTES_DIR);
    Path tempRoute = new Path(webGraphDb, ROUTES_DIR + "-"
      + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));


    // run the initializer
    JobConf init = new NutchJob(conf);
    init.setJobName("Initializer: " + webGraphDb);
    FileInputFormat.addInputPath(init, outlinkDb);
    FileInputFormat.addInputPath(init, nodeDb);
    init.setInputFormat(SequenceFileInputFormat.class);
    init.setMapperClass(Initializer.class);
    init.setReducerClass(Initializer.class);
    init.setMapOutputKeyClass(Text.class);
    init.setMapOutputValueClass(ObjectWritable.class);
    init.setOutputKeyClass(Text.class);
    init.setOutputValueClass(Route.class);
    FileOutputFormat.setOutputPath(init, tempRoute);
    init.setOutputFormat(SequenceFileOutputFormat.class);


    try {
      LOG.info("Loops: starting initializer");
      JobClient.runJob(init);
      LOG.info("Loops: installing initializer " + routes);
      FSUtils.replace(fs, routes, tempRoute, true);
      LOG.info("Loops: finished initializer");
    }
    catch (IOException e) {
      LOG.error(StringUtils.stringifyException(e));
      throw e;
    }


    // run the loops job for a maxdepth, default 2, which will find a 3 link
    // loop cycle
    int depth = conf.getInt("link.loops.depth", 2);
    for (int i = 0; i < depth; i++) {


      JobConf looper = new NutchJob(conf);
      looper.setJobName("Looper: " + (i + 1) + " of " + depth);
      FileInputFormat.addInputPath(looper, outlinkDb);
      FileInputFormat.addInputPath(looper, routes);
      looper.setInputFormat(SequenceFileInputFormat.class);
      looper.setMapperClass(Looper.class);
      looper.setReducerClass(Looper.class);
      looper.setMapOutputKeyClass(Text.class);
      looper.setMapOutputValueClass(ObjectWritable.class);
      looper.setOutputKeyClass(Text.class);
      looper.setOutputValueClass(Route.class);
      FileOutputFormat.setOutputPath(looper, tempRoute);
      looper.setOutputFormat(SequenceFileOutputFormat.class);
      looper.setBoolean("last", i == (depth - 1));


      try {
        LOG.info("Loops: starting looper");
        JobClient.runJob(looper);
        LOG.info("Loops: installing looper " + routes);
        FSUtils.replace(fs, routes, tempRoute, true);
        LOG.info("Loops: finished looper");
      }
      catch (IOException e) {
        LOG.error(StringUtils.stringifyException(e));
        throw e;
      }
    }


    // run the finalizer
    JobConf finalizer = new NutchJob(conf);
    finalizer.setJobName("Finalizer: " + webGraphDb);
    FileInputFormat.addInputPath(finalizer, routes);
    finalizer.setInputFormat(SequenceFileInputFormat.class);
    finalizer.setMapperClass(Finalizer.class);
    finalizer.setReducerClass(Finalizer.class);
    finalizer.setMapOutputKeyClass(Text.class);
    finalizer.setMapOutputValueClass(Route.class);
    finalizer.setOutputKeyClass(Text.class);
    finalizer.setOutputValueClass(LoopSet.class);
    FileOutputFormat.setOutputPath(finalizer, new Path(webGraphDb, LOOPS_DIR));
    finalizer.setOutputFormat(MapFileOutputFormat.class);


    try {
      LOG.info("Loops: starting finalizer");
      JobClient.runJob(finalizer);
      LOG.info("Loops: finished finalizer");

View Full Code Here


        Path tempDir = new Path(getConf().get("mapred.temp.dir", ".")
                + "/dedup-temp-"
                + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));


        JobConf job = new NutchJob(getConf());
        
        job.setJobName("Deduplication on "+crawldb);


        FileInputFormat.addInputPath(job, new Path(crawldb,
                CrawlDb.CURRENT_NAME));
        job.setInputFormat(SequenceFileInputFormat.class);


        FileOutputFormat.setOutputPath(job, tempDir);
        job.setOutputFormat(SequenceFileOutputFormat.class);


        job.setMapOutputKeyClass(BytesWritable.class);
        job.setMapOutputValueClass(CrawlDatum.class);


        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(CrawlDatum.class);


        job.setMapperClass(DBFilter.class);
        job.setReducerClass(DedupReducer.class);


        try {
            RunningJob rj = JobClient.runJob(job);
            Group g = rj.getCounters().getGroup("DeduplicationJobStatus");
            if (g != null){

View Full Code Here

    throws IOException {


    // configure the counter job
    Path numLinksPath = new Path(webGraphDb, NUM_NODES);
    Path nodeDb = new Path(webGraphDb, WebGraph.NODE_DIR);
    JobConf counter = new NutchJob(getConf());
    counter.setJobName("LinkRank Counter");
    FileInputFormat.addInputPath(counter, nodeDb);
    FileOutputFormat.setOutputPath(counter, numLinksPath);
    counter.setInputFormat(SequenceFileInputFormat.class);
    counter.setMapperClass(Counter.class);
    counter.setCombinerClass(Counter.class);
    counter.setReducerClass(Counter.class);
    counter.setMapOutputKeyClass(Text.class);
    counter.setMapOutputValueClass(LongWritable.class);
    counter.setOutputKeyClass(Text.class);
    counter.setOutputValueClass(LongWritable.class);
    counter.setNumReduceTasks(1);
    counter.setOutputFormat(TextOutputFormat.class);
    counter.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);


    // run the counter job, outputs to a single reduce task and file
    LOG.info("Starting link counter job");
    try {
      JobClient.runJob(counter);

View Full Code Here

   */
  private void runInitializer(Path nodeDb, Path output)
    throws IOException {


    // configure the initializer
    JobConf initializer = new NutchJob(getConf());
    initializer.setJobName("LinkAnalysis Initializer");
    FileInputFormat.addInputPath(initializer, nodeDb);
    FileOutputFormat.setOutputPath(initializer, output);
    initializer.setInputFormat(SequenceFileInputFormat.class);
    initializer.setMapperClass(Initializer.class);
    initializer.setMapOutputKeyClass(Text.class);
    initializer.setMapOutputValueClass(Node.class);
    initializer.setOutputKeyClass(Text.class);
    initializer.setOutputValueClass(Node.class);
    initializer.setOutputFormat(MapFileOutputFormat.class);
    initializer.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);


    // run the initializer
    LOG.info("Starting initialization job");
    try {
      JobClient.runJob(initializer);

View Full Code Here

   */
  private void runInverter(Path nodeDb, Path outlinkDb, Path loopDb, Path output)
    throws IOException {


    // configure the inverter
    JobConf inverter = new NutchJob(getConf());
    inverter.setJobName("LinkAnalysis Inverter");
    FileInputFormat.addInputPath(inverter, nodeDb);
    FileInputFormat.addInputPath(inverter, outlinkDb);


    // add the loop database if it exists, isn't null
    if (loopDb != null) {
      FileInputFormat.addInputPath(inverter, loopDb);
    }
    FileOutputFormat.setOutputPath(inverter, output);
    inverter.setInputFormat(SequenceFileInputFormat.class);
    inverter.setMapperClass(Inverter.class);
    inverter.setReducerClass(Inverter.class);
    inverter.setMapOutputKeyClass(Text.class);
    inverter.setMapOutputValueClass(ObjectWritable.class);
    inverter.setOutputKeyClass(Text.class);
    inverter.setOutputValueClass(LinkDatum.class);
    inverter.setOutputFormat(SequenceFileOutputFormat.class);
    inverter.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);


    // run the inverter job
    LOG.info("Starting inverter job");
    try {
      JobClient.runJob(inverter);

View Full Code Here

   */
  private void runAnalysis(Path nodeDb, Path inverted, Path output,
    int iteration, int numIterations, float rankOne)
    throws IOException {


    JobConf analyzer = new NutchJob(getConf());
    analyzer.set("link.analyze.iteration", String.valueOf(iteration + 1));
    analyzer.setJobName("LinkAnalysis Analyzer, iteration " + (iteration + 1)
      + " of " + numIterations);
    FileInputFormat.addInputPath(analyzer, nodeDb);
    FileInputFormat.addInputPath(analyzer, inverted);
    FileOutputFormat.setOutputPath(analyzer, output);
    analyzer.set("link.analyze.rank.one", String.valueOf(rankOne));
    analyzer.setMapOutputKeyClass(Text.class);
    analyzer.setMapOutputValueClass(ObjectWritable.class);
    analyzer.setInputFormat(SequenceFileInputFormat.class);
    analyzer.setMapperClass(Analyzer.class);
    analyzer.setReducerClass(Analyzer.class);
    analyzer.setOutputKeyClass(Text.class);
    analyzer.setOutputValueClass(Node.class);
    analyzer.setOutputFormat(MapFileOutputFormat.class);
    analyzer.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);


    LOG.info("Starting analysis job");
    try {
      JobClient.runJob(analyzer);
    }

View Full Code Here

      LOG.info("LinkDb dump: starting at " + sdf.format(start));
      LOG.info("LinkDb dump: db: " + linkdb);
    }
    Path outFolder = new Path(output);


    JobConf job = new NutchJob(getConf());
    job.setJobName("read " + linkdb);


    FileInputFormat.addInputPath(job, new Path(linkdb, LinkDb.CURRENT_NAME));
    job.setInputFormat(SequenceFileInputFormat.class);


    FileOutputFormat.setOutputPath(job, outFolder);
    job.setOutputFormat(TextOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Inlinks.class);


    JobClient.runJob(job);


    long end = System.currentTimeMillis();
    LOG.info("LinkDb dump: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));

View Full Code Here

    throws IOException {
    Path newCrawlDb =
      new Path(crawlDb,
               Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));


    JobConf job = new NutchJob(config);
    job.setJobName("crawldb " + crawlDb);


    Path current = new Path(crawlDb, CURRENT_NAME);
    if (FileSystem.get(job).exists(current)) {
      FileInputFormat.addInputPath(job, current);
    }
    job.setInputFormat(SequenceFileInputFormat.class);


    job.setMapperClass(CrawlDbFilter.class);
    job.setReducerClass(CrawlDbReducer.class);


    FileOutputFormat.setOutputPath(job, newCrawlDb);
    job.setOutputFormat(MapFileOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(CrawlDatum.class);


    // https://issues.apache.org/jira/browse/NUTCH-1110
    job.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);


    return job;
  }

View Full Code Here

    Path crawlDbCurrent = new Path(crawlDb, CrawlDb.CURRENT_NAME);
    Path newCrawlDb = new Path(crawlDb,
      Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));


    // run the updater job outputting to the temp crawl database
    JobConf updater = new NutchJob(conf);
    updater.setJobName("Update CrawlDb from WebGraph");
    FileInputFormat.addInputPath(updater, crawlDbCurrent);
    FileInputFormat.addInputPath(updater, nodeDb);
    FileOutputFormat.setOutputPath(updater, newCrawlDb);
    updater.setInputFormat(SequenceFileInputFormat.class);
    updater.setMapperClass(ScoreUpdater.class);
    updater.setReducerClass(ScoreUpdater.class);
    updater.setMapOutputKeyClass(Text.class);
    updater.setMapOutputValueClass(ObjectWritable.class);
    updater.setOutputKeyClass(Text.class);
    updater.setOutputValueClass(CrawlDatum.class);
    updater.setOutputFormat(MapFileOutputFormat.class);


    try {
      JobClient.runJob(updater);
    }
    catch (IOException e) {

View Full Code Here

    long start = System.currentTimeMillis();
    LOG.info("NodeDumper: starting at " + sdf.format(start));
    Path nodeDb = new Path(webGraphDb, WebGraph.NODE_DIR);
    Configuration conf = getConf();


    JobConf dumper = new NutchJob(conf);
    dumper.setJobName("NodeDumper: " + webGraphDb);
    FileInputFormat.addInputPath(dumper, nodeDb);
    dumper.setInputFormat(SequenceFileInputFormat.class);


    if (nameType == null) {
      dumper.setMapperClass(Sorter.class);
      dumper.setReducerClass(Sorter.class);
      dumper.setMapOutputKeyClass(FloatWritable.class);
      dumper.setMapOutputValueClass(Text.class);
    } else {
      dumper.setMapperClass(Dumper.class);
      dumper.setReducerClass(Dumper.class);
      dumper.setMapOutputKeyClass(Text.class);
      dumper.setMapOutputValueClass(FloatWritable.class);
    }


    dumper.setOutputKeyClass(Text.class);
    dumper.setOutputValueClass(FloatWritable.class);
    FileOutputFormat.setOutputPath(dumper, output);


    if (asSequenceFile) {
      dumper.setOutputFormat(SequenceFileOutputFormat.class);
    } else {
      dumper.setOutputFormat(TextOutputFormat.class);
    }


    dumper.setNumReduceTasks(1);
    dumper.setBoolean("inlinks", type == DumpType.INLINKS);
    dumper.setBoolean("outlinks", type == DumpType.OUTLINKS);
    dumper.setBoolean("scores", type == DumpType.SCORES);


    dumper.setBoolean("host", nameType == NameType.HOST);
    dumper.setBoolean("domain", nameType == NameType.DOMAIN);
    dumper.setBoolean("sum", aggrType == AggrType.SUM);
    dumper.setBoolean("max", aggrType == AggrType.MAX);


    dumper.setLong("topn", topN);


    // Set equals-sign as separator for Solr's ExternalFileField
    if (asEff) {
      dumper.set("mapred.textoutputformat.separator", "=");
    }


    try {
      LOG.info("NodeDumper: running");
      JobClient.runJob(dumper);

View Full Code Here

0 1 2 3 4 5 6 7 8 9

TOP

Related Classes of org.apache.nutch.util.NutchJob

com.flaptor.hounder.crawler.Nutch9Fetcher

org.apache.nutch.crawl.Crawl

org.apache.nutch.crawl.CrawlDb

org.apache.nutch.crawl.CrawlDbMerger

org.apache.nutch.crawl.CrawlDbReader

org.apache.nutch.crawl.DbUpdaterJob

org.apache.nutch.crawl.DeduplicationJob

org.apache.nutch.crawl.Generator

org.apache.nutch.crawl.GeneratorJob

org.apache.nutch.crawl.Injector

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.