Examples of NutchJob

org.apache.nutch.util.NutchJob
A {@link Job} for Nutch jobs.

Examples of org.apache.nutch.util.NutchJob

    Path tempDir =
      new Path(config.get("mapred.temp.dir", ".") +
               "/readdb-topN-temp-"+
               Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));


    JobConf job = new NutchJob(config);
    job.setJobName("topN prepare " + crawlDb);
    FileInputFormat.addInputPath(job, new Path(crawlDb, CrawlDb.CURRENT_NAME));
    job.setInputFormat(SequenceFileInputFormat.class);
    job.setMapperClass(CrawlDbTopNMapper.class);
    job.setReducerClass(IdentityReducer.class);


    FileOutputFormat.setOutputPath(job, tempDir);
    job.setOutputFormat(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(FloatWritable.class);
    job.setOutputValueClass(Text.class);


    // XXX hmmm, no setFloat() in the API ... :(
    job.setLong("db.reader.topn.min", Math.round(1000000.0 * min));
    JobClient.runJob(job);


    if (LOG.isInfoEnabled()) {
      LOG.info("CrawlDb topN: collecting topN scores.");
    }
    job = new NutchJob(config);
    job.setJobName("topN collect " + crawlDb);
    job.setLong("db.reader.topn", topN);


    FileInputFormat.addInputPath(job, tempDir);
    job.setInputFormat(SequenceFileInputFormat.class);
    job.setMapperClass(IdentityMapper.class);
    job.setReducerClass(CrawlDbTopNReducer.class);


    FileOutputFormat.setOutputPath(job, outFolder);
    job.setOutputFormat(TextOutputFormat.class);
    job.setOutputKeyClass(FloatWritable.class);
    job.setOutputValueClass(Text.class);


    job.setNumReduceTasks(1); // create a single file.


    JobClient.runJob(job);
    FileSystem fs = FileSystem.get(config);
    fs.delete(tempDir, true);
    if (LOG.isInfoEnabled()) { LOG.info("CrawlDb topN: done"); }

View Full Code Here

Examples of org.apache.nutch.util.NutchJob


        SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
        long start = System.currentTimeMillis();
        LOG.info("Indexer: starting at " + sdf.format(start));


        final JobConf job = new NutchJob(getConf());
        job.setJobName("Indexer");


        LOG.info("Indexer: deleting gone documents: " + deleteGone);
        LOG.info("Indexer: URL filtering: " + filter);
        LOG.info("Indexer: URL normalizing: " + normalize);   
        
        IndexWriters writers = new IndexWriters(getConf());
        LOG.info(writers.describe());


        IndexerMapReduce.initMRJob(crawlDb, linkDb, segments, job);


        // NOW PASSED ON THE COMMAND LINE AS A HADOOP PARAM
        // job.set(SolrConstants.SERVER_URL, solrUrl);


        job.setBoolean(IndexerMapReduce.INDEXER_DELETE, deleteGone);
        job.setBoolean(IndexerMapReduce.URL_FILTERING, filter);
        job.setBoolean(IndexerMapReduce.URL_NORMALIZING, normalize);


        if (params != null) {
            job.set(IndexerMapReduce.INDEXER_PARAMS, params);
        }


        job.setReduceSpeculativeExecution(false);


        final Path tmp = new Path("tmp_" + System.currentTimeMillis() + "-"
                + new Random().nextInt());


        FileOutputFormat.setOutputPath(job, tmp);

View Full Code Here

Examples of org.apache.nutch.util.NutchJob


  @Test
  public void testRepositoryCache() {
    Configuration config = NutchConfiguration.create();
    PluginRepository repo = PluginRepository.get(config);
    JobConf job = new NutchJob(config);
    PluginRepository repo1 = PluginRepository.get(job);
    Assert.assertTrue(repo == repo1);
    // now construct a config without UUID
    config = new Configuration();
    config.addResource("nutch-default.xml");
    config.addResource("nutch-site.xml");
    repo = PluginRepository.get(config);
    job = new NutchJob(config);
    repo1 = PluginRepository.get(job);
    Assert.assertTrue(repo1 != repo);
  }

View Full Code Here

Examples of org.apache.nutch.util.NutchJob

      LOG.error("IOException:", e);
    }
  }


  private JobConf createJobConf() {
    JobConf job = new NutchJob(getConf());
    job.setBoolean("segment.reader.co", this.co);
    job.setBoolean("segment.reader.fe", this.fe);
    job.setBoolean("segment.reader.ge", this.ge);
    job.setBoolean("segment.reader.pa", this.pa);
    job.setBoolean("segment.reader.pd", this.pd);
    job.setBoolean("segment.reader.pt", this.pt);
    return job;
  }

View Full Code Here

Examples of org.apache.nutch.util.NutchJob

      fs.mkdirs(outlinkDb);
    }


    Path tempOutlinkDb = new Path(outlinkDb + "-"
      + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
    JobConf outlinkJob = new NutchJob(conf);
    outlinkJob.setJobName("Outlinkdb: " + outlinkDb);


    boolean deleteGone = conf.getBoolean("link.delete.gone", false);
    boolean preserveBackup = conf.getBoolean("db.preserve.backup", true);


    if (deleteGone) {
      LOG.info("OutlinkDb: deleting gone links");
    }


    // get the parse data and crawl fetch data for all segments
    if (segments != null) {
      for (int i = 0; i < segments.length; i++) {
        Path parseData = new Path(segments[i], ParseData.DIR_NAME);
        if (fs.exists(parseData)) {
          LOG.info("OutlinkDb: adding input: " + parseData);
          FileInputFormat.addInputPath(outlinkJob, parseData);
        }


        if (deleteGone) {
          Path crawlFetch = new Path(segments[i], CrawlDatum.FETCH_DIR_NAME);
          if (fs.exists(crawlFetch)) {
            LOG.info("OutlinkDb: adding input: " + crawlFetch);
            FileInputFormat.addInputPath(outlinkJob, crawlFetch);
          }
        }
      }
    }


    // add the existing webgraph
    LOG.info("OutlinkDb: adding input: " + outlinkDb);
    FileInputFormat.addInputPath(outlinkJob, outlinkDb);


    outlinkJob.setBoolean(OutlinkDb.URL_NORMALIZING, normalize);
    outlinkJob.setBoolean(OutlinkDb.URL_FILTERING, filter);


    outlinkJob.setInputFormat(SequenceFileInputFormat.class);
    outlinkJob.setMapperClass(OutlinkDb.class);
    outlinkJob.setReducerClass(OutlinkDb.class);
    outlinkJob.setMapOutputKeyClass(Text.class);
    outlinkJob.setMapOutputValueClass(NutchWritable.class);
    outlinkJob.setOutputKeyClass(Text.class);
    outlinkJob.setOutputValueClass(LinkDatum.class);
    FileOutputFormat.setOutputPath(outlinkJob, tempOutlinkDb);
    outlinkJob.setOutputFormat(MapFileOutputFormat.class);
    outlinkJob.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);


    // run the outlinkdb job and replace any old outlinkdb with the new one
    try {
      LOG.info("OutlinkDb: running");
      JobClient.runJob(outlinkJob);
      LOG.info("OutlinkDb: installing " + outlinkDb);
      FSUtils.replace(fs, oldOutlinkDb, outlinkDb, true);
      FSUtils.replace(fs, outlinkDb, tempOutlinkDb, true);
      if (!preserveBackup && fs.exists(oldOutlinkDb)) fs.delete(oldOutlinkDb, true);
      LOG.info("OutlinkDb: finished");
    }
    catch (IOException e) {
      
      // remove lock file and and temporary directory if an error occurs
      LockUtil.removeLockFile(fs, lock);
      if (fs.exists(tempOutlinkDb)) {
        fs.delete(tempOutlinkDb, true);
      }
      LOG.error(StringUtils.stringifyException(e));
      throw e;
    }


    // inlink and temp link database paths
    Path inlinkDb = new Path(webGraphDb, INLINK_DIR);
    Path tempInlinkDb = new Path(inlinkDb + "-"
      + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));


    JobConf inlinkJob = new NutchJob(conf);
    inlinkJob.setJobName("Inlinkdb " + inlinkDb);
    LOG.info("InlinkDb: adding input: " + outlinkDb);
    FileInputFormat.addInputPath(inlinkJob, outlinkDb);
    inlinkJob.setInputFormat(SequenceFileInputFormat.class);
    inlinkJob.setMapperClass(InlinkDb.class);
    inlinkJob.setMapOutputKeyClass(Text.class);
    inlinkJob.setMapOutputValueClass(LinkDatum.class);
    inlinkJob.setOutputKeyClass(Text.class);
    inlinkJob.setOutputValueClass(LinkDatum.class);
    FileOutputFormat.setOutputPath(inlinkJob, tempInlinkDb);
    inlinkJob.setOutputFormat(MapFileOutputFormat.class);
    inlinkJob.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);


    try {
      
      // run the inlink and replace any old with new
      LOG.info("InlinkDb: running");
      JobClient.runJob(inlinkJob);
      LOG.info("InlinkDb: installing " + inlinkDb);
      FSUtils.replace(fs, inlinkDb, tempInlinkDb, true);
      LOG.info("InlinkDb: finished");
    }
    catch (IOException e) {
      
      // remove lock file and and temporary directory if an error occurs
      LockUtil.removeLockFile(fs, lock);
      if (fs.exists(tempInlinkDb)) {
        fs.delete(tempInlinkDb, true);
      }      
      LOG.error(StringUtils.stringifyException(e));
      throw e;
    }


    // node and temp node database paths
    Path nodeDb = new Path(webGraphDb, NODE_DIR);
    Path tempNodeDb = new Path(nodeDb + "-"
      + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));


    JobConf nodeJob = new NutchJob(conf);
    nodeJob.setJobName("NodeDb " + nodeDb);
    LOG.info("NodeDb: adding input: " + outlinkDb);
    LOG.info("NodeDb: adding input: " + inlinkDb);
    FileInputFormat.addInputPath(nodeJob, outlinkDb);
    FileInputFormat.addInputPath(nodeJob, inlinkDb);
    nodeJob.setInputFormat(SequenceFileInputFormat.class);
    nodeJob.setReducerClass(NodeDb.class);
    nodeJob.setMapOutputKeyClass(Text.class);
    nodeJob.setMapOutputValueClass(LinkDatum.class);
    nodeJob.setOutputKeyClass(Text.class);
    nodeJob.setOutputValueClass(Node.class);
    FileOutputFormat.setOutputPath(nodeJob, tempNodeDb);
    nodeJob.setOutputFormat(MapFileOutputFormat.class);
    nodeJob.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);


    try {
      
      // run the node job and replace old nodedb with new
      LOG.info("NodeDb: running");

View Full Code Here

Examples of org.apache.nutch.util.NutchJob

    public void delete(String crawldb, boolean noCommit) throws IOException {
        SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
        long start = System.currentTimeMillis();
        LOG.info("CleaningJob: starting at " + sdf.format(start));


        JobConf job = new NutchJob(getConf());


        FileInputFormat.addInputPath(job, new Path(crawldb,
                CrawlDb.CURRENT_NAME));
        job.setBoolean("noCommit", noCommit);
        job.setInputFormat(SequenceFileInputFormat.class);
        job.setOutputFormat(NullOutputFormat.class);
        job.setMapOutputKeyClass(ByteWritable.class);
        job.setMapOutputValueClass(Text.class);
        job.setMapperClass(DBFilter.class);
        job.setReducerClass(DeleterReducer.class);
        
        job.setJobName("CleaningJob");


        // need to expicitely allow deletions
        job.setBoolean(IndexerMapReduce.INDEXER_DELETE, true);


        JobClient.runJob(job);


        long end = System.currentTimeMillis();
        LOG.info("CleaningJob: finished at " + sdf.format(end) + ", elapsed: "

View Full Code Here

Examples of org.apache.nutch.util.NutchJob

  }


  public Path[] generate(Path dbDir, Path segments, int numLists, long topN, long curTime)
      throws IOException {


    JobConf job = new NutchJob(getConf());
    boolean filter = job.getBoolean(GENERATOR_FILTER, true);
    boolean normalise = job.getBoolean(GENERATOR_NORMALISE, true);
    return generate(dbDir, segments, numLists, topN, curTime, filter, normalise, false, 1);
  }

View Full Code Here

Examples of org.apache.nutch.util.NutchJob

    if ("true".equals(getConf().get(GENERATE_MAX_PER_HOST_BY_IP))){
      LOG.info("Generator: GENERATE_MAX_PER_HOST_BY_IP will be ignored, use partition.url.mode instead");
    }


    // map to inverted subset due for fetch, sort by score
    JobConf job = new NutchJob(getConf());
    job.setJobName("generate: select from " + dbDir);


    if (numLists == -1) { // for politeness make
      numLists = job.getNumMapTasks(); // a partition per fetch task
    }
    if ("local".equals(job.get("mapred.job.tracker")) && numLists != 1) {
      // override
      LOG.info("Generator: jobtracker is 'local', generating exactly one partition.");
      numLists = 1;
    }
    job.setLong(GENERATOR_CUR_TIME, curTime);
    // record real generation time
    long generateTime = System.currentTimeMillis();
    job.setLong(Nutch.GENERATE_TIME_KEY, generateTime);
    job.setLong(GENERATOR_TOP_N, topN);
    job.setBoolean(GENERATOR_FILTER, filter);
    job.setBoolean(GENERATOR_NORMALISE, norm);
    job.setInt(GENERATOR_MAX_NUM_SEGMENTS, maxNumSegments);


    FileInputFormat.addInputPath(job, new Path(dbDir, CrawlDb.CURRENT_NAME));
    job.setInputFormat(SequenceFileInputFormat.class);


    job.setMapperClass(Selector.class);
    job.setPartitionerClass(Selector.class);
    job.setReducerClass(Selector.class);


    FileOutputFormat.setOutputPath(job, tempDir);
    job.setOutputFormat(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(FloatWritable.class);
    job.setOutputKeyComparatorClass(DecreasingFloatComparator.class);
    job.setOutputValueClass(SelectorEntry.class);
    job.setOutputFormat(GeneratorOutputFormat.class);


    try {
      JobClient.runJob(job);
    } catch (IOException e) {
      LockUtil.removeLockFile(fs, lock);
      fs.delete(tempDir, true);
      throw e;
    }


    // read the subdirectories generated in the temp
    // output and turn them into segments
    List<Path> generatedSegments = new ArrayList<Path>();


    FileStatus[] status = fs.listStatus(tempDir);
    try {
      for (FileStatus stat : status) {
        Path subfetchlist = stat.getPath();
        if (!subfetchlist.getName().startsWith("fetchlist-")) continue;
        // start a new partition job for this segment
        Path newSeg = partitionSegment(fs, segments, subfetchlist, numLists);
        generatedSegments.add(newSeg);
      }
    } catch (Exception e) {
      LOG.warn("Generator: exception while partitioning segments, exiting ...");
      fs.delete(tempDir, true);
      return null;
    }


    if (generatedSegments.size() == 0) {
      LOG.warn("Generator: 0 records selected for fetching, exiting ...");
      LockUtil.removeLockFile(fs, lock);
      fs.delete(tempDir, true);
      return null;
    }


    if (getConf().getBoolean(GENERATE_UPDATE_CRAWLDB, false)) {
      // update the db from tempDir
      Path tempDir2 = new Path(getConf().get("mapred.temp.dir", ".") + "/generate-temp-"
          + java.util.UUID.randomUUID().toString());


      job = new NutchJob(getConf());
      job.setJobName("generate: updatedb " + dbDir);
      job.setLong(Nutch.GENERATE_TIME_KEY, generateTime);
      for (Path segmpaths : generatedSegments) {
        Path subGenDir = new Path(segmpaths, CrawlDatum.GENERATE_DIR_NAME);
        FileInputFormat.addInputPath(job, subGenDir);
      }
      FileInputFormat.addInputPath(job, new Path(dbDir, CrawlDb.CURRENT_NAME));
      job.setInputFormat(SequenceFileInputFormat.class);
      job.setMapperClass(CrawlDbUpdater.class);
      job.setReducerClass(CrawlDbUpdater.class);
      job.setOutputFormat(MapFileOutputFormat.class);
      job.setOutputKeyClass(Text.class);
      job.setOutputValueClass(CrawlDatum.class);
      FileOutputFormat.setOutputPath(job, tempDir2);
      try {
        JobClient.runJob(job);
        CrawlDb.install(job, dbDir);
      } catch (IOException e) {

View Full Code Here

Examples of org.apache.nutch.util.NutchJob

    Path segment = new Path(segmentsDir, generateSegmentName());
    Path output = new Path(segment, CrawlDatum.GENERATE_DIR_NAME);


    LOG.info("Generator: segment: " + segment);


    NutchJob job = new NutchJob(getConf());
    job.setJobName("generate: partition " + segment);


    job.setInt("partition.url.seed", new Random().nextInt());


    FileInputFormat.addInputPath(job, inputDir);
    job.setInputFormat(SequenceFileInputFormat.class);


    job.setMapperClass(SelectorInverseMapper.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(SelectorEntry.class);
    job.setPartitionerClass(URLPartitioner.class);
    job.setReducerClass(PartitionReducer.class);
    job.setNumReduceTasks(numLists);


    FileOutputFormat.setOutputPath(job, output);
    job.setOutputFormat(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(CrawlDatum.class);
    job.setOutputKeyComparatorClass(HashComparator.class);
    JobClient.runJob(job);
    return segment;
  }

View Full Code Here

Examples of org.apache.nutch.util.NutchJob


    FileSystem fs = FileSystem.get(getConf());
    // determine if the crawldb already exists
    boolean dbExists = fs.exists(crawlDb);


    JobConf sortJob = new NutchJob(getConf());
    sortJob.setJobName("inject " + urlDir);
    FileInputFormat.addInputPath(sortJob, urlDir);
    sortJob.setMapperClass(InjectMapper.class);


    FileOutputFormat.setOutputPath(sortJob, tempDir);
    if (dbExists) {
      // Don't run merge injected urls, wait for merge with
      // existing DB
      sortJob.setOutputFormat(SequenceFileOutputFormat.class);
      sortJob.setNumReduceTasks(0);
    } else {
      sortJob.setOutputFormat(MapFileOutputFormat.class);
      sortJob.setReducerClass(InjectReducer.class);
      sortJob.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs",
          false);
    }
    sortJob.setOutputKeyClass(Text.class);
    sortJob.setOutputValueClass(CrawlDatum.class);
    sortJob.setLong("injector.current.time", System.currentTimeMillis());


    RunningJob mapJob = null;
    try {
      mapJob = JobClient.runJob(sortJob);
    } catch (IOException e) {

View Full Code Here

0 1 2 3 4 5

TOP

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.