Examples of NutchJob


Examples of org.apache.nutch.util.NutchJob

      getConf().set(GENERATOR_COUNT_MODE, GENERATOR_COUNT_VALUE_HOST);
      getConf().set(URLPartitioner.PARTITION_MODE_KEY, URLPartitioner.PARTITION_MODE_HOST);
    }
    numJobs = 1;
    currentJobNum = 0;
    currentJob = new NutchJob(getConf(), "generate: " + batchId);
    StorageUtils.initMapperJob(currentJob, FIELDS, SelectorEntry.class,
        WebPage.class, GeneratorMapper.class, SelectorEntryPartitioner.class, true);
    StorageUtils.initReducerJob(currentJob, GeneratorReducer.class);
    currentJob.waitForCompletion(true);
    ToolUtil.recordJobStatus(null, currentJob, results);
View Full Code Here

Examples of org.apache.nutch.util.NutchJob

   
  public Map<String,Object> run(Map<String,Object> args) throws Exception {
    String crawlId = (String)args.get(Nutch.ARG_CRAWL);
    numJobs = 1;
    currentJobNum = 0;
    currentJob = new NutchJob(getConf(), "update-table");
    if (crawlId != null) {
      currentJob.getConfiguration().set(Nutch.CRAWL_ID_KEY, crawlId);
    }
    //job.setBoolean(ALL, updateAll);
    ScoringFilters scoringFilters = new ScoringFilters(getConf());
View Full Code Here

Examples of org.apache.nutch.util.NutchJob

      input = new Path(path.toString());
    }
    numJobs = 2;
    currentJobNum = 0;
    status.put(Nutch.STAT_PHASE, "convert input");
    currentJob = new NutchJob(getConf(), "inject-p1 " + input);
    FileInputFormat.addInputPath(currentJob, input);
    currentJob.setMapperClass(UrlMapper.class);
    currentJob.setMapOutputKeyClass(String.class);
    currentJob.setMapOutputValueClass(WebPage.class);
    currentJob.setOutputFormatClass(GoraOutputFormat.class);
    DataStore<String, WebPage> store = StorageUtils.createWebStore(currentJob.getConfiguration(),
        String.class, WebPage.class);
    GoraOutputFormat.setOutput(currentJob, store, true);
    currentJob.setReducerClass(Reducer.class);
    currentJob.setNumReduceTasks(0);
    currentJob.waitForCompletion(true);
    ToolUtil.recordJobStatus(null, currentJob, results);
    currentJob = null;

    status.put(Nutch.STAT_PHASE, "merge input with db");
    status.put(Nutch.STAT_PROGRESS, 0.5f);
    currentJobNum = 1;
    currentJob = new NutchJob(getConf(), "inject-p2 " + input);
    StorageUtils.initMapperJob(currentJob, FIELDS, String.class,
        WebPage.class, InjectorMapper.class);
    currentJob.setNumReduceTasks(0);
    ToolUtil.recordJobStatus(null, currentJob, results);
    status.put(Nutch.STAT_PROGRESS, 1.0f);
View Full Code Here

Examples of org.apache.nutch.util.NutchJob

     
        SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
        long start = System.currentTimeMillis();
        LOG.info("ElasticsearchIndexer: starting at " + sdf.format(start));

        final JobConf job = new NutchJob(getConf());
        job.setJobName("index-elasticsearch " + elasticsearchUrl);

        IndexerMapReduce.initMRJob(crawlDb, linkDb, segments, job);

        job.set(ElasticsearchConstants.SERVER_URL, elasticsearchUrl);
        job.set(ElasticsearchConstants.SERVER_PORT, elasticsearchPort);

        NutchIndexWriterFactory.addClassToConf(job, ElasticsearchWriter.class);

        job.setReduceSpeculativeExecution(false);

        final Path tmp = new Path("tmp_" + System.currentTimeMillis() + "-" +
                             new Random().nextInt());

        FileOutputFormat.setOutputPath(job, tmp);
View Full Code Here

Examples of org.apache.nutch.util.NutchJob

   */
  private void runExtractor(Path nodeDb, Path segment, Path outputDir)
    throws IOException {

    LOG.info("BasicFields: starting extractor");
    JobConf job = new NutchJob(getConf());
    job.setJobName("BasicFields " + outputDir);

    LOG.info("BasicFields: extractor adding segment: " + segment);
    FileInputFormat.addInputPath(job, new Path(segment,
      CrawlDatum.FETCH_DIR_NAME));
    FileInputFormat.addInputPath(job, new Path(segment, ParseData.DIR_NAME));
    FileInputFormat.addInputPath(job, new Path(segment, ParseText.DIR_NAME));
    FileInputFormat.addInputPath(job, nodeDb);
    job.setInputFormat(SequenceFileInputFormat.class);
    job.setMapperClass(Extractor.class);
    job.setReducerClass(Extractor.class);
    FileOutputFormat.setOutputPath(job, outputDir);
    job.setOutputFormat(SequenceFileOutputFormat.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(ObjectWritable.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(FieldsWritable.class);

    JobClient.runJob(job);
    if (LOG.isInfoEnabled()) {
      LOG.info("BasicFields: finished extractor");
    }
View Full Code Here

Examples of org.apache.nutch.util.NutchJob

   */
  private void runFlipper(Path basicFields, Path nodeDb, Path outputDir)
    throws IOException {

    LOG.info("BasicFields: starting flipper");
    JobConf job = new NutchJob(getConf());
    job.setJobName("BasicFields " + outputDir);
    FileInputFormat.addInputPath(job, nodeDb);
    FileInputFormat.addInputPath(job, basicFields);
    job.setInputFormat(SequenceFileInputFormat.class);
    job.setMapperClass(Flipper.class);
    job.setReducerClass(Flipper.class);
    FileOutputFormat.setOutputPath(job, outputDir);
    job.setOutputFormat(SequenceFileOutputFormat.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(ObjectWritable.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(LinkDatum.class);

    JobClient.runJob(job);
    if (LOG.isInfoEnabled()) {
      LOG.info("BasicFields: finished flipper");
    }
View Full Code Here

Examples of org.apache.nutch.util.NutchJob

   */
  private void runScorer(Path basicFields, Path links, Path outputDir)
    throws IOException {

    LOG.info("BasicFields: starting scorer");
    JobConf job = new NutchJob(getConf());
    job.setJobName("BasicFields " + outputDir);
    FileInputFormat.addInputPath(job, links);
    FileInputFormat.addInputPath(job, basicFields);
    job.setInputFormat(SequenceFileInputFormat.class);
    job.setMapperClass(Scorer.class);
    job.setReducerClass(Scorer.class);
    FileOutputFormat.setOutputPath(job, outputDir);
    job.setOutputFormat(SequenceFileOutputFormat.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(ObjectWritable.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(FieldsWritable.class);

    JobClient.runJob(job);
    if (LOG.isInfoEnabled()) {
      LOG.info("BasicFields: finished scorer");
    }
View Full Code Here

Examples of org.apache.nutch.util.NutchJob

   */
  private void runMerger(Path[] basicFields, Path outputDir)
    throws IOException {

    LOG.info("BasicFields: starting merger");
    JobConf job = new NutchJob(getConf());
    job.setJobName("BasicFields " + outputDir);
    for (Path basic : basicFields) {
      FileInputFormat.addInputPath(job, basic);
    }
    job.setInputFormat(SequenceFileInputFormat.class);
    job.setReducerClass(Merger.class);
    FileOutputFormat.setOutputPath(job, outputDir);
    job.setOutputFormat(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(FieldsWritable.class);

    JobClient.runJob(job);
    if (LOG.isInfoEnabled()) {
      LOG.info("BasicFields: finished merger");
    }
View Full Code Here

Examples of org.apache.nutch.util.NutchJob

  throws IOException {
    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
    long start = System.currentTimeMillis();
    LOG.info("Indexer: starting at " + sdf.format(start));

    final JobConf job = new NutchJob(getConf());
    job.setJobName("index-lucene " + luceneDir);

    IndexerMapReduce.initMRJob(crawlDb, linkDb, segments, job);

    FileOutputFormat.setOutputPath(job, luceneDir);
View Full Code Here

Examples of org.apache.nutch.util.NutchJob

   * @throws IOException If an error occurs while running the extractor.
   */
  private void runExtractor(Path webGraphDb, Path output)
    throws IOException {

    JobConf extractor = new NutchJob(getConf());
    extractor.setJobName("AnchorFields Extractor");
    FileInputFormat.addInputPath(extractor, new Path(webGraphDb,
      WebGraph.OUTLINK_DIR));
    FileInputFormat.addInputPath(extractor, new Path(webGraphDb,
      WebGraph.NODE_DIR));
    FileOutputFormat.setOutputPath(extractor, output);
    extractor.setInputFormat(SequenceFileInputFormat.class);
    extractor.setMapperClass(Extractor.class);
    extractor.setReducerClass(Extractor.class);
    extractor.setMapOutputKeyClass(Text.class);
    extractor.setMapOutputValueClass(ObjectWritable.class);
    extractor.setOutputKeyClass(Text.class);
    extractor.setOutputValueClass(LinkDatum.class);
    extractor.setOutputFormat(SequenceFileOutputFormat.class);

    LOG.info("Starting extractor job");
    try {
      JobClient.runJob(extractor);
    }
View Full Code Here
TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.