Examples of NutchJob

org.apache.nutch.util.NutchJob
A {@link Job} for Nutch jobs.

Examples of org.apache.nutch.util.NutchJob

   * @throws IOException If an error occurs while running the collector.
   */
  private void runCollector(Path basicFields, Path links, Path output)
    throws IOException {


    JobConf collector = new NutchJob(getConf());
    collector.setJobName("AnchorFields Collector");
    FileInputFormat.addInputPath(collector, links);
    FileInputFormat.addInputPath(collector, basicFields);
    FileOutputFormat.setOutputPath(collector, output);
    collector.setInputFormat(SequenceFileInputFormat.class);
    collector.setMapOutputKeyClass(Text.class);
    collector.setMapOutputValueClass(ObjectWritable.class);
    collector.setMapperClass(Collector.class);
    collector.setReducerClass(Collector.class);
    collector.setOutputKeyClass(Text.class);
    collector.setOutputValueClass(FieldWritable.class);
    collector.setOutputFormat(SequenceFileOutputFormat.class);


    LOG.info("Starting collector job");
    try {
      JobClient.runJob(collector);
    }

View Full Code Here

Examples of org.apache.nutch.util.NutchJob

      LOG.info("ReprUrlFixer: crawlDb " + crawlDb);
      Path crawlDbCurrent = new Path(crawlDb, CrawlDb.CURRENT_NAME);
      Path newCrawlDb = new Path(crawlDb,
        Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));


      JobConf updater = new NutchJob(conf);
      updater.setJobName("ReprUtilFixer: " + crawlDb.toString());
      FileInputFormat.addInputPath(updater, crawlDbCurrent);
      FileOutputFormat.setOutputPath(updater, newCrawlDb);
      updater.setInputFormat(SequenceFileInputFormat.class);
      updater.setReducerClass(ReprUrlFixer.class);
      updater.setOutputKeyClass(Text.class);
      updater.setOutputValueClass(CrawlDatum.class);
      updater.setOutputFormat(MapFileOutputFormat.class);


      try {
        JobClient.runJob(updater);
        LOG.info("ReprUrlFixer: installing new crawldb " + crawlDb);
        CrawlDb.install(updater, crawlDb);
      }
      catch (IOException e) {
        LOG.error(StringUtils.stringifyException(e));
        throw e;
      }
    }


    // run the segments through the repr fixer, logic will be run on both the
    // crawl_parse and the crawl_fetch directories for every segment specified
    if (segments != null) {


      for (int i = 0; i < segments.length; i++) {


        Path segment = segments[i];
        LOG.info("ReprUrlFixer: fetching segment " + segment);
        Path segFetch = new Path(segment, CrawlDatum.FETCH_DIR_NAME);
        Path newSegFetch = new Path(segment, CrawlDatum.FETCH_DIR_NAME + "-"
          + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));


        JobConf fetch = new NutchJob(conf);
        fetch.setJobName("ReprUrlFixer: " + segment.toString());
        FileInputFormat.addInputPath(fetch, segFetch);
        FileOutputFormat.setOutputPath(fetch, newSegFetch);
        fetch.setInputFormat(SequenceFileInputFormat.class);
        fetch.setReducerClass(ReprUrlFixer.class);
        fetch.setOutputKeyClass(Text.class);
        fetch.setOutputValueClass(CrawlDatum.class);
        fetch.setOutputFormat(MapFileOutputFormat.class);


        try {
          JobClient.runJob(fetch);
          LOG.info("ReprUrlFixer: installing new segment fetch directory " + newSegFetch);
          FSUtils.replace(fs, segFetch, newSegFetch, true);
          LOG.info("ReprUrlFixer: finished installing segment fetch directory");
        }
        catch (IOException e) {
          LOG.error(StringUtils.stringifyException(e));
          throw e;
        }


        LOG.info("ReprUrlFixer: parsing segment " + segment);
        Path segParse = new Path(segment, CrawlDatum.PARSE_DIR_NAME);
        Path newSegParse = new Path(segment, CrawlDatum.PARSE_DIR_NAME + "-"
          + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));


        JobConf parse = new NutchJob(conf);
        parse.setJobName("ReprUrlFixer: " + segment.toString());
        FileInputFormat.addInputPath(parse, segParse);
        FileOutputFormat.setOutputPath(parse, newSegParse);
        parse.setInputFormat(SequenceFileInputFormat.class);
        parse.setReducerClass(ReprUrlFixer.class);
        parse.setOutputKeyClass(Text.class);
        parse.setOutputValueClass(CrawlDatum.class);
        parse.setOutputFormat(MapFileOutputFormat.class);


        try {
          JobClient.runJob(parse);
          LOG.info("ReprUrlFixer: installing new segment parse directry " + newSegParse);
          FSUtils.replace(fs, segParse, newSegParse, true);

View Full Code Here

Examples of org.apache.nutch.util.NutchJob

  }


  public static JobConf createMergeJob(Configuration conf, Path output, boolean normalize, boolean filter) {
    Path newCrawlDb = new Path("crawldb-merge-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));


    JobConf job = new NutchJob(conf);
    job.setJobName("crawldb merge " + output);


    job.setInputFormat(SequenceFileInputFormat.class);


    job.setMapperClass(CrawlDbFilter.class);
    job.setBoolean(CrawlDbFilter.URL_FILTERING, filter);
    job.setBoolean(CrawlDbFilter.URL_NORMALIZING, normalize);
    job.setReducerClass(Merger.class);


    FileOutputFormat.setOutputPath(job, newCrawlDb);
    job.setOutputFormat(MapFileOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(CrawlDatum.class);


    return job;
  }

View Full Code Here

Examples of org.apache.nutch.util.NutchJob

      System.err.println("\toldDb\tname of the crawldb that uses UTF8 class.");
      System.err.println("\tnewDb\tname of the output crawldb that will use Text class.");
      System.err.println("\twithMetadata\tconvert also all metadata keys that use UTF8 to Text.");
      return -1;
    }
    JobConf job = new NutchJob(getConf());
    FileSystem fs = FileSystem.get(getConf());
    Path oldDb = new Path(args[0], CrawlDb.CURRENT_NAME);
    Path newDb =
      new Path(oldDb,
               Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
    if (!fs.exists(oldDb)) {
      LOG.fatal("Old db doesn't exist in '" + args[0] + "'");
      return -1;
    }
    boolean withMetadata = false;
    if (args.length > 2 && args[2].equalsIgnoreCase("-withMetadata"))
      withMetadata = true;
    
    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
    long start = System.currentTimeMillis();
    LOG.info("CrawlDbConverter: starting at " + sdf.format(start));


    job.setBoolean(CONVERT_META_KEY, withMetadata);
    FileInputFormat.addInputPath(job, oldDb);
    job.setInputFormat(SequenceFileInputFormat.class);
    job.setMapperClass(CrawlDbConverter.class);
    job.setOutputFormat(MapFileOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(CrawlDatum.class);
    FileOutputFormat.setOutputPath(job, newDb);
    try {
      JobClient.runJob(job);
      CrawlDb.install(job, new Path(args[1]));
    } catch (Exception e) {

View Full Code Here

Examples of org.apache.nutch.util.NutchJob

        " [-solr solrURL]");
      return;
    }


    Configuration conf = NutchConfiguration.createCrawlConfiguration();
    JobConf job = new NutchJob(conf);


    Path rootUrlDir = null;
    Path dir = new Path("crawl-" + getDate());
    int threads = job.getInt("fetcher.threads.fetch", 10);
    int depth = 5;
    long topN = Long.MAX_VALUE;
    String indexerName = "lucene";
    String solrUrl = null;
    
    for (int i = 0; i < args.length; i++) {
      if ("-dir".equals(args[i])) {
        dir = new Path(args[i+1]);
        i++;
      } else if ("-threads".equals(args[i])) {
        threads = Integer.parseInt(args[i+1]);
        i++;
      } else if ("-depth".equals(args[i])) {
        depth = Integer.parseInt(args[i+1]);
        i++;
      } else if ("-topN".equals(args[i])) {
          topN = Integer.parseInt(args[i+1]);
          i++;
      } else if ("-solr".equals(args[i])) {
        indexerName = "solr";
        solrUrl = StringUtils.lowerCase(args[i + 1]);
        i++;
      } else if (args[i] != null) {
        rootUrlDir = new Path(args[i]);
      }
    }


    boolean isSolrIndex = StringUtils.equalsIgnoreCase(indexerName, "solr");
    FileSystem fs = FileSystem.get(job);


    if (LOG.isInfoEnabled()) {
      LOG.info("crawl started in: " + dir);
      LOG.info("rootUrlDir = " + rootUrlDir);
      LOG.info("threads = " + threads);
      LOG.info("depth = " + depth);      
      LOG.info("indexer=" + indexerName);
      if (isSolrIndex) {
        LOG.info("solrUrl=" + solrUrl);
      }
      if (topN != Long.MAX_VALUE)
        LOG.info("topN = " + topN);
    }
    
    Path crawlDb = new Path(dir + "/crawldb");
    Path linkDb = new Path(dir + "/linkdb");
    Path segments = new Path(dir + "/segments");
    Path indexes = new Path(dir + "/indexes");
    Path index = new Path(dir + "/index");


    Path tmpDir = job.getLocalPath("crawl"+Path.SEPARATOR+getDate());
    Injector injector = new Injector(conf);
    Generator generator = new Generator(conf);
    Fetcher fetcher = new Fetcher(conf);
    ParseSegment parseSegment = new ParseSegment(conf);
    CrawlDb crawlDbTool = new CrawlDb(conf);

View Full Code Here

Examples of org.apache.nutch.util.NutchJob

   * @throws IOException If an error occurs while converting.
   */
  private void runConverter(Path[] inputs, Path output)
    throws IOException {


    JobConf converter = new NutchJob(getConf());
    converter.setJobName("CustomFields Converter");
    for (int i = 0; i < inputs.length; i++) {
      FileInputFormat.addInputPath(converter, inputs[i]);
    }
    FileOutputFormat.setOutputPath(converter, output);
    converter.setInputFormat(TextInputFormat.class);
    converter.setMapperClass(Converter.class);
    converter.setReducerClass(Converter.class);
    converter.setMapOutputKeyClass(Text.class);
    converter.setMapOutputValueClass(FieldWritable.class);
    converter.setOutputKeyClass(Text.class);
    converter.setOutputValueClass(FieldWritable.class);
    converter.setOutputFormat(SequenceFileOutputFormat.class);


    LOG.info("Starting converter job");
    try {
      JobClient.runJob(converter);
    }

View Full Code Here

Examples of org.apache.nutch.util.NutchJob

   * @throws IOException If an error occurs while converting.
   */
  private void runCollector(Path basicFields, Path converted, Path output)
    throws IOException {


    JobConf collector = new NutchJob(getConf());
    collector.setJobName("CustomFields Collector");
    FileInputFormat.addInputPath(collector, converted);
    FileInputFormat.addInputPath(collector, basicFields);
    FileOutputFormat.setOutputPath(collector, output);
    collector.setInputFormat(SequenceFileInputFormat.class);
    collector.setMapOutputKeyClass(Text.class);
    collector.setMapOutputValueClass(ObjectWritable.class);
    collector.setMapperClass(Collector.class);
    collector.setReducerClass(Collector.class);
    collector.setOutputKeyClass(Text.class);
    collector.setOutputValueClass(FieldWritable.class);
    collector.setOutputFormat(SequenceFileOutputFormat.class);


    LOG.info("Starting collector job");
    try {
      JobClient.runJob(collector);
    }

View Full Code Here

Examples of org.apache.nutch.util.NutchJob


    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
    long start = System.currentTimeMillis();
    LOG.info("FieldIndexer: starting at " + sdf.format(start));


    JobConf job = new NutchJob(getConf());
    job.setJobName("FieldIndexer: " + indexDir);


    for (int i = 0; i < fields.length; i++) {
      Path fieldsDb = fields[i];
      LOG.info("FieldIndexer: adding fields db: " + fieldsDb);
      FileInputFormat.addInputPath(job, fieldsDb);
    }


    job.setInputFormat(SequenceFileInputFormat.class);
    job.setMapperClass(FieldIndexer.class);
    job.setReducerClass(FieldIndexer.class);
    FileOutputFormat.setOutputPath(job, indexDir);
    job.setOutputFormat(OutputFormat.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(FieldWritable.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(LuceneDocumentWrapper.class);


    JobClient.runJob(job);
    long end = System.currentTimeMillis();
    LOG.info("FieldIndexer: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
  }

View Full Code Here

Examples of org.apache.nutch.util.NutchJob

      List<Path> segments) throws IOException {
    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
    long start = System.currentTimeMillis();
    LOG.info("SolrIndexer: starting at " + sdf.format(start));


    final JobConf job = new NutchJob(getConf());
    job.setJobName("index-solr " + solrUrl);


    IndexerMapReduce.initMRJob(crawlDb, linkDb, segments, job);


    job.set(SolrConstants.SERVER_URL, solrUrl);


    NutchIndexWriterFactory.addClassToConf(job, SolrWriter.class);


    job.setReduceSpeculativeExecution(false);


    final Path tmp = new Path("tmp_" + System.currentTimeMillis() + "-" +
                         new Random().nextInt());


    FileOutputFormat.setOutputPath(job, tmp);

View Full Code Here

Examples of org.apache.nutch.util.NutchJob

    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
    long start = System.currentTimeMillis();
    LOG.info("SolrDeleteDuplicates: starting at " + sdf.format(start));
    LOG.info("SolrDeleteDuplicates: Solr url: " + solrUrl);
    
    JobConf job = new NutchJob(getConf());


    job.set(SolrConstants.SERVER_URL, solrUrl);
    job.setInputFormat(SolrInputFormat.class);
    job.setOutputFormat(NullOutputFormat.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(SolrRecord.class);
    job.setMapperClass(IdentityMapper.class);
    job.setReducerClass(SolrDeleteDuplicates.class);


    JobClient.runJob(job);


    long end = System.currentTimeMillis();
    LOG.info("SolrDeleteDuplicates: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));

View Full Code Here

0 1 2 3 4 5

TOP

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.