Package org.apache.nutch.crawl

Source Code of org.apache.nutch.crawl.Injector$InjectReducer

/**
* Copyright 2005 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.nutch.crawl;

import java.io.*;
import java.util.*;

// Commons Logging imports
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import org.apache.hadoop.io.*;
import org.apache.hadoop.fs.*;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.mapred.*;

import org.apache.nutch.net.*;
import org.apache.nutch.scoring.ScoringFilterException;
import org.apache.nutch.scoring.ScoringFilters;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;

/** This class takes a flat file of URLs and adds them to the of pages to be
* crawled.  Useful for bootstrapping the system. */
public class Injector extends Configured {
  public static final Log LOG = LogFactory.getLog(Injector.class);


  /** Normalize and filter injected urls. */
  public static class InjectMapper implements Mapper {
    private UrlNormalizer urlNormalizer;
    private float interval;
    private float scoreInjected;
    private JobConf jobConf;
    private URLFilters filters;
    private ScoringFilters scfilters;

    public void configure(JobConf job) {
      this.jobConf = job;
      urlNormalizer = new UrlNormalizerFactory(jobConf).getNormalizer();
      interval = jobConf.getFloat("db.default.fetch.interval", 30f);
      filters = new URLFilters(jobConf);
      scfilters = new ScoringFilters(jobConf);
      scoreInjected = jobConf.getFloat("db.score.injected", 1.0f);
    }

    public void close() {}

    public void map(WritableComparable key, Writable val,
                    OutputCollector output, Reporter reporter)
      throws IOException {
      UTF8 value = (UTF8)val;
      String url = value.toString();              // value is line of text
      // System.out.println("url: " +url);
      try {
        url = urlNormalizer.normalize(url);       // normalize the url
        url = filters.filter(url);             // filter the url
      } catch (Exception e) {
        if (LOG.isWarnEnabled()) { LOG.warn("Skipping " +url+":"+e); }
        url = null;
      }
      if (url != null) {                          // if it passes
        value.set(url);                           // collect it
        CrawlDatum datum = new CrawlDatum(CrawlDatum.STATUS_DB_UNFETCHED, interval);
        datum.setScore(scoreInjected);
        try {
          scfilters.initialScore(value, datum);
        } catch (ScoringFilterException e) {
          if (LOG.isWarnEnabled()) {
            LOG.warn("Cannot filter init score for url " + url +
                     ", using default (" + e.getMessage() + ")");
          }
          datum.setScore(scoreInjected);
        }
        output.collect(value, datum);
      }
    }
  }

  /** Combine multiple new entries for a url. */
  public static class InjectReducer implements Reducer {
    public void configure(JobConf job) {}
    public void close() {}

    public void reduce(WritableComparable key, Iterator values,
                       OutputCollector output, Reporter reporter)
      throws IOException {
      output.collect(key, (Writable)values.next()); // just collect first value
    }
  }

  /** Construct an Injector. */
  public Injector(Configuration conf) {
    super(conf);
  }

  public void inject(Path crawlDb, Path urlDir) throws IOException {

    if (LOG.isInfoEnabled()) {
      LOG.info("Injector: starting");
      LOG.info("Injector: crawlDb: " + crawlDb);
      LOG.info("Injector: urlDir: " + urlDir);
    }

    Path tempDir =
      new Path(getConf().get("mapred.temp.dir", ".") +
               "/inject-temp-"+
               Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));

    // map text input file to a <url,CrawlDatum> file
    if (LOG.isInfoEnabled()) {
      LOG.info("Injector: Converting injected urls to crawl db entries.");
    }
    JobConf sortJob = new NutchJob(getConf());
    sortJob.setJobName("inject " + urlDir);
    sortJob.setInputPath(urlDir);
    sortJob.setMapperClass(InjectMapper.class);
    sortJob.setReducerClass(InjectReducer.class);

    sortJob.setOutputPath(tempDir);
    sortJob.setOutputFormat(SequenceFileOutputFormat.class);
    sortJob.setOutputKeyClass(UTF8.class);
    sortJob.setOutputValueClass(CrawlDatum.class);
    JobClient.runJob(sortJob);

    // merge with existing crawl db
    if (LOG.isInfoEnabled()) {
      LOG.info("Injector: Merging injected urls into crawl db.");
    }
    JobConf mergeJob = CrawlDb.createJob(getConf(), crawlDb);
    mergeJob.addInputPath(tempDir);
    JobClient.runJob(mergeJob);
    CrawlDb.install(mergeJob, crawlDb);

    // clean up
    FileSystem fs = new JobClient(getConf()).getFs();
    fs.delete(tempDir);
    if (LOG.isInfoEnabled()) { LOG.info("Injector: done"); }

  }

  public static void main(String[] args) throws Exception {
    Injector injector = new Injector(NutchConfiguration.create());
   
    if (args.length < 2) {
      System.err.println("Usage: Injector <crawldb> <url_dir>");
      return;
    }
   
    injector.inject(new Path(args[0]), new Path(args[1]));
  }

}
TOP

Related Classes of org.apache.nutch.crawl.Injector$InjectReducer

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.