Source Code of org.apache.nutch.crawl.Generator$SelectorInverseMapper

/**
 * Copyright 2005 The Apache Software Foundation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */


package org.apache.nutch.crawl;


import java.io.*;
import java.net.*;
import java.util.*;
import java.text.*;


// Commons Logging imports
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;


import org.apache.hadoop.io.*;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.mapred.*;
import org.apache.hadoop.fs.Path;


import org.apache.nutch.net.URLFilterException;
import org.apache.nutch.net.URLFilters;
import org.apache.nutch.scoring.ScoringFilterException;
import org.apache.nutch.scoring.ScoringFilters;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;


/** Generates a subset of a crawl db to fetch. */
public class Generator extends Configured {


  public static final Log LOG = LogFactory.getLog(Generator.class);
  
  public static class SelectorEntry implements Writable {
    public UTF8 url;
    public CrawlDatum datum;
    
    public SelectorEntry() {
      url = new UTF8();
      datum = new CrawlDatum();
    }


    public void readFields(DataInput in) throws IOException {
      url.readFields(in);
      datum.readFields(in);
    }


    public void write(DataOutput out) throws IOException {
      url.write(out);
      datum.write(out);
    }    
  }


  /** Selects entries due for fetch. */
  public static class Selector implements Mapper, Partitioner, Reducer {
    private long curTime;
    private long limit;
    private long count;
    private HashMap hostCounts = new HashMap();
    private int maxPerHost;
    private Partitioner hostPartitioner = new PartitionUrlByHost();
    private URLFilters filters;
    private ScoringFilters scfilters;
    private SelectorEntry entry = new SelectorEntry();
    private FloatWritable sortValue = new FloatWritable();
    private boolean byIP;
    private long dnsFailure = 0L;


    public void configure(JobConf job) {
      curTime = job.getLong("crawl.gen.curTime", System.currentTimeMillis());
      limit = job.getLong("crawl.topN",Long.MAX_VALUE)/job.getNumReduceTasks();
      maxPerHost = job.getInt("generate.max.per.host", -1);
      byIP = job.getBoolean("generate.max.per.host.by.ip", false);
      filters = new URLFilters(job);
      scfilters = new ScoringFilters(job);
    }


    public void close() {}


    /** Select & invert subset due for fetch. */
    public void map(WritableComparable key, Writable value,
                    OutputCollector output, Reporter reporter)
      throws IOException {
      UTF8 url = (UTF8)key;
      // don't generate URLs that don't pass URLFilters
      try {
        if (filters.filter(url.toString()) == null)
          return;
      } catch (URLFilterException e) {
        if (LOG.isWarnEnabled()) {
          LOG.warn("Couldn't filter url: " + url + " (" + e.getMessage() + ")");
        }
      }
      CrawlDatum crawlDatum = (CrawlDatum)value;


      if (crawlDatum.getStatus() == CrawlDatum.STATUS_DB_GONE)
        return;                                   // don't retry


      if (crawlDatum.getFetchTime() > curTime)
        return;                                   // not time yet


      float sort = 1.0f;
      try {
        sort = scfilters.generatorSortValue((UTF8)key, crawlDatum, sort);
      } catch (ScoringFilterException sfe) {
        if (LOG.isWarnEnabled()) {
          LOG.warn("Couldn't filter generatorSortValue for " + key + ": " + sfe);
        }
      }
      // sort by decreasing score
      sortValue.set(sort);
      entry.datum = crawlDatum;
      entry.url = (UTF8)key;
      output.collect(sortValue, entry);          // invert for sort by score
    }


    /** Partition by host. */
    public int getPartition(WritableComparable key, Writable value,
                            int numReduceTasks) {
      return hostPartitioner.getPartition(((SelectorEntry)value).url, key,
                                          numReduceTasks);
    }


    /** Collect until limit is reached. */
    public void reduce(WritableComparable key, Iterator values,
                       OutputCollector output, Reporter reporter)
      throws IOException {


      while (values.hasNext() && count < limit) {


        SelectorEntry entry = (SelectorEntry)values.next();
        UTF8 url = entry.url;


        if (maxPerHost > 0) {                     // are we counting hosts?
          String host = new URL(url.toString()).getHost();
          if (host == null) {
            // unknown host, skip
            continue;
          }
          host = host.toLowerCase();
          if (byIP) {
            try {
              InetAddress ia = InetAddress.getByName(host);
              host = ia.getHostAddress();
            } catch (UnknownHostException uhe) {
              if (LOG.isDebugEnabled()) {
                LOG.debug("DNS lookup failed: " + host + ", skipping.");
              }
              dnsFailure++;
              if ((dnsFailure % 1000 == 0) && (LOG.isWarnEnabled())) {
                LOG.warn("DNS failures: " + dnsFailure);
              }
              continue;
            }
          }
          IntWritable hostCount = (IntWritable)hostCounts.get(host);
          if (hostCount == null) {
            hostCount = new IntWritable();
            hostCounts.put(host, hostCount);
          }


          // increment hostCount
          hostCount.set(hostCount.get() + 1);


          // skip URL if above the limit per host.
          if (hostCount.get() > maxPerHost) {
            if (hostCount.get() == maxPerHost + 1) {
              if (LOG.isInfoEnabled()) {
                LOG.info("Host " + host + " has more than " + maxPerHost +
                         " URLs." + " Skipping additional.");
              }
            }
            continue;
          }
        }


        output.collect(key, entry);


        // Count is incremented only when we keep the URL
        // maxPerHost may cause us to skip it.
        count++;
      }


    }


  }


  public static class SelectorInverseMapper extends MapReduceBase implements Mapper {


    public void map(WritableComparable key, Writable value, OutputCollector output, Reporter reporter) throws IOException {
      SelectorEntry entry = (SelectorEntry)value;
      output.collect(entry.url, entry.datum);
    }    
  }
  
  /** Sort fetch lists by hash of URL. */
  public static class HashComparator extends WritableComparator {
    public HashComparator() { super(UTF8.class); }


    public int compare(WritableComparable a, WritableComparable b) {
      UTF8 url1 = (UTF8)a;
      UTF8 url2 = (UTF8)b;
      int hash1 = hash(url1.getBytes(), 0, url1.getLength());
      int hash2 = hash(url2.getBytes(), 0, url2.getLength());
      if (hash1 != hash2) {
        return hash1 - hash2;
      }
      return compareBytes(url1.getBytes(), 0, url1.getLength(),
                          url2.getBytes(), 0, url2.getLength());
    }




    public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
      int n1 = readUnsignedShort(b1, s1);
      int n2 = readUnsignedShort(b2, s2);
      int hash1 = hash(b1, s1+2, n1);
      int hash2 = hash(b2, s2+2, n2);
      if (hash1 != hash2) {
        return hash1 - hash2;
      }
      return compareBytes(b1, s1+2, n1, b2, s2+2, n2);
    }


    private static int hash(byte[] bytes, int start, int length) {
      int hash = 1;
      // make later bytes more significant in hash code, so that sorting by
      // hashcode correlates less with by-host ordering.
      for (int i = length-1; i >= 0; i--)
        hash = (31 * hash) + (int)bytes[start+i];
      return hash;
    }
  }


  /** Construct a generator. */
  public Generator(Configuration conf) {
    super(conf);
  }


  /** Generate fetchlists in a segment. */
  public Path generate(Path dbDir, Path segments)
    throws IOException {
    return generate(dbDir, segments,
                    -1, Long.MAX_VALUE, System.currentTimeMillis());
  }


  /** Generate fetchlists in a segment. */
  public Path generate(Path dbDir, Path segments,
                       int numLists, long topN, long curTime)
    throws IOException {


    Path tempDir =
      new Path(getConf().get("mapred.temp.dir", ".") +
               "/generate-temp-"+
               Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));


    Path segment = new Path(segments, generateSegmentName());
    Path output = new Path(segment, CrawlDatum.GENERATE_DIR_NAME);


    if (LOG.isInfoEnabled()) {
      LOG.info("Generator: starting");
      LOG.info("Generator: segment: " + segment);
      LOG.info("Generator: Selecting most-linked urls due for fetch.");
    }


    // map to inverted subset due for fetch, sort by link count
    JobConf job = new NutchJob(getConf());
    job.setJobName("generate: select " + segment);
    
    if (numLists == -1) {                         // for politeness make
      numLists = job.getNumMapTasks();            // a partition per fetch task
    }


    job.setLong("crawl.gen.curTime", curTime);
    job.setLong("crawl.topN", topN);


    job.setInputPath(new Path(dbDir, CrawlDatum.DB_DIR_NAME));
    job.setInputFormat(SequenceFileInputFormat.class);
    job.setInputKeyClass(UTF8.class);
    job.setInputValueClass(CrawlDatum.class);


    job.setMapperClass(Selector.class);
    job.setPartitionerClass(Selector.class);
    job.setReducerClass(Selector.class);


    job.setOutputPath(tempDir);
    job.setOutputFormat(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(FloatWritable.class);
    job.setOutputValueClass(SelectorEntry.class);
    JobClient.runJob(job);


    // invert again, paritition by host, sort by url hash
    if (LOG.isInfoEnabled()) {
      LOG.info("Generator: Partitioning selected urls by host, for politeness.");
    }
    job = new NutchJob(getConf());
    job.setJobName("generate: partition " + segment);
    
    job.setInt("partition.url.by.host.seed", new Random().nextInt());


    job.setInputPath(tempDir);
    job.setInputFormat(SequenceFileInputFormat.class);
    job.setInputKeyClass(FloatWritable.class);
    job.setInputValueClass(SelectorEntry.class);


    job.setMapperClass(SelectorInverseMapper.class);
    job.setPartitionerClass(PartitionUrlByHost.class);
    job.setNumReduceTasks(numLists);


    job.setOutputPath(output);
    job.setOutputFormat(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(UTF8.class);
    job.setOutputValueClass(CrawlDatum.class);
    job.setOutputKeyComparatorClass(HashComparator.class);
    JobClient.runJob(job);


    new JobClient(getConf()).getFs().delete(tempDir);


    if (LOG.isInfoEnabled()) { LOG.info("Generator: done."); }


    return segment;
  }
  
  private static SimpleDateFormat sdf = new SimpleDateFormat("yyyyMMddHHmmss");


  public static synchronized String generateSegmentName() {
    try {
      Thread.sleep(1000);
    } catch (Throwable t) {};
    return sdf.format
      (new Date(System.currentTimeMillis()));
  }


  /**
   * Generate a fetchlist from the pagedb and linkdb
   */
  public static void main(String args[]) throws Exception {
    if (args.length < 2) {
      System.out.println("Usage: Generator <crawldb> <segments_dir> [-topN N] [-numFetchers numFetchers] [-adddays numDays]");
      return;
    }


    Path dbDir = new Path(args[0]);
    Path segmentsDir = new Path(args[1]);
    long curTime = System.currentTimeMillis();
    long topN = Long.MAX_VALUE;
    int numFetchers = -1;


    for (int i = 2; i < args.length; i++) {
      if ("-topN".equals(args[i])) {
        topN = Long.parseLong(args[i+1]);
        i++;
      } else if ("-numFetchers".equals(args[i])) {
        numFetchers = Integer.parseInt(args[i+1]);
        i++;
      } else if ("-adddays".equals(args[i])) {
        long numDays = Integer.parseInt(args[i+1]);
        curTime += numDays * 1000L * 60 * 60 * 24;
      }
    }


    if ((LOG.isInfoEnabled()) && (topN != Long.MAX_VALUE)) {
      LOG.info("topN: " + topN);
    }
    Generator gen = new Generator(NutchConfiguration.create());
    gen.generate(dbDir, segmentsDir, numFetchers, topN, curTime);
  }
}
Source Code of org.apache.nutch.crawl.Generator$SelectorInverseMapper

Related Classes of org.apache.nutch.crawl.Generator$SelectorInverseMapper