Source Code of org.apache.nutch.crawl.LinkDb$Merger

/**
 * Copyright 2005 The Apache Software Foundation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */


package org.apache.nutch.crawl;


import java.io.*;
import java.util.*;
import java.net.*;


// Commons Logging imports
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;


import org.apache.hadoop.io.*;
import org.apache.hadoop.fs.*;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.mapred.*;


import org.apache.nutch.net.URLFilters;
import org.apache.nutch.parse.*;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;


/** Maintains an inverted link map, listing incoming links for each url. */
public class LinkDb extends Configured implements Mapper, Reducer {


  public static final Log LOG = LogFactory.getLog(LinkDb.class);


  public static String CURRENT_NAME = "current";


  private int maxAnchorLength;
  private int maxInlinks;
  private boolean ignoreInternalLinks;
  
  public static class Merger extends MapReduceBase implements Reducer {
    private int _maxInlinks;
    private URLFilters filters = null;
    
    public void configure(JobConf job) {
      super.configure(job);
      _maxInlinks = job.getInt("db.max.inlinks", 10000);
      if (job.getBoolean("linkdb.merger.urlfilters", false)) {
        filters = new URLFilters(job);
      }
    }


    public void reduce(WritableComparable key, Iterator values, OutputCollector output, Reporter reporter) throws IOException {
      if (filters != null) {
        try {
          if (filters.filter(((UTF8)key).toString()) == null)
            return;
        } catch (Exception e) {
          if (LOG.isDebugEnabled()) {
            LOG.debug("Can't filter " + key + ": " + e);
          }
        }
      }
      Inlinks inlinks = null;
      while (values.hasNext()) {
        if (inlinks == null) {
          inlinks = (Inlinks)values.next();
          continue;
        }
        Inlinks val = (Inlinks)values.next();
        for (Iterator it = val.iterator(); it.hasNext(); ) {
          if (inlinks.size() >= _maxInlinks) {
            output.collect(key, inlinks);
            return;
          }
          Inlink in = (Inlink)it.next();
          if (filters != null) {
            try {
              if (filters.filter(in.getFromUrl()) == null)
                continue;
            } catch (Exception e) {
              if (LOG.isDebugEnabled()) {
                LOG.debug("Can't filter " + key + ": " + e);
              }
            }
          }
          inlinks.add(in);
        }
      }
      if (inlinks.size() == 0) return;
      output.collect(key, inlinks);
    }
  }


  public LinkDb() {
    super(null);
  }


  /** Construct an LinkDb. */
  public LinkDb(Configuration conf) {
    super(conf);
  }


  public void configure(JobConf job) {
    maxAnchorLength = job.getInt("db.max.anchor.length", 100);
    maxInlinks = job.getInt("db.max.inlinks", 10000);
    ignoreInternalLinks = job.getBoolean("db.ignore.internal.links", true);
  }


  public void close() {}


  public void map(WritableComparable key, Writable value,
                  OutputCollector output, Reporter reporter)
    throws IOException {
    String fromUrl = key.toString();
    String fromHost = getHost(fromUrl);


    ParseData parseData = (ParseData)value;
    Outlink[] outlinks = parseData.getOutlinks();
    Inlinks inlinks = new Inlinks();
    for (int i = 0; i < outlinks.length; i++) {
      Outlink outlink = outlinks[i];
      String toUrl = outlink.getToUrl();


      if (ignoreInternalLinks) {
        String toHost = getHost(toUrl);
        if (toHost == null || toHost.equals(fromHost)) { // internal link
          continue;                               // skip it
        }
      }


      inlinks.clear();
      String anchor = outlink.getAnchor();        // truncate long anchors
      if (anchor.length() > maxAnchorLength) {
        anchor = anchor.substring(0, maxAnchorLength);
      }
      inlinks.add(new Inlink(fromUrl, anchor));   // collect inverted link
      output.collect(new UTF8(toUrl), inlinks);
    }
  }


  private String getHost(String url) {
    try {
      return new URL(url).getHost().toLowerCase();
    } catch (MalformedURLException e) {
      return null;
    }
  }


  public void reduce(WritableComparable key, Iterator values,
                     OutputCollector output, Reporter reporter)
    throws IOException {


    Inlinks result = null;


    while (values.hasNext()) {
      Inlinks inlinks = (Inlinks)values.next();


      if (result == null) {                       // optimize a common case
        if (inlinks.size() < maxInlinks) {
          result = inlinks;
          continue;
        } else {
          result = new Inlinks();
        }
      }


      int end = Math.min(maxInlinks - result.size(), inlinks.size());
      Iterator it = inlinks.iterator();
      int i = 0;
      while(it.hasNext() && i++ < end) {
        result.add((Inlink)it.next());
      }
    }
    output.collect(key, result);
  }


  public void invert(Path linkDb, final Path segmentsDir) throws IOException {
    final FileSystem fs = FileSystem.get(getConf());
    Path[] files = fs.listPaths(segmentsDir, new PathFilter() {
      public boolean accept(Path f) {
        try {
          if (fs.isDirectory(f)) return true;
        } catch (IOException ioe) {};
        return false;
      }
    });
    invert(linkDb, files);
  }


  public void invert(Path linkDb, Path[] segments) throws IOException {


    if (LOG.isInfoEnabled()) {
      LOG.info("LinkDb: starting");
      LOG.info("LinkDb: linkdb: " + linkDb);
    }
    JobConf job = LinkDb.createJob(getConf(), linkDb);
    for (int i = 0; i < segments.length; i++) {
      if (LOG.isInfoEnabled()) {
        LOG.info("LinkDb: adding segment: " + segments[i]);
      }
      job.addInputPath(new Path(segments[i], ParseData.DIR_NAME));
    }
    JobClient.runJob(job);
    FileSystem fs = FileSystem.get(getConf());
    if (fs.exists(linkDb)) {
      if (LOG.isInfoEnabled()) {
        LOG.info("LinkDb: merging with existing linkdb: " + linkDb);
      }
      // try to merge
      Path newLinkDb = job.getOutputPath();
      job = LinkDb.createMergeJob(getConf(), linkDb);
      job.addInputPath(new Path(linkDb, CURRENT_NAME));
      job.addInputPath(newLinkDb);
      JobClient.runJob(job);
      fs.delete(newLinkDb);
    }
    LinkDb.install(job, linkDb);
    if (LOG.isInfoEnabled()) { LOG.info("LinkDb: done"); }
  }


  private static JobConf createJob(Configuration config, Path linkDb) {
    Path newLinkDb =
      new Path("linkdb-" +
               Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));


    JobConf job = new NutchJob(config);
    job.setJobName("linkdb " + linkDb);


    job.setInputFormat(SequenceFileInputFormat.class);
    job.setInputKeyClass(UTF8.class);
    job.setInputValueClass(ParseData.class);


    job.setMapperClass(LinkDb.class);
    job.setReducerClass(LinkDb.class);


    job.setOutputPath(newLinkDb);
    job.setOutputFormat(MapFileOutputFormat.class);
    job.setBoolean("mapred.output.compress", true);
    job.setOutputKeyClass(UTF8.class);
    job.setOutputValueClass(Inlinks.class);


    return job;
  }


  public static JobConf createMergeJob(Configuration config, Path linkDb) {
    Path newLinkDb =
      new Path("linkdb-merge-" + 
               Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));


    JobConf job = new NutchJob(config);
    job.setJobName("linkdb merge " + linkDb);


    job.setInputFormat(SequenceFileInputFormat.class);
    job.setInputKeyClass(UTF8.class);
    job.setInputValueClass(Inlinks.class);


    job.setReducerClass(Merger.class);


    job.setOutputPath(newLinkDb);
    job.setOutputFormat(MapFileOutputFormat.class);
    job.setBoolean("mapred.output.compress", true);
    job.setOutputKeyClass(UTF8.class);
    job.setOutputValueClass(Inlinks.class);


    return job;
  }


  public static void install(JobConf job, Path linkDb) throws IOException {
    Path newLinkDb = job.getOutputPath();
    FileSystem fs = new JobClient(job).getFs();
    Path old = new Path(linkDb, "old");
    Path current = new Path(linkDb, CURRENT_NAME);
    fs.delete(old);
    fs.rename(current, old);
    fs.mkdirs(linkDb);
    fs.rename(newLinkDb, current);
    fs.delete(old);
  }


  public static void main(String[] args) throws Exception {
    Configuration conf = NutchConfiguration.create();
    LinkDb linkDb = new LinkDb(conf);
    
    if (args.length < 2) {
      System.err.println("Usage: <linkdb> (-dir segmentsDir | segment1 segment2 ...)");
      return;
    }
    Path segDir = null;
    final FileSystem fs = FileSystem.get(conf);
    Path db = new Path(args[0]);
    ArrayList segs = new ArrayList();
    for (int i = 1; i < args.length; i++) {
      if (args[i].equals("-dir")) {
        segDir = new Path(args[++i]);
        Path[] files = fs.listPaths(segDir, new PathFilter() {
          public boolean accept(Path f) {
            try {
              if (fs.isDirectory(f)) return true;
            } catch (IOException ioe) {};
            return false;
          }
        });
        if (files != null) segs.addAll(Arrays.asList(files));
        break;
      } else segs.add(new Path(args[i]));
    }
    linkDb.invert(db, (Path[])segs.toArray(new Path[segs.size()]));
  }






}
Source Code of org.apache.nutch.crawl.LinkDb$Merger

Related Classes of org.apache.nutch.crawl.LinkDb$Merger