Source Code of org.apache.nutch.indexer.field.AnchorFields$Collector

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.nutch.indexer.field;


import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.Iterator;
import java.util.List;
import java.util.Random;


import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.ObjectWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.scoring.webgraph.LinkDatum;
import org.apache.nutch.scoring.webgraph.Node;
import org.apache.nutch.scoring.webgraph.WebGraph;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;


/**
 * Creates FieldWritable objects for inbound anchor text.   These FieldWritable
 * objects are then included in the input to the FieldIndexer to be converted
 * to Lucene Field objects and indexed.
 * 
 * Any empty or null anchor text is ignored. Anchors are sorted in descending
 * order according to the score of their parent pages. There are settings for a
 * maximum number of anchors to index and whether those anchors should be stored
 * and tokenized. With a descending order by score and a maximum anchors index
 * we ensure that only the best anchors are indexed assuming that a higher link
 * analysis score equals a better page and better inbound text.
 */
public class AnchorFields
  extends Configured
  implements Tool {


  public static final Log LOG = LogFactory.getLog(AnchorFields.class);


  /**
   * Comparator to order the links in descending order by score.
   */
  private static class DescendinLinkDatumScoreComparator
    implements Comparator<LinkDatum> {


    public int compare(LinkDatum one, LinkDatum two) {
      float scoreOne = one.getScore();
      float scoreTwo = two.getScore();
      return (scoreOne == scoreTwo ? 0 : (scoreOne > scoreTwo ? -1 : 1));
    }
  }


  /**
   * Runs the Extractor job.  Get outlinks to be converted while ignoring empty
   * and null anchors.
   * 
   * @param webGraphDb The WebGraphDb to pull from.
   * @param output The extractor output.
   * 
   * @throws IOException If an error occurs while running the extractor.
   */
  private void runExtractor(Path webGraphDb, Path output)
    throws IOException {


    JobConf extractor = new NutchJob(getConf());
    extractor.setJobName("AnchorFields Extractor");
    FileInputFormat.addInputPath(extractor, new Path(webGraphDb,
      WebGraph.OUTLINK_DIR));
    FileInputFormat.addInputPath(extractor, new Path(webGraphDb,
      WebGraph.NODE_DIR));
    FileOutputFormat.setOutputPath(extractor, output);
    extractor.setInputFormat(SequenceFileInputFormat.class);
    extractor.setMapperClass(Extractor.class);
    extractor.setReducerClass(Extractor.class);
    extractor.setMapOutputKeyClass(Text.class);
    extractor.setMapOutputValueClass(ObjectWritable.class);
    extractor.setOutputKeyClass(Text.class);
    extractor.setOutputValueClass(LinkDatum.class);
    extractor.setOutputFormat(SequenceFileOutputFormat.class);


    LOG.info("Starting extractor job");
    try {
      JobClient.runJob(extractor);
    }
    catch (IOException e) {
      LOG.error(StringUtils.stringifyException(e));
      throw e;
    }
    LOG.info("Finished extractor job.");
  }


  /**
   * Runs the collector job.  Aggregates extracted inlinks, sorts and converts
   * the highest scoring into FieldWritable objects.  Only inlinks for which
   * basic fields exist will be collected to avoid orphan fields.
   * 
   * @param basicFields The BasicFields which must be present to collect anchors
   * to avoid orphan fields.
   * @param links The outlinks path.
   * @param output The collector output.
   * 
   * @throws IOException If an error occurs while running the collector.
   */
  private void runCollector(Path basicFields, Path links, Path output)
    throws IOException {


    JobConf collector = new NutchJob(getConf());
    collector.setJobName("AnchorFields Collector");
    FileInputFormat.addInputPath(collector, links);
    FileInputFormat.addInputPath(collector, basicFields);
    FileOutputFormat.setOutputPath(collector, output);
    collector.setInputFormat(SequenceFileInputFormat.class);
    collector.setMapOutputKeyClass(Text.class);
    collector.setMapOutputValueClass(ObjectWritable.class);
    collector.setMapperClass(Collector.class);
    collector.setReducerClass(Collector.class);
    collector.setOutputKeyClass(Text.class);
    collector.setOutputValueClass(FieldWritable.class);
    collector.setOutputFormat(SequenceFileOutputFormat.class);


    LOG.info("Starting collector job");
    try {
      JobClient.runJob(collector);
    }
    catch (IOException e) {
      LOG.error(StringUtils.stringifyException(e));
      throw e;
    }
    LOG.info("Finished collector job.");
  }


  /**
   * Extracts outlinks to be created as FieldWritable objects.  Ignores empty
   * and null anchors.
   */
  public static class Extractor
    extends Configured
    implements Mapper<Text, Writable, Text, ObjectWritable>,
    Reducer<Text, ObjectWritable, Text, LinkDatum> {


    private boolean ignoreEmptyAnchors = true;
    private JobConf conf;


    /**
     * Default constructor.
     */
    public Extractor() {
    }


    /**
     * Configurable constructor.
     */
    public Extractor(Configuration conf) {
      setConf(conf);
    }


    /**
     * Configures the job, sets to ignore empty anchors.
     */
    public void configure(JobConf conf) {
      this.conf = conf;
      ignoreEmptyAnchors = conf.getBoolean("link.ignore.empty.anchors", true);
    }


    /**
     * Wraps values in ObjectWritable
     */
    public void map(Text key, Writable value,
      OutputCollector<Text, ObjectWritable> output, Reporter reporter)
      throws IOException {


      ObjectWritable objWrite = new ObjectWritable();
      objWrite.set(value);
      output.collect(key, objWrite);
    }


    /**
     * Extracts and inverts outlinks, ignores empty anchors.
     */
    public void reduce(Text key, Iterator<ObjectWritable> values,
      OutputCollector<Text, LinkDatum> output, Reporter reporter)
      throws IOException {


      List<LinkDatum> outlinkList = new ArrayList<LinkDatum>();
      Node node = null;


      // collect the outlinks while ignoring links with empty anchor text, also
      // assign the node
      while (values.hasNext()) {
        ObjectWritable objWrite = values.next();
        Object obj = objWrite.get();
        if (obj instanceof LinkDatum) {
          LinkDatum next = (LinkDatum)obj;
          String anchor = next.getAnchor();
          if (anchor != null) {
            anchor = anchor.trim();
          }
          if (ignoreEmptyAnchors && (anchor == null || anchor.length() == 0)) {
            continue;
          }
          outlinkList.add(next);
        }
        else if (obj instanceof Node) {
          node = (Node)obj;
        }
      }


      // has to have outlinks to index
      if (node != null && outlinkList.size() > 0) {
        String fromUrl = key.toString();
        float outlinkScore = node.getInlinkScore();
        for (LinkDatum datum : outlinkList) {
          String toUrl = datum.getUrl();
          datum.setUrl(fromUrl);
          datum.setScore(outlinkScore);
          datum.setLinkType(LinkDatum.INLINK);
          output.collect(new Text(toUrl), datum);
        }
      }
    }


    public void close() {
    }
  }


  /**
   * Collects and creates FieldWritable objects from the inlinks. Inlinks are
   * first sorted by descending score before being collected.
   */
  public static class Collector
    extends Configured
    implements Mapper<Text, Writable, Text, ObjectWritable>,
    Reducer<Text, ObjectWritable, Text, FieldWritable> {


    private int maxInlinks = 1000;
    private boolean tokenize = true;
    private boolean stored = false;
    private Comparator<LinkDatum> descLinkComp = new DescendinLinkDatumScoreComparator();


    /**
     * Configures the jobs. Sets maximum number of inlinks and whether to
     * tokenize and store.
     */
    public void configure(JobConf conf) {
      this.maxInlinks = conf.getInt("link.max.inlinks", 1000);
      this.tokenize = conf.getBoolean("indexer.anchor.tokenize", true);
      this.stored = conf.getBoolean("indexer.anchor.stored", false);
    }


    public void close() {
    }


    /**
     * Wraps values in ObjectWritable
     */
    public void map(Text key, Writable value,
      OutputCollector<Text, ObjectWritable> output, Reporter reporter)
      throws IOException {


      ObjectWritable objWrite = new ObjectWritable();
      objWrite.set(value);
      output.collect(key, objWrite);
    }


    /**
     * Aggregates and sorts inlinks. Then converts up to a max number to
     * FieldWritable objects.
     */
    public void reduce(Text key, Iterator<ObjectWritable> values,
      OutputCollector<Text, FieldWritable> output, Reporter reporter)
      throws IOException {


      List<LinkDatum> anchors = new ArrayList<LinkDatum>();
      FieldsWritable basicFields = null;


      // aggregate inlinks assign basic fields
      while (values.hasNext()) {
        ObjectWritable objWrite = values.next();
        Object obj = objWrite.get();
        if (obj instanceof LinkDatum) {
          anchors.add((LinkDatum)obj);
        }
        else if (obj instanceof FieldsWritable) {
          basicFields = (FieldsWritable)obj;
        }
      }


      // only collect anchors for those urls that have basic fields, otherwise
      // we get orphan entries indexed only under anchor text
      if (basicFields != null && anchors.size() > 0) {


        // sort according to score descending
        Collections.sort(anchors, descLinkComp);


        // collect to maximum number of inlinks
        int numToCollect = (maxInlinks > anchors.size() ? anchors.size()
          : maxInlinks);
        for (int i = 0; i < numToCollect; i++) {
          LinkDatum datum = anchors.get(i);
          FieldWritable anchorField = new FieldWritable(Fields.ANCHOR,
            datum.getAnchor(), FieldType.CONTENT, true, stored, tokenize);
          output.collect(key, anchorField);
        }
      }
    }
  }


  /**
   * Creates the FieldsWritable object from the anchors.
   * 
   * @param webGraphDb The WebGraph from which to pull outlinks.
   * @param basicFields The BasicFields that must be present to avoid orphan
   * anchor fields.
   * @param output The AnchorFields output.
   * 
   * @throws IOException If an error occurs while creating the fields.
   */
  public void createFields(Path webGraphDb, Path basicFields, Path output)
    throws IOException {


    Configuration conf = getConf();
    FileSystem fs = FileSystem.get(conf);
    Path tempLinks = new Path(output + "-"
      + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
    runExtractor(webGraphDb, tempLinks);
    runCollector(basicFields, tempLinks, output);
    fs.delete(tempLinks, true);
  }


  public static void main(String[] args)
    throws Exception {
    int res = ToolRunner.run(NutchConfiguration.create(), new AnchorFields(),
      args);
    System.exit(res);
  }


  /**
   * Runs the AnchorFields job.
   */
  public int run(String[] args)
    throws Exception {


    Options options = new Options();
    Option helpOpts = OptionBuilder.withArgName("help").withDescription(
      "show this help message").create("help");
    Option outputOpts = OptionBuilder.withArgName("output").hasArg().withDescription(
      "the output index directory").create("output");
    Option webGraphDbOpts = OptionBuilder.withArgName("webgraphdb").hasArg().withDescription(
      "the webgraphdb to use").create("webgraphdb");
    Option basicFieldOpts = OptionBuilder.withArgName("basicfields").hasArgs().withDescription(
      "the basicfields to use").create("basicfields");
    options.addOption(helpOpts);
    options.addOption(webGraphDbOpts);
    options.addOption(basicFieldOpts);
    options.addOption(outputOpts);


    CommandLineParser parser = new GnuParser();
    try {


      CommandLine line = parser.parse(options, args);
      if (line.hasOption("help") || !line.hasOption("webgraphdb")
        || !line.hasOption("output") || !line.hasOption("basicfields")) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp("AnchorFields", options);
        return -1;
      }


      String webGraphDb = line.getOptionValue("webgraphdb");
      String output = line.getOptionValue("output");
      String basicFields = line.getOptionValue("basicfields");


      createFields(new Path(webGraphDb), new Path(basicFields),
        new Path(output));
      return 0;
    }
    catch (Exception e) {
      LOG.fatal("AnchorFields: " + StringUtils.stringifyException(e));
      return -2;
    }
  }
}
Source Code of org.apache.nutch.indexer.field.AnchorFields$Collector

Related Classes of org.apache.nutch.indexer.field.AnchorFields$Collector