Package ivory.ptc

Source Code of ivory.ptc.AnchorTextInvertedIndex$MyReducer

/**
* Ivory: A Hadoop toolkit for Web-scale information retrieval
*
* Licensed under the Apache License, Version 2.0 (the "License"); you
* may not use this file except in compliance with the License. You may
* obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied. See the License for the specific language governing
* permissions and limitations under the License.
*/

package ivory.ptc;

import ivory.ptc.data.AnchorTextTarget;
import ivory.ptc.judgments.weighting.WeightingScheme;

import java.io.IOException;
import java.net.URI;
import java.util.Collections;
import java.util.Iterator;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;


import edu.umd.cloud9.util.PowerTool;
import edu.umd.cloud9.webgraph.data.AnchorText;
import edu.umd.cloud9.io.array.ArrayListOfIntsWritable;
import edu.umd.cloud9.io.array.ArrayListWritable;


/**
* Map-Reduce job that constructs anchor text-inverted index.
* The inverted index contains, for each unique anchor text,
* a list of documents that are pointed to by that anchor text.
*
* @author Nima Asadi
*/
@SuppressWarnings("deprecation")
public class AnchorTextInvertedIndex extends PowerTool {
  private static final Logger LOG = Logger.getLogger(AnchorTextInvertedIndex.class);
  public static final String PARAMETER_SEPARATER = ",";

  static {
    LOG.setLevel(Level.INFO);
  }

  private static class MyMapper extends MapReduceBase implements
      Mapper<IntWritable, ArrayListWritable<AnchorText>, Text, AnchorTextTarget> {
    private static final AnchorTextTarget anchorTextTarget = new AnchorTextTarget();
    private static final Text keyOut = new Text();
    // Weighting scheme used to rank target documents
    private static WeightingScheme weightingScheme;

    @Override
    public void configure(JobConf job) {
      Path[] localFiles;
      try {
        localFiles = DistributedCache.getLocalCacheFiles(job);
      } catch (IOException e) {
        throw new RuntimeException("Local cache files not read properly.");
      }
     
      String[] params = new String[job.get("Ivory.WeightingSchemeParameters").split(PARAMETER_SEPARATER).length];
      for (int i = 0; i < params.length; i++) {
        params[i] = localFiles[i].toString();
      }

      try {
        weightingScheme = (WeightingScheme) Class.forName(
            job.get("Ivory.WeightingScheme")).newInstance();
        weightingScheme.initialize(FileSystem.getLocal(job), params);
      } catch (Exception e) {
        throw new RuntimeException("Mapper failed to initialize the weighting scheme: "
            + job.get("Ivory.WeightingScheme") + " with parameters: "
                + job.get("Ivory.WeightingSchemeParameters"));
      }
    }

    @Override
    public void map(IntWritable key, ArrayListWritable<AnchorText> anchors,
        OutputCollector<Text, AnchorTextTarget> output, Reporter reporter) throws IOException {
      anchorTextTarget.setTarget(key.get());
      for (AnchorText anchor : anchors) {
        // Internal links provide navigational information which are not useful for our purposes.
        if (!anchor.isExternalInLink() && !anchor.isInternalInLink()) {
          continue;
        }

        keyOut.set(anchor.getText());
        anchorTextTarget.setSources(new ArrayListOfIntsWritable(anchor.getDocuments()));
        anchorTextTarget.setWeight(weightingScheme.getWeight(key.get(), anchor));
        output.collect(keyOut, anchorTextTarget);
      }
    }
  }

  private static class MyReducer extends MapReduceBase implements
      Reducer<Text, AnchorTextTarget, Text, ArrayListWritable<AnchorTextTarget>> {
    private static final ArrayListWritable<AnchorTextTarget> outList =
        new ArrayListWritable<AnchorTextTarget>();

    @Override
    public void reduce(Text anchorText, Iterator<AnchorTextTarget> values,
        OutputCollector<Text, ArrayListWritable<AnchorTextTarget>> output, Reporter reporter)
            throws IOException {
      outList.clear();
      while (values.hasNext()) {
        outList.add(new AnchorTextTarget(values.next()));
      }

      Collections.sort(outList);
      output.collect(anchorText, outList);
    }
  }

  public static final String[] RequiredParameters = {
    "Ivory.NumMapTasks",
    "Ivory.NumReduceTasks",
    "Ivory.InputPath",
    "Ivory.OutputPath",
    "Ivory.WeightingScheme",
    "Ivory.WeightingSchemeParameters",
  };

  @Override
  public String[] getRequiredParameters() {
    return RequiredParameters;
  }

  public AnchorTextInvertedIndex(Configuration conf) {
    super(conf);
  }

  @Override
  public int runTool() throws Exception {
    JobConf conf = new JobConf(getConf(), AnchorTextInvertedIndex.class);
    FileSystem fs = FileSystem.get(conf);
    String inPath = conf.get("Ivory.InputPath");
    String outPath = conf.get("Ivory.OutputPath");
    Path inputPath = new Path(inPath);
    Path outputPath = new Path(outPath);
    int mapTasks = conf.getInt("Ivory.NumMapTasks", 1);
    int reduceTasks = conf.getInt("Ivory.NumReduceTasks", 100);
    String weightingSchemeParameters = conf.get("Ivory.WeightingSchemeParameters");

    LOG.info("BuildAnchorTextInvertedIndex");
    LOG.info(" - input path: " + inPath);
    LOG.info(" - output path: " + outPath);
    LOG.info(" - number of reducers: " + reduceTasks);
    LOG.info(" - weighting scheme: " + conf.get("Ivory.WeightingScheme"));
    LOG.info(" - weighting scheme parameters: " + weightingSchemeParameters);

    String[] params = weightingSchemeParameters.split(PARAMETER_SEPARATER);
    for(String param : params) {
      DistributedCache.addCacheFile(new URI(param), conf);
    }

    conf.setJobName("BuildAnchorTextInvertedIndex");
    conf.setNumMapTasks(mapTasks);
    conf.setNumReduceTasks(reduceTasks);
    conf.set("mapred.child.java.opts", "-Xmx4096m");
    conf.setInt("mapred.task.timeout", 60000000);

    FileInputFormat.setInputPaths(conf, inputPath);
    FileOutputFormat.setOutputPath(conf, outputPath);

    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);
    conf.setMapOutputKeyClass(Text.class);
    conf.setMapOutputValueClass(AnchorTextTarget.class);
    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(ArrayListWritable.class);
    conf.setMapperClass(MyMapper.class);
    conf.setReducerClass(MyReducer.class);

    fs.delete(outputPath);
    JobClient.runJob(conf);
    return 0;
  }
}
TOP

Related Classes of ivory.ptc.AnchorTextInvertedIndex$MyReducer

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.