Package ivory.core.preprocess

Source Code of ivory.core.preprocess.ComputeGlobalTermStatistics$MyMapper

/*
* Ivory: A Hadoop toolkit for web-scale information retrieval
*
* Licensed under the Apache License, Version 2.0 (the "License"); you
* may not use this file except in compliance with the License. You may
* obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied. See the License for the specific language governing
* permissions and limitations under the License.
*/

package ivory.core.preprocess;

import ivory.core.Constants;
import ivory.core.RetrievalEnvironment;
import ivory.core.data.document.TermDocVector;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Counters;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.log4j.Logger;

import edu.umd.cloud9.io.pair.PairOfIntLong;
import edu.umd.cloud9.util.PowerTool;

public class ComputeGlobalTermStatistics extends PowerTool {
  private static final Logger LOG = Logger.getLogger(ComputeGlobalTermStatistics.class);

  protected static enum Statistics {
    Docs, Terms, SumOfDocLengths
  }

  private static class MyMapper extends Mapper<IntWritable, TermDocVector, Text, PairOfIntLong> {
    private static final Text term = new Text();
    private static final PairOfIntLong pair = new PairOfIntLong();

    @Override
    public void map(IntWritable key, TermDocVector doc, Context context)
        throws IOException, InterruptedException {
      TermDocVector.Reader r = doc.getReader();
      int dl = 0, tf = 0;
      while (r.hasMoreTerms()) {
        term.set(r.nextTerm());
        tf = r.getTf();
        dl += tf;
        pair.set(1, tf);
        context.write(term, pair);
      }

      context.getCounter(Statistics.Docs).increment(1);
      context.getCounter(Statistics.SumOfDocLengths).increment(dl);
    }
  }

  private static class MyCombiner extends Reducer<Text, PairOfIntLong, Text, PairOfIntLong> {
    private static final PairOfIntLong output = new PairOfIntLong();

    @Override
    public void reduce(Text key, Iterable<PairOfIntLong> values, Context context)
        throws IOException, InterruptedException {
      int df = 0;
      long cf = 0;
      for (PairOfIntLong pair : values) {
        df += pair.getLeftElement();
        cf += pair.getRightElement();
      }

      output.set(df, cf);
      context.write(key, output);
    }
  }

  private static class MyReducer extends Reducer<Text, PairOfIntLong, Text, PairOfIntLong> {
    private static final PairOfIntLong output = new PairOfIntLong();
    private int minDf, maxDf;

    @Override
    public void setup(Reducer<Text, PairOfIntLong, Text, PairOfIntLong>.Context context) {
      minDf = context.getConfiguration().getInt(Constants.MinDf, 2);
      maxDf = context.getConfiguration().getInt(Constants.MaxDf, Integer.MAX_VALUE);
    }

    @Override
    public void reduce(Text key, Iterable<PairOfIntLong> values, Context context)
        throws IOException, InterruptedException {
      int df = 0;
      long cf = 0;
      for (PairOfIntLong pair : values) {
        df += pair.getLeftElement();
        cf += pair.getRightElement();
      }
      if (df < minDf || df > maxDf) {
        return;
      }
      context.getCounter(Statistics.Terms).increment(1);
      output.set(df, cf);
      context.write(key, output);
    }
  }

  public static final String[] RequiredParameters = {
          Constants.CollectionName, Constants.IndexPath, Constants.MinDf, Constants.MaxDf };

  public String[] getRequiredParameters() {
    return RequiredParameters;
  }

  public ComputeGlobalTermStatistics(Configuration conf) {
    super(conf);
  }

  public int runTool() throws Exception {
    Configuration conf = getConf();

    FileSystem fs = FileSystem.get(conf);

    String indexPath = conf.get(Constants.IndexPath);
    RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);

    int reduceTasks = 10;

    String collectionName = env.readCollectionName();
    String termDocVectorsPath = env.getTermDocVectorsDirectory();
    String termDfCfPath = env.getTermDfCfDirectory();

    if (!fs.exists(new Path(indexPath))) {
      LOG.info("index path doesn't existing: skipping!");
      return 0;
    }

    LOG.info("PowerTool: " + ComputeGlobalTermStatistics.class.getCanonicalName());
    LOG.info(String.format(" - %s: %s", Constants.CollectionName, collectionName));
    LOG.info(String.format(" - %s: %s", Constants.IndexPath, indexPath));
    LOG.info(String.format(" - %s: %s", Constants.NumReduceTasks, reduceTasks));

    Path outputPath = new Path(termDfCfPath);
    if (fs.exists(outputPath)) {
      LOG.info("TermDfCf directory exist: skipping!");
      return 0;
    }

    Job job = new Job(getConf(), ComputeGlobalTermStatistics.class.getSimpleName() + ":"
        + collectionName);
    job.setJarByClass(ComputeGlobalTermStatistics.class);

    job.setNumReduceTasks(reduceTasks);

    FileInputFormat.setInputPaths(job, new Path(termDocVectorsPath));
    FileOutputFormat.setOutputPath(job, outputPath);

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(PairOfIntLong.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(PairOfIntLong.class);

    job.setMapperClass(MyMapper.class);
    job.setCombinerClass(MyCombiner.class);
    job.setReducerClass(MyReducer.class);

    long startTime = System.currentTimeMillis();
    job.waitForCompletion(true);
    LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    Counters counters = job.getCounters();
    // Write out number of postings. NOTE: this value is not the same as
    // number of postings, because postings for non-English terms are
    // discarded, or as result of df cut.
    env.writeCollectionTermCount((int) counters.findCounter(Statistics.Terms).getValue());

    env.writeCollectionLength(counters.findCounter(Statistics.SumOfDocLengths).getValue());
    return 0;
  }
}
TOP

Related Classes of ivory.core.preprocess.ComputeGlobalTermStatistics$MyMapper

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.