Package ivory.core.index

Source Code of ivory.core.index.BuildIPInvertedIndexDocSorted$MyPartitioner

/*
* Ivory: A Hadoop toolkit for web-scale information retrieval
*
* Licensed under the Apache License, Version 2.0 (the "License"); you
* may not use this file except in compliance with the License. You may
* obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied. See the License for the specific language governing
* permissions and limitations under the License.
*/

package ivory.core.index;

import ivory.core.Constants;
import ivory.core.RetrievalEnvironment;
import ivory.core.data.document.IntDocVector;
import ivory.core.data.document.IntDocVector.Reader;
import ivory.core.data.index.PostingsList;
import ivory.core.data.index.PostingsListDocSortedPositional;
import ivory.core.data.index.TermPositions;

import java.io.IOException;
import java.util.Iterator;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;

import edu.umd.cloud9.io.pair.PairOfInts;
import edu.umd.cloud9.util.PowerTool;
import edu.umd.cloud9.util.map.HMapII;
import edu.umd.cloud9.util.map.MapII;

/**
* Indexer for building document-sorted inverted indexes.
*
* @author Jimmy Lin
* @author Tamer Elsayed
*/
public class BuildIPInvertedIndexDocSorted extends PowerTool {
  private static final Logger LOG = Logger.getLogger(BuildIPInvertedIndexDocSorted.class);

  protected static enum Docs { Total }
  protected static enum IndexedTerms { Unique, Total }
  protected static enum MapTime { Total }
  protected static enum ReduceTime { Total }

  private static class MyMapper
      extends Mapper<IntWritable, IntDocVector, PairOfInts, TermPositions> {
    private static final TermPositions termPositions = new TermPositions();
    private static final PairOfInts pair = new PairOfInts();
    private static final HMapII dfs = new HMapII(); // Holds dfs of terms processed by this mapper.

    private int docno;

    @Override
    public void setup(Context context) {
      dfs.clear();
    }

    @Override
    public void map(IntWritable key, IntDocVector doc, Context context)
        throws IOException, InterruptedException {
      docno = key.get();

      long startTime = System.currentTimeMillis();
      Reader r = doc.getReader();

      int dl = 0;
      while (r.hasMoreTerms()) {
        int term = r.nextTerm();
        r.getPositions(termPositions);

        // Set up the key and value, and emit.
        pair.set(term, docno);
        context.write(pair, termPositions);

        // Document length of the current doc.
        dl += termPositions.getTf();

        // Df of the term in the partition handled by this mapper.
        dfs.increment(term);
      }

      context.getCounter(IndexedTerms.Total).increment(dl); // Update number of indexed terms.
      context.getCounter(Docs.Total).increment(1);          // Update number of docs.
      context.getCounter(MapTime.Total).increment(System.currentTimeMillis() - startTime);
    }

    @Override
    public void cleanup(Context context) throws IOException, InterruptedException {
      int[] arr = new int[1];

      // Emit dfs for terms encountered in this partition of the collection.
      for (MapII.Entry e : dfs.entrySet()) {
        arr[0] = e.getValue();
        termPositions.set(arr, (short) 1); // Dummy value.
        // Special docno of "-1" to make sure this key-value pair comes before all other postings in
        // the reducer phase.
        pair.set(e.getKey(), -1);
        context.write(pair, termPositions);
      }
    }
  }

  private static class MyReducer
      extends Reducer<PairOfInts, TermPositions, IntWritable, PostingsList> {
    private static final IntWritable term = new IntWritable();
    private static final PostingsList postings = new PostingsListDocSortedPositional();

    private int prevTerm = -1;
    private int numPostings = 0;

    @Override
    public void setup(Context context) {
      LOG.setLevel(Level.WARN);
      int cnt = context.getConfiguration().getInt(Constants.CollectionDocumentCount, 0);
      if (cnt == 0) {
        throw new RuntimeException("Error: size of collection cannot be zero!");
      }
      postings.setCollectionDocumentCount(cnt);
    }

    @Override
    public void reduce(PairOfInts pair, Iterable<TermPositions> values, Context context)
        throws IOException, InterruptedException {
      long start = System.currentTimeMillis();
      int curTerm = pair.getLeftElement();

      if (pair.getRightElement() == -1) {
        if (prevTerm != -1 && curTerm != prevTerm) {
          // Encountered next term, so emit postings corresponding to previous term.
          if (numPostings != postings.size()) {
            throw new RuntimeException(String.format(
                "Error: actual number of postings processed is different from expected! " +
                    "expected: %d, got: %d for term %d", numPostings, postings.size(), prevTerm));
          }

          term.set(prevTerm);
          context.write(term, postings);
          context.getCounter(IndexedTerms.Unique).increment(1);

          LOG.info(String.format("Finished processing postings for term %d (num postings=%d)",
              prevTerm, postings.size()));
          postings.clear();
        }

        numPostings = 0;
        Iterator<TermPositions> iter = values.iterator();
        while (iter.hasNext()) {
          TermPositions positions = iter.next();
          numPostings += positions.getPositions()[0];
        }

        postings.setNumberOfPostings(numPostings);
        return;
      }

      Iterator<TermPositions> iter = values.iterator();
      TermPositions positions = iter.next();
      postings.add(pair.getRightElement(), positions.getTf(), positions);

      if (iter.hasNext()) {
        throw new RuntimeException(
            String.format("Error: values with the same (term, docno): docno=%d, term=%d",
                pair.getRightElement(), curTerm));
      }

      prevTerm = curTerm;

      long duration = System.currentTimeMillis() - start;
      context.getCounter(ReduceTime.Total).increment(duration);
    }

    @Override
    public void cleanup(Context context) throws IOException, InterruptedException {
      long start = System.currentTimeMillis();

      // We need to flush out the final postings list.
      if (numPostings != postings.size()) {
        throw new RuntimeException(String.format(
            "Error: actual number of postings processed is different from expected! " +
                "expected: %d, got: %d for term %d", numPostings, postings.size(), prevTerm));
      }

      term.set(prevTerm);
      context.write(term, postings);
      context.getCounter(IndexedTerms.Unique).increment(1);

      LOG.info(String.format("Finished processing postings for term %d (num postings=%d)",
          prevTerm, postings.size()));
      context.getCounter(ReduceTime.Total).increment(System.currentTimeMillis() - start);
    }
  }

  private static class MyPartitioner extends Partitioner<PairOfInts, TermPositions> {
    // Keys with the same terms should go to the same reducer.
    @Override
    public int getPartition(PairOfInts key, TermPositions value, int numReduceTasks) {
      return (key.getLeftElement() & Integer.MAX_VALUE) % numReduceTasks;
    }
  }

  public static final String[] RequiredParameters = {
          Constants.NumReduceTasks, Constants.IndexPath };

  public String[] getRequiredParameters() {
    return RequiredParameters;
  }

  public BuildIPInvertedIndexDocSorted(Configuration conf) {
    super(conf);
  }

  public int runTool() throws Exception {
    Configuration conf = getConf();
    FileSystem fs = FileSystem.get(conf);

    String indexPath = conf.get(Constants.IndexPath);
    RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);

    String collectionName = env.readCollectionName();

    int reduceTasks = conf.getInt(Constants.NumReduceTasks, 0);
    int minSplitSize = conf.getInt(Constants.MinSplitSize, 0);
    int collectionDocCnt = env.readCollectionDocumentCount();

    LOG.info("PowerTool: " + BuildIPInvertedIndexDocSorted.class.getCanonicalName());
    LOG.info(String.format(" - %s: %s", Constants.IndexPath, indexPath));
    LOG.info(String.format(" - %s: %s", Constants.CollectionName, collectionName));
    LOG.info(String.format(" - %s: %s", Constants.CollectionDocumentCount, collectionDocCnt));
    LOG.info(String.format(" - %s: %s", Constants.NumReduceTasks, reduceTasks));
    LOG.info(String.format(" - %s: %s", Constants.MinSplitSize, minSplitSize));

    if (!fs.exists(new Path(indexPath))) {
      fs.mkdirs(new Path(indexPath));
    }

    Path inputPath = new Path(env.getIntDocVectorsDirectory());
    Path postingsPath = new Path(env.getPostingsDirectory());

    if (fs.exists(postingsPath)) {
      LOG.info("Postings already exist: no indexing will be performed.");
      return 0;
    }

    conf.setInt(Constants.CollectionDocumentCount, collectionDocCnt);

    conf.setInt("mapred.min.split.size", minSplitSize);
    conf.set("mapred.child.java.opts", "-Xmx2048m");

    Job job = new Job(conf,
        BuildIPInvertedIndexDocSorted.class.getSimpleName() + ":" + collectionName);
    job.setJarByClass(BuildIPInvertedIndexDocSorted.class);

    job.setNumReduceTasks(reduceTasks);

    FileInputFormat.setInputPaths(job, inputPath);
    FileOutputFormat.setOutputPath(job, postingsPath);

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    job.setMapOutputKeyClass(PairOfInts.class);
    job.setMapOutputValueClass(TermPositions.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(PostingsListDocSortedPositional.class);

    job.setMapperClass(MyMapper.class);
    job.setReducerClass(MyReducer.class);
    job.setPartitionerClass(MyPartitioner.class);

    long startTime = System.currentTimeMillis();
    job.waitForCompletion(true);
    LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    env.writePostingsType(PostingsListDocSortedPositional.class.getCanonicalName());

    return 0;
  }
}
TOP

Related Classes of ivory.core.index.BuildIPInvertedIndexDocSorted$MyPartitioner

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.