Source Code of com.datasalt.pangool.tuplemr.mapred.RollupReducer

/**
 * Copyright [2012] [Datasalt Systems S.L.]
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */


package com.datasalt.pangool.tuplemr.mapred;


import java.io.IOException;
import java.util.Iterator;
import java.util.List;


import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.RawComparator;
import org.apache.hadoop.mapreduce.ReduceContext;
import org.apache.hadoop.mapreduce.Reducer;


import com.datasalt.pangool.io.DatumWrapper;
import com.datasalt.pangool.io.ITuple;
import com.datasalt.pangool.io.Schema;
import com.datasalt.pangool.io.ViewTuple;
import com.datasalt.pangool.tuplemr.Criteria.SortElement;
import com.datasalt.pangool.tuplemr.SerializationInfo;
import com.datasalt.pangool.tuplemr.TupleMRConfig;
import com.datasalt.pangool.tuplemr.TupleMRConfigBuilder;
import com.datasalt.pangool.tuplemr.TupleMRException;
import com.datasalt.pangool.tuplemr.TupleReducer;
import com.datasalt.pangool.tuplemr.TupleReducer.TupleMRContext;
import com.datasalt.pangool.tuplemr.TupleRollupReducer;
import com.datasalt.pangool.utils.DCUtils;


/**
 * 
 * This {@link Reducer} implements a similar functionality than
 * {@link SimpleReducer} but adding a Rollup feature.
 */
public class RollupReducer<OUTPUT_KEY, OUTPUT_VALUE> extends
    Reducer<DatumWrapper<ITuple>, NullWritable, OUTPUT_KEY, OUTPUT_VALUE> {


  private boolean firstRun = true;
  private TupleMRConfig tupleMRConfig;
  private SerializationInfo serInfo;
  private TupleMRContext context;
  private TupleReducer<OUTPUT_KEY, OUTPUT_VALUE>.Collector collector;
  private int minDepth, maxDepth;
  private ViewTuple groupTuple;
  private TupleIterator<OUTPUT_KEY, OUTPUT_VALUE> tupleIterator;
  private TupleRollupReducer<OUTPUT_KEY, OUTPUT_VALUE> handler;
  private boolean isMultipleSources;
  private Schema groupSchema;
  private RawComparator<?>[] customComparators;


  @Override
  public void setup(Context context) throws IOException, InterruptedException {
    try {
      this.tupleMRConfig = TupleMRConfig.get(context.getConfiguration());
      this.isMultipleSources = this.tupleMRConfig.getNumIntermediateSchemas() >= 2;
      this.serInfo = tupleMRConfig.getSerializationInfo();
      this.groupSchema = this.serInfo.getGroupSchema();
      if(!isMultipleSources) {
        /* schema id 0 is the only intermediate schema defined */
        this.groupTuple = new ViewTuple(groupSchema,
            this.serInfo.getGroupSchemaIndexTranslation(0)); 
      } else {
        this.groupTuple = new ViewTuple(groupSchema);
      }
      List<String> groupFields = tupleMRConfig.getGroupByFields();
      this.maxDepth = groupFields.size() - 1;
      this.minDepth = tupleMRConfig.calculateRollupBaseFields().size() - 1;
      this.tupleIterator = new TupleIterator<OUTPUT_KEY, OUTPUT_VALUE>(context);
      initHandlerContextAndCollector(context);
      initComparators();
    } catch(TupleMRException e) {
      throw new RuntimeException(e);
    }
  }


  /**
   * Initialize the custom comparators. Creates a quick access array for the
   * custom comparators.
   */
  private void initComparators() {
    TupleMRConfigBuilder.initializeComparators(context.getHadoopContext()
        .getConfiguration(), tupleMRConfig);
    customComparators = new RawComparator<?>[maxDepth + 1];
    for(int i = minDepth; i <= maxDepth; i++) {
      SortElement element = tupleMRConfig.getCommonCriteria().getElements().get(i);
      if(element.getCustomComparator() != null) {
        customComparators[i] = element.getCustomComparator();
      }
    }
  }


  @SuppressWarnings("unchecked")
  private void initHandlerContextAndCollector(Context context) throws IOException,
      InterruptedException, TupleMRException {
    String fileName = context.getConfiguration().get(SimpleReducer.CONF_REDUCER_HANDLER);
    handler = DCUtils.loadSerializedObjectInDC(context.getConfiguration(),
        TupleRollupReducer.class, fileName, true);
    collector = handler.new Collector(
        (ReduceContext<DatumWrapper<ITuple>, NullWritable, Object, Object>) context);
    this.context = new TupleMRContext(
        (ReduceContext<DatumWrapper<ITuple>, NullWritable, Object, Object>) context,
        tupleMRConfig);
    handler.setup(this.context, collector);
  }


  public void cleanup(Context context) throws IOException, InterruptedException {
    try {
      handler.cleanup(this.context, collector);
      collector.close();
      super.cleanup(context);
    } catch(TupleMRException e) {
      throw new RuntimeException(e);
    }
  }


  @Override
  public final void run(Context context) throws IOException, InterruptedException {
    try {
      setup(context);
      firstRun = true;
      while(context.nextKey()) {
        reduce(context.getCurrentKey(), context.getValues(), context);
        // TODO look if this matches super.run() implementation
      }


      // close last group
      for(int i = maxDepth; i >= minDepth; i--) {
        handler.onCloseGroup(i, groupSchema.getField(i).getName(), context
            .getCurrentKey().datum(), this.context, collector);
      }
      cleanup(context);
    } catch(TupleMRException e) {
      throw new RuntimeException(e);
    }
  }


  @Override
  public final void reduce(DatumWrapper<ITuple> key, Iterable<NullWritable> values,
      Context context) throws IOException, InterruptedException {


    try {
      Iterator<NullWritable> iterator = values.iterator();
      tupleIterator.setIterator(iterator);
      ITuple currentTuple = key.datum();
      ITuple previousKey = key.previousDatum();
      int indexMismatch;
      if(firstRun) {
        indexMismatch = minDepth;
        firstRun = false;
      } else {
        indexMismatch = indexMismatch(previousKey, currentTuple, 0, maxDepth);
        if(indexMismatch < minDepth) {
          indexMismatch = minDepth;
        }
        for(int i = maxDepth; i >= indexMismatch; i--) {
          handler.onCloseGroup(i, groupSchema.getField(i).getName(), previousKey,
              this.context, collector);
        }
      }


      for(int i = indexMismatch; i <= maxDepth; i++) {
        handler.onOpenGroup(i, groupSchema.getField(i).getName(), currentTuple,
            this.context, collector);
      }


      // We set a view over the group fields to the method.
      if(isMultipleSources) {
        int schemaId = tupleMRConfig
            .getSchemaIdByName(currentTuple.getSchema().getName());
        int[] indexTranslation = serInfo.getGroupSchemaIndexTranslation(schemaId);
        groupTuple.setContained(currentTuple, indexTranslation);
      } else {
        groupTuple.setContained(currentTuple);
      }


      handler.reduce(groupTuple, tupleIterator, this.context, collector);


      // This loop consumes the remaining elements that reduce didn't consume
      // The goal of this is to correctly set the last element in the next
      // onCloseGroup() call
      while(iterator.hasNext()) {
        iterator.next();
      }
    } catch(TupleMRException e) {
      throw new RuntimeException(e);
    }
  }


  /**
   * Compares sequentially the fields from two tuples and returns which field
   * they differ from. Use custom comparators when provided. The provided
   * RawComparators must implement "compare" so we should use them.<p>
   * 
   * Important. The contract of this method is that the tuples will differ
   * always between minField and maxField. If they are equal then an Exception
   * is thrown.</p>
   * 
   * 
   */
  private int indexMismatch(ITuple tuple1, ITuple tuple2, int minFieldIndex,
      int maxFieldIndex) {
    int schemaId1 = tupleMRConfig.getSchemaIdByName(tuple1.getSchema().getName());
    int schemaId2 = tupleMRConfig.getSchemaIdByName(tuple2.getSchema().getName());
    int[] translationTuple1 = serInfo.getGroupSchemaIndexTranslation(schemaId1);
    int[] translationTuple2 = serInfo.getGroupSchemaIndexTranslation(schemaId2);


    for(int i = minFieldIndex; i <= maxFieldIndex; i++) {
      Object obj1 = tuple1.get(translationTuple1[i]);
      Object obj2 = tuple2.get(translationTuple2[i]);
      @SuppressWarnings("unchecked")
      RawComparator<Object> customComparator = (RawComparator<Object>) customComparators[i];


      if(customComparator != null) {
        if(customComparator.compare(obj1, obj2) != 0) {
          return i;
        }
      } else {
        if(!obj1.equals(obj2)) {
          return i;
        }
      }
    }
    throw new RuntimeException("Illegal state.The tuples " + tuple1 + " and " + tuple2
        + " compare the same between indexes " + minFieldIndex + " and " + maxFieldIndex);
  }
}
Source Code of com.datasalt.pangool.tuplemr.mapred.RollupReducer

Related Classes of com.datasalt.pangool.tuplemr.mapred.RollupReducer