Package com.datasalt.pangool.tuplemr.mapred

Source Code of com.datasalt.pangool.tuplemr.mapred.TupleHashPartitioner

/**
* Copyright [2012] [Datasalt Systems S.L.]
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*   http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.datasalt.pangool.tuplemr.mapred;

import java.util.Arrays;

import org.apache.hadoop.conf.Configurable;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Partitioner;

import com.datasalt.pangool.io.DatumWrapper;
import com.datasalt.pangool.io.ITuple;
import com.datasalt.pangool.io.Utf8;
import com.datasalt.pangool.tuplemr.SerializationInfo;
import com.datasalt.pangool.tuplemr.TupleMRConfig;
import com.datasalt.pangool.tuplemr.TupleMRException;

public class TupleHashPartitioner extends Partitioner<DatumWrapper<ITuple>, NullWritable>
    implements Configurable {

  private TupleMRConfig tupleMRConfig;
  private SerializationInfo serInfo;

  private Configuration conf;
  private final Utf8 HELPER_UTF8 = new Utf8(); // to perform hashCode of strings

  @Override
  public int getPartition(DatumWrapper<ITuple> key, NullWritable value, int numPartitions) {
    if(numPartitions == 1) {
      // in this case the schema is not checked if it's valid
      return 0;
    } else {
      ITuple tuple = key.datum();
      String sourceName = tuple.getSchema().getName();
      Integer schemaId = tupleMRConfig.getSchemaIdByName(sourceName);
      if(schemaId == null) {
        throw new RuntimeException("Schema name '" + sourceName
            + "' is unknown. Known schemas are : "
            + tupleMRConfig.getIntermediateSchemaNames());
      }
      int[] fieldsToPartition = serInfo.getPartitionFieldsIndexes().get(schemaId);
      if(fieldsToPartition.length == 0) {
        throw new RuntimeException("Fields to partition is 0. Something has been wrongly configured.");
      }
      return (partialHashCode(tuple, fieldsToPartition) & Integer.MAX_VALUE)
          % numPartitions;
    }
  }

  @Override
  public Configuration getConf() {
    return conf;
  }

  @Override
  public void setConf(Configuration conf) {
    if(conf != null) {
      this.conf = conf;
      try {
        this.tupleMRConfig = TupleMRConfig.get(conf);
        this.serInfo = tupleMRConfig.getSerializationInfo();
      } catch(TupleMRException e) {
        throw new RuntimeException(e);
      }
    }
  }

  /**
   * Calculates a combinated hashCode using the specified number of fields.
   *
   */
  public int partialHashCode(ITuple tuple, int[] fields) {
    int result = 0;
    for(int field : fields) {
      Object o = tuple.get(field);
      int hashCode;
      if(o instanceof Utf8) { // since String.hashCode() != Utf8.hashCode()
        HELPER_UTF8.set((String) o);
        hashCode = HELPER_UTF8.hashCode();
      } else if(o instanceof Text) {
        HELPER_UTF8.set((Text) o);
        hashCode = HELPER_UTF8.hashCode();
      } else {
        hashCode = o.hashCode();
      }
      result = result * 31 + hashCode;
    }
    return result;
  }

}
TOP

Related Classes of com.datasalt.pangool.tuplemr.mapred.TupleHashPartitioner

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.