/**
* Copyright [2012] [Datasalt Systems S.L.]
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.datasalt.pangool.tuplemr.mapred;
import static org.junit.Assert.*;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.NullWritable;
import org.junit.Assert;
import org.junit.Test;
import com.datasalt.pangool.io.DatumWrapper;
import com.datasalt.pangool.io.Fields;
import com.datasalt.pangool.io.ITuple;
import com.datasalt.pangool.io.Schema;
import com.datasalt.pangool.io.Schema.Field;
import com.datasalt.pangool.io.Schema.Field.Type;
import com.datasalt.pangool.io.Tuple;
import com.datasalt.pangool.tuplemr.TupleMRConfig;
import com.datasalt.pangool.tuplemr.TupleMRConfigBuilder;
import com.datasalt.pangool.tuplemr.TupleMRException;
import com.datasalt.pangool.utils.TestUtils;
import com.datasalt.pangool.utils.test.AbstractBaseTest;
@SuppressWarnings({ "rawtypes", "unchecked" })
public class TestTupleHashPartitioner extends AbstractBaseTest{
final static int MAX_ITERATIONS_OVER_ONE_SCHEMA = 100000;
final static int N_PARTITIONS = 5;
@Test
public void multipleSourcesTest() throws TupleMRException, IOException {
Configuration conf = getConf();
TupleHashPartitioner partitioner = new TupleHashPartitioner();
List<Field> fields = new ArrayList<Field>();
fields.add(Field.create("number1", Type.INT));
fields.add(Field.create("string1", Type.STRING));
fields.add(Field.create("string2", Type.STRING));
Schema schema1 = new Schema("test1", fields);
fields = new ArrayList<Field>();
fields.add(Field.create("number1", Type.INT));
fields.add(Field.create("string1", Type.STRING));
fields.add(Field.create("number2", Type.LONG));
Schema schema2 = new Schema("test2", fields);
TupleMRConfigBuilder builder = new TupleMRConfigBuilder();
builder.addIntermediateSchema(schema1);
builder.addIntermediateSchema(schema2);
builder.setGroupByFields("number1", "string1");
TupleMRConfig tupleMRConf = builder.buildConf();
TupleMRConfig.set(tupleMRConf, conf);
partitioner.setConf(conf);
ITuple tuple = new Tuple(schema1);
tuple.set("number1", 35);
tuple.set("string1", "foo");
// Check that for the same prefix (number1, string1) we obtain the same partition
int partitionId = -N_PARTITIONS;
for(int i = 0; i < MAX_ITERATIONS_OVER_ONE_SCHEMA; i++) {
tuple.set("string2", TestUtils.randomString(10));
int thisPartitionId = partitioner.getPartition(new DatumWrapper(tuple), NullWritable.get(), N_PARTITIONS);
Assert.assertTrue(thisPartitionId >= 0);
Assert.assertTrue(thisPartitionId < N_PARTITIONS);
if(partitionId == -N_PARTITIONS) {
partitionId = thisPartitionId;
} else {
// Check that the returned partition is always the same even if "string2" field changes its value
Assert.assertEquals(thisPartitionId, partitionId);
}
}
// On the other hand, check that when we vary one of the group by fields, partition varies
int partitionMatches[] = new int[N_PARTITIONS];
for(int i = 0; i < MAX_ITERATIONS_OVER_ONE_SCHEMA; i++) {
tuple.set("string1", TestUtils.randomString(10));
int thisPartitionId = partitioner.getPartition(new DatumWrapper(tuple), NullWritable.get(), N_PARTITIONS);
Assert.assertTrue(thisPartitionId >= 0);
Assert.assertTrue(thisPartitionId < N_PARTITIONS);
partitionMatches[thisPartitionId]++;;
}
for(int i = 0; i < partitionMatches.length; i++) {
if(partitionMatches[i] == 0) {
throw new AssertionError("Partition matches: 0 for partition " + i + ". Seems like a bug in the Partitioner.");
}
}
}
/**
* Since pangool supports optional nulls, we should be able to hashcode tuples with nulls
*/
@Test
public void testNulls() throws TupleMRException, IOException {
TupleHashPartitioner partitioner = new TupleHashPartitioner();
Schema schema = new Schema("schema", Fields.parse("a:string, b:int, c:long, d:double"));
Tuple tuple = new Tuple(schema);
assertEquals(0, partitioner.partialHashCode(tuple, new int[] { 0, 1, 2, 3}));
tuple.set("b", 100);
assertEquals(new Integer(100).hashCode(), partitioner.partialHashCode(tuple, new int[] { 0, 1, 2, 3}));
}
@Test
public void sanityTest() throws TupleMRException, IOException {
// This is a basic sanity test for checking that the partitioner works for nPartitions > 1
Configuration conf = getConf();
TupleHashPartitioner partitioner = new TupleHashPartitioner();
List<Field> fields = new ArrayList<Field>();
// We use one INT field - we'll put random numbers in it
fields.add(Field.create("foo", Type.INT));
Schema schema = new Schema("test", fields);
TupleMRConfigBuilder builder = new TupleMRConfigBuilder();
builder.addIntermediateSchema(schema);
builder.setGroupByFields("foo");
TupleMRConfig tupleMRConf = builder.buildConf();
TupleMRConfig.set(tupleMRConf, conf);
partitioner.setConf(conf);
ITuple tuple = new Tuple(schema);
int partitionMatches[] = new int[N_PARTITIONS];
for(int i = 0; i < MAX_ITERATIONS_OVER_ONE_SCHEMA; i++) {
tuple.set("foo", (int)(Math.random() * Integer.MAX_VALUE));
int thisPartitionId = partitioner.getPartition(new DatumWrapper(tuple), NullWritable.get(), N_PARTITIONS);
Assert.assertTrue(thisPartitionId >= 0);
Assert.assertTrue(thisPartitionId < N_PARTITIONS);
partitionMatches[thisPartitionId]++;;
}
for(int i = 0; i < partitionMatches.length; i++) {
if(partitionMatches[i] == 0) {
throw new AssertionError("Partition matches: 0 for partition " + i + ". Seems like a bug in the Partitioner.");
}
}
}
}