package com.splout.db.hadoop;
/*
* #%L
* Splout SQL Hadoop library
* %%
* Copyright (C) 2012 Datasalt Systems S.L.
* %%
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
* #L%
*/
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
import java.io.IOException;
import java.util.Arrays;
import java.util.HashMap;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.junit.Test;
import com.datasalt.pangool.io.Fields;
import com.datasalt.pangool.io.ITuple;
import com.datasalt.pangool.io.Schema;
import com.datasalt.pangool.io.Tuple;
import com.datasalt.pangool.io.TupleFile;
import com.datasalt.pangool.tuplemr.mapred.lib.input.TupleInputFormat;
import com.splout.db.hadoop.TupleSampler.SamplingType;
import com.splout.db.hadoop.TupleSampler.TupleSamplerException;
public class TestTupleSampler {
public static String INPUT = "input-" + TestTupleSampler.class.getName();
public static String OUTPUT = "output-" + TestTupleSampler.class.getName();
@Test
public void testDefault() throws IOException, InterruptedException, TupleSamplerException {
testDefault(Long.MAX_VALUE, 1);
testDefault(1024 * 100, 2);
testDefault(1024 * 10, 3);
}
@Test
public void testReservoir() throws IOException, InterruptedException, TupleSamplerException {
Runtime.getRuntime().exec("rm -rf " + INPUT + "_r");
Runtime.getRuntime().exec("rm -rf " + OUTPUT + "_r");
Configuration conf = new Configuration();
TupleFile.Writer writer = new TupleFile.Writer(FileSystem.get(conf), conf, new Path(INPUT + "_r"), schema);
for(int i = 0; i < 10000; i++) {
ITuple tuple = new Tuple(schema);
tuple.set("id", i+"");
// We save a number in the "foo" field which is consecutive: [0, 1, 2, ... 9999]
tuple.set("foo", "foo" + i);
writer.append(tuple);
}
writer.close();
// Sampling with default method should yield lower numbers
// Default input split size so only one InputSplit
// All sampled keys will be [0, 1, 2, ..., 9]
TupleSampler sampler = new TupleSampler(SamplingType.RANDOM, new TupleSampler.RandomSamplingOptions(), this.getClass());
//sampler.sample(Arrays.asList(new TableInput[] { new TableInput(new TupleInputFormat(), new HashMap<String, String>(), schema, new IdentityRecordProcessor(), new Path(INPUT + "_r")) }), schema, conf, 10, new Path(OUTPUT + "_r"));
sampler.sample(getTblSpec("r"), conf, 10, new Path(OUTPUT + "_r"));
int nTuples = 0;
int[] sampledKeys = new int[10];
SequenceFile.Reader reader = new SequenceFile.Reader(FileSystem.get(conf), new Path(OUTPUT + "_r"), conf);
Text key = new Text();
while(reader.next(key)) {
sampledKeys[nTuples] = Integer.parseInt(key.toString());
nTuples++;
}
reader.close();
for(int i = 0; i < 10; i++) {
assertEquals(i, sampledKeys[i]);
}
// Reservoir sampling should yield better results for this case, let's see
sampler = new TupleSampler(SamplingType.FULL_SCAN, new TupleSampler.RandomSamplingOptions(), this.getClass());
// sampler.sample(Arrays.asList(new TableInput[] { new TableInput(new TupleInputFormat(), new HashMap<String, String>(), schema, new IdentityRecordProcessor(), new Path(INPUT + "_r")) }), schema, conf, 10, new Path(OUTPUT + "_r"));
sampler.sample(getTblSpec("r"), conf, 10, new Path(OUTPUT + "_r"));
nTuples = 0;
sampledKeys = new int[10];
reader = new SequenceFile.Reader(FileSystem.get(conf), new Path(OUTPUT + "_r"), conf);
key = new Text();
while(reader.next(key)) {
sampledKeys[nTuples] = Integer.parseInt(key.toString());
nTuples++;
}
reader.close();
int avgKey = 0;
for(int i = 0; i < 10; i++) {
avgKey += sampledKeys[i];
}
avgKey = avgKey / 10;
// This assertion may fail some day... but the chances are very rare.
// The lower bound is very low, usually the average key will be around 1/4th of the max key (10000).
assertTrue(avgKey > 100);
Runtime.getRuntime().exec("rm -rf " + INPUT + "_r");
Runtime.getRuntime().exec("rm -rf " + OUTPUT + "_r");
}
public void testDefault(long splitSize, int iter) throws TupleSamplerException, IOException, InterruptedException {
Runtime.getRuntime().exec("rm -rf " + INPUT + "_" + iter);
Runtime.getRuntime().exec("rm -rf " + OUTPUT + "_" + iter);
Configuration conf = new Configuration();
TupleFile.Writer writer = new TupleFile.Writer(FileSystem.get(conf), conf, new Path(INPUT + "_" + iter), schema);
for(int i = 0; i < 1000; i++) {
writer.append(randomTuple());
}
writer.close();
// Requesting as many samples as splits so one sample is needed from each split.
FileStatus status = FileSystem.get(conf).getFileStatus(new Path(INPUT + "_" + iter));
long expectedSplits = Math.max(1, (long) Math.ceil(((double) status.getLen()) / splitSize));
TupleSampler.RandomSamplingOptions options = new TupleSampler.RandomSamplingOptions();
options.setMaxInputSplitSize(splitSize);
options.setMaxSplitsToVisit(Integer.MAX_VALUE);
TablespaceSpec tablespaceSpec = getTblSpec(iter + "");
TupleSampler sampler = new TupleSampler(SamplingType.RANDOM, options, this.getClass());
sampler.sample(tablespaceSpec, conf, expectedSplits, new Path(OUTPUT + "_" + iter));
int nKeys = 0;
SequenceFile.Reader reader = new SequenceFile.Reader(FileSystem.get(conf), new Path(OUTPUT + "_" + iter), conf);
Text key = new Text();
while(reader.next(key)) {
nKeys++;
}
reader.close();
String a = "";
a.split("foo");
assertEquals(expectedSplits, nKeys);
Runtime.getRuntime().exec("rm -rf " + INPUT + "_" + iter);
Runtime.getRuntime().exec("rm -rf " + OUTPUT + "_" + iter);
}
final Schema schema = new Schema("schema", Fields.parse("id:string, foo:string"));
public ITuple randomTuple() {
ITuple tuple = new Tuple(schema);
tuple.set("id", "id" + (Math.random() * 1000000000));
tuple.set("foo", "foobar" + (Math.random() * 1000000000));
return tuple;
}
public TablespaceSpec getTblSpec(String inputPostfix) {
return new TablespaceSpec(
Arrays.asList(
new Table(
Arrays.asList(
new TableInput[] {
new TableInput(
new TupleInputFormat(),
new HashMap<String, String>(),
schema,
new IdentityRecordProcessor(),
new Path(INPUT + "_" + inputPostfix)) }),
new TableSpec(
schema,
schema.getField("id"))
)),
null, 1, null, null);
}
}