Source Code of com.splout.db.hadoop.TestTupleSampler

package com.splout.db.hadoop;


/*
 * #%L
 * Splout SQL Hadoop library
 * %%
 * Copyright (C) 2012 Datasalt Systems S.L.
 * %%
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *      http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * #L%
 */


import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;


import java.io.IOException;
import java.util.Arrays;
import java.util.HashMap;


import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.junit.Test;


import com.datasalt.pangool.io.Fields;
import com.datasalt.pangool.io.ITuple;
import com.datasalt.pangool.io.Schema;
import com.datasalt.pangool.io.Tuple;
import com.datasalt.pangool.io.TupleFile;
import com.datasalt.pangool.tuplemr.mapred.lib.input.TupleInputFormat;
import com.splout.db.hadoop.TupleSampler.SamplingType;
import com.splout.db.hadoop.TupleSampler.TupleSamplerException;


public class TestTupleSampler {


  public static String INPUT = "input-" + TestTupleSampler.class.getName();
  public static String OUTPUT = "output-" + TestTupleSampler.class.getName();


  @Test
  public void testDefault() throws IOException, InterruptedException, TupleSamplerException {
    testDefault(Long.MAX_VALUE, 1);
    testDefault(1024 * 100, 2);
    testDefault(1024 * 10, 3);
  }
  
  @Test
  public void testReservoir() throws IOException, InterruptedException, TupleSamplerException {
    Runtime.getRuntime().exec("rm -rf " + INPUT + "_r");
    Runtime.getRuntime().exec("rm -rf " + OUTPUT + "_r");


    Configuration conf = new Configuration();


    TupleFile.Writer writer = new TupleFile.Writer(FileSystem.get(conf), conf, new Path(INPUT + "_r"), schema);
    for(int i = 0; i < 10000; i++) {
      ITuple tuple = new Tuple(schema);
      tuple.set("id", i+"");
      // We save a number in the "foo" field which is consecutive: [0, 1, 2, ... 9999]
      tuple.set("foo", "foo" + i);
      writer.append(tuple);
    }
    writer.close();


    // Sampling with default method should yield lower numbers
    // Default input split size so only one InputSplit
    // All sampled keys will be [0, 1, 2, ..., 9]
    TupleSampler sampler = new TupleSampler(SamplingType.RANDOM, new TupleSampler.RandomSamplingOptions(), this.getClass());
    //sampler.sample(Arrays.asList(new TableInput[] { new TableInput(new TupleInputFormat(), new HashMap<String, String>(), schema, new IdentityRecordProcessor(), new Path(INPUT + "_r")) }), schema, conf, 10, new Path(OUTPUT + "_r"));
    sampler.sample(getTblSpec("r"), conf, 10, new Path(OUTPUT + "_r"));


    int nTuples = 0;
    int[] sampledKeys = new int[10];


    SequenceFile.Reader reader = new SequenceFile.Reader(FileSystem.get(conf),  new Path(OUTPUT + "_r"), conf);
    Text key = new Text();
    while(reader.next(key)) {
      sampledKeys[nTuples] = Integer.parseInt(key.toString());
      nTuples++;
    }
    reader.close();


    for(int i = 0; i < 10; i++) {
      assertEquals(i, sampledKeys[i]);
    }


    // Reservoir sampling should yield better results for this case, let's see
    sampler = new TupleSampler(SamplingType.FULL_SCAN, new TupleSampler.RandomSamplingOptions(), this.getClass());
    // sampler.sample(Arrays.asList(new TableInput[] { new TableInput(new TupleInputFormat(), new HashMap<String, String>(), schema, new IdentityRecordProcessor(), new Path(INPUT + "_r")) }), schema, conf, 10, new Path(OUTPUT + "_r"));
    sampler.sample(getTblSpec("r"), conf, 10, new Path(OUTPUT + "_r"));


    nTuples = 0;
    sampledKeys = new int[10];


    reader = new SequenceFile.Reader(FileSystem.get(conf),  new Path(OUTPUT + "_r"), conf);
    key = new Text();
    while(reader.next(key)) {
      sampledKeys[nTuples] = Integer.parseInt(key.toString());
      nTuples++;
    }
    reader.close();


    int avgKey = 0;
    for(int i = 0; i < 10; i++) {
      avgKey += sampledKeys[i];
    }


    avgKey = avgKey / 10;
    // This assertion may fail some day... but the chances are very rare.
    // The lower bound is very low, usually the average key will be around 1/4th of the max key (10000).
    assertTrue(avgKey > 100);


    Runtime.getRuntime().exec("rm -rf " + INPUT + "_r");
    Runtime.getRuntime().exec("rm -rf " + OUTPUT + "_r");
  }
  
  public void testDefault(long splitSize, int iter) throws TupleSamplerException, IOException, InterruptedException {
    Runtime.getRuntime().exec("rm -rf " + INPUT + "_" + iter);
    Runtime.getRuntime().exec("rm -rf " + OUTPUT + "_" + iter);
    
    Configuration conf = new Configuration();


    TupleFile.Writer writer = new TupleFile.Writer(FileSystem.get(conf), conf, new Path(INPUT + "_" + iter), schema);
    for(int i = 0; i < 1000; i++) {
      writer.append(randomTuple());
    }
    writer.close();


    // Requesting as many samples as splits so one sample is needed from each split.
    FileStatus status = FileSystem.get(conf).getFileStatus(new Path(INPUT + "_" + iter));
    long expectedSplits = Math.max(1, (long) Math.ceil(((double) status.getLen()) / splitSize));


    TupleSampler.RandomSamplingOptions options = new TupleSampler.RandomSamplingOptions();
    options.setMaxInputSplitSize(splitSize);
    options.setMaxSplitsToVisit(Integer.MAX_VALUE);


    TablespaceSpec tablespaceSpec = getTblSpec(iter + "");


    TupleSampler sampler = new TupleSampler(SamplingType.RANDOM, options, this.getClass());
    sampler.sample(tablespaceSpec, conf, expectedSplits, new Path(OUTPUT + "_" + iter));


    int nKeys = 0;
    SequenceFile.Reader reader = new SequenceFile.Reader(FileSystem.get(conf), new Path(OUTPUT + "_" + iter), conf);
    Text key = new Text();
    while(reader.next(key)) {
      nKeys++;
    }
    reader.close();
    
    String a = "";
    a.split("foo");
    
    assertEquals(expectedSplits, nKeys);
    
    Runtime.getRuntime().exec("rm -rf " + INPUT + "_" + iter);
    Runtime.getRuntime().exec("rm -rf " + OUTPUT + "_" + iter);
  }


  final Schema schema = new Schema("schema", Fields.parse("id:string, foo:string"));


  public ITuple randomTuple() {
    ITuple tuple = new Tuple(schema);
    tuple.set("id", "id" + (Math.random() * 1000000000));
    tuple.set("foo", "foobar" + (Math.random() * 1000000000));
    return tuple;
  }


  public TablespaceSpec getTblSpec(String inputPostfix) {
    return new TablespaceSpec(
        Arrays.asList(
            new Table(
                Arrays.asList(
                    new TableInput[] {
                        new TableInput(
                            new TupleInputFormat(),
                            new HashMap<String, String>(),
                            schema,
                            new IdentityRecordProcessor(),
                            new Path(INPUT + "_" + inputPostfix)) }),
                new TableSpec(
                    schema,
                    schema.getField("id"))


            )),
        null, 1, null, null);
  }
}
Source Code of com.splout.db.hadoop.TestTupleSampler

Related Classes of com.splout.db.hadoop.TestTupleSampler