Source Code of com.twitter.elephantbird.mapreduce.input.TestLzoTextInputFormat

package com.twitter.elephantbird.mapreduce.input;


import java.io.IOException;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.Arrays;
import java.util.List;
import java.util.Random;


import com.hadoop.compression.lzo.LzoIndex;
import com.hadoop.compression.lzo.LzopCodec;


import com.twitter.elephantbird.mapreduce.input.combine.DelegateCombineFileInputFormat;
import com.twitter.elephantbird.util.HadoopCompat;
import com.twitter.elephantbird.util.CoreTestUtil;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.junit.Assume;
import org.junit.Before;
import org.junit.Test;


import static org.junit.Assert.*;


/**
 * Test the LzoTextInputFormat, make sure it splits the file properly and
 * returns the right data.
 */
public class TestLzoTextInputFormat {
  private static final Log LOG = LogFactory.getLog(TestLzoTextInputFormat.class);


  private MessageDigest md5_;
  private final String lzoFileName_ = "part-r-00001" + new LzopCodec().getDefaultExtension();
  private Path outputDir_;


  // Test both bigger outputs and small one chunk ones.
  private static final int OUTPUT_BIG = 10485760;
  private static final int OUTPUT_SMALL = 50000;


  @Before
  public void setUp() throws Exception {
    md5_ = MessageDigest.getInstance("MD5");
    Path testBuildData = new Path(System.getProperty("test.build.data", "data"));
    outputDir_ = new Path(testBuildData, "outputDir");
  }


  /**
   * Make sure the lzo index class works as described.
   */
  @Test
  public void testLzoIndex() {
    LzoIndex index = new LzoIndex();
    assertTrue(index.isEmpty());
    index = new LzoIndex(4);
    index.set(0, 0);
    index.set(1, 5);
    index.set(2, 10);
    index.set(3, 15);
    assertFalse(index.isEmpty());


    assertEquals(0, index.findNextPosition(-1));
    assertEquals(5, index.findNextPosition(1));
    assertEquals(5, index.findNextPosition(5));
    assertEquals(15, index.findNextPosition(11));
    assertEquals(15, index.findNextPosition(15));
    assertEquals(-1, index.findNextPosition(16));


    assertEquals(5, index.alignSliceStartToIndex(3, 20));
    assertEquals(15, index.alignSliceStartToIndex(15, 20));
    assertEquals(10, index.alignSliceEndToIndex(8, 30));
    assertEquals(10, index.alignSliceEndToIndex(10, 30));
    assertEquals(30, index.alignSliceEndToIndex(17, 30));
    assertEquals(LzoIndex.NOT_FOUND, index.alignSliceStartToIndex(16, 20));
  }


  /**
   * Index the file and make sure it splits properly.
   *
   * @throws NoSuchAlgorithmException
   * @throws IOException
   * @throws InterruptedException
   */
  @Test
  public void testWithIndex() throws NoSuchAlgorithmException, IOException,
      InterruptedException {


    runTest(true, OUTPUT_BIG);
    runTest(true, OUTPUT_SMALL);
  }


  /**
   * Don't index the file and make sure it can be processed anyway.
   *
   * @throws NoSuchAlgorithmException
   * @throws IOException
   * @throws InterruptedException
   */
  @Test
  public void testWithoutIndex() throws NoSuchAlgorithmException, IOException,
      InterruptedException {


    runTest(false, OUTPUT_BIG);
    runTest(false, OUTPUT_SMALL);
  }


  @Test
  public void testCombineWithIndex() throws NoSuchAlgorithmException, IOException,
          InterruptedException {


    runTest(true, OUTPUT_BIG, true);
    runTest(true, OUTPUT_SMALL, true);
  }


  @Test
  public void testCombineWithoutIndex() throws NoSuchAlgorithmException, IOException,
          InterruptedException {


    runTest(false, OUTPUT_BIG, true);
    runTest(false, OUTPUT_SMALL, true);
  }


  private void runTest(boolean testWithIndex, int charsToOutput) throws IOException,
          NoSuchAlgorithmException, InterruptedException {
    runTest(testWithIndex, charsToOutput, false);
  }


  /**
   * Generate random data, compress it, index and md5 hash the data.
   * Then read it all back and md5 that too, to verify that it all went ok.
   *
   * @param testWithIndex Should we index or not?
   * @param charsToOutput How many characters of random data should we output.
   * @throws IOException
   * @throws NoSuchAlgorithmException
   * @throws InterruptedException
   */
  private void runTest(boolean testWithIndex, int charsToOutput, boolean combineSplits) throws IOException,
      NoSuchAlgorithmException, InterruptedException {


    Configuration conf = new Configuration();
    conf.setLong("fs.local.block.size", charsToOutput / 2);
    // reducing block size to force a split of the tiny file
    conf.set("io.compression.codecs", LzopCodec.class.getName());
    DelegateCombineFileInputFormat.setCombinedInputFormatDelegate(conf, LzoTextInputFormat.class);


    Assume.assumeTrue(CoreTestUtil.okToRunLzoTests(conf));


    FileSystem.getLocal(conf).close(); // remove cached filesystem (if any)
    FileSystem localFs = FileSystem.getLocal(conf);
    localFs.delete(outputDir_, true);
    localFs.mkdirs(outputDir_);


    Job job = new Job(conf);
    TextOutputFormat.setCompressOutput(job, true);
    TextOutputFormat.setOutputCompressorClass(job, LzopCodec.class);
    TextOutputFormat.setOutputPath(job, outputDir_);


    TaskAttemptContext attemptContext =
        HadoopCompat.newTaskAttemptContext(HadoopCompat.getConfiguration(job),
            new TaskAttemptID(TaskID.forName("task_201305011733_0001_r_000001"), 2));


    // create some input data
    byte[] expectedMd5 = createTestInput(outputDir_, localFs, attemptContext, charsToOutput);


    if (testWithIndex) {
      Path lzoFile = new Path(outputDir_, lzoFileName_);
      LzoIndex.createIndex(localFs, lzoFile);
    }


    InputFormat inputFormat;
    if (combineSplits) {
      inputFormat = new DelegateCombineFileInputFormat();
    }  else {
      inputFormat = new LzoTextInputFormat();
    }


    TextInputFormat.setInputPaths(job, outputDir_);


    List<InputSplit> is = inputFormat.getSplits(job);
    //verify we have the right number of lzo chunks
    if (testWithIndex && OUTPUT_BIG == charsToOutput) {
      assertEquals(3, is.size());
    } else {
      assertEquals(1, is.size());
    }


    // let's read it all and calculate the md5 hash
    for (InputSplit inputSplit : is) {
      RecordReader<LongWritable, Text> rr = inputFormat.createRecordReader(
          inputSplit, attemptContext);
      rr.initialize(inputSplit, attemptContext);


      while (rr.nextKeyValue()) {
        Text value = rr.getCurrentValue();


        md5_.update(value.getBytes(), 0, value.getLength());
      }


      rr.close();
    }


    localFs.close();
    assertTrue(Arrays.equals(expectedMd5, md5_.digest()));
  }


  /**
   * Creates an lzo file with random data.
   *
   * @param outputDir Output directory.
   * @param fs File system we're using.
   * @param attemptContext Task attempt context, contains task id etc.
   * @throws IOException
   * @throws InterruptedException
   */
  private byte[] createTestInput(Path outputDir, FileSystem fs, TaskAttemptContext attemptContext,
      int charsToOutput) throws IOException, InterruptedException {


    TextOutputFormat<Text, Text> output = new TextOutputFormat<Text, Text>();
    RecordWriter<Text, Text> rw = null;


    md5_.reset();


    try {
      rw = output.getRecordWriter(attemptContext);


      char[] chars = "abcdefghijklmnopqrstuvwxyz\u00E5\u00E4\u00F6"
          .toCharArray();


      Random r = new Random(System.currentTimeMillis());
      Text key = new Text();
      Text value = new Text();
      int charsMax = chars.length - 1;
      for (int i = 0; i < charsToOutput;) {
        i += fillText(chars, r, charsMax, key);
        i += fillText(chars, r, charsMax, value);
        rw.write(key, value);
        md5_.update(key.getBytes(), 0, key.getLength());
        // text output format writes tab between the key and value
        md5_.update("\t".getBytes("UTF-8"));
        md5_.update(value.getBytes(), 0, value.getLength());
      }
    } finally {
      if (rw != null) {
        rw.close(attemptContext);
        OutputCommitter committer = output.getOutputCommitter(attemptContext);
        committer.commitTask(attemptContext);
        committer.commitJob(attemptContext);
      }
    }


    byte[] result = md5_.digest();
    md5_.reset();
    return result;
  }


  private int fillText(char[] chars, Random r, int charsMax, Text text) {
    StringBuilder sb = new StringBuilder();
    // get a reasonable string length
    int stringLength = r.nextInt(charsMax * 2);
    for (int j = 0; j < stringLength; j++) {
      sb.append(chars[r.nextInt(charsMax)]);
    }
    text.set(sb.toString());
    return stringLength;
  }


}
Source Code of com.twitter.elephantbird.mapreduce.input.TestLzoTextInputFormat

Related Classes of com.twitter.elephantbird.mapreduce.input.TestLzoTextInputFormat