Source Code of com.hadoop.mapreduce.TestLzoTextInputFormat

/*
 * This file is part of Hadoop-Gpl-Compression.
 *
 * Hadoop-Gpl-Compression is free software: you can redistribute it
 * and/or modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation, either version 3 of
 * the License, or (at your option) any later version.
 *
 * Hadoop-Gpl-Compression is distributed in the hope that it will be
 * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with Hadoop-Gpl-Compression.  If not, see
 * <http://www.gnu.org/licenses/>.
 */
 
package com.hadoop.mapreduce;


import java.io.IOException;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Random;


import junit.framework.TestCase;


import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.OutputCommitter;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.TaskAttemptID;
import org.apache.hadoop.mapreduce.TaskID;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;


import com.hadoop.compression.lzo.GPLNativeCodeLoader;
import com.hadoop.compression.lzo.LzoIndex;
import com.hadoop.compression.lzo.LzoInputFormatCommon;
import com.hadoop.compression.lzo.LzopCodec;
import com.hadoop.compression.lzo.util.CompatibilityUtil;


/**
 * Test the LzoTextInputFormat, make sure it splits the file properly and
 * returns the right data.
 */
public class TestLzoTextInputFormat extends TestCase {


  private static final Log LOG = LogFactory.getLog(TestLzoTextInputFormat.class
      .getName());


  private MessageDigest md5;
  private final String lzoFileName = "part-r-00001" + new LzopCodec().getDefaultExtension();
  private Path outputDir;
  
  //test both bigger outputs and small one chunk ones
  private static final int OUTPUT_BIG = 10485760;
  private static final int OUTPUT_SMALL = 50000;
  
  @Override
  protected void setUp() throws Exception {
    super.setUp();
    md5 = MessageDigest.getInstance("MD5");
    Path testBuildData = new Path(System.getProperty("test.build.data", "data"));
    outputDir = new Path(testBuildData, "outputDir");
  }


  /**
   * Make sure the lzo index class works as described.
   */
  public void testLzoIndex() {
    LzoIndex index = new LzoIndex();
    assertTrue(index.isEmpty());
    index = new LzoIndex(4);
    index.set(0, 0);
    index.set(1, 5);
    index.set(2, 10);
    index.set(3, 15);
    assertFalse(index.isEmpty());


    assertEquals(0, index.findNextPosition(-1));
    assertEquals(5, index.findNextPosition(1));
    assertEquals(5, index.findNextPosition(5));
    assertEquals(15, index.findNextPosition(11));
    assertEquals(15, index.findNextPosition(15));
    assertEquals(-1, index.findNextPosition(16));
    
    assertEquals(5, index.alignSliceStartToIndex(3, 20));
    assertEquals(15, index.alignSliceStartToIndex(15, 20));
    assertEquals(10, index.alignSliceEndToIndex(8, 30));
    assertEquals(10, index.alignSliceEndToIndex(10, 30));
    assertEquals(30, index.alignSliceEndToIndex(17, 30));
    assertEquals(LzoIndex.NOT_FOUND, index.alignSliceStartToIndex(16, 20));
  }


  /**
   * Index the file and make sure it splits properly.
   * 
   * @throws NoSuchAlgorithmException
   * @throws IOException
   * @throws InterruptedException
   */
  public void testWithIndex() throws NoSuchAlgorithmException, IOException,
      InterruptedException {
    
    runTest(true, OUTPUT_BIG);
    runTest(true, OUTPUT_SMALL);
  }


  /**
   * Don't index the file and make sure it can be processed anyway.
   * 
   * @throws NoSuchAlgorithmException
   * @throws IOException
   * @throws InterruptedException
   */
  public void testWithoutIndex() throws NoSuchAlgorithmException, IOException,
      InterruptedException {
    
    runTest(false, OUTPUT_BIG);
    runTest(false, OUTPUT_SMALL);
  }


  /**
   * Generate random data, compress it, index and md5 hash the data.
   * Then read it all back and md5 that too, to verify that it all went ok.
   * 
   * @param testWithIndex Should we index or not?
   * @param charsToOutput How many characters of random data should we output.
   * @throws IOException
   * @throws NoSuchAlgorithmException
   * @throws InterruptedException
   */
  private void runTest(boolean testWithIndex, int charsToOutput) throws IOException,
      NoSuchAlgorithmException, InterruptedException {


    if (!GPLNativeCodeLoader.isNativeCodeLoaded()) {
      LOG.warn("Cannot run this test without the native lzo libraries");
      return;
    }


    Configuration conf = new Configuration();
    conf.setLong("fs.local.block.size", charsToOutput / 2);
    // reducing block size to force a split of the tiny file
    conf.set("io.compression.codecs", LzopCodec.class.getName());
    
    FileSystem localFs = FileSystem.getLocal(conf);
    localFs.delete(outputDir, true);
    localFs.mkdirs(outputDir);


    Job job = new Job(conf);
    TextOutputFormat.setCompressOutput(job, true);
    TextOutputFormat.setOutputCompressorClass(job, LzopCodec.class);
    TextOutputFormat.setOutputPath(job, outputDir);


    TaskAttemptContext attemptContext = 
        CompatibilityUtil.newTaskAttemptContext(job.getConfiguration(),
          new TaskAttemptID(TaskID.forName("task_123_0001_r_000001"), 2));


    // create some input data
    byte[] expectedMd5 = createTestInput(localFs, job, attemptContext, charsToOutput);
   
    if (testWithIndex) {
      Path lzoFile = new Path(outputDir, lzoFileName);
      LzoIndex.createIndex(localFs, lzoFile);
    }


    LzoTextInputFormat inputFormat = new LzoTextInputFormat();
    TextInputFormat.setInputPaths(job, outputDir);
    
    List<InputSplit> is = inputFormat.getSplits(job);
    //verify we have the right number of lzo chunks
    if (testWithIndex && OUTPUT_BIG == charsToOutput) {
      assertEquals(3, is.size());
    } else {
      assertEquals(1, is.size());
    }


    // let's read it all and calculate the md5 hash
    for (InputSplit inputSplit : is) {
      RecordReader<LongWritable, Text> rr = inputFormat.createRecordReader(
          inputSplit, attemptContext);
      rr.initialize(inputSplit, attemptContext);


      while (rr.nextKeyValue()) {
        Text value = rr.getCurrentValue();


        md5.update(value.getBytes(), 0, value.getLength());
      }


      rr.close();
    }


    localFs.close();
    assertTrue(Arrays.equals(expectedMd5, md5.digest()));
  }


  /**
   * Creates an lzo file with random data.
   * 
   * @param fs File system we're using.
   * @param attemptContext Task attempt context, contains task id etc. 
   *
   * @throws IOException
   * @throws InterruptedException
   */
  private byte[] createTestInput(FileSystem fs, Job job, TaskAttemptContext attemptContext, 
      int charsToOutput) throws IOException, InterruptedException {


    TextOutputFormat<Text, Text> output = new TextOutputFormat<Text, Text>();
    OutputCommitter committer = output.getOutputCommitter(attemptContext);
    committer.setupJob(job);
    RecordWriter<Text, Text> rw = null;


    md5.reset();
   
    try {
      rw = output.getRecordWriter(attemptContext);


      char[] chars = "abcdefghijklmnopqrstuvwxyz\u00E5\u00E4\u00F6"
          .toCharArray();


      Random r = new Random(System.currentTimeMillis());
      Text key = new Text();
      Text value = new Text();
      int charsMax = chars.length - 1;
      for (int i = 0; i < charsToOutput;) {
        i += fillText(chars, r, charsMax, key);
        i += fillText(chars, r, charsMax, value);
        rw.write(key, value);
        md5.update(key.getBytes(), 0, key.getLength());
        // text output format writes tab between the key and value
        md5.update("\t".getBytes("UTF-8"));
        md5.update(value.getBytes(), 0, value.getLength());
      }
    } finally {
      if (rw != null) {
        rw.close(attemptContext);
        committer.commitTask(attemptContext);
        committer.commitJob(job);
      }
    }


    byte[] result = md5.digest();
    md5.reset();
    return result;
  }


  private int fillText(char[] chars, Random r, int charsMax, Text text) {
    StringBuilder sb = new StringBuilder();
    // get a reasonable string length
    int stringLength = r.nextInt(charsMax * 2);
    for (int j = 0; j < stringLength; j++) {
      sb.append(chars[r.nextInt(charsMax)]);
    }
    text.set(sb.toString());
    return stringLength;
  }


  public void testIgnoreNonLzoTrue()
      throws IOException, InterruptedException, NoSuchAlgorithmException {
    runTestIgnoreNonLzo(true, OUTPUT_BIG, true);
    runTestIgnoreNonLzo(true, OUTPUT_SMALL, true);
    runTestIgnoreNonLzo(false, OUTPUT_BIG, true);
    runTestIgnoreNonLzo(false, OUTPUT_SMALL, true);
  }


  public void testIgnoreNonLzoFalse()
      throws IOException, InterruptedException, NoSuchAlgorithmException {
    runTestIgnoreNonLzo(true, OUTPUT_BIG, false);
    runTestIgnoreNonLzo(true, OUTPUT_SMALL, false);
    runTestIgnoreNonLzo(false, OUTPUT_BIG, false);
    runTestIgnoreNonLzo(false, OUTPUT_SMALL, false);
  }


  private void runTestIgnoreNonLzo(boolean testWithIndex, int charsToOutput,
    boolean ignoreNonLzo) throws IOException, InterruptedException, NoSuchAlgorithmException {
    if (!GPLNativeCodeLoader.isNativeCodeLoaded()) {
      LOG.warn("Cannot run this test without the native lzo libraries");
      return;
    }


    Configuration conf = new Configuration();
    conf.setLong("fs.local.block.size", charsToOutput / 2);
    // reducing block size to force a split of the tiny file
    conf.set("io.compression.codecs", LzopCodec.class.getName());
    conf.setBoolean(LzoInputFormatCommon.IGNORE_NONLZO_KEY, ignoreNonLzo);


    FileSystem localFs = FileSystem.getLocal(conf);
    localFs.delete(outputDir, true);
    localFs.mkdirs(outputDir);


    // Create a non-LZO input file and put it alongside the LZO files.
    Path nonLzoFile = new Path(outputDir, "part-r-00001");
    localFs.createNewFile(nonLzoFile);
    FSDataOutputStream outputStream = localFs.create(nonLzoFile);
    outputStream.writeBytes("key1\tvalue1\nkey2\tvalue2\nkey3\tvalue3\n");
    outputStream.close();


    Job job = new Job(conf);
    TextOutputFormat.setCompressOutput(job, true);
    TextOutputFormat.setOutputCompressorClass(job, LzopCodec.class);
    TextOutputFormat.setOutputPath(job, outputDir);


    TaskAttemptContext attemptContext =
        CompatibilityUtil.newTaskAttemptContext(job.getConfiguration(),
          new TaskAttemptID(TaskID.forName("task_123_0001_r_000001"), 2));


    // create some input data
    byte[] expectedMd5 = createTestInput(localFs, job, attemptContext, charsToOutput);


    if (testWithIndex) {
      Path lzoFile = new Path(outputDir, lzoFileName);
      LzoIndex.createIndex(localFs, lzoFile);
    }


    LzoTextInputFormat inputFormat = new LzoTextInputFormat();
    TextInputFormat.setInputPaths(job, outputDir);


    // verify we have the right number of input splits
    List<InputSplit> is = inputFormat.getSplits(job);
    int numExpectedLzoSplits = 0;
    int numExpectedNonLzoSplits = 0;
    int numActualLzoSplits = 0;
    int numActualNonLzoSplits = 0;
    if (!ignoreNonLzo) {
      numExpectedNonLzoSplits += 1;
    }
    if (testWithIndex && OUTPUT_BIG == charsToOutput) {
      numExpectedLzoSplits += 3;
    } else {
      numExpectedLzoSplits += 1;
    }
    assertEquals(numExpectedLzoSplits + numExpectedNonLzoSplits, is.size());


    // Verify that we have the right number of each kind of split and the right
    // data inside the splits.
    List<String> expectedNonLzoLines = new ArrayList<String>();
    if (!ignoreNonLzo) {
      expectedNonLzoLines.add("key1\tvalue1");
      expectedNonLzoLines.add("key2\tvalue2");
      expectedNonLzoLines.add("key3\tvalue3");
    }
    List<String> actualNonLzoLines = new ArrayList<String>();
    for (InputSplit inputSplit : is) {
      FileSplit fileSplit = (FileSplit) inputSplit;
      Path file = fileSplit.getPath();
      RecordReader<LongWritable, Text> rr = inputFormat.createRecordReader(
          inputSplit, attemptContext);
      rr.initialize(inputSplit, attemptContext);
      if (LzoInputFormatCommon.isLzoFile(file.toString())) {
        numActualLzoSplits += 1;


        while (rr.nextKeyValue()) {
          Text value = rr.getCurrentValue();


          md5.update(value.getBytes(), 0, value.getLength());
        }


        rr.close();
      } else {
        numActualNonLzoSplits += 1;


        while (rr.nextKeyValue()) {
          actualNonLzoLines.add(rr.getCurrentValue().toString());
        }
      }
    }
    localFs.close();
    assertEquals(numExpectedLzoSplits, numActualLzoSplits);
    assertEquals(numExpectedNonLzoSplits, numActualNonLzoSplits);
    assertTrue(Arrays.equals(expectedMd5, md5.digest()));
    assertEquals(expectedNonLzoLines, actualNonLzoLines);
  }
}
Source Code of com.hadoop.mapreduce.TestLzoTextInputFormat

Related Classes of com.hadoop.mapreduce.TestLzoTextInputFormat