import static org.hamcrest.CoreMatchers.instanceOf;
import static;
import static org.junit.Assert.assertThat;
import java.util.Arrays;
import java.util.Random;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hdfs.MiniDFSCluster;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapred.InputFormat;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reporter;
import org.junit.*;
* Create a file with 4k blocksize, 3 blocks, lines 1023 bytes + 1 byte nl
* Make each line begin with its line number 01 to 12
* Then expect to get 3 splits (one per block), and 4 records per split
* * each split corresponds exactly to one block
* If lines are 1024 * 1.5 bytes long (in nl), then what do we get for each record?
* * Do we lose records?
* * If not then in general need to get end from another block?
* How does compression fit in!?
0 0
1 1024
2 2048
public class SplitTest {
private static final Random r = new Random();
private static final String[] lines1 = new String[120];
static {
for (int i = 0; i < lines1.length; i++) {
char[] c = new char[1023];
c[0] = Integer.toHexString(i % 16).charAt(0);
for (int j = 1; j < c.length; j++) {
c[j] = (char) (r.nextInt(26) + (int) 'a');
lines1[i] = new String(c);
private static final String[] lines2 = new String[12];
static {
for (int i = 0; i < lines2.length; i++) {
char[] c = new char[1023 + 512];
c[0] = Integer.toHexString(i % 16).charAt(0);
for (int j = 1; j < c.length; j++) {
c[j] = (char) (r.nextInt(26) + (int) 'a');
lines2[i] = new String(c);
private static MiniDFSCluster cluster; // use an in-process HDFS cluster for testing
private static FileSystem fs;
public static void setUp() throws IOException {
Configuration conf = new Configuration();
if (System.getProperty("") == null) {
System.setProperty("", "/tmp");
cluster = new MiniDFSCluster(conf, 1, true, null);
fs = cluster.getFileSystem();
public static void tearDown() throws IOException {
@Ignore("Needs more investigation")
public void recordsCoincideWithBlocks() throws IOException {
int recordLength = 1024;
Path input = new Path("input");
createFile(input, 12, recordLength);
JobConf job = new JobConf();
job.set("", fs.getUri().toString());
FileInputFormat.addInputPath(job, input);
InputFormat<LongWritable, Text> inputFormat = job.getInputFormat();
InputSplit[] splits = inputFormat.getSplits(job, job.getNumMapTasks());
assertThat(splits.length, is(3));
checkSplit(splits[0], 0, 4096);
checkSplit(splits[1], 4096, 4096);
checkSplit(splits[2], 8192, 4096);
checkRecordReader(inputFormat, splits[0], job, recordLength, 0, 4);
checkRecordReader(inputFormat, splits[1], job, recordLength, 4, 8);
checkRecordReader(inputFormat, splits[2], job, recordLength, 8, 12);
public void recordsDontCoincideWithBlocks() throws IOException {
int recordLength = 1024 + 512;
Path input = new Path("input");
createFile(input, 8, recordLength);
JobConf job = new JobConf();
job.set("", fs.getUri().toString());
FileInputFormat.addInputPath(job, input);
InputFormat<LongWritable, Text> inputFormat = job.getInputFormat();
InputSplit[] splits = inputFormat.getSplits(job, job.getNumMapTasks());
checkSplit(splits[0], 0, 4096);
checkSplit(splits[1], 4096, 4096);
checkSplit(splits[2], 8192, 4096);
checkRecordReader(inputFormat, splits[0], job, recordLength, 0, 3);
checkRecordReader(inputFormat, splits[1], job, recordLength, 3, 6);
checkRecordReader(inputFormat, splits[2], job, recordLength, 6, 8);
@Ignore("Needs more investigation")
public void compression() throws IOException {
int recordLength = 1024;
Path input = new Path("input.bz2");
createFile(input, 24, recordLength);
System.out.println(">>>>>>" + fs.getLength(input));
JobConf job = new JobConf();
job.set("", fs.getUri().toString());
FileInputFormat.addInputPath(job, input);
InputFormat<LongWritable, Text> inputFormat = job.getInputFormat();
InputSplit[] splits = inputFormat.getSplits(job, job.getNumMapTasks());
assertThat(splits.length, is(2));
checkSplit(splits[0], 0, 4096);
checkSplit(splits[1], 4096, 4096);
checkRecordReader(inputFormat, splits[0], job, recordLength, 0, 4);
checkRecordReader(inputFormat, splits[1], job, recordLength, 5, 12);
private void checkSplit(InputSplit split, long start, long length) {
assertThat(split, instanceOf(FileSplit.class));
FileSplit fileSplit = (FileSplit) split;
assertThat(fileSplit.getStart(), is(start));
assertThat(fileSplit.getLength(), is(length));
private void checkRecord(int record, RecordReader<LongWritable, Text> recordReader, long expectedKey, String expectedValue)
throws IOException {
LongWritable key = new LongWritable();
Text value = new Text();
assertThat(, value), is(true));
assertThat("Record " + record, value.toString(), is(expectedValue));
assertThat("Record " + record, key.get(), is(expectedKey));
private void checkRecordReader(InputFormat<LongWritable, Text> inputFormat,
InputSplit split, JobConf job, long recordLength, int startLine, int endLine) throws IOException {
RecordReader<LongWritable, Text> recordReader =
inputFormat.getRecordReader(split, job, Reporter.NULL);
for (int i = startLine; i < endLine; i++) {
checkRecord(i, recordReader, i * recordLength, line(i, recordLength));
assertThat( LongWritable(), new Text()), is(false));
private void createFile(Path input, int records, int recordLength) throws IOException {
long fileSize = 4096;
OutputStream out = fs.create(input, true, 4096, (short) 1, fileSize);
CompressionCodecFactory codecFactory = new CompressionCodecFactory(new Configuration());
CompressionCodec codec = codecFactory.getCodec(input);
if (codec != null) {
out = codec.createOutputStream(out);
Writer writer = new OutputStreamWriter(out);
try {
for (int n = 0; n < records; n++) {
writer.write(line(n, recordLength));
} finally {
private String line(int i, long recordLength) {
return recordLength == 1024 ? lines1[i] : lines2[i];