}
@Test
public void testDedupByValueWithSequenceFileInputFormat() throws Exception {
HashMap<IntWritable, Text> inputData1 = new HashMap<IntWritable, Text>();
inputData1.put(new IntWritable(1), new Text("Xavier Wilson,Mason Holloway,Carlos Johnston,Martin Noel,Drake Mckinney"));
inputData1.put(new IntWritable(2), new Text("Kennedy Bailey,Jerome Perry,David Cabrera,Edan Fleming,Orlando Tyson"));
inputData1.put(new IntWritable(3), new Text("Drake Mckinney,Murphy Baird,Theodore Lindsey,Nehru Wilcox,Harper Klein"));
createSequenceFileInHdfs(inputData1, "/input1", "testFile1.seq");
HashMap<IntWritable, Text> inputData2 = new HashMap<IntWritable, Text>();
inputData2.put(new IntWritable(1), new Text("Zephania Bauer,Jermaine Gordon,Vincent Moon,Steven Pierce,Jasper Campos"));
inputData2.put(new IntWritable(2), new Text("Kennedy Bailey,Plato Atkinson,Stuart Guy,Rooney Levy,Judah Benson"));
inputData2.put(new IntWritable(4), new Text("Drake Mckinney,Murphy Baird,Theodore Lindsey,Nehru Wilcox,Harper Klein"));
createSequenceFileInHdfs(inputData2, "/input2", "testFile2.seq");
String[] args = new String[] {
"-inputFormat", "org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat",
"-inputPath", "/input1,/input2",
"-outputPath", "output",
"-inputKeyClassName", "org.apache.hadoop.io.IntWritable",
"-inputValueClassName", "org.apache.hadoop.io.Text",
"-dedupBy", "value" };
DedupJob job = runDedupJob(args);
assertEquals(6, job.getTotalRecordsRead());
assertEquals(0, job.getBadRecords());
assertEquals(5, job.getOutput());
assertEquals(1, job.getDuplicateRecords());
FileSystem outputFS = getFileSystem();
Path outputPath = new Path(outputFS.getHomeDirectory(), "output/part-r-00000");
Configuration conf = new Configuration();
SequenceFile.Reader reader = new SequenceFile.Reader(outputFS, outputPath, conf);
Writable writableKey = (Writable)
ReflectionUtils.newInstance(reader.getKeyClass(), conf);
Writable writableValue = (Writable)
ReflectionUtils.newInstance(reader.getValueClass(), conf);
List<Text> expectedOutput = new ArrayList<Text>();
expectedOutput.add(new Text("Xavier Wilson,Mason Holloway,Carlos Johnston,Martin Noel,Drake Mckinney"));
expectedOutput.add(new Text("Kennedy Bailey,Jerome Perry,David Cabrera,Edan Fleming,Orlando Tyson"));
expectedOutput.add(new Text("Drake Mckinney,Murphy Baird,Theodore Lindsey,Nehru Wilcox,Harper Klein"));
expectedOutput.add(new Text("Zephania Bauer,Jermaine Gordon,Vincent Moon,Steven Pierce,Jasper Campos"));
expectedOutput.add(new Text("Kennedy Bailey,Plato Atkinson,Stuart Guy,Rooney Levy,Judah Benson"));
int count = 0;
while (reader.next(writableKey, writableValue)) {
logger.debug("key and value is: " + writableKey + ", " + writableValue);
assertTrue("Matched output " + writableValue , expectedOutput.contains(writableValue));
count++;