}
@Test
public void testDedupByLongWritableKeyWithSequenceFileInputFormat() throws Exception {
HashMap<LongWritable, Text> inputData1 = new HashMap<LongWritable, Text>();
inputData1.put(new LongWritable(1), new Text("Xavier Wilson,Mason Holloway,Carlos Johnston,Martin Noel,Drake Mckinney"));
inputData1.put(new LongWritable(2), new Text("Kennedy Bailey,Jerome Perry,David Cabrera,Edan Fleming,Orlando Tyson"));
inputData1.put(new LongWritable(3), new Text("Drake Mckinney,Murphy Baird,Theodore Lindsey,Nehru Wilcox,Harper Klein"));
createSequenceFileInHdfs(inputData1, "/input1", "testFile1.seq");
HashMap<LongWritable, Text> inputData2 = new HashMap<LongWritable, Text>();
inputData2.put(new LongWritable(1), new Text("Zephania Bauer,Jermaine Gordon,Vincent Moon,Steven Pierce,Jasper Campos"));
inputData2.put(new LongWritable(2), new Text("Kennedy Bailey,Plato Atkinson,Stuart Guy,Rooney Levy,Judah Benson"));
inputData2.put(new LongWritable(4), new Text("Drake Mckinney,Murphy Baird,Theodore Lindsey,Nehru Wilcox,Harper Klein"));
createSequenceFileInHdfs(inputData2, "/input2", "testFile2.seq");
String[] args = new String[] {
"-inputFormat", "org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat",
"-inputPath", "/input1,/input2",
"-outputPath", "output",
"-inputKeyClassName", "org.apache.hadoop.io.LongWritable",
"-inputValueClassName", "org.apache.hadoop.io.Text",
"-dedupBy", "key" };
DedupJob job = runDedupJob(args);
assertEquals(6, job.getTotalRecordsRead());
assertEquals(0, job.getBadRecords());
assertEquals(4, job.getOutput());
assertEquals(2, job.getDuplicateRecords());
FileSystem outputFS = getFileSystem();
Path outputPath = new Path(outputFS.getHomeDirectory(), "output/part-r-00000");
Configuration conf = new Configuration();
SequenceFile.Reader reader = new SequenceFile.Reader(outputFS, outputPath, conf);
Writable writableKey = (Writable)
ReflectionUtils.newInstance(reader.getKeyClass(), conf);
Writable writableValue = (Writable)
ReflectionUtils.newInstance(reader.getValueClass(), conf);
List<LongWritable> expectedOutput = new ArrayList<LongWritable>();
expectedOutput.add(new LongWritable(1));
expectedOutput.add(new LongWritable(2));
expectedOutput.add(new LongWritable(3));
expectedOutput.add(new LongWritable(4));
int count = 0;
while (reader.next(writableKey, writableValue)) {
logger.debug("key and value is: " + writableKey + ", " + writableValue);
assertTrue("Matched output " + writableKey , expectedOutput.contains(writableKey));
count++;