checkMandatoryConfs();
} catch (HIHOException e1) {
e1.printStackTrace();
throw new Exception(e1);
}
Job job = new Job(conf);
job.setJobName("Dedup job");
job.setJarByClass(DedupJob.class);
Class inputFormatClass = Class.forName(inputFormat);
Class outputFormatClass = Class.forName(outputFormat);
Class inputKeyClass = Class.forName(inputKeyClassName);
Class inputValueClass = Class.forName(inputValueClassName);
if (dedupBy.equals("key")) {
job.setMapperClass(DedupKeyMapper.class);
job.setReducerClass(DedupKeyReducer.class);
job.setMapOutputValueClass(inputValueClass);
} else if (dedupBy.equals("value")) {
job.setMapperClass(DedupValueMapper.class);
job.setReducerClass(DedupValueReducer.class);
job.setMapOutputValueClass(inputKeyClass);
}
job.setInputFormatClass(inputFormatClass);
if (inputFormat
.equals("co.nubetech.hiho.dedup.DelimitedTextInputFormat")) {
DelimitedTextInputFormat.setProperties(job, delimiter, column);
}
job.setMapOutputKeyClass(HihoTuple.class);
job.setOutputKeyClass(inputKeyClass);
job.setOutputValueClass(inputValueClass);
job.setPartitionerClass(HihoHashPartitioner.class);
FileInputFormat.setInputPaths(job, inputPath);
job.setOutputFormatClass(outputFormatClass);
FileOutputFormat.setOutputPath(job, new Path(outputPath));
try {
logger.debug("Output format class is " + job.getOutputFormatClass());
logger.debug("Class is "
+ ReflectionUtils
.newInstance(job.getOutputFormatClass(),
job.getConfiguration()).getClass()
.getName());
job.waitForCompletion(false);
if (job.isComplete()) {
Counters counters = job.getCounters();
totalRecordsRead = counters.findCounter(
DedupRecordCounter.TOTAL_RECORDS_READ).getValue();
badRecords = counters.findCounter(
DedupRecordCounter.BAD_RECORD).getValue();
output = counters.findCounter(DedupRecordCounter.OUTPUT)