/**
* Copyright 2011 Nube Technologies
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed
* under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
* CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and limitations under the License.
*/
package co.nubetech.hiho.merge;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.Counters;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.ReflectionUtils;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.log4j.Logger;
import co.nubetech.hiho.common.HIHOConf;
import co.nubetech.hiho.common.HIHOException;
import co.nubetech.hiho.dedup.DelimitedTextInputFormat;
import co.nubetech.hiho.dedup.HihoTuple;
public class MergeJob extends Configured implements Tool {
final static Logger logger = Logger
.getLogger(co.nubetech.hiho.merge.MergeJob.class);
private String oldPath = null;
private String newPath = null;
private String mergeBy = null;
private String delimiter = ",";
private int column = 1;
private String inputFormat = null;
private String inputKeyClassName = null;
private String inputValueClassName = null;
private String outputPath = null;
private String outputFormat = null;
private long output;
private long badRecords;
private long totalRecordsNew;
private long totalRecordsOld;
public long getOutput() {
return output;
}
public void setOutput(long output) {
this.output = output;
}
public long getBadRecords() {
return badRecords;
}
public void setBadRecords(long badRecords) {
this.badRecords = badRecords;
}
public long getTotalRecordsNew() {
return totalRecordsNew;
}
public void setTotalRecordsNew(long totalRecordsNew) {
this.totalRecordsNew = totalRecordsNew;
}
public long getTotalRecordsOld() {
return totalRecordsOld;
}
public void setTotalRecordsOld(long totalRecordsOld) {
this.totalRecordsOld = totalRecordsOld;
}
public void populateConfiguration(String[] args) {
for (int i = 0; i < args.length - 1; i++) {
if ("-inputFormat".equals(args[i])) {
inputFormat = args[++i];
if (inputFormat
.equals("org.apache.hadoop.mapreduce.lib.input.TextInputFormat")) {
inputKeyClassName = "org.apache.hadoop.io.LongWritable";
inputValueClassName = "org.apache.hadoop.io.Text";
} else if (inputFormat
.equals("co.nubetech.hiho.dedup.DelimitedTextInputFormat")) {
inputKeyClassName = "org.apache.hadoop.io.Text";
inputValueClassName = "org.apache.hadoop.io.Text";
outputFormat = "co.nubetech.hiho.mapreduce.lib.output.NoKeyOnlyValueOutputFormat";
} else if (inputFormat
.equals("org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat")) {
outputFormat = "org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat";
}
} else if ("-oldPath".equals(args[i])) {
oldPath = args[++i];
} else if ("-newPath".equals(args[i])) {
newPath = args[++i];
} else if ("-mergeBy".equals(args[i])) {
mergeBy = args[++i];
} else if ("-inputKeyClassName".equals(args[i])) {
inputKeyClassName = args[++i];
} else if ("-inputValueClassName".equals(args[i])) {
inputValueClassName = args[++i];
} else if ("-outputPath".equals(args[i])) {
outputPath = args[++i];
} else if ("-delimiter".equals(args[i])) {
delimiter = args[++i];
} else if ("-column".equals(args[i])) {
column = Integer.parseInt(args[++i]);
}else if ("-outputFormat".equals(args[i])) {
outputFormat = args[++i];
}
}
}
public void checkMandatoryConfs() throws HIHOException {
if (inputFormat == null) {
throw new HIHOException(
"The provided input format is empty, please specify inputFormat");
}
if (mergeBy == null) {
throw new HIHOException(
"The provided value of dedupBy is empty, please specify either key or value");
}
if ((!mergeBy.equals("key")) && (!mergeBy.equals("value"))) {
throw new HIHOException(
"The provided value of mergeBy is Incorrect, please specify either key or value");
}
if (inputKeyClassName == null) {
throw new HIHOException(
"The provided input key class name is empty, please specify inputKeyClassName");
}
if (inputValueClassName == null) {
throw new HIHOException(
"The provided input value class name is empty, please specify inputValueClassName");
}
if (oldPath == null) {
throw new HIHOException(
"The provided old path is empty, please specify oldPath");
}
if (newPath == null) {
throw new HIHOException(
"The provided new path is empty, please specify newPath");
}
if (outputPath == null) {
throw new HIHOException(
"The provided output path is empty, please specify outputPath");
}
if (outputFormat == null) {
System.out.println(outputFormat);
throw new HIHOException(
"The provided output format is empty, please specify outputFormat");
}
}
@Override
public int run(String[] args) throws Exception {
populateConfiguration(args);
try {
checkMandatoryConfs();
} catch (HIHOException e1) {
e1.printStackTrace();
throw new Exception(e1);
}
Class inputFormatClass = Class.forName(inputFormat);
Class outputFormatClass = Class.forName(outputFormat);
Class inputKeyClass = Class.forName(inputKeyClassName);
Class inputValueClass = Class.forName(inputValueClassName);
Configuration conf = getConf();
conf.set(HIHOConf.MERGE_OLD_PATH, oldPath);
conf.set(HIHOConf.MERGE_NEW_PATH, newPath);
Job job = new Job(conf);
job.setJobName("Merge job");
job.setJarByClass(MergeJob.class);
if (mergeBy.equals("key")) {
job.setMapperClass(MergeKeyMapper.class);
job.setReducerClass(MergeKeyReducer.class);
} else if (mergeBy.equals("value")) {
job.setMapperClass(MergeValueMapper.class);
job.setReducerClass(MergeValueReducer.class);
}
job.setInputFormatClass(inputFormatClass);
DelimitedTextInputFormat.setProperties(job, delimiter, column);
job.setMapOutputKeyClass(HihoTuple.class);
job.setMapOutputValueClass(HihoValue.class);
job.setOutputKeyClass(inputKeyClass);
job.setOutputValueClass(inputValueClass);
FileInputFormat.setInputPaths(job, oldPath + "," + newPath);
job.setOutputFormatClass(outputFormatClass);
FileOutputFormat.setOutputPath(job, new Path(outputPath));
try {
logger.debug("Output format class is " + job.getOutputFormatClass());
logger.debug("Class is "
+ ReflectionUtils
.newInstance(job.getOutputFormatClass(),
job.getConfiguration()).getClass()
.getName());
job.waitForCompletion(false);
if (job.isComplete()) {
Counters counters = job.getCounters();
totalRecordsOld = counters.findCounter(
MergeRecordCounter.TOTAL_RECORDS_OLD).getValue();
totalRecordsNew = counters.findCounter(
MergeRecordCounter.TOTAL_RECORDS_NEW).getValue();
badRecords = counters.findCounter(
MergeRecordCounter.BAD_RECORD).getValue();
output = counters.findCounter(MergeRecordCounter.OUTPUT)
.getValue();
logger.info("Total old records read are: " + totalRecordsOld);
logger.info("Total new records read are: " + totalRecordsNew);
logger.info("Bad Records are: " + badRecords);
logger.info("Output records are: " + output);
}
} catch (Exception e) {
e.printStackTrace();
}
return 0;
}
public static void main(String[] args) throws Exception {
MergeJob job = new MergeJob();
int res = ToolRunner.run(new Configuration(), job, args);
System.exit(res);
}
}