package edu.eltech;
import edu.eltech.classifier.Category;
import edu.eltech.classifier.Letter;
import edu.eltech.generator.FileGenerator;
import edu.eltech.mapreduce.classifier.MyMapper;
import edu.eltech.mapreduce.classifier.MyReducer;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.ParseException;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapred.*;
import java.io.File;
import java.io.IOException;
import java.util.UUID;
public class EntryPoint {
private JobConf jobConf = new JobConf(EntryPoint.class);
public EntryPoint() {
this("test-job-" + UUID.randomUUID());
}
public EntryPoint(String jobName) {
this(jobName, MyMapper.class, MyReducer.class);
}
public EntryPoint(String jobName, Class<? extends org.apache.hadoop.mapred.Mapper> mapper,
Class<? extends org.apache.hadoop.mapred.Reducer> reducer) {
jobConf.setJobName(jobName);
jobConf.setJarByClass(EntryPoint.class);
jobConf.setMapperClass(mapper);
jobConf.setReducerClass(reducer);
jobConf.setMapOutputKeyClass(IntWritable.class);
jobConf.setMapOutputValueClass(Letter.class);
jobConf.setOutputKeyClass(Category.class);
jobConf.setOutputValueClass(Letter.class);
}
public void run(String inputPath, String outputPath, boolean isTest) throws IOException{
/* get files from input folder and wrap them into Path objects */
Path[] paths = null;
// /* for testing mode - list files from the local file system */
// if (isTest) {
File fileFolder = new File(inputPath);
File[] files = fileFolder.listFiles();
paths = new Path[files.length];
for (int i = 0; i < files.length; i++) {
String s = files[i].toString().replaceFirst(":/", "://");
paths[i] = new Path(s);
}
// }
// /* for production mode - list files uploaded to the S3 */
// else {
//
//
// }
FileInputFormat.setInputPaths(jobConf, paths);
FileOutputFormat.setOutputPath(jobConf, new Path(outputPath));
jobConf.setNumMapTasks(20);
jobConf.setNumReduceTasks(20);
// jobConf.setCompressMapOutput(true);
// TextOutputFormat.setCompressOutput(jobConf, true);
JobClient.runJob(jobConf);
}
public static void main(String[] args) throws ParseException, IOException {
/* get attributes' values from the input parameters */
CmdParser cmdParser = new CmdParser();
CommandLine line = cmdParser.getAttributes(args);
String inputPath = null, outputPath = null;
boolean isTestRun = false;
if ( line.hasOption(CmdParser.HELP) || line.hasOption(CmdParser.HELP_LONG) ) {
/* print help and exit */
cmdParser.printHelp();
System.exit(1);
}
else if ( line.hasOption(CmdParser.INPUT_PATH) && line.hasOption(CmdParser.OUTPUT_PATH) ) {
inputPath = line.getOptionValue(CmdParser.INPUT_PATH);
outputPath = line.getOptionValue(CmdParser.OUTPUT_PATH);
if ( line.hasOption(CmdParser.TEST_RUN) ) {
isTestRun = true;
}
} else {
/* print help and exit */
cmdParser.printHelp();
System.exit(1);
}
/* set up environment (only for testing - not suitable for Amazon) */
if (isTestRun) {
/* generate data beforehand - in Amazon it will be saved to S3 */
FileGenerator fileGenerator = new FileGenerator();
fileGenerator.generateData(inputPath, 100, 10000);
/* clear the output folder so that Hadoop would not cuss */
FileGenerator.removeFolder(outputPath);
}
long start = System.currentTimeMillis();
/* initialize and run a mapreduce job */
EntryPoint entryPoint = new EntryPoint();
entryPoint.run(inputPath, outputPath, isTestRun);
if (isTestRun) {
long end = System.currentTimeMillis();
System.out.println("Elapsed time: " + (end - start) / 1000 + "sec.");
}
}
}