package org.xadoop.driver;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.lib.TotalOrderPartitioner;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import org.xadoop.saxon.SaxonMapper;
import org.xadoop.saxon.SaxonReducer;
import org.xadoop.xquerymr.XQueryMRSaxonConfFile;
//import org.apache.log4j.Logger;
/**
* The Saxon Hadoop-Binding.
*
* @author Georg Polzer
*
*/
public class SaxonDriver {
//private static Logger log = Logger.getLogger(XadoopDriver.class);
public static void main(String[] args) throws IOException {
Configuration configuration = new Configuration();
GenericOptionsParser parser = new GenericOptionsParser(configuration, args);
args = parser.getRemainingArgs();
if (args.length < 2) {
System.err.println("Usage: <input dir> <output dir>");
System.exit(2);
}
final String inputDirName = args[0];
final String outputDirName = args[1];
final String pipeline = configuration.get(XQueryMRSaxonConfFile.PROPNAME_PIPELINE);
if (pipeline == null) {
System.err.println("ERROR: " + XQueryMRSaxonConfFile.PROPNAME_PIPELINE + " not set");
System.exit(-1);
}
final String[] pipelineSplitted = pipeline.split(", ");
final String prolog = configuration.get(XQueryMRSaxonConfFile.PROPNAME_QUERYFILE);
if (prolog == null) {
System.err.println("ERROR: " + XQueryMRSaxonConfFile.PROPNAME_QUERYFILE + " not set");
System.exit(-1);
}
DistributedCache.addCacheFile(new Path(prolog).toUri(), configuration);
configuration.set("mapred.tasktracker.map.tasks.maximum", "2");
configuration.set("mapred.tasktracker.reduce.tasks.maximum", "2");
configuration.set("mapred.child.java.opts", "-Xmx8192m");
System.out.println(pipelineSplitted);
// look over all jobs
String previousJobDirName = inputDirName;
for (String jobName : pipelineSplitted) {
Job job = new Job(configuration);
job.setJobName(jobName);
job.setJarByClass(SaxonDriver.class);
job.setMapperClass(SaxonMapper.class);
job.setReducerClass(SaxonReducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setInputFormatClass(TextInputFormat.class);
// use input from previous job (or given input)
// write output to this job dir
String currentJobDirName = outputDirName + "/" + jobName;
FileInputFormat.setInputPaths(job, new Path(previousJobDirName));
FileOutputFormat.setOutputPath(job, new Path(currentJobDirName));
// iterate
previousJobDirName = currentJobDirName;
// run job
try {
job.waitForCompletion(true);
} catch (InterruptedException e) {
e.printStackTrace();
throw new RuntimeException(e);
} catch (ClassNotFoundException e) {
e.printStackTrace();
throw new RuntimeException(e);
}
}
}
}