List<String> inputPaths = new ArrayList<String>();
inputPaths.add(task.inputPathFormat);
String outputPath = task.outputPath;
final StagedOutputJob job = StagedOutputJob.createStagedJob(
_props,
_name + "-parse-jobs-" + task.id,
inputPaths,
"/tmp" + outputPath,
outputPath,
_log);
job.getConfiguration().set("jobs.output.path", _jobsOutputPathRoot);
job.getConfiguration().set("logs.cluster.name", clusterName);
// 1 reducer per 12 GB of input data
long numReduceTasks = (int)Math.ceil(((double)task.totalLength) / 1024 / 1024 / 1024 / 12);
job.setOutputKeyClass(BytesWritable.class);
job.setOutputValueClass(BytesWritable.class);
job.setInputFormatClass(CombinedTextInputFormat.class);
job.setOutputFormatClass(AvroKeyValueOutputFormat.class);
AvroJob.setOutputKeySchema(job, Schema.create(Type.STRING));
AvroJob.setOutputValueSchema(job, LogData.SCHEMA$);
job.setNumReduceTasks((int)numReduceTasks);
job.setMapperClass(ParseJobsFromLogs.TheMapper.class);
job.setReducerClass(ParseJobsFromLogs.TheReducer.class);
AvroJob.setMapOutputKeySchema(job, Schema.create(Type.STRING));
AvroJob.setMapOutputValueSchema(job, LogData.SCHEMA$);
MyAvroMultipleOutputs.addNamedOutput(job, "logs", AvroKeyValueOutputFormat.class, Schema.create(Type.STRING), LogData.SCHEMA$);