}
String inputExamples = args[0];
String output = args[1];
delete(output);
TupleMRBuilder job = new TupleMRBuilder(conf, "Naive Bayes Model Generator");
job.addIntermediateSchema(INTERMEDIATE_SCHEMA);
// perform per-category word count mapping
job.addInput(new Path(inputExamples), new HadoopInputFormat(TextInputFormat.class),
new TupleMapper<LongWritable, Text>() {
ITuple tuple = new Tuple(INTERMEDIATE_SCHEMA);
@Override
public void map(LongWritable toIgnore, Text value, TupleMRContext context, Collector collector)
throws IOException, InterruptedException {
Category category = Category.valueOf(value.toString().split("\t")[0]);
StringTokenizer itr = new StringTokenizer(value.toString().split("\t")[1]);
tuple.set("category", category);
tuple.set("count", 1);
while(itr.hasMoreTokens()) {
tuple.set("word", normalizeWord(itr.nextToken()));
collector.write(tuple);
}
}
});
TupleReducer countReducer = new TupleReducer<ITuple, NullWritable>() {
public void reduce(ITuple group, Iterable<ITuple> tuples, TupleMRContext context, Collector collector)
throws IOException, InterruptedException, TupleMRException {
int count = 0;
ITuple outputTuple = null;
for(ITuple tuple : tuples) {
count += (Integer) tuple.get("count");
outputTuple = tuple;
}
outputTuple.set("count", count);
collector.write(outputTuple, NullWritable.get());
}
};
job.setTupleCombiner(countReducer);
job.setTupleReducer(countReducer);
job.setGroupByFields("word", "category");
job.setTupleOutput(new Path(output), INTERMEDIATE_SCHEMA);
if(job.createJob().waitForCompletion(true)) {
return 1;
}
return -1;
}