job.addIntermediateSchema(INTERMEDIATE_SCHEMA);
// perform per-category word count mapping
job.addInput(new Path(inputExamples), new HadoopInputFormat(TextInputFormat.class),
new TupleMapper<LongWritable, Text>() {
ITuple tuple = new Tuple(INTERMEDIATE_SCHEMA);
@Override
public void map(LongWritable toIgnore, Text value, TupleMRContext context, Collector collector)
throws IOException, InterruptedException {
Category category = Category.valueOf(value.toString().split("\t")[0]);
StringTokenizer itr = new StringTokenizer(value.toString().split("\t")[1]);
tuple.set("category", category);
tuple.set("count", 1);
while(itr.hasMoreTokens()) {
tuple.set("word", normalizeWord(itr.nextToken()));
collector.write(tuple);
}
}
});
TupleReducer countReducer = new TupleReducer<ITuple, NullWritable>() {
public void reduce(ITuple group, Iterable<ITuple> tuples, TupleMRContext context, Collector collector)
throws IOException, InterruptedException, TupleMRException {
int count = 0;
ITuple outputTuple = null;
for(ITuple tuple : tuples) {
count += (Integer) tuple.get("count");
outputTuple = tuple;
}
outputTuple.set("count", count);
collector.write(outputTuple, NullWritable.get());
}