failArguments("Invalid number of arguments");
return -1;
}
delete(args[1]);
TupleMRBuilder job = new TupleMRBuilder(conf);
job.addIntermediateSchema(SCHEMA);
job.setGroupByFields("line");
String input = args[0], output = args[1];
FileSystem fileSystem = FileSystem.get(conf);
for(Category category : Category.values()) { // For each Category
String categoryString = category.toString().toLowerCase();
// Add the category, book title input spec with the associated CategoryMapper
for(FileStatus fileStatus : fileSystem.listStatus(new Path(input + "/" + categoryString))) {
job.addInput(fileStatus.getPath(), new HadoopInputFormat(TextInputFormat.class),
new CategoryMapper(category, fileStatus.getPath().getName()));
}
// Add a named output for each category
job.addNamedOutput(categoryString, new TupleSolrOutputFormat(new File(
"src/test/resources/shakespeare-solr"), conf), ITuple.class, NullWritable.class);
}
job.setOutput(new Path(output), new HadoopOutputFormat(NullOutputFormat.class), ITuple.class,
NullWritable.class);
// The reducer will just emit the tuple to the corresponding Category output
job.setTupleReducer(new TupleReducer<ITuple, NullWritable>() {
ITuple outTuple = new Tuple(OUT_SCHEMA);
public void reduce(ITuple group, Iterable<ITuple> tuples, TupleMRContext context,
Collector collector) throws IOException, InterruptedException, TupleMRException {
for(ITuple tuple : tuples) {
Category category = (Category) tuple.get("category");
outTuple.set("line", tuple.get("line"));
outTuple.set("text", tuple.get("text"));
outTuple.set("title", tuple.get("title"));
collector.getNamedOutput(category.toString().toLowerCase())
.write(outTuple, NullWritable.get());
}
}
});
try {
Job hadoopJob = job.createJob();
hadoopJob.waitForCompletion(true);
} finally {
job.cleanUpInstanceFiles();
}
return 0;
}