HadoopUtils.deleteIfExists(fS, outPath);
HadoopUtils.deleteIfExists(fS, outPathText);
Schema originalSchema = new Schema("schema", Fields.parse("title:string, content:string"));
TupleMRBuilder builder = new TupleMRBuilder(conf);
builder.addIntermediateSchema(originalSchema);
builder.setGroupByFields("title");
builder.setOrderBy(new OrderBy().add("title", Order.ASC).add("content", Order.ASC));
builder.setTupleReducer(new IdentityTupleReducer());
builder.setTupleOutput(outPath, originalSchema);
builder.addInput(inPath, new HadoopInputFormat(TextInputFormat.class), new MyInputProcessor());
Job job = builder.createJob();
try {
job.waitForCompletion(true);
} finally {
builder.cleanUpInstanceFiles();
}
// Use output as input of new TupleMRBuilder
// To make things nicer, we evolve the Schema and use a different Schema for reading the Tuple File.
// We remove the "content" and add a new nullable field.
Schema evolvedSchema = new Schema("evolved", Fields.parse("content:string, new_field:string?"));
builder = new TupleMRBuilder(conf);
builder.addTupleInput(outPath, evolvedSchema, new IdentityTupleMapper());
builder.addIntermediateSchema(evolvedSchema);
builder.setGroupByFields("content");
builder.setTupleReducer(new MyGroupHandler());
builder.setOutput(outPathText, new HadoopOutputFormat(TextOutputFormat.class), Text.class,
NullWritable.class);
job = builder.createJob();
try {
assertRun(job);
} finally {
builder.cleanUpInstanceFiles();
}
Assert.assertEquals("bar2 foo2\nfoo1 bar1",
Files.toString(new File(OUT_TEXT + "/" + "part-r-00000"), Charset.forName("UTF-8")).trim());