writer = new BufferedWriter(new FileWriter(INPUT2));
writer.write("4.5" + "\t" + "true" + "\n");
writer.write("4.6" + "\t" + "false" + "\n");
writer.close();
TupleMRBuilder builder = new TupleMRBuilder(getConf());
final Schema tupleSchema1 = new Schema("tupleSchema1", Fields.parse("a:string, b:int"));
final Schema tupleSchema2 = new Schema("tupleSchema2", Fields.parse("c:double, d:boolean"));
List<Field> fields = new ArrayList<Field>();
fields.add(Field.create("partitionId", Type.INT));
fields.add(Fields.createTupleField("tuple1", tupleSchema1));
final Schema schema1 = new Schema("tupleInTuple1", fields);
fields.clear();
fields.add(Field.create("partitionId", Type.INT));
fields.add(Fields.createTupleField("tuple2", tupleSchema2));
final Schema schema2 = new Schema("tupleInTuple2", fields);
builder.addIntermediateSchema(schema1);
builder.addIntermediateSchema(schema2);
builder.addInput(new Path(INPUT1), new HadoopInputFormat(TextInputFormat.class),
new TupleMapper<LongWritable, Text>() {
ITuple tupleInTuple1 = new Tuple(schema1);
ITuple tuple1 = new Tuple(tupleSchema1);
@Override
public void map(LongWritable key, Text value, TupleMRContext context, Collector collector)
throws IOException, InterruptedException {
String[] split = value.toString().split("\t");
tuple1.set("a", split[0]);
tuple1.set("b", Integer.parseInt(split[1]));
tupleInTuple1.set("partitionId", 0);
tupleInTuple1.set("tuple1", tuple1);
collector.write(tupleInTuple1);
}
});
builder.addInput(new Path(INPUT2), new HadoopInputFormat(TextInputFormat.class),
new TupleMapper<LongWritable, Text>() {
ITuple tupleInTuple2 = new Tuple(schema2);
ITuple tuple2 = new Tuple(tupleSchema2);
@Override
public void map(LongWritable key, Text value, TupleMRContext context, Collector collector)
throws IOException, InterruptedException {
String[] split = value.toString().split("\t");
tuple2.set("c", Double.parseDouble(split[0]));
tuple2.set("d", Boolean.parseBoolean(split[1]));
tupleInTuple2.set("partitionId", 0);
tupleInTuple2.set("tuple2", tuple2);
collector.write(tupleInTuple2);
}
});
builder.setTupleReducer(new TupleReducer<Text, NullWritable>() {
public void reduce(ITuple group, Iterable<ITuple> tuples, TupleMRContext context, Collector collector)
throws IOException, InterruptedException, TupleMRException {
Iterator<ITuple> iterator = tuples.iterator();
ITuple currentTuple;
assertEquals(0, group.get("partitionId"));
currentTuple = iterator.next();
assertEquals("foo1", ((ITuple)currentTuple.get("tuple1")).get("a").toString());
assertEquals(30, ((ITuple)currentTuple.get("tuple1")).get("b"));
currentTuple = iterator.next();
assertEquals("foo2", ((ITuple)currentTuple.get("tuple1")).get("a").toString());
assertEquals(20, ((ITuple)currentTuple.get("tuple1")).get("b"));
currentTuple = iterator.next();
assertEquals("foo3", ((ITuple)currentTuple.get("tuple1")).get("a").toString());
assertEquals(140, ((ITuple)currentTuple.get("tuple1")).get("b"));
currentTuple = iterator.next();
assertEquals("foo4", ((ITuple)currentTuple.get("tuple1")).get("a").toString());
assertEquals(110, ((ITuple)currentTuple.get("tuple1")).get("b"));
currentTuple = iterator.next();
assertEquals("foo5", ((ITuple)currentTuple.get("tuple1")).get("a").toString());
assertEquals(220, ((ITuple)currentTuple.get("tuple1")).get("b"));
currentTuple = iterator.next();
assertEquals("foo6", ((ITuple)currentTuple.get("tuple1")).get("a").toString());
assertEquals(260, ((ITuple)currentTuple.get("tuple1")).get("b"));
// Second data source BEGINS
currentTuple = iterator.next();
assertEquals(4.5, ((ITuple)currentTuple.get("tuple2")).get("c"));
assertEquals(true, ((ITuple)currentTuple.get("tuple2")).get("d"));
currentTuple = iterator.next();
assertEquals(4.6, ((ITuple)currentTuple.get("tuple2")).get("c"));
assertEquals(false, ((ITuple)currentTuple.get("tuple2")).get("d"));
};
});
builder.setGroupByFields("partitionId");
builder.setOutput(new Path(OUTPUT), new HadoopOutputFormat(TextOutputFormat.class), Text.class, NullWritable.class);
Job job = builder.createJob();
try {
job.waitForCompletion(true);
} finally {
builder.cleanUpInstanceFiles();
}
trash(INPUT1, INPUT2, OUTPUT);
}