Vertex tokenizerVertex = Vertex.create(TOKENIZER, ProcessorDescriptor.create(
TokenProcessor.class.getName()));
tokenizerVertex.addDataSource(INPUT, dataSource);
// Use Text key and IntWritable value to bring counts for each word in the same partition
OrderedPartitionedKVEdgeConfig summationEdgeConf = OrderedPartitionedKVEdgeConfig
.newBuilder(Text.class.getName(), IntWritable.class.getName(),
HashPartitioner.class.getName()).build();
// This vertex will be reading intermediate data via an input edge and writing intermediate data
// via an output edge.
Vertex summationVertex = Vertex.create(SUMMATION, ProcessorDescriptor.create(
SumProcessor.class.getName()), numPartitions);
// Use IntWritable key and Text value to bring all words with the same count in the same
// partition. The data will be ordered by count and words grouped by count.
OrderedPartitionedKVEdgeConfig sorterEdgeConf = OrderedPartitionedKVEdgeConfig
.newBuilder(IntWritable.class.getName(), Text.class.getName(),
HashPartitioner.class.getName()).build();
// Use 1 task to bring all the data in one place for global sorted order. Essentially the number
// of partitions is 1. So the NoOpSorter can be used to produce the globally ordered output
Vertex sorterVertex = Vertex.create(SORTER, ProcessorDescriptor.create(
NoOpSorter.class.getName()), 1);
sorterVertex.addDataSink(OUTPUT, dataSink);
// No need to add jar containing this class as assumed to be part of the tez jars.
DAG dag = DAG.create(dagName);
dag.addVertex(tokenizerVertex)
.addVertex(summationVertex)
.addVertex(sorterVertex)
.addEdge(
Edge.create(tokenizerVertex, summationVertex,
summationEdgeConf.createDefaultEdgeProperty()))
.addEdge(
Edge.create(summationVertex, sorterVertex, sorterEdgeConf.createDefaultEdgeProperty()));
return dag;
}