// We have seen how to use groupBy, but you can use a more low-level form of aggregation as well
// This example keeps track of counts, but this time it aggregates the result into a hash map
topology
.newStream("aggregation", spout)
.aggregate(new Fields("location"), new StringCounter(), new Fields("aggregated_result"))
.parallelismHint(3)
;
// We can affect how the processing is parallelized by using "partitioning"
topology
.newStream("aggregation", spout)
.partitionBy(new Fields("location"))
.partitionAggregate(new Fields("location"), new StringCounter(), new Fields("aggregated_result"))
.parallelismHint(3)
;
// If no partitioning is specified (as in the former), a given location can be aggregated in different
// aggregators. In the later, all input with a given location are routed to the same instance of aggregation.
// This means that, more summarization can be done in the later, which would make subsequent processing more
// efficient. However, note that if your input is skewed, the workload can become skewed, too
// Here is an example how to deal with such skews
topology
.newStream("aggregation", spout)
.partitionBy(new Fields("location"))
.partitionAggregate(new Fields("location"), new StringCounter(), new Fields("count_map"))
.each(new Fields("count_map"), new HasSpain())
.each(new Fields("count_map"), new Print("AFTER-HAS-SPAIN"))
.parallelismHint(3)
.shuffle()
.each(new Fields("count_map"), new TimesTen(), new Fields("count_map_times_ten"))