Package tutorial.storm.trident

Source Code of tutorial.storm.trident.Part01_BasicPrimitives

package tutorial.storm.trident;

import backtype.storm.Config;
import backtype.storm.LocalCluster;
import backtype.storm.generated.StormTopology;
import backtype.storm.tuple.Fields;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import storm.trident.TridentTopology;
import storm.trident.operation.builtin.Count;
import storm.trident.spout.IBatchSpout;
import storm.trident.testing.MemoryMapState;
import tutorial.storm.trident.operations.Print;
import tutorial.storm.trident.operations.RegexFilter;
import tutorial.storm.trident.operations.ToUpperCase;
import tutorial.storm.trident.testutil.FakeTweetsBatchSpout;

import java.io.IOException;


/**
* @author Enno Shioji (enno.shioji@peerindex.com)
*/
public class Part01_BasicPrimitives {
    private static final Logger log = LoggerFactory.getLogger(Part01_BasicPrimitives.class);

    public static void main(String[] args) throws Exception {
        // Storm can be run locally for testing purposes
        Config conf = new Config();
//        conf.put(Config.TOPOLOGY_DEBUG,true);

        LocalCluster cluster = new LocalCluster();
        cluster.submitTopology("basic_primitives", conf, basicPrimitives(new FakeTweetsBatchSpout()));
        Thread.sleep(100000);
    }

    public static StormTopology basicPrimitives(IBatchSpout spout) throws IOException {

        // A topology is a set of streams.
        // A stream is a DAG of Spouts and Bolts.
        // (In Storm there are Spouts (data producers) and Bolts (data processors).
        // Spouts create Tuples and Bolts manipulate then and possibly emit new ones.)

        // But in Trident we operate at a higher level.
        // Bolts are created and connected automatically out of higher-level constructs.
        // Also, Spouts are "batched".
        TridentTopology topology = new TridentTopology();

        // The "each" primitive allows us to apply either filters or functions to the stream
        // We always have to select the input fields.
        topology
                .newStream("filter", spout)
                .each(new Fields("actor"), new RegexFilter("pere"))
                .each(new Fields("text", "actor"), new Print());

        // Functions describe their output fields, which are always appended to the input fields.
        // As you see, Each operations can be chained.
        topology
                .newStream("function", spout)
                .each(new Fields("text"), new ToUpperCase(), new Fields("uppercased_text"))
                .each(new Fields("text", "uppercased_text"), new Print());

        // You can prune unnecessary fields using "project"
        topology
                .newStream("projection", spout)
                .each(new Fields("text"), new ToUpperCase(), new Fields("uppercased_text"))
                .project(new Fields("uppercased_text"))
                .each(new Fields("uppercased_text"), new Print());

        // Stream can be parallelized with "parallelismHint"
        // Parallelism hint is applied downwards until a partitioning operation (we will see this later).
        // This topology creates 5 spouts and 5 bolts:
        // Let's debug that with TridentOperationContext.partitionIndex !
        topology
                .newStream("parallel", spout)
                .each(new Fields("actor"), new RegexFilter("pere"))
                .parallelismHint(5)
                .each(new Fields("text", "actor"), new Print());

        // You can perform aggregations by grouping the stream and then applying an aggregation
        // Note how each actor appears more than once. We are aggregating inside small batches (aka micro batches)
        // This is useful for pre-processing before storing the result to databases
        topology
                .newStream("aggregation", spout)
                .groupBy(new Fields("actor"))
                .aggregate(new Count(),new Fields("count"))
                .each(new Fields("actor", "count"),new Print())
        ;

        // In order ot aggregate across batches, we need persistentAggregate.
        // This example is incrementing a count in the DB, using the result of these micro batch aggregations
        // (here we are simply using a hash map for the "database")
        topology
                .newStream("aggregation", spout)
                .groupBy(new Fields("actor"))
                .persistentAggregate(new MemoryMapState.Factory(),new Count(),new Fields("count"))
        ;

        return topology.build();
    }

}
TOP

Related Classes of tutorial.storm.trident.Part01_BasicPrimitives

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.