Package tutorial.storm.trident

Source Code of tutorial.storm.trident.Part02_AdvancedPrimitives1$HasSpain

package tutorial.storm.trident;

import backtype.storm.Config;
import backtype.storm.LocalCluster;
import backtype.storm.generated.StormTopology;
import backtype.storm.tuple.Fields;
import backtype.storm.tuple.Values;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import storm.trident.TridentTopology;
import storm.trident.operation.BaseFilter;
import storm.trident.operation.BaseFunction;
import storm.trident.operation.TridentCollector;
import storm.trident.tuple.TridentTuple;
import tutorial.storm.trident.operations.Print;
import tutorial.storm.trident.operations.StringCounter;
import tutorial.storm.trident.testutil.FakeTweetsBatchSpout;

import java.util.HashMap;
import java.util.Map;


/**
* @author Enno Shioji (enno.shioji@peerindex.com)
*/
public class Part02_AdvancedPrimitives1 {
    private static final Logger log = LoggerFactory.getLogger(Part02_AdvancedPrimitives1.class);

    public static void main(String[] args) throws Exception {
        Config conf = new Config();
//        conf.put(Config.TOPOLOGY_DEBUG,true);
        LocalCluster cluster = new LocalCluster();
        cluster.submitTopology("advanced_primitives", conf, advancedPrimitives(new FakeTweetsBatchSpout(1000)));
        Thread.sleep(30000);
        cluster.shutdown();
    }

    private static StormTopology advancedPrimitives(FakeTweetsBatchSpout spout) {

        TridentTopology topology = new TridentTopology();

        // We have seen how to use groupBy, but you can use a more low-level form of aggregation as well
        // This example keeps track of counts, but this time it aggregates the result into a hash map
        topology
                .newStream("aggregation", spout)
                .aggregate(new Fields("location"), new StringCounter(), new Fields("aggregated_result"))
                .parallelismHint(3)
                ;

        // We can affect how the processing is parallelized by using "partitioning"
        topology
                .newStream("aggregation", spout)
                .partitionBy(new Fields("location"))
                .partitionAggregate(new Fields("location"), new StringCounter(), new Fields("aggregated_result"))
                .parallelismHint(3)
        ;

        // If no partitioning is specified (as in the former), a given location can be aggregated in different
        // aggregators. In the later, all input with a given location are routed to the same instance of aggregation.
        // This means that, more summarization can be done in the later, which would make subsequent processing more
        // efficient. However, note that if your input is skewed, the workload can become skewed, too

        // Here is an example how to deal with such skews
        topology
                .newStream("aggregation", spout)
                .partitionBy(new Fields("location"))
                .partitionAggregate(new Fields("location"), new StringCounter(), new Fields("count_map"))
                .each(new Fields("count_map"), new HasSpain())
                .each(new Fields("count_map"), new Print("AFTER-HAS-SPAIN"))
                .parallelismHint(3)
                .shuffle()
                .each(new Fields("count_map"), new TimesTen(), new Fields("count_map_times_ten"))
                .each(new Fields("count_map_times_ten"), new Print("AFTER-TIMES-TEN"))
                .parallelismHint(3)
        ;

        // Without the "shuffle" partitioning, only a single partition will be executing the "TimesTen" function,
        // i.e. the workload will not be distributed. With the "shuffle" partitioning, the skew is corrected and
        // the workload will be distributed again.
        // Note the need for two parallelismHints, as parallelismHints apply downwards up until a partitioning operation

        // There are several other partitioning operations.
        // Here is an example that uses the "global" partitining, which sends all tuples to the same partition
        // This means however, that the processing can't be distributed -- something you want to avoid
        topology
                .newStream("aggregation", spout)
                .global()
                .each(new Fields("actor"), new Print())
                .parallelismHint(3)
        ;

        //


        return topology.build();
    }

    public static class HasSpain extends BaseFilter {
        @Override
        public boolean isKeep(TridentTuple tuple) {
            Map<String,Integer> val = (Map<String,Integer>)tuple.get(0);
            return val != null && val.keySet().contains("Spain");
        }
    }


    private static class TimesTen extends BaseFunction {
        @Override
        public void execute(TridentTuple tuple, TridentCollector collector) {
            Map<String,Integer> val = (Map<String,Integer>)tuple.get(0);
            Map<String,Integer> ret = new HashMap<String, Integer>();
            for (Map.Entry<String, Integer> e : val.entrySet()) {
                ret.put(e.getKey(), e.getValue() * 10);
            }
            collector.emit(new Values(ret));
        }
    }
}
TOP

Related Classes of tutorial.storm.trident.Part02_AdvancedPrimitives1$HasSpain

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.