Package storm.trident

Examples of storm.trident.TridentTopology


public class RealTimeTextSearch {

    public static StormTopology buildTopology(TransactionalTridentKafkaSpout spout)
            throws IOException {

        TridentTopology topology = new TridentTopology();
        /**
         * As a first thing, we need a stream of tweets which we can parse and extract
         * only the text and its id. As you will notice, we're going to store the stream
         * using the {@link ElasticSearchState} implementation using its {@link StateUpdater}.
         * Check their implementations for details.
         */
        topology
                .newStream("tweets", spout)
                .each(new Fields("str"), new ParseTweet(), new Fields("text", "content", "user"))
                .each(new Fields("text", "content"), new TweetIdExtractor(), new Fields("tweetId"))
                .project(new Fields("tweetId", "text"))
                .each(new Fields("tweetId", "text"), new Print())
                .partitionPersist(new ElasticSearchStateFactory(), new Fields("tweetId", "text"), new ElasticSearchStateUpdater());

        /**
         * Now we need a DRPC stream to query the state where the tweets are stored.
         * To do that, as shown below, we need an implementation of {@link QueryFunction} to
         * access our {@link ElasticSearchState}.
         */
        TridentState elasticSearchState = topology.newStaticState(new ElasticSearchStateFactory());
        topology
                .newDRPCStream("search")
                .each(new Fields("args"), new Split(" "), new Fields("keywords")) // let's split the arguments
                .stateQuery(elasticSearchState, new Fields("keywords"), new TweetQuery(), new Fields("ids")) // and pass them as query parameters
                .project(new Fields("ids"));
        return topology.build();
    }
View Full Code Here


*/
public class TopHashtagByCountry {

    public static StormTopology buildTopology(TransactionalTridentKafkaSpout spout) throws IOException {

        TridentTopology topology = new TridentTopology();
        TridentState count =
        topology
                .newStream("tweets", spout)
                .each(new Fields("str"), new ParseTweet(), new Fields("status", "content", "user"))
                .project(new Fields("content", "user", "status"))
                .each(new Fields("content"), new OnlyHashtags())
                .each(new Fields("status"), new OnlyGeo())
                .each(new Fields("status", "content"), new ExtractLocation(), new Fields("country", "contentName"))
                .groupBy(new Fields("country", "contentName"))
                .persistentAggregate(new MemoryMapState.Factory(), new Count(), new Fields("count"))
        ;


        topology
                .newDRPCStream("location_hashtag_count")
                .stateQuery(count, new TupleCollectionGet(), new Fields("country", "contentName"))
                .stateQuery(count, new Fields("country", "contentName"), new MapGet(), new Fields("count"))
                .groupBy(new Fields("country"))
                .aggregate(new Fields("contentName", "count"), new FirstN.FirstNSortedAgg(3,"count", true), new Fields("contentName", "count"))
        ;

        return topology.build();
    }
View Full Code Here

public class JoinExample {

    public static StormTopology buildTopology(TransactionalTridentKafkaSpout spout)
            throws IOException {

        TridentTopology topology = new TridentTopology();

        /**
         * First, grab the tweets stream. We're going to use it in two different places
         * and then, we'll going to join them.
         *
         */
        Stream contents = topology
                .newStream("tweets", spout)
                .each(new Fields("str"), new ParseTweet(), new Fields("text", "content", "user"));

        /**
         * Now, let's select and project only hashtags for each tweet.
         * This stream is basically a list of couples (tweetId, hashtag).
         *
         */
        Stream hashtags = contents
                .each(new Fields("content"), new OnlyHashtags())
                .each(new Fields("content"), new TweetIdExtractor(), new Fields("tweetId"))
                .each(new Fields("content"), new GetContentName(), new Fields("hashtag"))
                .project(new Fields("hashtag", "tweetId"));
                //.each(new Fields("content", "tweetId"), new DebugFilter());

        /**
         * And let's do the same for urls, obtaining a stream of couples
         * like (tweetId, url).
         *
         */
        Stream urls = contents
                .each(new Fields("content"), new OnlyUrls())
                .each(new Fields("content"), new TweetIdExtractor(), new Fields("tweetId"))
                .each(new Fields("content"), new GetContentName(), new Fields("url"))
                .project(new Fields("url", "tweetId"));
                //.each(new Fields("content", "tweetId"), new DebugFilter());

        /**
         * Now is time to join on the tweetId to get a stream of triples (tweetId, hashtag, url).
         *
         */
        topology.join(hashtags, new Fields("tweetId"), urls, new Fields("tweetId"), new Fields("tweetId", "hashtag", "url"))
                .each(new Fields("tweetId", "hashtag", "url"), new Print());

        return topology.build();

    }
View Full Code Here

*/
public class TopHashtagByFollowerClass {

    public static StormTopology buildTopology(TransactionalTridentKafkaSpout spout) throws IOException {

        TridentTopology topology = new TridentTopology();
        TridentState count =
        topology
                .newStream("tweets", spout)
                .each(new Fields("str"), new ParseTweet(), new Fields("text", "content", "user"))
                .project(new Fields("content", "user"))
                .each(new Fields("content"), new OnlyHashtags())
                .each(new Fields("user"), new OnlyEnglish())
                .each(new Fields("content", "user"), new ExtractFollowerClassAndContentName(), new Fields("followerClass", "contentName"))
                .groupBy(new Fields("followerClass", "contentName"))
                .persistentAggregate(new MemoryMapState.Factory(), new Count(), new Fields("count"))
        ;


        topology
                .newDRPCStream("hashtag_count")
                .stateQuery(count, new TupleCollectionGet(), new Fields("followerClass", "contentName"))
                .stateQuery(count, new Fields("followerClass", "contentName"), new MapGet(), new Fields("count"))
                .groupBy(new Fields("followerClass"))
                .aggregate(new Fields("contentName", "count"), new FirstN.FirstNSortedAgg(1,"count", true), new Fields("contentName", "count"))
        ;

        return topology.build();
    }
View Full Code Here

* @author Enno Shioji (enno.shioji@peerindex.com)
*/
public class Skeleton {
    private static final Logger log = LoggerFactory.getLogger(Skeleton.class);
    public static StormTopology buildTopology(TransactionalTridentKafkaSpout spout) throws IOException {
        TridentTopology topology = new TridentTopology();
        topology
                .newStream("tweets", spout)
                .each(new Fields("str"), new Print())
        ;

        topology
                .newDRPCStream("ping");

        return topology.build();
    }
View Full Code Here

        String topologyName = args[0];
        conf.setNumWorkers(8); // Our Vagrant environment has 8 workers

        FakeTweetsBatchSpout fakeTweets = new FakeTweetsBatchSpout(10);

        TridentTopology topology = new TridentTopology();
        TridentState countState =
                topology
                        .newStream("spout", fakeTweets)
                        .groupBy(new Fields("actor"))
                        .persistentAggregate(new MemoryMapState.Factory(), new Count(), new Fields("count"));

        topology
                .newDRPCStream("count_per_actor")
                .stateQuery(countState, new Fields("args"), new MapGet(), new Fields("count"));

        StormSubmitter.submitTopology(topologyName, conf, topology.build());

    }
View Full Code Here

        testSpout.feed(ImmutableList.of(new Values("rose", "Shanghai", 32), new Values("mary", "Shanghai", 51), new Values("pere", "Jakarta", 65), new Values("Tom", "Jakarta", 10)));
    }

    private static StormTopology advancedPrimitives(FeederBatchSpout spout) throws IOException {

        TridentTopology topology = new TridentTopology();

        // What if we want more than one aggregation? For that, we can use "chained" aggregations.
        // Note how we calculate count and sum.
        // The aggregated values can then be processed further, in this case into mean
        topology
                .newStream("aggregation", spout)
                .groupBy(new Fields("city"))
                .chainedAgg()
                .aggregate(new Count(), new Fields("count"))
                .aggregate(new Fields("age"), new Sum(), new Fields("age_sum"))
                .chainEnd()
                .each(new Fields("age_sum", "count"), new DivideAsDouble(), new Fields("mean_age"))
                .each(new Fields("city", "mean_age"), new Print())
        ;

        // What if we want to persist results of an aggregation, but want to further process these
        // results? You can use "newValuesStream" for that
        topology
                .newStream("further",spout)
                .groupBy(new Fields("city"))
                .persistentAggregate(new MemoryMapState.Factory(), new Count(), new Fields("count"))
                .newValuesStream()
                .each(new Fields("city", "count"), new Print());

        return topology.build();
    }
View Full Code Here

    }



    private static StormTopology basicStateAndDRPC(LocalDRPC drpc, FeederBatchSpout spout) throws IOException {
        TridentTopology topology = new TridentTopology();

        // persistentAggregate persists the result of aggregation into data stores,
        // which you can use from other applications.
        // You can also use it in other topologies by using the TridentState object returned.
        //
        // The state is commonly backed by a data store like memcache, cassandra etc.
        // Here we are simply using a hash map
        TridentState countState =
                topology
                        .newStream("spout", spout)
                        .groupBy(new Fields("actor"))
                        .persistentAggregate(new MemoryMapState.Factory(), new Count(), new Fields("count"));

        // There are a few ready-made state libraries that you can use
        // Below is an example to use memcached
//        List<InetSocketAddress> memcachedServerLocations = ImmutableList.of(new InetSocketAddress("some.memcached.server",12000));
//        TridentState countStateMemcached =
//                topology
//                        .newStream("spout", spout)
//                        .groupBy(new Fields("actor"))
//                        .persistentAggregate(MemcachedState.transactional(memcachedServerLocations), new Count(), new Fields("count"));



        // DRPC stands for Distributed Remote Procedure Call
        // You can issue calls using the DRPC client library
        // A DRPC call takes two Strings, function name and function arguments
        //
        // In order to call the DRPC defined below, you'd use "count_per_actor" as the function name
        // The function arguments will be available as "args"

        /*
        topology
                .newDRPCStream("ping", drpc)
                .each(new Fields("args"), new Split(" "), new Fields("reply"))
                .each(new Fields("reply"), new RegexFilter("ping"))
                .project(new Fields("reply"));

        // You can apply usual processing primitives to DRPC streams as well
        topology
                .newDRPCStream("count", drpc)
                .each(new Fields("args"), new Split(" "), new Fields("split"))
                .each(new Fields("split"), new RegexFilter("a.*"))
                .groupBy(new Fields("split"))
                .aggregate(new Count(), new Fields("count"));   */


        // More usefully, you can query the state you created earlier
        topology
                .newDRPCStream("count_per_actor", drpc)
                .stateQuery(countState, new Fields("args"), new MapGet(), new Fields("count"));


        // Here is a more complex example
        topology
                .newDRPCStream("count_per_actors", drpc)
                .each(new Fields("args"), new Split(" "), new Fields("actor"))
                .groupBy(new Fields("actor"))
                .stateQuery(countState, new Fields("actor"), new MapGet(), new Fields("individual_count"))
                .each(new Fields("individual_count"), new FilterNull())
                .aggregate(new Fields("individual_count"), new Sum(), new Fields("count"));

        // For how to call DRPC calls, go back to the main method

        return topology.build();
    }
View Full Code Here

        // Spouts create Tuples and Bolts manipulate then and possibly emit new ones.)

        // But in Trident we operate at a higher level.
        // Bolts are created and connected automatically out of higher-level constructs.
        // Also, Spouts are "batched".
        TridentTopology topology = new TridentTopology();

        // The "each" primitive allows us to apply either filters or functions to the stream
        // We always have to select the input fields.
        topology
                .newStream("filter", spout)
                .each(new Fields("actor"), new RegexFilter("pere"))
                .each(new Fields("text", "actor"), new Print());

        // Functions describe their output fields, which are always appended to the input fields.
        // As you see, Each operations can be chained.
        topology
                .newStream("function", spout)
                .each(new Fields("text"), new ToUpperCase(), new Fields("uppercased_text"))
                .each(new Fields("text", "uppercased_text"), new Print());

        // You can prune unnecessary fields using "project"
        topology
                .newStream("projection", spout)
                .each(new Fields("text"), new ToUpperCase(), new Fields("uppercased_text"))
                .project(new Fields("uppercased_text"))
                .each(new Fields("uppercased_text"), new Print());

        // Stream can be parallelized with "parallelismHint"
        // Parallelism hint is applied downwards until a partitioning operation (we will see this later).
        // This topology creates 5 spouts and 5 bolts:
        // Let's debug that with TridentOperationContext.partitionIndex !
        topology
                .newStream("parallel", spout)
                .each(new Fields("actor"), new RegexFilter("pere"))
                .parallelismHint(5)
                .each(new Fields("text", "actor"), new Print());

        // You can perform aggregations by grouping the stream and then applying an aggregation
        // Note how each actor appears more than once. We are aggregating inside small batches (aka micro batches)
        // This is useful for pre-processing before storing the result to databases
        topology
                .newStream("aggregation", spout)
                .groupBy(new Fields("actor"))
                .aggregate(new Count(),new Fields("count"))
                .each(new Fields("actor", "count"),new Print())
        ;

        // In order ot aggregate across batches, we need persistentAggregate.
        // This example is incrementing a count in the DB, using the result of these micro batch aggregations
        // (here we are simply using a hash map for the "database")
        topology
                .newStream("aggregation", spout)
                .groupBy(new Fields("actor"))
                .persistentAggregate(new MemoryMapState.Factory(),new Count(),new Fields("count"))
        ;

        return topology.build();
    }
View Full Code Here

        cluster.shutdown();
    }

    private static StormTopology advancedPrimitives(FakeTweetsBatchSpout spout) {

        TridentTopology topology = new TridentTopology();

        // We have seen how to use groupBy, but you can use a more low-level form of aggregation as well
        // This example keeps track of counts, but this time it aggregates the result into a hash map
        topology
                .newStream("aggregation", spout)
                .aggregate(new Fields("location"), new StringCounter(), new Fields("aggregated_result"))
                .parallelismHint(3)
                ;

        // We can affect how the processing is parallelized by using "partitioning"
        topology
                .newStream("aggregation", spout)
                .partitionBy(new Fields("location"))
                .partitionAggregate(new Fields("location"), new StringCounter(), new Fields("aggregated_result"))
                .parallelismHint(3)
        ;

        // If no partitioning is specified (as in the former), a given location can be aggregated in different
        // aggregators. In the later, all input with a given location are routed to the same instance of aggregation.
        // This means that, more summarization can be done in the later, which would make subsequent processing more
        // efficient. However, note that if your input is skewed, the workload can become skewed, too

        // Here is an example how to deal with such skews
        topology
                .newStream("aggregation", spout)
                .partitionBy(new Fields("location"))
                .partitionAggregate(new Fields("location"), new StringCounter(), new Fields("count_map"))
                .each(new Fields("count_map"), new HasSpain())
                .each(new Fields("count_map"), new Print("AFTER-HAS-SPAIN"))
                .parallelismHint(3)
                .shuffle()
                .each(new Fields("count_map"), new TimesTen(), new Fields("count_map_times_ten"))
                .each(new Fields("count_map_times_ten"), new Print("AFTER-TIMES-TEN"))
                .parallelismHint(3)
        ;

        // Without the "shuffle" partitioning, only a single partition will be executing the "TimesTen" function,
        // i.e. the workload will not be distributed. With the "shuffle" partitioning, the skew is corrected and
        // the workload will be distributed again.
        // Note the need for two parallelismHints, as parallelismHints apply downwards up until a partitioning operation

        // There are several other partitioning operations.
        // Here is an example that uses the "global" partitining, which sends all tuples to the same partition
        // This means however, that the processing can't be distributed -- something you want to avoid
        topology
                .newStream("aggregation", spout)
                .global()
                .each(new Fields("actor"), new Print())
                .parallelismHint(3)
        ;

        //


        return topology.build();
    }
View Full Code Here

TOP

Related Classes of storm.trident.TridentTopology

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.