Package cascading.pipe.assembly

Examples of cascading.pipe.assembly.Retain


    // create PIPEs for left join on the stop words
    Pipe stopPipe = new Pipe( "stop" ); // name branch
    Pipe joinPipe = new HashJoin( tweetPipe, new Fields( "token" ), stopPipe, new Fields( "stop" ), new LeftJoin() );
    joinPipe = new Each( joinPipe, new Fields( "stop" ), new RegexFilter( "^$" ) );

    joinPipe = new Retain( joinPipe, new Fields( "uid", "token" ) );

    /*
    flow part #2
    create SINK tap to measure token frequency, which will need to be used to adjust
    stop words -- based on an R script
View Full Code Here


    // perform a left join to remove stop words, discarding the rows
    // which joined with stop words, i.e., were non-null after left join
    Pipe stopPipe = new Pipe( "stop" );
    Pipe tokenPipe = new HashJoin( docPipe, token, stopPipe, stop, new LeftJoin() );
    tokenPipe = new Each( tokenPipe, stop, new RegexFilter( "^$" ) );
    tokenPipe = new Retain( tokenPipe, fieldSelector );

    // one branch of the flow tallies the token counts for term frequency (TF)
    Pipe tfPipe = new Pipe( "TF", tokenPipe );
    Fields tf_count = new Fields( "tf_count" );
    tfPipe = new CountBy( tfPipe, new Fields( "doc_id", "token" ), tf_count );

    Fields tf_token = new Fields( "tf_token" );
    tfPipe = new Rename( tfPipe, token, tf_token );

    // one branch counts the number of documents (D)
    Fields doc_id = new Fields( "doc_id" );
    Fields tally = new Fields( "tally" );
    Fields rhs_join = new Fields( "rhs_join" );
    Fields n_docs = new Fields( "n_docs" );
    Pipe dPipe = new Unique( "D", tokenPipe, doc_id );
    dPipe = new Each( dPipe, new Insert( tally, 1 ), Fields.ALL );
    dPipe = new Each( dPipe, new Insert( rhs_join, 1 ), Fields.ALL );
    dPipe = new SumBy( dPipe, rhs_join, tally, n_docs, long.class );

    // one branch tallies the token counts for document frequency (DF)
    Pipe dfPipe = new Unique( "DF", tokenPipe, Fields.ALL );
    Fields df_count = new Fields( "df_count" );
    dfPipe = new CountBy( dfPipe, token, df_count );

    Fields df_token = new Fields( "df_token" );
    Fields lhs_join = new Fields( "lhs_join" );
    dfPipe = new Rename( dfPipe, token, df_token );
    dfPipe = new Each( dfPipe, new Insert( lhs_join, 1 ), Fields.ALL );

    // join to bring together all the components for calculating TF-IDF
    // the D side of the join is smaller, so it goes on the RHS
    Pipe idfPipe = new HashJoin( dfPipe, lhs_join, dPipe, rhs_join );

    // the IDF side of the join is smaller, so it goes on the RHS
    Pipe tfidfPipe = new CoGroup( tfPipe, tf_token, idfPipe, df_token );

    // calculate the TF-IDF weights, per token, per document
    Fields tfidf = new Fields( "tfidf" );
    String expression = "(double) tf_count * Math.log( (double) n_docs / ( 1.0 + df_count ) )";
    ExpressionFunction tfidfExpression = new ExpressionFunction( tfidf, expression, Double.class );
    Fields tfidfArguments = new Fields( "tf_count", "df_count", "n_docs" );
    tfidfPipe = new Each( tfidfPipe, tfidfArguments, tfidfExpression, Fields.ALL );

    fieldSelector = new Fields( "tf_token", "doc_id", "tfidf" );
    tfidfPipe = new Retain( tfidfPipe, fieldSelector );
    tfidfPipe = new Rename( tfidfPipe, tf_token, token );

    // keep track of the word counts, which are useful for QA
    Pipe wcPipe = new Pipe( "wc", tfPipe );

View Full Code Here

      tail = new Pipe( getBranchName() );
    else
      tail = new Pipe( getBranchName(), pipe );

    if( getRetainIncomingFields() != null )
      tail = new Retain( tail, getRetainIncomingFields() );

    for( Model model : getPMMLModel().getModels() )
      {
      if( !model.isScorable() )
        {
View Full Code Here

    // determine a tree geohash
    Fields geohashArguments = new Fields( "tree_lat", "tree_lng" );
    treePipe = new Each( treePipe, geohashArguments, new GeoHashFunction( new Fields( "tree_geohash" ), 6 ), Fields.ALL );

    Fields fieldSelector = new Fields( "tree_name", "priv", "tree_id", "situs", "tree_site", "species", "wikipedia", "calflora", "min_height", "max_height", "tree_lat", "tree_lng", "tree_alt", "tree_geohash" );
    treePipe = new Retain( treePipe, fieldSelector );

    // parse the "road" output
    Pipe roadPipe = new Pipe( "road", tsvCheck );
    regex = "^\\s+Sequence\\:.*\\s+Year Constructed\\:\\s+(\\d+)\\s+Traffic Count\\:\\s+(\\d+)\\s+Traffic Index\\:\\s+(\\w.*\\w)\\s+Traffic Class\\:\\s+(\\w.*\\w)\\s+Traffic Date.*\\s+Paving Length\\:\\s+(\\d+)\\s+Paving Width\\:\\s+(\\d+)\\s+Paving Area\\:\\s+(\\d+)\\s+Surface Type\\:\\s+(\\w.*\\w)\\s+Surface Thickness.*\\s+Bike Lane\\:\\s+(\\w+)\\s+Bus Route\\:\\s+(\\w+)\\s+Truck Route\\:\\s+(\\w+)\\s+Remediation.*$";
    roadPipe = new Each( roadPipe, new Fields( "misc" ), new RegexFilter( regex ) );
    Fields roadFields = new Fields( "year_construct", "traffic_count", "traffic_index", "traffic_class", "paving_length", "paving_width", "paving_area", "surface_type", "bike_lane", "bus_route", "truck_route" );
    int[] roadGroups = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 };
    parser = new RegexParser( roadFields, regex, roadGroups );
    roadPipe = new Each( roadPipe, new Fields( "misc" ), parser, Fields.ALL );

    // join with road metadata
    Pipe metaRoadPipe = new Pipe( "meta_road" );
    roadPipe = new HashJoin( roadPipe, new Fields( "surface_type" ), metaRoadPipe, new Fields( "pavement_type" ), new InnerJoin() );
    roadPipe = new Rename( roadPipe, new Fields( "blurb" ), new Fields( "road_name" ) );

    // estimate albedo based on the road surface age and pavement type
    Fields albedoArguments = new Fields( "year_construct", "albedo_new", "albedo_worn" );
    roadPipe = new Each( roadPipe, albedoArguments, new AlbedoFunction( new Fields( "albedo" ), 2002 ), Fields.ALL );

    // generate road segments, with midpoint, y=mx+b, and road_geohash for each
    Fields segmentArguments = new Fields( "geo" );
    Fields segmentResults = new Fields( "lat0", "lng0", "alt0", "lat1", "lng1", "alt1", "lat_mid", "lng_mid" );
    roadPipe = new Each( roadPipe, segmentArguments, new RoadSegmentFunction( segmentResults ), Fields.ALL );

    geohashArguments = new Fields( "lat_mid", "lng_mid" );
    roadPipe = new Each( roadPipe, geohashArguments, new GeoHashFunction( new Fields( "road_geohash" ), 6 ), Fields.ALL );

    fieldSelector = new Fields( "road_name", "year_construct", "traffic_count", "traffic_index", "traffic_class", "paving_length", "paving_width", "paving_area", "surface_type", "bike_lane", "bus_route", "truck_route", "albedo", "lat0", "lng0", "alt0", "lat1", "lng1", "alt1", "road_geohash" );
    roadPipe = new Retain( roadPipe, fieldSelector );

    // join the tree and road pipes to estimate shade
    Pipe shadePipe = new Pipe( "shade", roadPipe );
    shadePipe = new CoGroup( shadePipe, new Fields( "road_geohash" ), treePipe, new Fields( "tree_geohash" ), new InnerJoin() );

    // calculate a rough estimate for distance from tree to road, then filter for "< ~1 block"
    Fields treeDistArguments = new Fields( "tree_lat", "tree_lng", "lat0", "lng0", "lat1", "lng1" );
    Fields tree_dist = new Fields( "tree_dist" );
    shadePipe = new Each( shadePipe, treeDistArguments, new TreeDistanceFunction( tree_dist ), Fields.ALL );

    ExpressionFilter distFilter = new ExpressionFilter( "tree_dist > 25.0", Double.class );
    shadePipe = new Each( shadePipe, tree_dist, distFilter );

    // checkpoint this (big) calculation too
    fieldSelector = new Fields( "road_name", "year_construct", "traffic_count", "traffic_index", "traffic_class", "paving_length", "paving_width", "paving_area", "surface_type", "bike_lane", "bus_route", "truck_route", "albedo", "lat0", "lng0", "lat1", "lng1", "tree_name", "priv", "tree_id", "situs", "tree_site", "species", "wikipedia", "calflora", "min_height", "max_height", "tree_lat", "tree_lng", "tree_alt", "tree_dist", "tree_geohash" );
    shadePipe = new Retain( shadePipe, fieldSelector );
    shadePipe = new GroupBy( shadePipe, new Fields( "tree_name" ), new Fields( "tree_dist" ) );

    Checkpoint shadeCheck = new Checkpoint( "shade", shadePipe );

    // determine the geohash for GPS tracks log events
View Full Code Here

    Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), new RegexSplitter( new Fields( "num1", "char1" ), " " ) );
    Pipe pipeOffset = new Each( new Pipe( "offset" ), new Fields( "line" ), new RegexSplitter( new Fields( "num2", "char2" ), " " ) );

    Pipe splice = new HashJoin( pipeLower, new Fields( "num1" ), pipeOffset, new Fields( "num2" ) );

    splice = new Retain( splice, new Fields( "num1", "char1" ) );

    splice = new Merge( "merge", splice, pipeUpper );

    if( streamed )
      splice = new HashJoin( splice, new Fields( "num1" ), pipeOffset, new Fields( "num2" ) );
View Full Code Here

    Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), new RegexSplitter( new Fields( "num1", "char1" ), " " ) );
    Pipe pipeOffset = new Each( new Pipe( "offset" ), new Fields( "line" ), new RegexSplitter( new Fields( "num2", "char2" ), " " ) );

    Pipe splice = new HashJoin( pipeLower, new Fields( "num1" ), pipeOffset, new Fields( "num2" ) );

    splice = new Retain( splice, new Fields( "num1", "char1" ) );

    splice = new Merge( "merge1", splice, pipeUpper );

    if( firstStreamed )
      splice = new HashJoin( splice, new Fields( "num1" ), pipeOffset, new Fields( "num2" ) );
    else
      splice = new HashJoin( pipeOffset, new Fields( "num2" ), splice, new Fields( "num1" ) );

    splice = new Retain( splice, new Fields( "num1", "char1" ) );

    if( interMerge )
      splice = new Merge( "merge2", splice, pipeUpper );

    if( secondStreamed )
View Full Code Here

    Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), new RegexSplitter( new Fields( "num1", "char1" ), " " ) );
    Pipe pipeOffset = new Each( new Pipe( "offset" ), new Fields( "line" ), new RegexSplitter( new Fields( "num2", "char2" ), " " ) );

    Pipe splice = new HashJoin( pipeLower, new Fields( "num1" ), pipeOffset, new Fields( "num2" ) );

    splice = new Retain( splice, new Fields( "num1", "char1" ) );

    splice = new Merge( "merge1", splice, pipeUpper );

    splice = new HashJoin( splice, new Fields( "num1" ), pipeOffset, new Fields( "num2" ) );

    splice = new Retain( splice, new Fields( "num1", "char1" ) );

    splice = new Merge( "merge2", splice, pipeUpper );

    splice = new HashJoin( splice, new Fields( "num1" ), pipeOffset, new Fields( "num2" ) );
View Full Code Here

      for (int i = 0; i < combinerDefinitions.size(); i++) {
        CombinerDefinition definition = combinerDefinitions.get(i);
        Pipe output = new Pipe(definition.getName() + "-intermediate", pipe);
        output = new Each(output, new Fields(ID_FIELD), new MultiCombinerFilter(definition.getId()));
        output = new Retain(output, definition.getGroupFields().append(definition.getOutputFields()));
        tails[i] = new Pipe(definition.getName(), output);
      }

      setTails(tails);
    } else {
View Full Code Here

    grouped = new Every(grouped,
        new MultiBufferOperation(groupRename, operation),
        outputFields
    );

    grouped = new Retain(grouped,
        outputFields
    );

    setTails(grouped);
  }
View Full Code Here

    Map<String, Tap> sources = new HashMap<String, Tap>();
    sources.put("s1", source1);
    sources.put("s2", source2);

    Pipe s1 = new Pipe("s1");
    s1 = new Retain(s1, new Fields("key", "num"));

    Pipe s2 = new Pipe("s2");
    s2 = new Retain(s2, new Fields("key", "num1", "num2"));

    Pipe results = new MultiGroupBy(s1, new Fields("key"), s2, new Fields("key"),
        new Fields("key-rename"), new CustomBuffer(new Fields("result", " result1", "result2", "result3", "result4", "result5")));

    CascadingUtil.get().getFlowConnector().connect(sources, sink, results).complete();
View Full Code Here

TOP

Related Classes of cascading.pipe.assembly.Retain

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.