Package cascading.pipe

Examples of cascading.pipe.CoGroup


        Pipe resultsPipe = new Pipe("results pipe", analyzerPipe);
        resultsPipe = new Each(resultsPipe, new CreateResultsFunction());
       
        // Group the finished datums, the skipped datums, status, outlinks
        Pipe updatePipe = new CoGroup("update pipe", Pipe.pipes(finishedDatumsFromDb, statusPipe, analyzerPipe, outlinksPipe),
                        Fields.fields(new Fields(CrawlDbDatum.URL_FIELD), new Fields(StatusDatum.URL_FN),
                                        new Fields(AnalyzedDatum.URL_FIELD), new Fields(LinkDatum.URL_FN)), null, new OuterJoin());
        updatePipe = new Every(updatePipe, new UpdateCrawlDbBuffer(), Fields.RESULTS);

       
        // output : loop dir specific crawldb
        BasePath outCrawlDbPath = platform.makePath(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);
        Tap crawlDbSink = platform.makeTap(platform.makeTextScheme(), outCrawlDbPath, SinkMode.REPLACE);
        // Status,
        BasePath statusDirPath = platform.makePath(curLoopDirPath, CrawlConfig.STATUS_SUBDIR_NAME);
        Tap statusSink = platform.makeTap(platform.makeTextScheme(), statusDirPath);
        // Content
        BasePath contentDirPath = platform.makePath(curLoopDirPath, CrawlConfig.CONTENT_SUBDIR_NAME);
        Tap contentSink = platform.makeTap(platform.makeBinaryScheme(FetchedDatum.FIELDS), contentDirPath);
       
        // PageResults
        BasePath resultsDirPath = platform.makePath(curLoopDirPath, CrawlConfig.RESULTS_SUBDIR_NAME);
        Tap resultsSink = platform.makeTap(platform.makeTextScheme(), resultsDirPath);

        // Create the output map that connects each tail pipe to the appropriate sink.
        Map<String, Tap> sinkMap = new HashMap<String, Tap>();
        sinkMap.put(updatePipe.getName(), crawlDbSink);
        sinkMap.put(statusPipe.getName(), statusSink);
        sinkMap.put(contentPipe.getName(), contentSink);
        sinkMap.put(resultsPipe.getName(), resultsSink);

        FlowConnector flowConnector = platform.makeFlowConnector();
View Full Code Here


        MultiSourceTap dRM1Source = getTaps(dRM1InputPath, inFieldsDRM1);
        MultiSourceTap dRM2Source = getTaps(dRM2InputPath, inFieldsDRM2);

        Pipe lhs = new Pipe("DRM1");
        Pipe rhs = new Pipe("DRM2");
        Pipe groupByItemIDPipe = new CoGroup(lhs, common, rhs, common, grouped, new InnerJoin());
        groupByItemIDPipe = new Each(groupByItemIDPipe, new VectorsToCSVFunction(joinedOutFields));
        //the DRMs (Mahout Distributed Row Matrices) have row and items indexes the two dictionary BiHashMaps
        //pass these to the output function so the strings from the indexes can be written instead of the
        //binary values of the Keys and Vectors in the DRMs
        groupByItemIDPipe.getStepConfigDef().setProperty("itemIndexPath", itemIndexPath.toString());
        // for these matrices the group by key is the id from the Mahout row key
        groupByItemIDPipe.getStepConfigDef().setProperty("rowIndexPath", iDIndexPath.toString());
        groupByItemIDPipe.getStepConfigDef().setProperty("joining", "true");

        Tap groupedOutputSink = new Hfs(new TextDelimited(true,","), groupedCSVOutputPath.toString());

        FlowDef flowDef = new FlowDef()
            .setName("group-DRMs-by-key")
View Full Code Here

    generate an inverted index for ((uid1,uid2), token) to avoid having to perform
    a cross-product, which would impose a bottleneck in the parallelism
    */

    Pipe invertPipe = new Pipe( "inverted index", joinPipe );
    invertPipe = new CoGroup( invertPipe, new Fields( "token" ), 1, new Fields( "uid1", "ignore", "uid2", "token" ) );

    Fields filterArguments = new Fields( "uid1", "uid2" );
    String uidFilter = "uid1.compareToIgnoreCase( uid2 ) >= 0";
    invertPipe = new Each( invertPipe, filterArguments, new ExpressionFilter( uidFilter, String.class ) );
    Fields ignore = new Fields( "ignore" );
    invertPipe = new Discard( invertPipe, ignore );

    /*
    flow part #4
    count the number of tokens in common for each uid pair and apply a threshold
    */

    Pipe commonPipe = new GroupBy( new Pipe( "uid common", invertPipe ), new Fields( "uid1", "uid2" ) );
    commonPipe = new Every( commonPipe, Fields.ALL, new Count( new Fields( "common" ) ), Fields.ALL );

    String commonFilter = String.format( "common < %d", MIN_COMMON_TOKENS );
    commonPipe = new Each( commonPipe, new Fields( "common" ), new ExpressionFilter( commonFilter, Integer.TYPE ) );

    /*
    flow part #5
    count the number of tokens overall for each uid, then join to calculate
    the vector length for uid1
    */

    Fields tokenCount = new Fields( "token_count" );
    Pipe countPipe = new GroupBy( "count", joinPipe, new Fields( "uid" ) );
    countPipe = new Every( countPipe, Fields.ALL, new Count( tokenCount ), Fields.ALL );

    joinPipe = new CoGroup( countPipe, new Fields( "uid" ), commonPipe, new Fields( "uid1" ) );
    joinPipe = new Pipe( "common", joinPipe );
    joinPipe = new Discard( joinPipe, new Fields( "uid" ) );

    joinPipe = new Rename( joinPipe, tokenCount, new Fields( "token_count1" ) );

    /*
    flow part #6 join to be able to calculate the vector length for
    uid2, remove instances where one uid merely retweets another,
    then calculate an Ochiai similarity metric to find the nearest
    "neighbors" for each uid -- as recommended users to "follow"
    */

    joinPipe = new CoGroup( "similarity", countPipe, new Fields( "uid" ), joinPipe, new Fields( "uid2" ) );

    joinPipe = new Rename( joinPipe, tokenCount, new Fields( "token_count2" ) );

    // use a DEBUG to check the values in the tuple stream; turn off in the FLOWDEF below
    joinPipe = new Each( joinPipe, DebugLevel.VERBOSE, new Debug( true ) );
View Full Code Here

    // join to bring together all the components for calculating TF-IDF
    // the D side of the join is smaller, so it goes on the RHS
    Pipe idfPipe = new HashJoin( dfPipe, lhs_join, dPipe, rhs_join );

    // the IDF side of the join is smaller, so it goes on the RHS
    Pipe tfidfPipe = new CoGroup( tfPipe, tf_token, idfPipe, df_token );

    // calculate the TF-IDF weights, per token, per document
    Fields tfidf = new Fields( "tfidf" );
    String expression = "(double) tf_count * Math.log( (double) n_docs / ( 1.0 + df_count ) )";
    ExpressionFunction tfidfExpression = new ExpressionFunction( tfidf, expression, Double.class );
View Full Code Here

    while (fake.size() < pipeFieldsSum) {
      fake = fake.append(new Fields("__" + i));
      i++;
    }
    Pipe result =
        new CoGroup(pipes, groupFields, fake, new MultiGroupJoiner(pipeFieldsSum, operation));
    result = new Each(result, resultFields, new Identity());
    setTails(result);
  }
View Full Code Here

    fieldSelector = new Fields( "road_name", "year_construct", "traffic_count", "traffic_index", "traffic_class", "paving_length", "paving_width", "paving_area", "surface_type", "bike_lane", "bus_route", "truck_route", "albedo", "lat0", "lng0", "alt0", "lat1", "lng1", "alt1", "road_geohash" );
    roadPipe = new Retain( roadPipe, fieldSelector );

    // join the tree and road pipes to estimate shade
    Pipe shadePipe = new Pipe( "shade", roadPipe );
    shadePipe = new CoGroup( shadePipe, new Fields( "road_geohash" ), treePipe, new Fields( "tree_geohash" ), new InnerJoin() );

    // calculate a rough estimate for distance from tree to road, then filter for "< ~1 block"
    Fields treeDistArguments = new Fields( "tree_lat", "tree_lng", "lat0", "lng0", "lat1", "lng1" );
    Fields tree_dist = new Fields( "tree_dist" );
    shadePipe = new Each( shadePipe, treeDistArguments, new TreeDistanceFunction( tree_dist ), Fields.ALL );

    ExpressionFilter distFilter = new ExpressionFilter( "tree_dist > 25.0", Double.class );
    shadePipe = new Each( shadePipe, tree_dist, distFilter );

    // checkpoint this (big) calculation too
    fieldSelector = new Fields( "road_name", "year_construct", "traffic_count", "traffic_index", "traffic_class", "paving_length", "paving_width", "paving_area", "surface_type", "bike_lane", "bus_route", "truck_route", "albedo", "lat0", "lng0", "lat1", "lng1", "tree_name", "priv", "tree_id", "situs", "tree_site", "species", "wikipedia", "calflora", "min_height", "max_height", "tree_lat", "tree_lng", "tree_alt", "tree_dist", "tree_geohash" );
    shadePipe = new Retain( shadePipe, fieldSelector );
    shadePipe = new GroupBy( shadePipe, new Fields( "tree_name" ), new Fields( "tree_dist" ) );

    Checkpoint shadeCheck = new Checkpoint( "shade", shadePipe );

    // determine the geohash for GPS tracks log events
    Pipe logsPipe = new Pipe( "logs" );
    geohashArguments = new Fields( "lat", "lng" );
    logsPipe = new Each( logsPipe, geohashArguments, new GeoHashFunction( new Fields( "gps_geohash" ), 6 ), Fields.ALL );

    // prepare data for recommendations
    // NB: RHS is large given the sample data, but in practice the logs on the LHS could be much larger
    Pipe recoPipe = new Pipe( "reco", logsPipe );
    recoPipe = new CoGroup( recoPipe, new Fields( "gps_geohash" ), shadeCheck, new Fields( "tree_geohash" ), new InnerJoin() );

    // connect the taps, pipes, etc., into a flow
    FlowDef flowDef = FlowDef.flowDef()
     .setName( "copa" )
     .addSource( gisPipe, gisTap )
View Full Code Here

    Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "cross" ), SinkMode.REPLACE );

    Pipe pipeLower = new Each( "lhs", new Fields( "line" ), new RegexSplitter( new Fields( "numLHS", "charLHS" ), " " ) );
    Pipe pipeUpper = new Each( "rhs", new Fields( "line" ), new RegexSplitter( new Fields( "numRHS", "charRHS" ), " " ) );

    Pipe cross = new CoGroup( pipeLower, new Fields( "numLHS" ), pipeUpper, new Fields( "numRHS" ), new InnerJoin() );

    Flow flow = getPlatform().getFlowConnector().connect( sources, sink, cross );

    flow.complete();
View Full Code Here

    Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " );

    Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter );
    Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitter );

    Pipe splice = new CoGroup( pipeLower, new Fields( "num" ), pipeUpper, new Fields( "num" ), new InnerJoin( Fields.size( 4 ) ) );

    Map<Object, Object> properties = getProperties();

    // make sure hasher is getting called, but does nothing special
    FlowProps.setDefaultTupleElementComparator( properties, getPlatform().getStringComparator( false ).getClass().getCanonicalName() );
View Full Code Here

    pipeUpper = new Pipe( "right", pipeUpper );

//    pipeLower = new Each( pipeLower, new Debug( true ) );
//    pipeUpper = new Each( pipeUpper, new Debug( true ) );

    Pipe splice = new CoGroup( pipeLower, new Fields( "num" ), pipeUpper, new Fields( "num" ), Fields.size( 4 ) );

//    splice = new Each( splice, new Debug( true ) );
    splice = new Pipe( "splice", splice );
    splice = new Pipe( "tail", splice );
View Full Code Here

    Function splitter = new RegexSplitter( Fields.UNKNOWN, " " );

    Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter );
    Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitter );

    Pipe splice = new CoGroup( pipeLower, new Fields( 0 ), pipeUpper, new Fields( 0 ), Fields.size( 4 ) );

    Flow flow = getPlatform().getFlowConnector().connect( sources, sink, splice );

    flow.complete();
View Full Code Here

TOP

Related Classes of cascading.pipe.CoGroup

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.