Package cascading.flow

Examples of cascading.flow.FlowConnector


            // Create the output (sink tap)
            Tap sinkTap = platform.makeTap(platform.makeTextScheme(),
                            platform.makePath(outputDirName), SinkMode.REPLACE);
           
            // Finally we can run it.
            FlowConnector flowConnector = platform.makeFlowConnector();
            Flow flow = flowConnector.connect(sourceTap, sinkTap, pipe);
            flow.complete();
        } catch (Throwable t) {
            System.err.println("Exception running AnalyzeMbox: " + t.getMessage());
            t.printStackTrace(System.err);
            System.exit(-1);
View Full Code Here


        sinkMap.put(updatePipe.getName(), crawlDbSink);
        sinkMap.put(statusPipe.getName(), statusSink);
        sinkMap.put(contentPipe.getName(), contentSink);
        sinkMap.put(resultsPipe.getName(), resultsSink);

        FlowConnector flowConnector = platform.makeFlowConnector();
        Flow flow = flowConnector.connect(inputSource, sinkMap, updatePipe, statusPipe, contentPipe, resultsPipe);

        return flow;
    }
View Full Code Here

    String tokenPath = args[ 2 ];
    String similarityPath = args[ 3 ];

    Properties properties = new Properties();
    AppProps.setApplicationJarClass( properties, Main.class );
    FlowConnector flowConnector = new HadoopFlowConnector( properties );

    // create SOURCE taps, and read from local file system if inputs are not URLs
    Tap tweetTap = makeTap( tweetPath, new TextDelimited( true, "\t" ) );

    Tap stopTap = makeTap( stopWords, new TextDelimited( new Fields( "stop" ), true, "\t" ) );

    // create SINK taps, replacing previous output if needed
    Tap tokenTap = new Hfs( new TextDelimited( true, "\t" ), tokenPath, SinkMode.REPLACE );
    Tap similarityTap = new Hfs( new TextDelimited( true, "\t" ), similarityPath, SinkMode.REPLACE );

    /*
    flow part #1
    generate a bipartite map of (uid, token), while filtering out stop-words
    */

    // create a STREAM ASSERTION to validate the input data
    Pipe tweetPipe = new Pipe( "tweet" ); // name branch
    AssertMatches assertMatches = new AssertMatches( ".{6,150}" );
    tweetPipe = new Each( tweetPipe, AssertionLevel.STRICT, assertMatches );

    // create an OPERATION split the text into a token stream
    RegexSplitGenerator splitter = new RegexSplitGenerator( new Fields( "token" ), " " );
    Fields outputSelector = new Fields( "uid", "token" );
    tweetPipe = new Each( tweetPipe, new Fields( "text" ), splitter, outputSelector );

    tweetPipe = new Unique( tweetPipe, Fields.ALL );

    RegexFilter filter = new RegexFilter( "^\\S\\S+$" );
    tweetPipe = new Each( tweetPipe, new Fields( "token" ), filter );

    // create PIPEs for left join on the stop words
    Pipe stopPipe = new Pipe( "stop" ); // name branch
    Pipe joinPipe = new HashJoin( tweetPipe, new Fields( "token" ), stopPipe, new Fields( "stop" ), new LeftJoin() );
    joinPipe = new Each( joinPipe, new Fields( "stop" ), new RegexFilter( "^$" ) );

    joinPipe = new Retain( joinPipe, new Fields( "uid", "token" ) );

    /*
    flow part #2
    create SINK tap to measure token frequency, which will need to be used to adjust
    stop words -- based on an R script
    */

    Pipe tokenPipe = new Pipe( "token", joinPipe ); // name branch
    tokenPipe = new GroupBy( tokenPipe, new Fields( "token" ) );
    tokenPipe = new Every( tokenPipe, Fields.ALL, new Count(), Fields.ALL );

    /*
    flow part #3
    generate an inverted index for ((uid1,uid2), token) to avoid having to perform
    a cross-product, which would impose a bottleneck in the parallelism
    */

    Pipe invertPipe = new Pipe( "inverted index", joinPipe );
    invertPipe = new CoGroup( invertPipe, new Fields( "token" ), 1, new Fields( "uid1", "ignore", "uid2", "token" ) );

    Fields filterArguments = new Fields( "uid1", "uid2" );
    String uidFilter = "uid1.compareToIgnoreCase( uid2 ) >= 0";
    invertPipe = new Each( invertPipe, filterArguments, new ExpressionFilter( uidFilter, String.class ) );
    Fields ignore = new Fields( "ignore" );
    invertPipe = new Discard( invertPipe, ignore );

    /*
    flow part #4
    count the number of tokens in common for each uid pair and apply a threshold
    */

    Pipe commonPipe = new GroupBy( new Pipe( "uid common", invertPipe ), new Fields( "uid1", "uid2" ) );
    commonPipe = new Every( commonPipe, Fields.ALL, new Count( new Fields( "common" ) ), Fields.ALL );

    String commonFilter = String.format( "common < %d", MIN_COMMON_TOKENS );
    commonPipe = new Each( commonPipe, new Fields( "common" ), new ExpressionFilter( commonFilter, Integer.TYPE ) );

    /*
    flow part #5
    count the number of tokens overall for each uid, then join to calculate
    the vector length for uid1
    */

    Fields tokenCount = new Fields( "token_count" );
    Pipe countPipe = new GroupBy( "count", joinPipe, new Fields( "uid" ) );
    countPipe = new Every( countPipe, Fields.ALL, new Count( tokenCount ), Fields.ALL );

    joinPipe = new CoGroup( countPipe, new Fields( "uid" ), commonPipe, new Fields( "uid1" ) );
    joinPipe = new Pipe( "common", joinPipe );
    joinPipe = new Discard( joinPipe, new Fields( "uid" ) );

    joinPipe = new Rename( joinPipe, tokenCount, new Fields( "token_count1" ) );

    /*
    flow part #6 join to be able to calculate the vector length for
    uid2, remove instances where one uid merely retweets another,
    then calculate an Ochiai similarity metric to find the nearest
    "neighbors" for each uid -- as recommended users to "follow"
    */

    joinPipe = new CoGroup( "similarity", countPipe, new Fields( "uid" ), joinPipe, new Fields( "uid2" ) );

    joinPipe = new Rename( joinPipe, tokenCount, new Fields( "token_count2" ) );

    // use a DEBUG to check the values in the tuple stream; turn off in the FLOWDEF below
    joinPipe = new Each( joinPipe, DebugLevel.VERBOSE, new Debug( true ) );

    Fields expressionArguments = new Fields( "token_count1", "token_count2", "common" );
    commonFilter = "( token_count1 == common ) || ( token_count2 == common )";
    joinPipe = new Each( joinPipe, expressionArguments, new ExpressionFilter( commonFilter, Integer.TYPE ) );

    Fields ochiaiArguments = new Fields( "uid1", "token_count1", "uid2", "token_count2", "common" );
    Fields resultFields = new Fields( "uid", "recommend_uid", "similarity" );
    joinPipe = new Each( joinPipe, ochiaiArguments, new OchiaiFunction( resultFields ), Fields.RESULTS );

    /*
    flow part #7
    apply thresholds to filter out poor recommendations
    */

    Fields similarityArguments = new Fields( "similarity" );
    commonFilter = String.format(Locale.US, "similarity < %f || similarity > %f", MIN_SIMILARITY, MAX_SIMILARITY );
    joinPipe = new Each( joinPipe, similarityArguments, new ExpressionFilter( commonFilter, Double.TYPE ) );

    /*
    connect up all the flow, generate a flow diagram, then run the flow.
    results for recommended users get stored in the "similarityPath" sink tap.
    */

    FlowDef flowDef = FlowDef.flowDef().setName( "similarity" );
    flowDef.addSource( tweetPipe, tweetTap );
    flowDef.addSource( stopPipe, stopTap );
    flowDef.addTailSink( tokenPipe, tokenTap );
    flowDef.addTailSink( joinPipe, similarityTap );

    // set to DebugLevel.VERBOSE for trace, or DebugLevel.NONE in production
    flowDef.setDebugLevel( DebugLevel.VERBOSE );

    // set to AssertionLevel.STRICT for all assertions, or AssertionLevel.NONE in production
    flowDef.setAssertionLevel( AssertionLevel.STRICT );

    Flow similarityFlow = flowConnector.connect( flowDef );
    similarityFlow.writeDOT( "dot/similarity.dot" );
    similarityFlow.complete();
    }
View Full Code Here

        new SerializingHBaseScheme(keyFields, valueFields,
                                       new Class<?>[]{String.class, String.class},
          false, SerializingHBaseScheme.Direction.FOR_WRITE),
        SinkMode.REPLACE);

    Flow flow = new FlowConnector(PROPERTIES).connect(source, hBaseTap, pipe);
    flow.complete();

    mHelper.expectResult(flow, expected);
  }
View Full Code Here

   * @param properties - map of job configuration properties to run with.
   * Usually this is obtained from the cascading ClusterTestCase's getProperties() method.
   */
  public ClusterTestHelper(Map<Object, Object> properties) throws IOException {
    mPathsToManage = new ArrayList<String>();
    mFlowConnector = new FlowConnector(properties);
  }
View Full Code Here

    String[] primaryKeys = {"num", "lower"};
    TableDesc tableDesc = new TableDesc( tableName, columnNames, columnDefs, primaryKeys );

    Tap replaceTap = new JDBCTap( url, driver, tableDesc, new JDBCScheme( columnNames ), SinkMode.REPLACE );

    Flow parseFlow = new FlowConnector( getProperties() ).connect( source, replaceTap, parsePipe );

    parseFlow.complete();

    verifySink( parseFlow, 13 );

    // READ DATA FROM TABLE INTO TEXT FILE

    // create flow to read from hbase and save to local file
    Tap sink = new Lfs( new TextLine(), "build/test/jdbc", SinkMode.REPLACE );

    Pipe copyPipe = new Each( "read", new Identity() );

    Flow copyFlow = new FlowConnector( getProperties() ).connect( replaceTap, sink, copyPipe );

    copyFlow.complete();

    verifySink( copyFlow, 13 );

    // READ DATA FROM TEXT FILE AND UPDATE TABLE

    JDBCScheme jdbcScheme = new JDBCScheme( columnNames, null, new String[]{"num", "lower"} );
    Tap updateTap = new JDBCTap( url, driver, tableDesc, jdbcScheme, SinkMode.APPEND );

    Flow updateFlow = new FlowConnector( getProperties() ).connect( sink, updateTap, parsePipe );

    updateFlow.complete();

    verifySink( updateFlow, 13 );

    // READ DATA FROM TABLE INTO TEXT FILE, USING CUSTOM QUERY

    Tap sourceTap = new JDBCTap( url, driver, new JDBCScheme( columnNames, "select num, lower, upper from testingtable as testingtable", "select count(*) from testingtable" ) );

    Pipe readPipe = new Each( "read", new Identity() );

    Flow readFlow = new FlowConnector( getProperties() ).connect( sourceTap, sink, readPipe );

    readFlow.complete();

    verifySink( readFlow, 13 );
    }
View Full Code Here

    String[] primaryKeys = {"db_num", "db_lower"};
    TableDesc tableDesc = new TableDesc( tableName, columnNames, columnDefs, primaryKeys );

    Tap replaceTap = new JDBCTap( url, driver, tableDesc, new JDBCScheme( columnFields, columnNames ), SinkMode.REPLACE );

    Flow parseFlow = new FlowConnector( getProperties() ).connect( source, replaceTap, parsePipe );

    parseFlow.complete();

    verifySink( parseFlow, 13 );

    // READ DATA FROM TABLE INTO TEXT FILE

    // create flow to read from hbase and save to local file
    Tap sink = new Lfs( new TextLine(), "build/test/jdbc", SinkMode.REPLACE );

    Pipe copyPipe = new Each( "read", new Identity() );

    Flow copyFlow = new FlowConnector( getProperties() ).connect( replaceTap, sink, copyPipe );

    copyFlow.complete();

    verifySink( copyFlow, 13 );

    // READ DATA FROM TEXT FILE AND UPDATE TABLE

    Fields updateByFields = new Fields( "num", "lower" );
    String[] updateBy = {"db_num", "db_lower"};
    JDBCScheme jdbcScheme = new JDBCScheme( columnFields, columnNames, null, updateByFields, updateBy );
    Tap updateTap = new JDBCTap( url, driver, tableDesc, jdbcScheme, SinkMode.APPEND );

    Flow updateFlow = new FlowConnector( getProperties() ).connect( sink, updateTap, parsePipe );

    updateFlow.complete();

    verifySink( updateFlow, 13 );

    // READ DATA FROM TABLE INTO TEXT FILE, USING CUSTOM QUERY

    Tap sourceTap = new JDBCTap( url, driver, new JDBCScheme( columnFields, columnNames, "select db_num, db_lower, db_upper from testingtablealias as testingtablealias", "select count(*) from testingtablealias" ) );

    Pipe readPipe = new Each( "read", new Identity() );

    Flow readFlow = new FlowConnector( getProperties() ).connect( sourceTap, sink, readPipe );

    readFlow.complete();

    verifySink( readFlow, 13 );
    }
View Full Code Here

    // set the current job jar
    Properties properties = new Properties();
    FlowConnector.setApplicationJarClass(properties, PopularLogResources.class);

    // connect the assembly to the SOURCE and SINK taps
    Flow parsedLogFlow = new FlowConnector(properties).connect(logTap, remoteLogTap, pipeline);

    // start execution of the flow (either locally or on the cluster
    parsedLogFlow.start();

    // block until the flow completes
View Full Code Here

    {
    // set the current job jar
    Properties properties = new Properties();
    FlowConnector.setApplicationJarClass( properties, Main.class );

    FlowConnector flowConnector = new FlowConnector( properties );
    CascadeConnector cascadeConnector = new CascadeConnector();

    String inputPath = args[ 0 ];
    String logsPath = args[ 1 ] + "/logs/";
    String arrivalRatePath = args[ 1 ] + "/arrivalrate/";
    String arrivalRateSecPath = arrivalRatePath + "sec";
    String arrivalRateMinPath = arrivalRatePath + "min";

    // create an assembly to import an Apache log file and store on DFS
    // declares: "time", "method", "event", "status", "size"
    Fields apacheFields = new Fields( "ip", "time", "method", "event", "status", "size" );
    String apacheRegex = "^([^ ]*) +[^ ]* +[^ ]* +\\[([^]]*)\\] +\\\"([^ ]*) ([^ ]*) [^ ]*\\\" ([^ ]*) ([^ ]*).*$";
    int[] apacheGroups = {1, 2, 3, 4, 5, 6};
    RegexParser parser = new RegexParser( apacheFields, apacheRegex, apacheGroups );
    Pipe importPipe = new Each( "import", new Fields( "line" ), parser );

    // create tap to read a resource from the local file system, if not an url for an external resource
    // Lfs allows for relative paths
    Tap logTap =
      inputPath.matches( "^[^:]+://.*" ) ? new Hfs( new TextLine(), inputPath ) : new Lfs( new TextLine(), inputPath );
    // create a tap to read/write from the default filesystem
    Tap parsedLogTap = new Hfs( apacheFields, logsPath );

    // connect the assembly to source and sink taps
    Flow importLogFlow = flowConnector.connect( logTap, parsedLogTap, importPipe );

    // create an assembly to parse out the time field into a timestamp
    // then count the number of requests per second and per minute

    // apply a text parser to create a timestamp with 'second' granularity
    // declares field "ts"
    DateParser dateParser = new DateParser( new Fields( "ts" ), "dd/MMM/yyyy:HH:mm:ss Z" );
    Pipe tsPipe = new Each( "arrival rate", new Fields( "time" ), dateParser, Fields.RESULTS );

    // name the per second assembly and split on tsPipe
    Pipe tsCountPipe = new Pipe( "tsCount", tsPipe );
    tsCountPipe = new GroupBy( tsCountPipe, new Fields( "ts" ) );
    tsCountPipe = new Every( tsCountPipe, Fields.GROUP, new Count() );

    // apply expression to create a timestamp with 'minute' granularity
    // declares field "tm"
    Pipe tmPipe = new Each( tsPipe, new ExpressionFunction( new Fields( "tm" ), "ts - (ts % (60 * 1000))", long.class ) );

    // name the per minute assembly and split on tmPipe
    Pipe tmCountPipe = new Pipe( "tmCount", tmPipe );
    tmCountPipe = new GroupBy( tmCountPipe, new Fields( "tm" ) );
    tmCountPipe = new Every( tmCountPipe, Fields.GROUP, new Count() );

    // create taps to write the results the default filesystem, using the given fields
    Tap tsSinkTap = new Hfs( new TextLine(), arrivalRateSecPath );
    Tap tmSinkTap = new Hfs( new TextLine(), arrivalRateMinPath );

    // a convenience method for binding taps and pipes, order is significant
    Map<String, Tap> sinks = Cascades.tapsMap( Pipe.pipes( tsCountPipe, tmCountPipe ), Tap.taps( tsSinkTap, tmSinkTap ) );

    // connect the assembly to the source and sink taps
    Flow arrivalRateFlow = flowConnector.connect( parsedLogTap, sinks, tsCountPipe, tmCountPipe );

    // optionally print out the arrivalRateFlow to a graph file for import into a graphics package
    //arrivalRateFlow.writeDOT( "arrivalrate.dot" );

    // connect the flows by their dependencies, order is not significant
View Full Code Here

  public static void main( String[] args )
    {
    // set the current job jar
    Properties properties = new Properties();
    FlowConnector.setApplicationJarClass( properties, Main.class );
    FlowConnector flowConnector = new FlowConnector( properties );

    String inputPath = args[ 0 ];
    String pagesPath = args[ 1 ] + "/pages/";
    String urlsPath = args[ 1 ] + "/urls/";
    String wordsPath = args[ 1 ] + "/words/";
    String localUrlsPath = args[ 2 ] + "/urls/";
    String localWordsPath = args[ 2 ] + "/words/";

    // import a text file with crawled pages from the local filesystem into a Hadoop distributed filesystem
    // the imported file will be a native Hadoop sequence file with the fields "page" and "url"
    // note this examples stores crawl pages as a tabbed file, with the first field being the "url"
    // and the second being the "raw" document that had all new line chars ("\n") converted to the text ":nl:".

    // a predefined pipe assembly that returns fields named "url" and "page"
    Pipe importPipe = new ImportCrawlDataAssembly( "import pipe" );

    // create the tap instances
    Tap localPagesSource = new Lfs( new TextLine(), inputPath );
    Tap importedPages = new Hfs( new SequenceFile( new Fields( "url", "page" ) ), pagesPath );

    // connect the pipe assembly to the tap instances
    Flow importPagesFlow = flowConnector.connect( "import pages", localPagesSource, importedPages, importPipe );

    // a predefined pipe assembly that splits the stream into two named "url pipe" and "word pipe"
    // these pipes could be retrieved via the getTails() method and added to new pipe instances
    SubAssembly wordCountPipe = new WordCountSplitAssembly( "wordcount pipe", "url pipe", "word pipe" );

    // create Hadoop sequence files to store the results of the counts
    Tap sinkUrl = new Hfs( new SequenceFile( new Fields( "url", "word", "count" ) ), urlsPath );
    Tap sinkWord = new Hfs( new SequenceFile( new Fields( "word", "count" ) ), wordsPath );

    // convenience method to bind multiple pipes and taps
    Map<String, Tap> sinks = Cascades.tapsMap( new String[]{"url pipe", "word pipe"}, Tap.taps( sinkUrl, sinkWord ) );

    // wordCountPipe will be recognized as an assembly and handled appropriately
    Flow count = flowConnector.connect( importedPages, sinks, wordCountPipe );

    // create an assembly to export the Hadoop sequence file to local text files
    Pipe exportPipe = new Each( "export pipe", new Identity() );

    Tap localSinkUrl = new Lfs( new TextLine(), localUrlsPath );
    Tap localSinkWord = new Lfs( new TextLine(), localWordsPath );

    // connect up both sinks using the same exportPipe assembly
    Flow exportFromUrl = flowConnector.connect( "export url", sinkUrl, localSinkUrl, exportPipe );
    Flow exportFromWord = flowConnector.connect( "export word", sinkWord, localSinkWord, exportPipe );

    // connect up all the flows, order is not significant
    Cascade cascade = new CascadeConnector().connect( importPagesFlow, count, exportFromUrl, exportFromWord );

    // run the cascade to completion
View Full Code Here

TOP

Related Classes of cascading.flow.FlowConnector

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.