Package cascading.tap

Examples of cascading.tap.Tap


    Fields discardFields = trainingFields.appendSelector( predictorFields );

    if( !discardFields.isNone() )
      pipe = new Discard( pipe, discardFields );

    Tap source = getPlatform().getDelimitedFile( "\t", "\"", planner.getFieldTypeResolver(), DATA_PATH + testModel + ".tsv", SinkMode.KEEP );
    Tap sink = getPlatform().getDelimitedFile( "\t", "\"", null, getResultPath(), SinkMode.REPLACE );

    FlowDef flowDef = FlowDef.flowDef()
      .addSource( "head", source )
      .addSink( "tail", sink )
      .addTail( pipe )
      .addAssemblyPlanner( planner );

    Flow flow;

    try
      {
      flow = getPlatform().getFlowConnector().connect( flowDef );
      }
    catch( PlannerException exception )
      {
      exception.writeDOT( getFlowPlanPath() + "/plan.dot" );

      throw exception;
      }

    flow.writeDOT( getFlowPlanPath() + "/plan.dot" );

    flow.complete();

    LOG.debug( "source = {}", source.getSourceFields().printVerbose() );
    LOG.debug( "sink   = {}", sink.getSinkFields().printVerbose() );

    Fields sourceSelector = source.getSourceFields().subtract( trainingFields );
    Fields sinkSelector = sink.getSinkFields();

    LOG.debug( "source select = {}", sourceSelector.printVerbose() );
    LOG.debug( "sink select   = {}", sinkSelector.printVerbose() );

    List<Tuple> sourceTuples = asList( flow, source, sourceSelector );
View Full Code Here


    Pipe pipe = new Pipe( "head" );
    pipe = new Discard( pipe, predictedFields );
    pipe = new ParallelEnsembleAssembly( pipe, ensembleSpec );
    pipe = new Pipe( "tail", pipe );

    Tap source = getPlatform().getDelimitedFile( expectedFields.append( predictedFields ), true, ",", "\"", DATA_PATH + inputData, SinkMode.KEEP );
    Tap sink = getPlatform().getDelimitedFile( Fields.ALL, true, ",", "\"", getResultPath(), SinkMode.REPLACE );

    FlowDef flowDef = FlowDef.flowDef()
      .addSource( "head", source )
      .addSink( "tail", sink )
      .addTail( pipe );

    Flow flow = getPlatform().getFlowConnector().connect( flowDef );

    flow.writeDOT( getFlowPlanPath() + "/plan.dot" );

    flow.complete();

    Fields sourceSelector = source.getSourceFields();
    Fields sinkSelector = sink.getSinkFields();

    LOG.debug( "source select = {}", sourceSelector.printVerbose() );
    LOG.debug( "sink select   = {}", sinkSelector.printVerbose() );

    List<Tuple> sourceTuples = asList( flow, source, sourceSelector );
View Full Code Here

  public void testJDBC() throws IOException
    {

    // CREATE NEW TABLE FROM SOURCE

    Tap source = new Lfs( new TextLine(), inputFile );

    Pipe parsePipe = new Each( "insert", new Fields( "line" ), new RegexSplitter( new Fields( "num", "lower", "upper" ), "\\s" ) );

    String url = "jdbc:hsqldb:hsql://localhost/testing";
    String driver = "org.hsqldb.jdbcDriver";
    String tableName = "testingtable";
    String[] columnNames = {"num", "lower", "upper"};
    String[] columnDefs = {"VARCHAR(100) NOT NULL", "VARCHAR(100) NOT NULL", "VARCHAR(100) NOT NULL"};
    String[] primaryKeys = {"num", "lower"};
    TableDesc tableDesc = new TableDesc( tableName, columnNames, columnDefs, primaryKeys );

    Tap replaceTap = new JDBCTap( url, driver, tableDesc, new JDBCScheme( columnNames ), SinkMode.REPLACE );

    Flow parseFlow = new FlowConnector( getProperties() ).connect( source, replaceTap, parsePipe );

    parseFlow.complete();

    verifySink( parseFlow, 13 );

    // READ DATA FROM TABLE INTO TEXT FILE

    // create flow to read from hbase and save to local file
    Tap sink = new Lfs( new TextLine(), "build/test/jdbc", SinkMode.REPLACE );

    Pipe copyPipe = new Each( "read", new Identity() );

    Flow copyFlow = new FlowConnector( getProperties() ).connect( replaceTap, sink, copyPipe );

    copyFlow.complete();

    verifySink( copyFlow, 13 );

    // READ DATA FROM TEXT FILE AND UPDATE TABLE

    JDBCScheme jdbcScheme = new JDBCScheme( columnNames, null, new String[]{"num", "lower"} );
    Tap updateTap = new JDBCTap( url, driver, tableDesc, jdbcScheme, SinkMode.APPEND );

    Flow updateFlow = new FlowConnector( getProperties() ).connect( sink, updateTap, parsePipe );

    updateFlow.complete();

    verifySink( updateFlow, 13 );

    // READ DATA FROM TABLE INTO TEXT FILE, USING CUSTOM QUERY

    Tap sourceTap = new JDBCTap( url, driver, new JDBCScheme( columnNames, "select num, lower, upper from testingtable as testingtable", "select count(*) from testingtable" ) );

    Pipe readPipe = new Each( "read", new Identity() );

    Flow readFlow = new FlowConnector( getProperties() ).connect( sourceTap, sink, readPipe );

View Full Code Here

  public void testJDBCAliased() throws IOException
    {

    // CREATE NEW TABLE FROM SOURCE

    Tap source = new Lfs( new TextLine(), inputFile );

    Fields columnFields = new Fields( "num", "lower", "upper" );
    Pipe parsePipe = new Each( "insert", new Fields( "line" ), new RegexSplitter( columnFields, "\\s" ) );

    String url = "jdbc:hsqldb:hsql://localhost/testing";
    String driver = "org.hsqldb.jdbcDriver";
    String tableName = "testingtablealias";
    String[] columnNames = {"db_num", "db_lower", "db_upper"};
    String[] columnDefs = {"VARCHAR(100) NOT NULL", "VARCHAR(100) NOT NULL", "VARCHAR(100) NOT NULL"};
    String[] primaryKeys = {"db_num", "db_lower"};
    TableDesc tableDesc = new TableDesc( tableName, columnNames, columnDefs, primaryKeys );

    Tap replaceTap = new JDBCTap( url, driver, tableDesc, new JDBCScheme( columnFields, columnNames ), SinkMode.REPLACE );

    Flow parseFlow = new FlowConnector( getProperties() ).connect( source, replaceTap, parsePipe );

    parseFlow.complete();

    verifySink( parseFlow, 13 );

    // READ DATA FROM TABLE INTO TEXT FILE

    // create flow to read from hbase and save to local file
    Tap sink = new Lfs( new TextLine(), "build/test/jdbc", SinkMode.REPLACE );

    Pipe copyPipe = new Each( "read", new Identity() );

    Flow copyFlow = new FlowConnector( getProperties() ).connect( replaceTap, sink, copyPipe );

    copyFlow.complete();

    verifySink( copyFlow, 13 );

    // READ DATA FROM TEXT FILE AND UPDATE TABLE

    Fields updateByFields = new Fields( "num", "lower" );
    String[] updateBy = {"db_num", "db_lower"};
    JDBCScheme jdbcScheme = new JDBCScheme( columnFields, columnNames, null, updateByFields, updateBy );
    Tap updateTap = new JDBCTap( url, driver, tableDesc, jdbcScheme, SinkMode.APPEND );

    Flow updateFlow = new FlowConnector( getProperties() ).connect( sink, updateTap, parsePipe );

    updateFlow.complete();

    verifySink( updateFlow, 13 );

    // READ DATA FROM TABLE INTO TEXT FILE, USING CUSTOM QUERY

    Tap sourceTap = new JDBCTap( url, driver, new JDBCScheme( columnFields, columnNames, "select db_num, db_lower, db_upper from testingtablealias as testingtablealias", "select count(*) from testingtablealias" ) );

    Pipe readPipe = new Each( "read", new Identity() );

    Flow readFlow = new FlowConnector( getProperties() ).connect( sourceTap, sink, readPipe );

View Full Code Here

    // define what the input file looks like, "offset" is bytes from beginning
    TextLine input = new TextLine(new Fields("offset", "line"));

    // create SOURCE tap to read a resource from HDFS
    Tap logTap = new Hfs(input, inputPath);

    // create an assembly to parse an Apache log file and store on an HDFS cluster

    // declare the field names we will parse out of the log file
    Fields apacheFields = new Fields("resource");

    // define the regular expression to parse the log file with
    String apacheRegex = "^([^ ]*) +[^ ]* +[^ ]* +\\[([^]]*)\\] +\\\"([^ ]*) ([^ ]*) [^ ]*\\\" ([^ ]*) ([^ ]*).*$";

    // declare the groups from the above regex we want to keep. each regex group will be given
    // a field name from 'apacheFields', above, respectively
    int[] allGroups = {4};

    // create the parser
    RegexParser parser = new RegexParser(apacheFields, apacheRegex, allGroups);

    // create the import pipe element, with the name 'import', and with the input argument named "line"
    // replace the incoming tuple with the parser results
    // "line" -> parser -> "ts"
    Pipe pipeline = new Each("import", new Fields("line"), parser, Fields.RESULTS);


    // group the Tuple stream by the "word" value
    pipeline = new GroupBy(pipeline, new Fields("resource"));

    // For every Tuple group
    // count the number of occurrences of "word" and store result in
    // a field named "count"
    Aggregator count = new Count(new Fields("resource"));
    pipeline = new Every(pipeline, count);


    // create a SINK tap to write to the default filesystem
    // by default, TextLine writes all fields out
    Tap remoteLogTap = new Hfs(new TextLine(), outputPath, SinkMode.REPLACE);

    // set the current job jar
    Properties properties = new Properties();
    FlowConnector.setApplicationJarClass(properties, PopularLogResources.class);

View Full Code Here

    RegexParser parser = new RegexParser( apacheFields, apacheRegex, apacheGroups );
    Pipe importPipe = new Each( "import", new Fields( "line" ), parser );

    // create tap to read a resource from the local file system, if not an url for an external resource
    // Lfs allows for relative paths
    Tap logTap =
      inputPath.matches( "^[^:]+://.*" ) ? new Hfs( new TextLine(), inputPath ) : new Lfs( new TextLine(), inputPath );
    // create a tap to read/write from the default filesystem
    Tap parsedLogTap = new Hfs( apacheFields, logsPath );

    // connect the assembly to source and sink taps
    Flow importLogFlow = flowConnector.connect( logTap, parsedLogTap, importPipe );

    // create an assembly to parse out the time field into a timestamp
    // then count the number of requests per second and per minute

    // apply a text parser to create a timestamp with 'second' granularity
    // declares field "ts"
    DateParser dateParser = new DateParser( new Fields( "ts" ), "dd/MMM/yyyy:HH:mm:ss Z" );
    Pipe tsPipe = new Each( "arrival rate", new Fields( "time" ), dateParser, Fields.RESULTS );

    // name the per second assembly and split on tsPipe
    Pipe tsCountPipe = new Pipe( "tsCount", tsPipe );
    tsCountPipe = new GroupBy( tsCountPipe, new Fields( "ts" ) );
    tsCountPipe = new Every( tsCountPipe, Fields.GROUP, new Count() );

    // apply expression to create a timestamp with 'minute' granularity
    // declares field "tm"
    Pipe tmPipe = new Each( tsPipe, new ExpressionFunction( new Fields( "tm" ), "ts - (ts % (60 * 1000))", long.class ) );

    // name the per minute assembly and split on tmPipe
    Pipe tmCountPipe = new Pipe( "tmCount", tmPipe );
    tmCountPipe = new GroupBy( tmCountPipe, new Fields( "tm" ) );
    tmCountPipe = new Every( tmCountPipe, Fields.GROUP, new Count() );

    // create taps to write the results the default filesystem, using the given fields
    Tap tsSinkTap = new Hfs( new TextLine(), arrivalRateSecPath );
    Tap tmSinkTap = new Hfs( new TextLine(), arrivalRateMinPath );

    // a convenience method for binding taps and pipes, order is significant
    Map<String, Tap> sinks = Cascades.tapsMap( Pipe.pipes( tsCountPipe, tmCountPipe ), Tap.taps( tsSinkTap, tmSinkTap ) );

    // connect the assembly to the source and sink taps
View Full Code Here

    // a predefined pipe assembly that returns fields named "url" and "page"
    Pipe importPipe = new ImportCrawlDataAssembly( "import pipe" );

    // create the tap instances
    Tap localPagesSource = new Lfs( new TextLine(), inputPath );
    Tap importedPages = new Hfs( new SequenceFile( new Fields( "url", "page" ) ), pagesPath );

    // connect the pipe assembly to the tap instances
    Flow importPagesFlow = flowConnector.connect( "import pages", localPagesSource, importedPages, importPipe );

    // a predefined pipe assembly that splits the stream into two named "url pipe" and "word pipe"
    // these pipes could be retrieved via the getTails() method and added to new pipe instances
    SubAssembly wordCountPipe = new WordCountSplitAssembly( "wordcount pipe", "url pipe", "word pipe" );

    // create Hadoop sequence files to store the results of the counts
    Tap sinkUrl = new Hfs( new SequenceFile( new Fields( "url", "word", "count" ) ), urlsPath );
    Tap sinkWord = new Hfs( new SequenceFile( new Fields( "word", "count" ) ), wordsPath );

    // convenience method to bind multiple pipes and taps
    Map<String, Tap> sinks = Cascades.tapsMap( new String[]{"url pipe", "word pipe"}, Tap.taps( sinkUrl, sinkWord ) );

    // wordCountPipe will be recognized as an assembly and handled appropriately
    Flow count = flowConnector.connect( importedPages, sinks, wordCountPipe );

    // create an assembly to export the Hadoop sequence file to local text files
    Pipe exportPipe = new Each( "export pipe", new Identity() );

    Tap localSinkUrl = new Lfs( new TextLine(), localUrlsPath );
    Tap localSinkWord = new Lfs( new TextLine(), localWordsPath );

    // connect up both sinks using the same exportPipe assembly
    Flow exportFromUrl = flowConnector.connect( "export url", sinkUrl, localSinkUrl, exportPipe );
    Flow exportFromWord = flowConnector.connect( "export word", sinkWord, localSinkWord, exportPipe );
View Full Code Here

    // define what the input file looks like, "offset" is bytes from beginning
    TextLine scheme = new TextLine( new Fields( "offset", "line" ) );

    // create SOURCE tap to read a resource from the local file system, if input is not an URL
    Tap logTap = inputPath.matches( "^[^:]+://.*" ) ? new Hfs( scheme, inputPath ) : new Lfs( scheme, inputPath );

    // create an assembly to parse an Apache log file and store on an HDFS cluster

    // declare the field names we will parse out of the log file
    Fields apacheFields = new Fields( "ip", "time", "method", "event", "status", "size" );

    // define the regular expression to parse the log file with
    String apacheRegex = "^([^ ]*) +[^ ]* +[^ ]* +\\[([^]]*)\\] +\\\"([^ ]*) ([^ ]*) [^ ]*\\\" ([^ ]*) ([^ ]*).*$";

    // declare the groups from the above regex we want to keep. each regex group will be given
    // a field name from 'apacheFields', above, respectively
    int[] allGroups = {1, 2, 3, 4, 5, 6};

    // create the parser
    RegexParser parser = new RegexParser( apacheFields, apacheRegex, allGroups );

    // create the import pipe element, with the name 'import', and with the input argument named "line"
    // replace the incoming tuple with the parser results
    // "line" -> parser -> "ts"
    Pipe importPipe = new Each( "import", new Fields( "line" ), parser, Fields.RESULTS );

    // create a SINK tap to write to the default filesystem
    // by default, TextLine writes all fields out
    Tap remoteLogTap = new Hfs( new TextLine(), outputPath, SinkMode.REPLACE );

    // set the current job jar
    Properties properties = new Properties();
    FlowConnector.setApplicationJarClass( properties, Main.class );

View Full Code Here

    Properties properties = new Properties();
    AppProps.setApplicationJarClass( properties, Main.class );
    HadoopFlowConnector flowConnector = new HadoopFlowConnector( properties );

    // create taps for sources, sinks, traps
    Tap gisTap = new Hfs( new TextLine( new Fields( "line" ) ), gisPath );
    Tap metaTreeTap = new Hfs( new TextDelimited( true, "\t" ), metaTreePath );
    Tap metaRoadTap = new Hfs( new TextDelimited( true, "\t" ), metaRoadPath );
    Tap logsTap = new Hfs( new TextDelimited( true, "," ), logsPath );
    Tap trapTap = new Hfs( new TextDelimited( true, "\t" ), trapPath );
    Tap tsvTap = new Hfs( new TextDelimited( true, "\t" ), tsvPath );
    Tap treeTap = new Hfs( new TextDelimited( true, "\t" ), treePath );
    Tap roadTap = new Hfs( new TextDelimited( true, "\t" ), roadPath );
    Tap parkTap = new Hfs( new TextDelimited( true, "\t" ), parkPath );
    Tap shadeTap = new Hfs( new TextDelimited( true, "\t" ), shadePath );
    Tap recoTap = new Hfs( new TextDelimited( true, "\t" ), recoPath );

    // specify a regex to split the GIS dump into known fields
    Fields fieldDeclaration = new Fields( "blurb", "misc", "geo", "kind" );
    String regex =  "^\"(.*)\",\"(.*)\",\"(.*)\",\"(.*)\"$";
    int[] gisGroups = { 1, 2, 3, 4 };
View Full Code Here

  public static FlowDef
  createFlowDef( String docPath, String wcPath )
   {
    // create source and sink taps
    Tap docTap = new Hfs( new TextDelimited( true, "\t" ), docPath );
    Tap wcTap = new Hfs( new TextDelimited( true, "\t" ), wcPath );

    // specify a regex operation to split the "document" text lines into a token stream
    Fields token = new Fields( "token" );
    Fields text = new Fields( "text" );
    RegexSplitGenerator splitter = new RegexSplitGenerator( token, "[ \\[\\]\\(\\),.]" );
View Full Code Here

TOP

Related Classes of cascading.tap.Tap

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.