Examples of cascading.tap.Tap

cascading.tap.Tap
A Tap represents the physical data source or sink in a connected {@link cascading.flow.Flow}.
That is, a source Tap is the head end of a connected {@link Pipe} and {@link Tuple} stream, anda sink Tap is the tail end. Kinds of Tap types are used to manage files from a local disk, distributed disk, remote storage like Amazon S3, or via FTP. It simply abstracts out the complexity of connecting to these types of data sources.
A Tap takes a {@link Scheme} instance, which is used to identify the type of resource (text file, binary file, etc).A Tap is responsible for how the resource is reached.
By default when planning a Flow, Tap equality is a function of the {@link #getIdentifier()} and {@link #getScheme()}values. That is, two Tap instances are the same Tap instance if they sink/source the same resource and sink/source the same fields.
Some more advanced taps, like a database tap, may need to extend equality to include any filtering, like the {@code where} clause in a SQL statement so two taps reading from the same SQL table aren't considered equal.
Taps are also used to determine dependencies between two or more {@link Flow} instances when used with a{@link cascading.cascade.Cascade}. In that case the {@link #getFullIdentifier(Object)} value is used and the Schemeis ignored.

    Fields discardFields = trainingFields.appendSelector( predictorFields );


    if( !discardFields.isNone() )
      pipe = new Discard( pipe, discardFields );


    Tap source = getPlatform().getDelimitedFile( "\t", "\"", planner.getFieldTypeResolver(), DATA_PATH + testModel + ".tsv", SinkMode.KEEP );
    Tap sink = getPlatform().getDelimitedFile( "\t", "\"", null, getResultPath(), SinkMode.REPLACE );


    FlowDef flowDef = FlowDef.flowDef()
      .addSource( "head", source )
      .addSink( "tail", sink )
      .addTail( pipe )
      .addAssemblyPlanner( planner );


    Flow flow;


    try
      {
      flow = getPlatform().getFlowConnector().connect( flowDef );
      }
    catch( PlannerException exception )
      {
      exception.writeDOT( getFlowPlanPath() + "/plan.dot" );


      throw exception;
      }


    flow.writeDOT( getFlowPlanPath() + "/plan.dot" );


    flow.complete();


    LOG.debug( "source = {}", source.getSourceFields().printVerbose() );
    LOG.debug( "sink   = {}", sink.getSinkFields().printVerbose() );


    Fields sourceSelector = source.getSourceFields().subtract( trainingFields );
    Fields sinkSelector = sink.getSinkFields();


    LOG.debug( "source select = {}", sourceSelector.printVerbose() );
    LOG.debug( "sink select   = {}", sinkSelector.printVerbose() );


    List<Tuple> sourceTuples = asList( flow, source, sourceSelector );

View Full Code Here

    Pipe pipe = new Pipe( "head" );
    pipe = new Discard( pipe, predictedFields );
    pipe = new ParallelEnsembleAssembly( pipe, ensembleSpec );
    pipe = new Pipe( "tail", pipe );


    Tap source = getPlatform().getDelimitedFile( expectedFields.append( predictedFields ), true, ",", "\"", DATA_PATH + inputData, SinkMode.KEEP );
    Tap sink = getPlatform().getDelimitedFile( Fields.ALL, true, ",", "\"", getResultPath(), SinkMode.REPLACE );


    FlowDef flowDef = FlowDef.flowDef()
      .addSource( "head", source )
      .addSink( "tail", sink )
      .addTail( pipe );


    Flow flow = getPlatform().getFlowConnector().connect( flowDef );


    flow.writeDOT( getFlowPlanPath() + "/plan.dot" );


    flow.complete();


    Fields sourceSelector = source.getSourceFields();
    Fields sinkSelector = sink.getSinkFields();


    LOG.debug( "source select = {}", sourceSelector.printVerbose() );
    LOG.debug( "sink select   = {}", sinkSelector.printVerbose() );


    List<Tuple> sourceTuples = asList( flow, source, sourceSelector );

View Full Code Here

  public void testJDBC() throws IOException
    {


    // CREATE NEW TABLE FROM SOURCE


    Tap source = new Lfs( new TextLine(), inputFile );


    Pipe parsePipe = new Each( "insert", new Fields( "line" ), new RegexSplitter( new Fields( "num", "lower", "upper" ), "\\s" ) );


    String url = "jdbc:hsqldb:hsql://localhost/testing";
    String driver = "org.hsqldb.jdbcDriver";
    String tableName = "testingtable";
    String[] columnNames = {"num", "lower", "upper"};
    String[] columnDefs = {"VARCHAR(100) NOT NULL", "VARCHAR(100) NOT NULL", "VARCHAR(100) NOT NULL"};
    String[] primaryKeys = {"num", "lower"};
    TableDesc tableDesc = new TableDesc( tableName, columnNames, columnDefs, primaryKeys );


    Tap replaceTap = new JDBCTap( url, driver, tableDesc, new JDBCScheme( columnNames ), SinkMode.REPLACE );


    Flow parseFlow = new FlowConnector( getProperties() ).connect( source, replaceTap, parsePipe );


    parseFlow.complete();


    verifySink( parseFlow, 13 );


    // READ DATA FROM TABLE INTO TEXT FILE


    // create flow to read from hbase and save to local file
    Tap sink = new Lfs( new TextLine(), "build/test/jdbc", SinkMode.REPLACE );


    Pipe copyPipe = new Each( "read", new Identity() );


    Flow copyFlow = new FlowConnector( getProperties() ).connect( replaceTap, sink, copyPipe );


    copyFlow.complete();


    verifySink( copyFlow, 13 );


    // READ DATA FROM TEXT FILE AND UPDATE TABLE


    JDBCScheme jdbcScheme = new JDBCScheme( columnNames, null, new String[]{"num", "lower"} );
    Tap updateTap = new JDBCTap( url, driver, tableDesc, jdbcScheme, SinkMode.APPEND );


    Flow updateFlow = new FlowConnector( getProperties() ).connect( sink, updateTap, parsePipe );


    updateFlow.complete();


    verifySink( updateFlow, 13 );


    // READ DATA FROM TABLE INTO TEXT FILE, USING CUSTOM QUERY


    Tap sourceTap = new JDBCTap( url, driver, new JDBCScheme( columnNames, "select num, lower, upper from testingtable as testingtable", "select count(*) from testingtable" ) );


    Pipe readPipe = new Each( "read", new Identity() );


    Flow readFlow = new FlowConnector( getProperties() ).connect( sourceTap, sink, readPipe );

View Full Code Here

  public void testJDBCAliased() throws IOException
    {


    // CREATE NEW TABLE FROM SOURCE


    Tap source = new Lfs( new TextLine(), inputFile );


    Fields columnFields = new Fields( "num", "lower", "upper" );
    Pipe parsePipe = new Each( "insert", new Fields( "line" ), new RegexSplitter( columnFields, "\\s" ) );


    String url = "jdbc:hsqldb:hsql://localhost/testing";
    String driver = "org.hsqldb.jdbcDriver";
    String tableName = "testingtablealias";
    String[] columnNames = {"db_num", "db_lower", "db_upper"};
    String[] columnDefs = {"VARCHAR(100) NOT NULL", "VARCHAR(100) NOT NULL", "VARCHAR(100) NOT NULL"};
    String[] primaryKeys = {"db_num", "db_lower"};
    TableDesc tableDesc = new TableDesc( tableName, columnNames, columnDefs, primaryKeys );


    Tap replaceTap = new JDBCTap( url, driver, tableDesc, new JDBCScheme( columnFields, columnNames ), SinkMode.REPLACE );


    Flow parseFlow = new FlowConnector( getProperties() ).connect( source, replaceTap, parsePipe );


    parseFlow.complete();


    verifySink( parseFlow, 13 );


    // READ DATA FROM TABLE INTO TEXT FILE


    // create flow to read from hbase and save to local file
    Tap sink = new Lfs( new TextLine(), "build/test/jdbc", SinkMode.REPLACE );


    Pipe copyPipe = new Each( "read", new Identity() );


    Flow copyFlow = new FlowConnector( getProperties() ).connect( replaceTap, sink, copyPipe );


    copyFlow.complete();


    verifySink( copyFlow, 13 );


    // READ DATA FROM TEXT FILE AND UPDATE TABLE


    Fields updateByFields = new Fields( "num", "lower" );
    String[] updateBy = {"db_num", "db_lower"};
    JDBCScheme jdbcScheme = new JDBCScheme( columnFields, columnNames, null, updateByFields, updateBy );
    Tap updateTap = new JDBCTap( url, driver, tableDesc, jdbcScheme, SinkMode.APPEND );


    Flow updateFlow = new FlowConnector( getProperties() ).connect( sink, updateTap, parsePipe );


    updateFlow.complete();


    verifySink( updateFlow, 13 );


    // READ DATA FROM TABLE INTO TEXT FILE, USING CUSTOM QUERY


    Tap sourceTap = new JDBCTap( url, driver, new JDBCScheme( columnFields, columnNames, "select db_num, db_lower, db_upper from testingtablealias as testingtablealias", "select count(*) from testingtablealias" ) );


    Pipe readPipe = new Each( "read", new Identity() );


    Flow readFlow = new FlowConnector( getProperties() ).connect( sourceTap, sink, readPipe );

View Full Code Here


    // define what the input file looks like, "offset" is bytes from beginning
    TextLine input = new TextLine(new Fields("offset", "line"));


    // create SOURCE tap to read a resource from HDFS
    Tap logTap = new Hfs(input, inputPath);


    // create an assembly to parse an Apache log file and store on an HDFS cluster


    // declare the field names we will parse out of the log file
    Fields apacheFields = new Fields("resource");


    // define the regular expression to parse the log file with
    String apacheRegex = "^([^ ]*) +[^ ]* +[^ ]* +\\[([^]]*)\\] +\\\"([^ ]*) ([^ ]*) [^ ]*\\\" ([^ ]*) ([^ ]*).*$";


    // declare the groups from the above regex we want to keep. each regex group will be given
    // a field name from 'apacheFields', above, respectively
    int[] allGroups = {4};


    // create the parser
    RegexParser parser = new RegexParser(apacheFields, apacheRegex, allGroups);


    // create the import pipe element, with the name 'import', and with the input argument named "line"
    // replace the incoming tuple with the parser results
    // "line" -> parser -> "ts"
    Pipe pipeline = new Each("import", new Fields("line"), parser, Fields.RESULTS);




    // group the Tuple stream by the "word" value
    pipeline = new GroupBy(pipeline, new Fields("resource"));


    // For every Tuple group
    // count the number of occurrences of "word" and store result in
    // a field named "count"
    Aggregator count = new Count(new Fields("resource"));
    pipeline = new Every(pipeline, count);




    // create a SINK tap to write to the default filesystem
    // by default, TextLine writes all fields out
    Tap remoteLogTap = new Hfs(new TextLine(), outputPath, SinkMode.REPLACE);


    // set the current job jar
    Properties properties = new Properties();
    FlowConnector.setApplicationJarClass(properties, PopularLogResources.class);

View Full Code Here

    RegexParser parser = new RegexParser( apacheFields, apacheRegex, apacheGroups );
    Pipe importPipe = new Each( "import", new Fields( "line" ), parser );


    // create tap to read a resource from the local file system, if not an url for an external resource
    // Lfs allows for relative paths
    Tap logTap =
      inputPath.matches( "^[^:]+://.*" ) ? new Hfs( new TextLine(), inputPath ) : new Lfs( new TextLine(), inputPath );
    // create a tap to read/write from the default filesystem
    Tap parsedLogTap = new Hfs( apacheFields, logsPath );


    // connect the assembly to source and sink taps
    Flow importLogFlow = flowConnector.connect( logTap, parsedLogTap, importPipe );


    // create an assembly to parse out the time field into a timestamp
    // then count the number of requests per second and per minute


    // apply a text parser to create a timestamp with 'second' granularity
    // declares field "ts"
    DateParser dateParser = new DateParser( new Fields( "ts" ), "dd/MMM/yyyy:HH:mm:ss Z" );
    Pipe tsPipe = new Each( "arrival rate", new Fields( "time" ), dateParser, Fields.RESULTS );


    // name the per second assembly and split on tsPipe
    Pipe tsCountPipe = new Pipe( "tsCount", tsPipe );
    tsCountPipe = new GroupBy( tsCountPipe, new Fields( "ts" ) );
    tsCountPipe = new Every( tsCountPipe, Fields.GROUP, new Count() );


    // apply expression to create a timestamp with 'minute' granularity
    // declares field "tm"
    Pipe tmPipe = new Each( tsPipe, new ExpressionFunction( new Fields( "tm" ), "ts - (ts % (60 * 1000))", long.class ) );


    // name the per minute assembly and split on tmPipe
    Pipe tmCountPipe = new Pipe( "tmCount", tmPipe );
    tmCountPipe = new GroupBy( tmCountPipe, new Fields( "tm" ) );
    tmCountPipe = new Every( tmCountPipe, Fields.GROUP, new Count() );


    // create taps to write the results the default filesystem, using the given fields
    Tap tsSinkTap = new Hfs( new TextLine(), arrivalRateSecPath );
    Tap tmSinkTap = new Hfs( new TextLine(), arrivalRateMinPath );


    // a convenience method for binding taps and pipes, order is significant
    Map<String, Tap> sinks = Cascades.tapsMap( Pipe.pipes( tsCountPipe, tmCountPipe ), Tap.taps( tsSinkTap, tmSinkTap ) );


    // connect the assembly to the source and sink taps

View Full Code Here


    // a predefined pipe assembly that returns fields named "url" and "page"
    Pipe importPipe = new ImportCrawlDataAssembly( "import pipe" );


    // create the tap instances
    Tap localPagesSource = new Lfs( new TextLine(), inputPath );
    Tap importedPages = new Hfs( new SequenceFile( new Fields( "url", "page" ) ), pagesPath );


    // connect the pipe assembly to the tap instances
    Flow importPagesFlow = flowConnector.connect( "import pages", localPagesSource, importedPages, importPipe );


    // a predefined pipe assembly that splits the stream into two named "url pipe" and "word pipe"
    // these pipes could be retrieved via the getTails() method and added to new pipe instances
    SubAssembly wordCountPipe = new WordCountSplitAssembly( "wordcount pipe", "url pipe", "word pipe" );


    // create Hadoop sequence files to store the results of the counts
    Tap sinkUrl = new Hfs( new SequenceFile( new Fields( "url", "word", "count" ) ), urlsPath );
    Tap sinkWord = new Hfs( new SequenceFile( new Fields( "word", "count" ) ), wordsPath );


    // convenience method to bind multiple pipes and taps
    Map<String, Tap> sinks = Cascades.tapsMap( new String[]{"url pipe", "word pipe"}, Tap.taps( sinkUrl, sinkWord ) );


    // wordCountPipe will be recognized as an assembly and handled appropriately
    Flow count = flowConnector.connect( importedPages, sinks, wordCountPipe );


    // create an assembly to export the Hadoop sequence file to local text files
    Pipe exportPipe = new Each( "export pipe", new Identity() );


    Tap localSinkUrl = new Lfs( new TextLine(), localUrlsPath );
    Tap localSinkWord = new Lfs( new TextLine(), localWordsPath );


    // connect up both sinks using the same exportPipe assembly
    Flow exportFromUrl = flowConnector.connect( "export url", sinkUrl, localSinkUrl, exportPipe );
    Flow exportFromWord = flowConnector.connect( "export word", sinkWord, localSinkWord, exportPipe );

View Full Code Here


    // define what the input file looks like, "offset" is bytes from beginning
    TextLine scheme = new TextLine( new Fields( "offset", "line" ) );


    // create SOURCE tap to read a resource from the local file system, if input is not an URL
    Tap logTap = inputPath.matches( "^[^:]+://.*" ) ? new Hfs( scheme, inputPath ) : new Lfs( scheme, inputPath );


    // create an assembly to parse an Apache log file and store on an HDFS cluster


    // declare the field names we will parse out of the log file
    Fields apacheFields = new Fields( "ip", "time", "method", "event", "status", "size" );


    // define the regular expression to parse the log file with
    String apacheRegex = "^([^ ]*) +[^ ]* +[^ ]* +\\[([^]]*)\\] +\\\"([^ ]*) ([^ ]*) [^ ]*\\\" ([^ ]*) ([^ ]*).*$";


    // declare the groups from the above regex we want to keep. each regex group will be given
    // a field name from 'apacheFields', above, respectively
    int[] allGroups = {1, 2, 3, 4, 5, 6};


    // create the parser
    RegexParser parser = new RegexParser( apacheFields, apacheRegex, allGroups );


    // create the import pipe element, with the name 'import', and with the input argument named "line"
    // replace the incoming tuple with the parser results
    // "line" -> parser -> "ts"
    Pipe importPipe = new Each( "import", new Fields( "line" ), parser, Fields.RESULTS );


    // create a SINK tap to write to the default filesystem
    // by default, TextLine writes all fields out
    Tap remoteLogTap = new Hfs( new TextLine(), outputPath, SinkMode.REPLACE );


    // set the current job jar
    Properties properties = new Properties();
    FlowConnector.setApplicationJarClass( properties, Main.class );

View Full Code Here

    Properties properties = new Properties();
    AppProps.setApplicationJarClass( properties, Main.class );
    HadoopFlowConnector flowConnector = new HadoopFlowConnector( properties );


    // create taps for sources, sinks, traps
    Tap gisTap = new Hfs( new TextLine( new Fields( "line" ) ), gisPath );
    Tap metaTreeTap = new Hfs( new TextDelimited( true, "\t" ), metaTreePath );
    Tap metaRoadTap = new Hfs( new TextDelimited( true, "\t" ), metaRoadPath );
    Tap logsTap = new Hfs( new TextDelimited( true, "," ), logsPath );
    Tap trapTap = new Hfs( new TextDelimited( true, "\t" ), trapPath );
    Tap tsvTap = new Hfs( new TextDelimited( true, "\t" ), tsvPath );
    Tap treeTap = new Hfs( new TextDelimited( true, "\t" ), treePath );
    Tap roadTap = new Hfs( new TextDelimited( true, "\t" ), roadPath );
    Tap parkTap = new Hfs( new TextDelimited( true, "\t" ), parkPath );
    Tap shadeTap = new Hfs( new TextDelimited( true, "\t" ), shadePath );
    Tap recoTap = new Hfs( new TextDelimited( true, "\t" ), recoPath );


    // specify a regex to split the GIS dump into known fields
    Fields fieldDeclaration = new Fields( "blurb", "misc", "geo", "kind" );
    String regex =  "^\"(.*)\",\"(.*)\",\"(.*)\",\"(.*)\"$";
    int[] gisGroups = { 1, 2, 3, 4 };

View Full Code Here


  public static FlowDef
  createFlowDef( String docPath, String wcPath )
   {
    // create source and sink taps
    Tap docTap = new Hfs( new TextDelimited( true, "\t" ), docPath );
    Tap wcTap = new Hfs( new TextDelimited( true, "\t" ), wcPath );


    // specify a regex operation to split the "document" text lines into a token stream
    Fields token = new Fields( "token" );
    Fields text = new Fields( "text" );
    RegexSplitGenerator splitter = new RegexSplitGenerator( token, "[ \\[\\]\\(\\),.]" );

View Full Code Here

0 1 2 3 4 5 6 7 8 9

TOP

Related Classes of cascading.tap.Tap

bixo.examples.crawl.DemoStatusTool

bixo.examples.crawl.LatestUrlDatumBufferTest

bixo.examples.webmining.DemoWebMiningWorkflow

bixo.examples.webmining.DemoWebMiningWorkflowTest

bixo.fetcher.FetcherTest

bixo.pipes.AbstractFetchPipeTest

cascading.BasicPipesPlatformTest

cascading.BasicTrapPlatformTest

cascading.BufferPipesPlatformTest

cascading.cascade.CascadePlatformTest

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.