Examples of cascading.tap.Tap

cascading.tap.Tap
A Tap represents the physical data source or sink in a connected {@link cascading.flow.Flow}.
That is, a source Tap is the head end of a connected {@link Pipe} and {@link Tuple} stream, anda sink Tap is the tail end. Kinds of Tap types are used to manage files from a local disk, distributed disk, remote storage like Amazon S3, or via FTP. It simply abstracts out the complexity of connecting to these types of data sources.
A Tap takes a {@link Scheme} instance, which is used to identify the type of resource (text file, binary file, etc).A Tap is responsible for how the resource is reached.
By default when planning a Flow, Tap equality is a function of the {@link #getIdentifier()} and {@link #getScheme()}values. That is, two Tap instances are the same Tap instance if they sink/source the same resource and sink/source the same fields.
Some more advanced taps, like a database tap, may need to extend equality to include any filtering, like the {@code where} clause in a SQL statement so two taps reading from the same SQL table aren't considered equal.
Taps are also used to determine dependencies between two or more {@link Flow} instances when used with a{@link cascading.cascade.Cascade}. In that case the {@link #getFullIdentifier(Object)} value is used and the Schemeis ignored.

        //pass these to the output function so the strings from the indexes can be written instead of the
        //binary values of the Keys and Vectors in the DRMs
        dRM1.getStepConfigDef().setProperty("itemIndexPath", itemIndexPath.toString());
        dRM1.getStepConfigDef().setProperty("rowIndexPath", iDIndexPath.toString());
        dRM1.getStepConfigDef().setProperty("joining", "false");
        Tap outputSink = new Hfs(new TextDelimited(true,","), cSVOutputPath.toString());


        FlowDef flowDef = new FlowDef()
            .setName("convert-to-CSV")
            .addSource(dRM1, dRM1Source)
            .addTailSink(dRM1, outputSink);

View Full Code Here

                //if(s.getPath().toString().contains("part-")){//found a part-xxxxx file
                if(s.getPath().getName().matches("^part.*")){//found a part-xxxxx file
                    Path filePath = new Path(s.getPath().toString());
                    //Tap t = new Hfs( inFields, filePath.toString());
                    //Tap t = new Hfs(new TextLine(), filePath.toString(), true);
                    Tap t = new Hfs( new WritableSequenceFile( f, LongWritable.class, VectorWritable.class ), filePath.toString() );
                    if( s.getLen() != 0 ){// then part file is not empty
                        all.add(t);
                    }
                }
            }

View Full Code Here

        final String in = getTestDir() + "testSimpleIndexing/in";
        final String out = getTestDir() + "testSimpleIndexing/out";


        byte[] imageData = new byte[] {0, 1, 2, 3, 5};
        
        Tap source = makeSourceTap(testFields, in);
        TupleEntryCollector write = source.openForWrite(makeFlowProcess());
        Tuple t = new Tuple();
        t.add(1);
        t.add("TurboWriter 2.3");
        t.add(395.50f);
        t.add(new Tuple("wordprocessor", "Japanese"));
        t.add(true);
        t.add(imageData);
        write.add(t);
        
        t = new Tuple();
        t.add(2);
        t.add("Shasta 1.0");
        t.add(95.00f);
        t.add("Chinese");
        t.add(false);
        
        BytesWritable bw = new BytesWritable(imageData);
        bw.setCapacity(imageData.length + 10);
        t.add(bw);
        write.add(t);
        write.close();


        // Now read from the results, and write to a Solr index.
        Pipe writePipe = new Pipe("tuples to Solr");


        Tap solrSink = makeSolrSink(testFields, out);
        Flow flow = makeFlowConnector().connect(source, solrSink, writePipe);
        flow.complete();


        // Open up the Solr index, and do some searches.
        System.setProperty("solr.data.dir", out + "/part-00000");

View Full Code Here

    Properties properties = new Properties();
    AppProps.setApplicationJarClass( properties, Main.class );
    FlowConnector flowConnector = new HadoopFlowConnector( properties );


    // create SOURCE taps, and read from local file system if inputs are not URLs
    Tap tweetTap = makeTap( tweetPath, new TextDelimited( true, "\t" ) );


    Tap stopTap = makeTap( stopWords, new TextDelimited( new Fields( "stop" ), true, "\t" ) );


    // create SINK taps, replacing previous output if needed
    Tap tokenTap = new Hfs( new TextDelimited( true, "\t" ), tokenPath, SinkMode.REPLACE );
    Tap similarityTap = new Hfs( new TextDelimited( true, "\t" ), similarityPath, SinkMode.REPLACE );


    /*
    flow part #1
    generate a bipartite map of (uid, token), while filtering out stop-words
    */

View Full Code Here

    Properties properties = new Properties();
    AppProps.setApplicationJarClass( properties, Main.class );
    HadoopFlowConnector flowConnector = new HadoopFlowConnector( properties );


    // create source and sink taps
    Tap docTap = new Hfs( new TextDelimited( true, "\t" ), docPath );
    Tap wcTap = new Hfs( new TextDelimited( true, "\t" ), wcPath );


    Fields stop = new Fields( "stop" );
    Tap stopTap = new Hfs( new TextDelimited( stop, true, "\t" ), stopPath );
    Tap tfidfTap = new Hfs( new TextDelimited( true, "\t" ), tfidfPath );


    // specify a regex operation to split the "document" text lines into a token stream
    Fields token = new Fields( "token" );
    Fields text = new Fields( "text" );
    RegexSplitGenerator splitter = new RegexSplitGenerator( token, "[ \\[\\]\\(\\),.]" );

View Full Code Here

      input.add(t);
    }
    input.close();


    // Create flow to read from local file and insert into HBase.
    Tap source = new Hfs(new SequenceFile(inputFields), mHelper.manageTemporaryPath("input"));


    Pipe pipe = new Pipe("values");
    Fields keyFields = new Fields("num");
    Fields valueFields = new Fields("lower", "upper");
    Tap hBaseTap = new HBaseTap("testTable",
        new SerializingHBaseScheme(keyFields, valueFields,
                                       new Class<?>[]{String.class, String.class},
          false, SerializingHBaseScheme.Direction.FOR_WRITE),
        SinkMode.REPLACE);

View Full Code Here


    File inputFile = new File(inputPath);
    if (inputFile.exists()) {
      throw new CascadingException("Input file " + inputPath + " already exists.");
    }
    Tap inputTap = new Hfs(new SequenceFile(fields), inputPath, SinkMode.REPLACE);
    TupleEntryCollector collector = inputTap.openForWrite(getJobConf());
    return collector;
  }

View Full Code Here

   */
  public Flow runFlow(Pipe[] tails, Map<String, Fields> inputs, Fields[] outputs,
                      String[] outputPaths) {
    Map<String, Tap> sources = new HashMap<String, Tap>();
    for (Map.Entry<String, Fields> input : inputs.entrySet()) {
      Tap tap = new Hfs(new SequenceFile(input.getValue()),
                        manageTemporaryPath(input.getKey()));
      sources.put(input.getKey(), tap);
    }


    if (tails.length != outputs.length) {
      System.err.println("size of tails should be same as outputs.");
      return null;
    }


    int i = 0;
    Map<String, Tap> sinks = new HashMap<String, Tap>();
    for (Fields output : outputs) {
      String sinkName = tails[i].getName();
      Tap sink = new Hfs(new SequenceFile(output), outputPaths[i]);
      sinks.put(sinkName, sink);
      i++;
    }


    Flow f = mFlowConnector.connect(sources, sinks, tails);

View Full Code Here

    Properties properties = new Properties();
    AppProps.setApplicationJarClass( properties, Main.class );
    HadoopFlowConnector flowConnector = new HadoopFlowConnector( properties );


    // create source and sink taps
    Tap inputTap = new Hfs( new TextDelimited( true, "\t" ), inputPath );
    Tap classifyTap = new Hfs( new TextDelimited( true, "\t" ), classifyPath );


    // handle command line options
    OptionParser optParser = new OptionParser();
    optParser.accepts( "pmml" ).withRequiredArg();

View Full Code Here

    new RegressionFlowExample().run();
    }


  public void run() throws IOException
    {
    Tap irisTap = new FileTap( new TextDelimited( true, "\t", "\"" ), "data/iris.lm_p.tsv", SinkMode.KEEP );


    Tap resultsTap = new FileTap( new TextDelimited( true, "\t", "\"" ), "build/test/output/flow/results.tsv", SinkMode.REPLACE );


    FlowDef flowDef = FlowDef.flowDef()
      .setName( "pmml flow" )
      .addSource( "iris", irisTap )
      .addSink( "results", resultsTap );


    PMMLPlanner pmmlPlanner = new PMMLPlanner()
      .setPMMLInput( new File( "data/iris.lm_p.xml" ) )
      .retainOnlyActiveIncomingFields();


    flowDef.addAssemblyPlanner( pmmlPlanner );


    Flow flow = new LocalFlowConnector().connect( flowDef );


    flow.complete();


    TupleEntryIterator iterator = resultsTap.openForRead( flow.getFlowProcess() );


    while( iterator.hasNext() )
      System.out.println( iterator.next() );


    iterator.close();

View Full Code Here

0 1 2 3 4 5 6 7 8 9

TOP

Related Classes of cascading.tap.Tap

bixo.examples.crawl.DemoStatusTool

bixo.examples.crawl.LatestUrlDatumBufferTest

bixo.examples.webmining.DemoWebMiningWorkflow

bixo.examples.webmining.DemoWebMiningWorkflowTest

bixo.fetcher.FetcherTest

bixo.pipes.AbstractFetchPipeTest

cascading.BasicPipesPlatformTest

cascading.BasicTrapPlatformTest

cascading.BufferPipesPlatformTest

cascading.cascade.CascadePlatformTest

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.