Examples of cascading.tuple.TupleEntryCollector

cascading.tuple.TupleEntryCollector
Interface TupleEntryCollector is used to allow {@link cascading.operation.BaseOperation} instances to emitone or more result {@link Tuple} values.
The general rule in Cascading is if you are handed a Tuple, you cannot change or cache it. Attempts at modifying such a Tuple will result in an Exception. Preventing caching is harder, see below.
If you create the Tuple, you can re-use or modify it.
When calling {@link #add(Tuple)} or {@link #add(TupleEntry)}, you are passing a Tuple to the down stream pipes and operations. Since no downstream operation may modify or cache the Tuple instance, it is safe to re-use the Tuple instance when {@code add()} returns.
That said, Tuple copies do get cached in order to perform specific operations in the underlying platforms. Currently only a shallow copy is made (via the {@link Tuple} copy constructor). Thus, any mutable type or collectionplaced inside a Tuple will not be copied, but will likely be cached if a copy of the Tuple passed downstream is copied.
So any subsequent changes to that nested type or collection will be reflected in the cached copy, a likely source of hard to find errors.
There is currently no way to specify that a deep copy must be performed when making a Tuple copy.


    @Override
    public void flush( FlowProcess flowProcess, OperationCall<CompositeFunction.Context> operationCall )
      {
      // need to drain context
      TupleEntryCollector collector = ( (FunctionCall) operationCall ).getOutputCollector();


      Tuple result = operationCall.getContext().result;
      LinkedHashMap<Tuple, Tuple[]> context = operationCall.getContext().lru;


      for( Map.Entry<Tuple, Tuple[]> entry : context.entrySet() )

View Full Code Here

  public void testAsGroupByValue() throws Exception {
    FileSystem.get(new Configuration()).delete(new Path("/tmp/input"), true);
    FileSystem.get(new Configuration()).delete(new Path("/tmp/output"), true);


    Tap t = new Hfs(new ProtobufScheme("value", Example.Person.class), "/tmp/input");
    TupleEntryCollector tec = t.openForWrite(new HadoopFlowProcess(new JobConf()));


    HashSet<Tuple> expectedTuples = new HashSet<Tuple>(){{
      add(new Tuple(Example.Person.newBuilder().setName("bryan").setId(1).build()));
      add(new Tuple(Example.Person.newBuilder().setName("lucas").setId(2).build()));
    }};


    for (Tuple tuple : expectedTuples) {
      tec.add(tuple);
    }


    tec.close();


    Pipe inPipe = new Pipe("input");
    Pipe injectedPipe = new Each(inPipe, Fields.NONE, new Insert(new Fields("key"), 7), new Fields("key", "value"));
    Pipe groupByPipe = new GroupBy(injectedPipe, new Fields("key"));

View Full Code Here

  public void testAsGroupByKey() throws Exception {
    FileSystem.get(new Configuration()).delete(new Path("/tmp/input"), true);
    FileSystem.get(new Configuration()).delete(new Path("/tmp/output"), true);


    Tap t = new Hfs(new ProtobufScheme("value", Example.Person.class), "/tmp/input");
    TupleEntryCollector tec = t.openForWrite(new HadoopFlowProcess(new JobConf()));


    HashSet<Tuple> expectedTuples = new HashSet<Tuple>(){{
      add(new Tuple(Example.Person.newBuilder().setName("bryan").setId(1).build()));
      add(new Tuple(Example.Person.newBuilder().setName("lucas").setId(2).build()));
    }};


    for (Tuple tuple : expectedTuples) {
      tec.add(tuple);
    }


    tec.close();


    Pipe inPipe = new Pipe("input");
    Pipe groupByPipe = new GroupBy(inPipe, new Fields("value"));


    Hfs sink = new Hfs(new ProtobufScheme("value", Example.Person.class), "/tmp/output");

View Full Code Here

    expected.add(fixture("bryan", "bryan.duxbury@mail.com", 1));
    expected.add(fixture("lucas", "lucas@mail.com", 2));
    expected.add(fixture("vida", null, 3));


    Tap inputTap = new Hfs(new ProtobufScheme("value", Example.Person.class), "/tmp/input");
    TupleEntryCollector tec = inputTap.openForWrite(new HadoopFlowProcess(), null);


    for (Tuple t : expected) {
      tec.add(new TupleEntry(new Fields("value"), t));
    }
    tec.close();


    // read results back out
    Tap outputTap = new Hfs(new ProtobufScheme("value", Example.Person.class), "/tmp/input");
    TupleEntryIterator iter = outputTap.openForRead(new HadoopFlowProcess(), null);
    List<Tuple> tuples = new ArrayList<Tuple>();

View Full Code Here

      @Override public Fields getDeclaredFields() {
        return null;
      }


      @Override public TupleEntryCollector getOutputCollector() {
        return new TupleEntryCollector() {
          @Override protected void collect(TupleEntry tupleEntry) throws IOException {
            output.set(tupleEntry.getTuple());
          }
        };
      }

View Full Code Here

  public void testInFlow() throws Exception {
    FileSystem.get(new Configuration()).delete(new Path("/tmp/input"), true);
    FileSystem.get(new Configuration()).delete(new Path("/tmp/output"), true);


    Hfs inTap = new Hfs(new ProtobufScheme("value", Example.Person.class), "/tmp/input");
    TupleEntryCollector collector = inTap.openForWrite(new HadoopFlowProcess());
    collector.add(new TupleEntry(new Fields("value"), new Tuple(BRYAN.build())));
    collector.add(new TupleEntry(new Fields("value"), new Tuple(LUCAS.build())));
    collector.close();


    Pipe inPipe = new Pipe("in");
    Pipe p = new Each(inPipe, new Fields("value"), new ExpandProto(Example.Person.class), new Fields("id", "name", "email", "position"));


    Hfs sink = new Hfs(new TextLine(), "/tmp/output");

View Full Code Here

      @Override public Fields getDeclaredFields() {
        return null;  //To change body of implemented methods use File | Settings | File Templates.
      }


      @Override public TupleEntryCollector getOutputCollector() {
        return new TupleEntryCollector() {
          @Override protected void collect(TupleEntry tupleEntry) throws IOException {
            results.add(tupleEntry.getTuple());
          }
        };
      }

View Full Code Here


      try {
        LOG.info("HLL counter found " + approxCounter.cardinality() + " distinct keys");


        Hfs tap = new Hfs(new SequenceFile(new Fields("bytes")), BloomProps.getApproxCountsDir(conf));
        TupleEntryCollector out = tap.openForWrite(new HadoopFlowProcess(conf));
        out.add(new Tuple(new BytesWritable(approxCounter.getBytes())));
        out.close();


      } catch (IOException e) {
        throw new RuntimeException("couldn't write approximate counts to side bucket", e);
      }
    }

View Full Code Here


  @Test
  public void testSimpleCombiner() throws IOException {


    Hfs source = new Hfs(new SequenceFile(new Fields("key", "value")), INPUT_PATH);
    TupleEntryCollector tc = source.openForWrite(CascadingUtil.get().getFlowProcess());
    tc.add(new Tuple("k0", 1));
    tc.add(new Tuple("k0", 2));
    tc.add(new Tuple("k1", 1));
    tc.add(new Tuple("k1", -3));
    tc.add(new Tuple("k1", 10));
    tc.close();


    Tap sink = new Hfs(new SequenceFile(new Fields("key", "sum")), OUTPUT_PATH);


    Pipe pipe = new Pipe("pipe");
    pipe = Combiner.assembly(pipe, new SimpleAggregator(), new Fields("key"), new Fields("value"), new Fields("sum"));

View Full Code Here


  @Test
  public void testSimpleCombinerWithMemoryLimit() throws IOException {


    Hfs source = new Hfs(new SequenceFile(new Fields("key", "value")), INPUT_PATH);
    TupleEntryCollector tc = source.openForWrite(CascadingUtil.get().getFlowProcess());
    tc.add(new Tuple("key0", 1));
    tc.add(new Tuple("key0", 2));
    tc.add(new Tuple("key1", 1));
    tc.add(new Tuple("key1", -3));
    tc.add(new Tuple("key0", 10));
    tc.close();


    Tap sink = new Hfs(new SequenceFile(new Fields("key", "sum")), OUTPUT_PATH);


    Pipe pipe = new Pipe("pipe");
    pipe = new Each(pipe, Combiner.function(new SimpleAggregator(), new Fields("key"), new Fields("value"), new Fields("sum"), MemoryBoundLruHashMap.UNLIMITED_ITEM_CAPACITY, 100, new SimpleTupleMemoryUsageEstimator(), new LongMemoryUsageEstimator(), false));

View Full Code Here

0 1 2 3 4 5

TOP

Related Classes of cascading.tuple.TupleEntryCollector

bixo.datum.ScoredUrlDatumTest

bixo.examples.crawl.CreateUrlDatumFromOutlinksFunction

bixo.examples.crawl.DemoCrawlTool

bixo.examples.crawl.LatestUrlDatumBufferTest

bixo.examples.webmining.CreateLinkDatumFromOutlinksFunction

bixo.examples.webmining.CreateResultsFunction

bixo.examples.webmining.DemoWebMiningWorkflow

bixo.operations.FilterAndScoreByUrlAndRobotsTest

bixo.operations.MakeFetchSetsBuffer

bixo.pipes.AbstractFetchPipeTest

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.