Source Code of com.liveramp.cascading_ext.assembly.BloomAssembly$GetSerializedTuple

/**
 *  Copyright 2012 LiveRamp
 *
 *  Licensed under the Apache License, Version 2.0 (the "License");
 *  you may not use this file except in compliance with the License.
 *  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */


package com.liveramp.cascading_ext.assembly;


import cascading.flow.FlowProcess;
import cascading.flow.planner.Scope;
import cascading.operation.*;
import cascading.pipe.*;
import cascading.pipe.assembly.Discard;
import cascading.pipe.joiner.InnerJoin;
import cascading.pipe.joiner.Joiner;
import cascading.pipe.joiner.LeftJoin;
import cascading.pipe.joiner.RightJoin;
import cascading.property.ConfigDef;
import cascading.tuple.Fields;
import cascading.tuple.Tuple;
import com.liveramp.cascading_ext.FileSystemHelper;
import com.liveramp.cascading_ext.TupleSerializationUtil;
import com.liveramp.cascading_ext.bloom.BloomProps;
import com.liveramp.cascading_ext.bloom.operation.BloomJoinFilter;
import com.liveramp.cascading_ext.joiner.LimitJoin;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.mapred.JobConf;


import java.io.IOException;
import java.util.Collections;
import java.util.Set;
import java.util.UUID;


/**
 * This SubAssembly is used by BloomJoin and BloomFilter. It builds a bloom filter from the RHS, filters
 * the LHS using the bloom filter and, depending on parameters, does a CoGroup for exactness.
 */
public abstract class BloomAssembly extends SubAssembly {


  protected enum Mode {
    FILTER_EXACT, FILTER_INEXACT, JOIN
  }


  public static enum CoGroupOrder {
    LARGE_LHS, LARGE_RHS
  }


  /**
   * See BloomJoin(...) for other argument usages.
   *
   * @param operationType whether the operation is a join, filter, or inexact filter (passed in by subclasses)
   */
  protected BloomAssembly(Pipe largePipe, Fields largeJoinFields,
                          Pipe smallPipe, Fields smallJoinFields,
                          Fields renameFields, Mode operationType,
                          Joiner joiner, CoGroupOrder coGroupOrder) {


    try {
      //  If it's a filter, we care about nothing except the join keys on the RHS -- remove the rest
      if (operationType != Mode.JOIN) {
        smallPipe = new Each(smallPipe, smallJoinFields, new Identity());
      }


      Pipe filterPipe;
      Pipe rhsOrig = new Pipe("smallPipe-orig", smallPipe);


      if (shouldApplyBloomFilter(operationType, joiner, coGroupOrder)) {
        filterPipe = getBloomFilterPipe(largePipe, largeJoinFields, smallPipe, smallJoinFields);
      } else {
        // Fall back to a regular CoGroup. TODO: We could try to optimize this case by splitting the large
        // side into relevant/not-relevant using the bloom filter, applying CoGroup only to the relevant part,
        // and then merging the not-relevant part back in to honor the joiner.
        filterPipe = largePipe;
        if (coGroupOrder == CoGroupOrder.LARGE_RHS) {
          rhsOrig = largePipe;
          filterPipe = smallPipe;
        }
      }


      if (operationType == Mode.FILTER_EXACT) {
        // We don't actually care about the fields on the RHS (the user just expects the LHS fields), so we can
        // rename them to junk to avoid conflicts with field names on the LHS
        Fields newKeyFields = new Fields();
        for (int i = 0; i < smallJoinFields.size(); i++) {
          newKeyFields = newKeyFields.append(new Fields("__bloom_join_tmp_" + i));
        }
        rhsOrig = new Each(rhsOrig, smallJoinFields, new Identity(newKeyFields), newKeyFields);
        filterPipe = getCoGroup(filterPipe, largeJoinFields, rhsOrig, newKeyFields, renameFields, joiner, coGroupOrder, operationType);


        filterPipe = new Discard(filterPipe, newKeyFields);
      } else if (operationType == Mode.JOIN) {
        filterPipe = getCoGroup(filterPipe, largeJoinFields, rhsOrig, smallJoinFields, renameFields, joiner, coGroupOrder, operationType);
      }


      setTails(filterPipe);


    } catch (IOException e) {
      throw new RuntimeException(e);
    }
  }


  private Pipe getBloomFilterPipe(Pipe largePipe, Fields largeJoinFields, Pipe smallPipe, Fields smallJoinFields) throws IOException {
    String bloomJobID = UUID.randomUUID().toString();
    Path bloomTempDir = FileSystemHelper.getRandomTemporaryPath("/tmp/bloom_tmp/");
    String bloomPartsDir = bloomTempDir + "/parts";
    String bloomFinalFilter = bloomTempDir + "/filter.bloomfilter";
    String approxCountPartsDir = bloomTempDir + "/approx_distinct_keys_parts/";


    Pipe filterPipe;
    smallPipe = new Each(smallPipe, smallJoinFields, new GetSerializedTuple());
    smallPipe = new CreateBloomFilter(smallPipe, bloomJobID, approxCountPartsDir, bloomPartsDir, "serialized-tuple-key");


    // This is a bit of a hack to:
    //  1) Force a dependency on the operations performed on RHS above (can't continue until they're done)
    //  2) Bind RHS to the flow, which wouldn't happen otherwise.
    // Note that RHS has no output, so there shouldn't be any danger in doing this.
    filterPipe = new NaiveMerge(largePipe.getName(), largePipe, smallPipe);


    // Load the bloom filter into memory and apply it to the LHS.
    filterPipe = new Each(filterPipe, largeJoinFields, new BloomJoinFilter(bloomJobID, false));


    ConfigDef config = filterPipe.getStepConfigDef();  // tell BloomAssemblyStrategy which bloom filter to expect
    config.setProperty(BloomProps.SOURCE_BLOOM_FILTER_ID, bloomJobID);
    config.setProperty(BloomProps.REQUIRED_BLOOM_FILTER_PATH, bloomFinalFilter);
    return filterPipe;
  }


  private boolean shouldApplyBloomFilter(Mode operationType, Joiner joiner, CoGroupOrder coGroupOrder) {
    return operationType != Mode.JOIN || (operationType == Mode.JOIN && (
            joiner == null || joiner instanceof InnerJoin ||
            (joiner instanceof LeftJoin && coGroupOrder == CoGroupOrder.LARGE_RHS) ||
            (joiner instanceof RightJoin && coGroupOrder == CoGroupOrder.LARGE_LHS)));
  }


  private Pipe getCoGroup(Pipe filtered, Fields largeJoinFields,
                          Pipe keysOrig, Fields keyFields,
                          Fields renameFields,
                          Joiner joinerInput,
                          CoGroupOrder coGroupOrder,
                          Mode mode) {
    Pipe lhs, rhs;
    Fields lhsFields, rhsFields;
    Joiner joiner = new InnerJoin();


    if (coGroupOrder == CoGroupOrder.LARGE_LHS) {
      lhs = filtered;
      lhsFields = largeJoinFields;
      rhs = keysOrig;
      rhsFields = keyFields;
      if (mode == Mode.FILTER_EXACT) {
        // Limit join takes the LHS exactly once if and only if there is a match on the RHS.
        // Won't duplicate tuples on the LHS if there are duplicates on the RHS.
        joiner = new LimitJoin(new long[]{Long.MAX_VALUE, 1l});
      } else {
        if (joinerInput != null) {
          joiner = joinerInput;
        }
      }
    } else {
      lhs = keysOrig;
      lhsFields = keyFields;
      rhs = filtered;
      rhsFields = largeJoinFields;
      if (mode == Mode.FILTER_EXACT) {
        joiner = new LimitJoin(new long[]{1l, Long.MAX_VALUE});
      } else {
        if (joinerInput != null) {
          joiner = joinerInput;
        }
      }
    }


    if (renameFields != null) {
      return new CoGroup(lhs, lhsFields, rhs, rhsFields, renameFields, joiner);
    } else {
      return new CoGroup(lhs, lhsFields, rhs, rhsFields, joiner);
    }
  }


  private static class NaiveMerge extends Merge {
    String toAccept;


    public NaiveMerge(String toAccept, Pipe... pipes) {
      super(null, pipes);
      this.toAccept = toAccept;
    }


    @Override
    public Scope outgoingScopeFor(Set<Scope> incomingScopes) {
      Scope toUse = null;
      for (Scope s : incomingScopes) {
        if (s.getName().equals(toAccept)) {
          toUse = s;
        }
      }
      return new Scope(super.outgoingScopeFor(Collections.singleton(toUse)));
    }


    /**
     * Scope mangling will cause NPE when trying to printInternal -- override with simple name
     */
    @Override
    protected void printInternal( StringBuffer buffer, Scope scope ) {
      buffer.append( getClass().getSimpleName() ).append( "('" ).append( getName() ).append( "')" );
    }
  }


  private static class GetSerializedTuple extends BaseOperation implements Function {


    private transient TupleSerializationUtil tupleSerializationUtil;


    public GetSerializedTuple() {
      super(new Fields("serialized-tuple-key"));
    }


    @Override
    public void prepare(FlowProcess flowProcess, OperationCall operationCall) {
      tupleSerializationUtil = new TupleSerializationUtil((JobConf) flowProcess.getConfigCopy());
    }


    @Override
    public void operate(FlowProcess flowProcess, FunctionCall functionCall) {
      try {
        Tuple tuple = functionCall.getArguments().getTuple();
        byte[] tupleSerialized = tupleSerializationUtil.serialize(tuple);
        functionCall.getOutputCollector().add(new Tuple(new BytesWritable(tupleSerialized)));
      } catch (IOException e) {
        throw new RuntimeException(e);
      }
    }
  }
}
Source Code of com.liveramp.cascading_ext.assembly.BloomAssembly$GetSerializedTuple

Related Classes of com.liveramp.cascading_ext.assembly.BloomAssembly$GetSerializedTuple