Package com.liveramp.cascading_ext.bloom

Source Code of com.liveramp.cascading_ext.bloom.BloomAssemblyStrategy

/**
*  Copyright 2012 LiveRamp
*
*  Licensed under the Apache License, Version 2.0 (the "License");
*  you may not use this file except in compliance with the License.
*  You may obtain a copy of the License at
*
*      http://www.apache.org/licenses/LICENSE-2.0
*
*  Unless required by applicable law or agreed to in writing, software
*  distributed under the License is distributed on an "AS IS" BASIS,
*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*  See the License for the specific language governing permissions and
*  limitations under the License.
*/

package com.liveramp.cascading_ext.bloom;

import cascading.flow.Flow;
import cascading.flow.FlowStep;
import cascading.flow.FlowStepStrategy;
import cascading.flow.planner.BaseFlowStep;
import cascading.stats.FlowStepStats;
import com.liveramp.cascading_ext.assembly.CreateBloomFilter;
import com.liveramp.cascading_ext.counters.Counters;
import org.apache.hadoop.mapred.JobConf;
import org.apache.log4j.Logger;

import java.util.List;
import java.util.Map;

/**
* Does any configuration necessary for a job that involves stuff from BloomAssembly
*/
public class BloomAssemblyStrategy implements FlowStepStrategy<JobConf> {

  private static Logger LOG = Logger.getLogger(BloomAssemblyStrategy.class);

  @Override
  public void apply(Flow<JobConf> flow, List<FlowStep<JobConf>> predecessorSteps, FlowStep<JobConf> flowStep) {
    JobConf conf = flowStep.getConfig();

    String targetBloomID = conf.get(BloomProps.TARGET_BLOOM_FILTER_ID);
    if (targetBloomID != null) {
      prepareBloomFilterBuilder(flowStep);
    }
    //  the job is the filter which needs to use the bloom filter
    String sourceBloomID = conf.get(BloomProps.SOURCE_BLOOM_FILTER_ID);
    if (sourceBloomID != null) {
      buildBloomfilter(sourceBloomID, flowStep, predecessorSteps);
    }

  }

  private void prepareBloomFilterBuilder(FlowStep<JobConf> currentStep) {
    JobConf currentStepConf = currentStep.getConfig();
    currentStepConf.set("mapred.reduce.tasks", Integer.toString(BloomProps.getNumSplits(currentStepConf)));
    currentStepConf.set("io.sort.record.percent", Double.toString(BloomProps.getIOSortPercent(currentStepConf)));
  }

  /**
   * Merges bloom filter parts created across multiple splits of the keys and put the result in the distributed cache.
   */
  private void buildBloomfilter(String bloomID, FlowStep<JobConf> currentStep, List<FlowStep<JobConf>> predecessorSteps) {
    try {
      JobConf currentStepConf = currentStep.getConfig();
      currentStepConf.set("io.sort.mb", Integer.toString(BloomProps.getBufferSize(currentStepConf)));
      currentStepConf.set("mapred.job.reuse.jvm.num.tasks", "-1");

      String requiredBloomPath = currentStepConf.get(BloomProps.REQUIRED_BLOOM_FILTER_PATH);

      for (FlowStep<JobConf> step : predecessorSteps) {
        JobConf prevStepConf = step.getConfig();
        String targetBloomID = prevStepConf.get(BloomProps.TARGET_BLOOM_FILTER_ID);

        if (bloomID.equals(targetBloomID)) {
          LOG.info("Found step generating required bloom filter: " + targetBloomID);

          // Extract the counters from the previous job to approximate the average key/tuple size
          FlowStepStats stats = ((BaseFlowStep) step).getFlowStepStats();

          // Collect some of the stats gathered. This will help configure the bloom filter
          long numSampled = Counters.get(stats, CreateBloomFilter.StatsCounters.TOTAL_SAMPLED_TUPLES);
          long keySizeSum = Counters.get(stats, CreateBloomFilter.StatsCounters.KEY_SIZE_SUM);
          long matchSizeSum = Counters.get(stats, CreateBloomFilter.StatsCounters.TUPLE_SIZE_SUM);

          int avgKeySize = 0;
          int avgMatchSize = 0;

          if (numSampled != 0) {
            avgKeySize = (int) (keySizeSum / numSampled);
            avgMatchSize = (int) (matchSizeSum / numSampled);
          }

          LOG.info("Avg key size ~= " + avgKeySize);
          LOG.info("Avg match size ~= " + avgMatchSize);
          for (Map.Entry<String, String> entry : BloomUtil.getPropertiesForBloomFilter(avgMatchSize, avgKeySize).entrySet()) {
            currentStepConf.set(entry.getKey(), entry.getValue());
          }

          // Put merged result in distributed cache
          LOG.info("Adding dist cache properties to config:");
          for (Map.Entry<String, String> prop : BloomUtil.getPropertiesForDistCache(requiredBloomPath).entrySet()) {
            LOG.info(prop.getKey() + " = " + prop.getValue());
            String previousProperty = currentStepConf.get(prop.getKey());
            if (previousProperty != null) {
              LOG.info("found already existing value for key: " + prop.getKey() + ", found " + previousProperty + ".  Appending.");
              currentStepConf.set(prop.getKey(), previousProperty + "," + prop.getValue());
            } else {
              currentStepConf.set(prop.getKey(), prop.getValue());
            }
          }

          BloomUtil.writeFilterToHdfs(prevStepConf, requiredBloomPath);
        }
      }
    } catch (Exception e) {
      throw new RuntimeException("Failed to create bloom filter!", e);
    }
  }
}
TOP

Related Classes of com.liveramp.cascading_ext.bloom.BloomAssemblyStrategy

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.