Package org.kitesdk.morphline.stdlib

Source Code of org.kitesdk.morphline.stdlib.SampleBuilder$Sample

/*
* Copyright 2013 Cloudera Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.kitesdk.morphline.stdlib;

import java.security.SecureRandom;
import java.util.Collection;
import java.util.Collections;
import java.util.Random;

import org.kitesdk.morphline.api.Command;
import org.kitesdk.morphline.api.CommandBuilder;
import org.kitesdk.morphline.api.MorphlineCompilationException;
import org.kitesdk.morphline.api.MorphlineContext;
import org.kitesdk.morphline.api.Record;
import org.kitesdk.morphline.base.AbstractCommand;
import org.kitesdk.morphline.shaded.org.apache.commons.math3.random.RandomGenerator;
import org.kitesdk.morphline.shaded.org.apache.commons.math3.random.Well19937c;

import com.typesafe.config.Config;

/**
* Command that forwards each input record with a given probability to its child command, and
* silently ignores all other input records. Sampling is based on a random number generator. This
* can be helpful to easily test a morphline with a random subset of records from a large dataset.
*/
public final class SampleBuilder implements CommandBuilder {

  @Override
  public Collection<String> getNames() {
    return Collections.singletonList("sample");
  }

  @Override
  public Command build(Config config, Command parent, Command child, MorphlineContext context) {
    return new Sample(this, config, parent, child, context);
  }
 
 
  ///////////////////////////////////////////////////////////////////////////////
  // Nested classes:
  ///////////////////////////////////////////////////////////////////////////////
  private static final class Sample extends AbstractCommand {

    private final double probability;
    private final RandomGenerator prng;
    private long count = 0;
   
    public Sample(CommandBuilder builder, Config config, Command parent, Command child, MorphlineContext context) {
      super(builder, config, parent, child, context);   
      this.probability = getConfigs().getDouble(config, "probability", 1.0);
      if (probability < 0.0) {
        throw new MorphlineCompilationException("Probability must not be negative: " + probability, config);
      }
      if (probability >= 1.0) {
        this.prng = null;
      } else {
        if (config.hasPath("seed")) {
          long seed = getConfigs().getLong(config, "seed");
          this.prng = new Well19937c(seed); // non-secure & fast
        } else {
          Random rand = new SecureRandom();
          int[] seed = new int[624];
          for (int i = 0; i < seed.length; i++) {
            seed[i] = rand.nextInt();
          }
          this.prng = new Well19937c(seed); // non-secure & fast
        }
      }
      validateArguments();
    }
       
    @Override
    protected boolean doProcess(Record record) {     
      if (prng != null && prng.nextDouble() > probability) {
        return true; // silently ignore this record
      }
      if (LOG.isDebugEnabled()) {
        LOG.debug("sampleCount: {}", count);
      }
      count++;
     
      // pass record to next command in chain:
      return super.doProcess(record);
    }
   
  }
 
}
TOP

Related Classes of org.kitesdk.morphline.stdlib.SampleBuilder$Sample

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.