Package org.apache.crunch.lib

Source Code of org.apache.crunch.lib.Distinct

/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements.  See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership.  The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.crunch.lib;

import java.util.Set;

import org.apache.crunch.DoFn;
import org.apache.crunch.Emitter;
import org.apache.crunch.PCollection;
import org.apache.crunch.PTable;
import org.apache.crunch.Pair;
import org.apache.crunch.types.PType;
import org.apache.crunch.types.PTypeFamily;

import com.google.common.base.Preconditions;
import com.google.common.collect.Sets;

/**
* Functions for computing the distinct elements of a {@code PCollection}.
*/
public final class Distinct {

  private static final int DEFAULT_FLUSH_EVERY = 50000;
 
  /**
   * Construct a new {@code PCollection} that contains the unique elements of a
   * given input {@code PCollection}.
   *
   * @param input The input {@code PCollection}
   * @return A new {@code PCollection} that contains the unique elements of the input
   */
  public static <S> PCollection<S> distinct(PCollection<S> input) {
    return distinct(input, DEFAULT_FLUSH_EVERY);
  }
 
  /**
   * A {@code PTable<K, V>} analogue of the {@code distinct} function.
   */
  public static <K, V> PTable<K, V> distinct(PTable<K, V> input) {
    return PTables.asPTable(distinct((PCollection<Pair<K, V>>) input));
  }
 
  /**
   * A {@code distinct} operation that gives the client more control over how frequently
   * elements are flushed to disk in order to allow control over performance or
   * memory consumption.
   *
   * @param input The input {@code PCollection}
   * @param flushEvery Flush the elements to disk whenever we encounter this many unique values
   * @return A new {@code PCollection} that contains the unique elements of the input
   */
  public static <S> PCollection<S> distinct(PCollection<S> input, int flushEvery) {
    Preconditions.checkArgument(flushEvery > 0);
    PType<S> pt = input.getPType();
    PTypeFamily ptf = pt.getFamily();
    return input
        .parallelDo("pre-distinct", new PreDistinctFn<S>(flushEvery, pt), ptf.tableOf(pt, ptf.nulls()))
        .groupByKey()
        .parallelDo("post-distinct", new PostDistinctFn<S>(), pt);
  }
 
  /**
   * A {@code PTable<K, V>} analogue of the {@code distinct} function.
   */
  public static <K, V> PTable<K, V> distinct(PTable<K, V> input, int flushEvery) {
    return PTables.asPTable(distinct((PCollection<Pair<K, V>>) input, flushEvery));
  }
 
  private static class PreDistinctFn<S> extends DoFn<S, Pair<S, Void>> {
    private final Set<S> values = Sets.newHashSet();
    private final int flushEvery;
    private final PType<S> ptype;
   
    PreDistinctFn(int flushEvery, PType<S> ptype) {
      this.flushEvery = flushEvery;
      this.ptype = ptype;
    }
   
    @Override
    public void initialize() {
      super.initialize();
      ptype.initialize(getConfiguration());
    }
   
    @Override
    public void process(S input, Emitter<Pair<S, Void>> emitter) {
      values.add(ptype.getDetachedValue(input));
      if (values.size() > flushEvery) {
        cleanup(emitter);
      }
    }
   
    @Override
    public void cleanup(Emitter<Pair<S, Void>> emitter) {
      for (S in : values) {
        emitter.emit(Pair.<S, Void>of(in, null));
      }
      values.clear();
    }
  }
 
  private static class PostDistinctFn<S> extends DoFn<Pair<S, Iterable<Void>>, S> {
    @Override
    public void process(Pair<S, Iterable<Void>> input, Emitter<S> emitter) {
      emitter.emit(input.first());
    }
  }
 
  // No instantiation
  private Distinct() {}
}
TOP

Related Classes of org.apache.crunch.lib.Distinct

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.