Package cascading.pipe.assembly

Source Code of cascading.pipe.assembly.AggregateBy$CompositeFunction$Context

/*
* Copyright (c) 2007-2014 Concurrent, Inc. All Rights Reserved.
*
* Project and contact information: http://www.cascading.org/
*
* This file is part of the Cascading project.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package cascading.pipe.assembly;

import java.beans.ConstructorProperties;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;

import cascading.flow.FlowProcess;
import cascading.management.annotation.Property;
import cascading.management.annotation.PropertyConfigured;
import cascading.management.annotation.PropertyDescription;
import cascading.management.annotation.Visibility;
import cascading.operation.Aggregator;
import cascading.operation.BaseOperation;
import cascading.operation.Function;
import cascading.operation.FunctionCall;
import cascading.operation.OperationCall;
import cascading.pipe.Each;
import cascading.pipe.Every;
import cascading.pipe.GroupBy;
import cascading.pipe.Pipe;
import cascading.pipe.SubAssembly;
import cascading.tuple.Fields;
import cascading.tuple.Tuple;
import cascading.tuple.TupleEntry;
import cascading.tuple.TupleEntryCollector;
import cascading.tuple.util.TupleHasher;
import cascading.tuple.util.TupleViews;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
* Class AggregateBy is a {@link SubAssembly} that serves two roles for handling aggregate operations.
* <p/>
* The first role is as a base class for composable aggregate operations that have a MapReduce Map side optimization for the
* Reduce side aggregation. For example 'summing' a value within a grouping can be performed partially Map side and
* completed Reduce side. Summing is associative and commutative.
* <p/>
* AggregateBy also supports operations that are not associative/commutative like 'counting'. Counting
* would result in 'counting' value occurrences Map side but summing those counts Reduce side. (Yes, counting can be
* transposed to summing Map and Reduce sides by emitting 1's before the first sum, but that's three operations over
* two, and a hack)
* <p/>
* Think of this mechanism as a MapReduce Combiner, but more efficient as no values are serialized,
* deserialized, saved to disk, and multi-pass sorted in the process, which consume cpu resources in trade of
* memory and a little or no IO.
* <p/>
* Further, Combiners are limited to only associative/commutative operations.
* <p/>
* Additionally the Cascading planner can move the Map side optimization
* to the previous Reduce operation further increasing IO performance (between the preceding Reduce and Map phase which
* is over HDFS).
* <p/>
* The second role of the AggregateBy class is to allow for composition of AggregateBy
* sub-classes. That is, {@link SumBy} and {@link CountBy} AggregateBy sub-classes can be performed
* in parallel on the same grouping keys.
* </p>
* Custom AggregateBy classes can be created by sub-classing this class and implementing a special
* {@link Functor} for use on the Map side. Multiple Functor instances are managed by the {@link CompositeFunction}
* class allowing them all to share the same LRU value map for more efficiency.
* <p/>
* AggregateBy instances return {@code argumentFields} which are used internally to control the values passed to
* internal Functor instances. If any argumentFields also have {@link java.util.Comparator}s, they will be used
* to for secondary sorting (see {@link GroupBy} {@code sortFields}. This feature is used by {@link FirstBy} to
* control which Tuple is seen first for a grouping.
* <p/>
* <p/>
* To tune the LRU, set the {@code threshold} value to a high enough value to utilize available memory. Or set a
* default value via the {@link #AGGREGATE_BY_THRESHOLD} property. The current default ({@link CompositeFunction#DEFAULT_THRESHOLD})
* is {@code 10, 000} unique keys. Note "flushes" from the LRU will be logged in threshold increments along with memory
* information.
* <p/>
* Note using a AggregateBy instance automatically inserts a {@link GroupBy} into the resulting {@link cascading.flow.Flow}.
* And passing multiple AggregateBy instances to a parent AggregateBy instance still results in one GroupBy.
* <p/>
* Also note that {@link Unique} is not a CompositeAggregator and is slightly more optimized internally.
* <p/>
* As of Cascading 2.6 AggregateBy honors the {@link cascading.tuple.Hasher} interface for storing keys in the cache.
*
* @see SumBy
* @see CountBy
* @see Unique
*/
public class AggregateBy extends SubAssembly
  {
  private static final Logger LOG = LoggerFactory.getLogger( AggregateBy.class );

  public static final int USE_DEFAULT_THRESHOLD = 0;
  public static final int DEFAULT_THRESHOLD = CompositeFunction.DEFAULT_THRESHOLD;
  public static final String AGGREGATE_BY_THRESHOLD = "cascading.aggregateby.threshold";

  private String name;
  private int threshold;
  private Fields groupingFields;
  private Fields[] argumentFields;
  private Functor[] functors;
  private Aggregator[] aggregators;
  private transient GroupBy groupBy;

  public enum Cache
    {
      Num_Keys_Flushed,
      Num_Keys_Hit,
      Num_Keys_Missed
    }

  @Deprecated
  public enum Flush
    {
      Num_Keys_Flushed
    }

  /**
   * Interface Functor provides a means to create a simple function for use with the {@link CompositeFunction} class.
   * <p/>
   * Note the {@link FlowProcess} argument provides access to the underlying properties and counter APIs.
   */
  public interface Functor extends Serializable
    {
    /**
     * Method getDeclaredFields returns the declaredFields of this Functor object.
     *
     * @return the declaredFields (type Fields) of this Functor object.
     */
    Fields getDeclaredFields();

    /**
     * Method aggregate operates on the given args in tandem (optionally) with the given context values.
     * <p/>
     * The context argument is the result of the previous call to this method. Use it to store values between aggregate
     * calls (the current count, or sum of the args).
     * <p/>
     * On the very first invocation of aggregate for a given grouping key, context will be {@code null}. All subsequent
     * invocations context will be the value returned on the previous invocation.
     *
     * @param flowProcess of type FlowProcess
     * @param args        of type TupleEntry
     * @param context     of type Tuple   @return Tuple
     */
    Tuple aggregate( FlowProcess flowProcess, TupleEntry args, Tuple context );

    /**
     * Method complete allows the final aggregate computation to be performed before the return value is collected.
     * <p/>
     * The number of values in the returned {@link Tuple} instance must match the number of declaredFields.
     * <p/>
     * It is safe to return the context object as the result value.
     *
     * @param flowProcess of type FlowProcess
     * @param context     of type Tuple  @return Tuple
     */
    Tuple complete( FlowProcess flowProcess, Tuple context );
    }

  /**
   * Class CompositeFunction takes multiple Functor instances and manages them as a single {@link Function}.
   *
   * @see Functor
   */
  public static class CompositeFunction extends BaseOperation<CompositeFunction.Context> implements Function<CompositeFunction.Context>
    {
    public static final int DEFAULT_THRESHOLD = 10000;

    private int threshold = 0;
    private final Fields groupingFields;
    private final Fields[] argumentFields;
    private final Fields[] functorFields;
    private final Functor[] functors;
    private final TupleHasher tupleHasher;

    public static class Context
      {
      LinkedHashMap<Tuple, Tuple[]> lru;
      TupleEntry[] arguments;
      Tuple result;
      }

    /**
     * Constructor CompositeFunction creates a new CompositeFunction instance.
     *
     * @param groupingFields of type Fields
     * @param argumentFields of type Fields
     * @param functor        of type Functor
     * @param threshold      of type int
     */
    public CompositeFunction( Fields groupingFields, Fields argumentFields, Functor functor, int threshold )
      {
      this( groupingFields, Fields.fields( argumentFields ), new Functor[]{functor}, threshold );
      }

    /**
     * Constructor CompositeFunction creates a new CompositeFunction instance.
     *
     * @param groupingFields of type Fields
     * @param argumentFields of type Fields[]
     * @param functors       of type Functor[]
     * @param threshold      of type int
     */
    public CompositeFunction( Fields groupingFields, Fields[] argumentFields, Functor[] functors, int threshold )
      {
      super( getFields( groupingFields, functors ) ); // todo: groupingFields should lookup incoming type information
      this.groupingFields = groupingFields;
      this.argumentFields = argumentFields;
      this.functors = functors;
      this.threshold = threshold;

      this.functorFields = new Fields[ functors.length ];

      for( int i = 0; i < functors.length; i++ )
        this.functorFields[ i ] = functors[ i ].getDeclaredFields();

      Comparator[] hashers = TupleHasher.merge( functorFields );
      if( !TupleHasher.isNull( hashers ) )
        this.tupleHasher = new TupleHasher( null, hashers );
      else
        this.tupleHasher = null;
      }

    private static Fields getFields( Fields groupingFields, Functor[] functors )
      {
      Fields fields = groupingFields;

      for( Functor functor : functors )
        fields = fields.append( functor.getDeclaredFields() );

      return fields;
      }

    @Override
    public void prepare( final FlowProcess flowProcess, final OperationCall<CompositeFunction.Context> operationCall )
      {
      if( threshold == 0 )
        {
        Integer value = flowProcess.getIntegerProperty( AGGREGATE_BY_THRESHOLD );

        if( value != null && value > 0 )
          threshold = value;
        else
          threshold = DEFAULT_THRESHOLD;
        }

      LOG.info( "using threshold value: {}", threshold );

      Fields[] fields = new Fields[ functors.length + 1 ];

      fields[ 0 ] = groupingFields;

      for( int i = 0; i < functors.length; i++ )
        fields[ i + 1 ] = functors[ i ].getDeclaredFields();

      final Context context = new Context();

      context.arguments = new TupleEntry[ functors.length ];

      for( int i = 0; i < context.arguments.length; i++ )
        {
        Fields resolvedArgumentFields = operationCall.getArgumentFields();

        int[] pos;

        if( argumentFields[ i ].isAll() )
          pos = resolvedArgumentFields.getPos();
        else
          pos = resolvedArgumentFields.getPos( argumentFields[ i ] ); // returns null if selector is ALL

        Tuple narrow = TupleViews.createNarrow( pos );

        Fields currentFields;

        if( this.argumentFields[ i ].isSubstitution() )
          currentFields = resolvedArgumentFields.select( this.argumentFields[ i ] ); // attempt to retain comparator
        else
          currentFields = Fields.asDeclaration( this.argumentFields[ i ] );

        context.arguments[ i ] = new TupleEntry( currentFields, narrow );
        }

      context.result = TupleViews.createComposite( fields );

      context.lru = new LinkedHashMap<Tuple, Tuple[]>( threshold, 0.75f, true )
      {
      long flushes = 0;

      @Override
      protected boolean removeEldestEntry( Map.Entry<Tuple, Tuple[]> eldest )
        {
        boolean doRemove = size() > threshold;

        if( doRemove )
          {
          completeFunctors( flowProcess, ( (FunctionCall) operationCall ).getOutputCollector(), context.result, eldest );
          flowProcess.increment( Cache.Num_Keys_Flushed, 1 );
          flowProcess.increment( Flush.Num_Keys_Flushed, 1 );

          if( flushes % threshold == 0 ) // every multiple, write out data
            {
            Runtime runtime = Runtime.getRuntime();
            long freeMem = runtime.freeMemory() / 1024 / 1024;
            long maxMem = runtime.maxMemory() / 1024 / 1024;
            long totalMem = runtime.totalMemory() / 1024 / 1024;

            LOG.info( "flushed keys num times: {}, with threshold: {}", flushes + 1, threshold );
            LOG.info( "mem on flush (mb), free: " + freeMem + ", total: " + totalMem + ", max: " + maxMem );

            float percent = (float) totalMem / (float) maxMem;

            if( percent < 0.80F )
              LOG.info( "total mem is {}% of max mem, to better utilize unused memory consider increasing current LRU threshold with system property \"{}\"", (int) ( percent * 100.0F ), AGGREGATE_BY_THRESHOLD );
            }

          flushes++;
          }

        return doRemove;
        }
      };

      operationCall.setContext( context );
      }

    @Override
    public void operate( FlowProcess flowProcess, FunctionCall<CompositeFunction.Context> functionCall )
      {
      TupleEntry arguments = functionCall.getArguments();
      Tuple key = TupleHasher.wrapTuple( this.tupleHasher, arguments.selectTupleCopy( groupingFields ) );

      Context context = functionCall.getContext();
      Tuple[] functorContext = context.lru.get( key );

      if( functorContext == null )
        {
        functorContext = new Tuple[ functors.length ];
        context.lru.put( key, functorContext );
        flowProcess.increment( Cache.Num_Keys_Missed, 1 );
        }
      else
        {
        flowProcess.increment( Cache.Num_Keys_Hit, 1 );
        }

      for( int i = 0; i < functors.length; i++ )
        {
        TupleViews.reset( context.arguments[ i ].getTuple(), arguments.getTuple() );
        functorContext[ i ] = functors[ i ].aggregate( flowProcess, context.arguments[ i ], functorContext[ i ] );
        }
      }

    @Override
    public void flush( FlowProcess flowProcess, OperationCall<CompositeFunction.Context> operationCall )
      {
      // need to drain context
      TupleEntryCollector collector = ( (FunctionCall) operationCall ).getOutputCollector();

      Tuple result = operationCall.getContext().result;
      LinkedHashMap<Tuple, Tuple[]> context = operationCall.getContext().lru;

      for( Map.Entry<Tuple, Tuple[]> entry : context.entrySet() )
        completeFunctors( flowProcess, collector, result, entry );

      operationCall.setContext( null );
      }

    private void completeFunctors( FlowProcess flowProcess, TupleEntryCollector outputCollector, Tuple result, Map.Entry<Tuple, Tuple[]> entry )
      {
      Tuple[] results = new Tuple[ functors.length + 1 ];

      results[ 0 ] = entry.getKey();

      Tuple[] values = entry.getValue();

      for( int i = 0; i < functors.length; i++ )
        results[ i + 1 ] = functors[ i ].complete( flowProcess, values[ i ] );

      TupleViews.reset( result, results );

      outputCollector.add( result );
      }

    @Override
    public boolean equals( Object object )
      {
      if( this == object )
        return true;
      if( !( object instanceof CompositeFunction ) )
        return false;
      if( !super.equals( object ) )
        return false;

      CompositeFunction that = (CompositeFunction) object;

      if( !Arrays.equals( argumentFields, that.argumentFields ) )
        return false;
      if( !Arrays.equals( functorFields, that.functorFields ) )
        return false;
      if( !Arrays.equals( functors, that.functors ) )
        return false;
      if( groupingFields != null ? !groupingFields.equals( that.groupingFields ) : that.groupingFields != null )
        return false;

      return true;
      }

    @Override
    public int hashCode()
      {
      int result = super.hashCode();
      result = 31 * result + ( groupingFields != null ? groupingFields.hashCode() : 0 );
      result = 31 * result + ( argumentFields != null ? Arrays.hashCode( argumentFields ) : 0 );
      result = 31 * result + ( functorFields != null ? Arrays.hashCode( functorFields ) : 0 );
      result = 31 * result + ( functors != null ? Arrays.hashCode( functors ) : 0 );
      return result;
      }
    }

  /**
   * Constructor CompositeAggregator creates a new CompositeAggregator instance.
   *
   * @param name      of type String
   * @param threshold of type int
   */
  protected AggregateBy( String name, int threshold )
    {
    this.name = name;
    this.threshold = threshold;
    }

  /**
   * Constructor CompositeAggregator creates a new CompositeAggregator instance.
   *
   * @param argumentFields of type Fields
   * @param functor        of type Functor
   * @param aggregator     of type Aggregator
   */
  protected AggregateBy( Fields argumentFields, Functor functor, Aggregator aggregator )
    {
    this.argumentFields = Fields.fields( argumentFields );
    this.functors = new Functor[]{functor};
    this.aggregators = new Aggregator[]{aggregator};
    }

  /**
   * Constructor CompositeAggregator creates a new CompositeAggregator instance.
   *
   * @param pipe           of type Pipe
   * @param groupingFields of type Fields
   * @param assemblies     of type CompositeAggregator...
   */
  @ConstructorProperties({"pipe", "groupingFields", "assemblies"})
  public AggregateBy( Pipe pipe, Fields groupingFields, AggregateBy... assemblies )
    {
    this( null, Pipe.pipes( pipe ), groupingFields, 0, assemblies );
    }

  /**
   * Constructor CompositeAggregator creates a new CompositeAggregator instance.
   *
   * @param pipe           of type Pipe
   * @param groupingFields of type Fields
   * @param threshold      of type int
   * @param assemblies     of type CompositeAggregator...
   */
  @ConstructorProperties({"pipe", "groupingFields", "threshold", "assemblies"})
  public AggregateBy( Pipe pipe, Fields groupingFields, int threshold, AggregateBy... assemblies )
    {
    this( null, Pipe.pipes( pipe ), groupingFields, threshold, assemblies );
    }

  /**
   * Constructor CompositeAggregator creates a new CompositeAggregator instance.
   *
   * @param pipe           of type Pipe
   * @param groupingFields of type Fields
   * @param threshold      of type int
   * @param assemblies     of type CompositeAggregator...
   */
  @ConstructorProperties({"name", "pipe", "groupingFields", "threshold", "assemblies"})
  public AggregateBy( String name, Pipe pipe, Fields groupingFields, int threshold, AggregateBy... assemblies )
    {
    this( name, Pipe.pipes( pipe ), groupingFields, threshold, assemblies );
    }

  /**
   * Constructor CompositeAggregator creates a new CompositeAggregator instance.
   *
   * @param name           of type String
   * @param pipes          of type Pipe[]
   * @param groupingFields of type Fields
   * @param assemblies     of type CompositeAggregator...
   */
  @ConstructorProperties({"name", "pipes", "groupingFields", "assemblies"})
  public AggregateBy( String name, Pipe[] pipes, Fields groupingFields, AggregateBy... assemblies )
    {
    this( name, pipes, groupingFields, 0, assemblies );
    }

  /**
   * Constructor CompositeAggregator creates a new CompositeAggregator instance.
   *
   * @param name           of type String
   * @param pipes          of type Pipe[]
   * @param groupingFields of type Fields
   * @param threshold      of type int
   * @param assemblies     of type CompositeAggregator...
   */
  @ConstructorProperties({"name", "pipes", "groupingFields", "threshold", "assemblies"})
  public AggregateBy( String name, Pipe[] pipes, Fields groupingFields, int threshold, AggregateBy... assemblies )
    {
    this( name, threshold );

    List<Fields> arguments = new ArrayList<Fields>();
    List<Functor> functors = new ArrayList<Functor>();
    List<Aggregator> aggregators = new ArrayList<Aggregator>();

    for( int i = 0; i < assemblies.length; i++ )
      {
      AggregateBy assembly = assemblies[ i ];

      Collections.addAll( arguments, assembly.getArgumentFields() );
      Collections.addAll( functors, assembly.getFunctors() );
      Collections.addAll( aggregators, assembly.getAggregators() );
      }

    initialize( groupingFields, pipes, arguments.toArray( new Fields[ arguments.size() ] ), functors.toArray( new Functor[ functors.size() ] ), aggregators.toArray( new Aggregator[ aggregators.size() ] ) );
    }

  protected AggregateBy( String name, Pipe[] pipes, Fields groupingFields, Fields argumentFields, Functor functor, Aggregator aggregator, int threshold )
    {
    this( name, threshold );
    initialize( groupingFields, pipes, argumentFields, functor, aggregator );
    }

  protected void initialize( Fields groupingFields, Pipe[] pipes, Fields argumentFields, Functor functor, Aggregator aggregator )
    {
    initialize( groupingFields, pipes, Fields.fields( argumentFields ),
      new Functor[]{functor},
      new Aggregator[]{aggregator} );
    }

  protected void initialize( Fields groupingFields, Pipe[] pipes, Fields[] argumentFields, Functor[] functors, Aggregator[] aggregators )
    {
    setPrevious( pipes );

    this.groupingFields = groupingFields;
    this.argumentFields = argumentFields;
    this.functors = functors;
    this.aggregators = aggregators;

    verify();

    Fields sortFields = Fields.copyComparators( Fields.merge( this.argumentFields ), this.argumentFields );
    Fields argumentSelector = Fields.merge( this.groupingFields, sortFields );

    if( argumentSelector.equals( Fields.NONE ) )
      argumentSelector = Fields.ALL;

    Pipe[] functions = new Pipe[ pipes.length ];

    CompositeFunction function = new CompositeFunction( this.groupingFields, this.argumentFields, this.functors, threshold );

    for( int i = 0; i < functions.length; i++ )
      functions[ i ] = new Each( pipes[ i ], argumentSelector, function, Fields.RESULTS );

    groupBy = new GroupBy( name, functions, this.groupingFields, sortFields.hasComparators() ? sortFields : null );

    Pipe pipe = groupBy;

    for( int i = 0; i < aggregators.length; i++ )
      pipe = new Every( pipe, this.functors[ i ].getDeclaredFields(), this.aggregators[ i ], Fields.ALL );

    setTails( pipe );
    }

  /** Method verify should be overridden by sub-classes if any values must be tested before the calling constructor returns. */
  protected void verify()
    {

    }

  /**
   * Method getGroupingFields returns the Fields this instances will be grouping against.
   *
   * @return the current grouping fields
   */
  public Fields getGroupingFields()
    {
    return groupingFields;
    }

  /**
   * Method getFieldDeclarations returns an array of Fields where each Field element in the array corresponds to the
   * field declaration of the given Aggregator operations.
   * <p/>
   * Note the actual Fields values are returned, not planner resolved Fields.
   *
   * @return and array of Fields
   */
  public Fields[] getFieldDeclarations()
    {
    Fields[] fields = new Fields[ this.aggregators.length ];

    for( int i = 0; i < aggregators.length; i++ )
      fields[ i ] = aggregators[ i ].getFieldDeclaration();

    return fields;
    }

  protected Fields[] getArgumentFields()
    {
    return argumentFields;
    }

  protected Functor[] getFunctors()
    {
    return functors;
    }

  protected Aggregator[] getAggregators()
    {
    return aggregators;
    }

  /**
   * Method getGroupBy returns the internal {@link GroupBy} instance so that any custom properties
   * can be set on it via {@link cascading.pipe.Pipe#getStepConfigDef()}.
   *
   * @return GroupBy type
   */
  public GroupBy getGroupBy()
    {
    return groupBy;
    }

  @Property(name = "threshold", visibility = Visibility.PUBLIC)
  @PropertyDescription("Threshold of the aggregation.")
  @PropertyConfigured(value = AGGREGATE_BY_THRESHOLD, defaultValue = "10000")
  public int getThreshold()
    {
    return threshold;
    }
  }
TOP

Related Classes of cascading.pipe.assembly.AggregateBy$CompositeFunction$Context

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.