Package org.apache.crunch.lib

Source Code of org.apache.crunch.lib.SecondarySort$SSFormatFn

/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements.  See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership.  The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.crunch.lib;

import java.util.Collection;

import org.apache.crunch.DoFn;
import org.apache.crunch.Emitter;
import org.apache.crunch.GroupingOptions;
import org.apache.crunch.MapFn;
import org.apache.crunch.PCollection;
import org.apache.crunch.PGroupedTable;
import org.apache.crunch.PTable;
import org.apache.crunch.Pair;
import org.apache.crunch.lib.join.JoinUtils;
import org.apache.crunch.types.PTableType;
import org.apache.crunch.types.PType;
import org.apache.crunch.types.PTypeFamily;
import org.apache.hadoop.conf.Configuration;

/**
* Utilities for performing a secondary sort on a {@code PTable<K, Pair<V1, V2>>} collection.
* <p>
* Secondary sorts are usually performed during sessionization: given a collection
* of events, we want to group them by a key (such as a user ID), then sort the grouped
* records by an auxillary key (such as a timestamp), and then perform some additional
* processing on the sorted records.
*/
public class SecondarySort {
 
  /**
   * Perform a secondary sort on the given {@code PTable} instance and then apply a
   * {@code DoFn} to the resulting sorted data to yield an output {@code PCollection<T>}.
   */
  public static <K, V1, V2, T> PCollection<T> sortAndApply(
      PTable<K, Pair<V1, V2>> input,
      DoFn<Pair<K, Iterable<Pair<V1, V2>>>, T> doFn,
      PType<T> ptype) {
    return sortAndApply(input, doFn, ptype, -1);
  }
 
  /**
   * Perform a secondary sort on the given {@code PTable} instance and then apply a
   * {@code DoFn} to the resulting sorted data to yield an output {@code PCollection<T>}, using
   * the given number of reducers.
   */
  public static <K, V1, V2, T> PCollection<T> sortAndApply(
      PTable<K, Pair<V1, V2>> input,
      DoFn<Pair<K, Iterable<Pair<V1, V2>>>, T> doFn,
      PType<T> ptype,
      int numReducers) {
    return prepare(input, numReducers)
        .parallelDo("SecondarySort.apply", new SSWrapFn<K, V1, V2, T>(doFn), ptype);
  }
  /**
   * Perform a secondary sort on the given {@code PTable} instance and then apply a
   * {@code DoFn} to the resulting sorted data to yield an output {@code PTable<U, V>}.
   */
  public static <K, V1, V2, U, V> PTable<U, V> sortAndApply(
      PTable<K, Pair<V1, V2>> input,
      DoFn<Pair<K, Iterable<Pair<V1, V2>>>, Pair<U, V>> doFn,
      PTableType<U, V> ptype) {
    return sortAndApply(input, doFn, ptype, -1);
  }
 
  /**
   * Perform a secondary sort on the given {@code PTable} instance and then apply a
   * {@code DoFn} to the resulting sorted data to yield an output {@code PTable<U, V>}, using
   * the given number of reducers.
   */
  public static <K, V1, V2, U, V> PTable<U, V> sortAndApply(
      PTable<K, Pair<V1, V2>> input,
      DoFn<Pair<K, Iterable<Pair<V1, V2>>>, Pair<U, V>> doFn,
      PTableType<U, V> ptype,
      int numReducers) {
    return prepare(input, numReducers)
        .parallelDo("SecondarySort.apply", new SSWrapFn<K, V1, V2, Pair<U, V>>(doFn), ptype);
  }
 
  private static <K, V1, V2> PGroupedTable<Pair<K, V1>, Pair<V1, V2>> prepare(
      PTable<K, Pair<V1, V2>> input, int numReducers) {
    PTypeFamily ptf = input.getTypeFamily();
    PType<Pair<V1, V2>> valueType = input.getValueType();
    PTableType<Pair<K, V1>, Pair<V1, V2>> inter = ptf.tableOf(
        ptf.pairs(input.getKeyType(), valueType.getSubTypes().get(0)),
        valueType);
    PTableType<K, Collection<Pair<V1, V2>>> out = ptf.tableOf(input.getKeyType(),
        ptf.collections(input.getValueType()));
    GroupingOptions.Builder gob = GroupingOptions.builder()
        .groupingComparatorClass(JoinUtils.getGroupingComparator(ptf))
        .partitionerClass(JoinUtils.getPartitionerClass(ptf));
    if (numReducers > 0) {
      gob.numReducers(numReducers);
    }
    return input.parallelDo("SecondarySort.format", new SSFormatFn<K, V1, V2>(), inter)
        .groupByKey(gob.build());
  }
 
  private static class SSFormatFn<K, V1, V2> extends MapFn<Pair<K, Pair<V1, V2>>, Pair<Pair<K, V1>, Pair<V1, V2>>> {
    @Override
    public Pair<Pair<K, V1>, Pair<V1, V2>> map(Pair<K, Pair<V1, V2>> input) {
      return Pair.of(Pair.of(input.first(), input.second().first()), input.second());
    }
  } 

  private static class SSWrapFn<K, V1, V2, T> extends DoFn<Pair<Pair<K, V1>, Iterable<Pair<V1, V2>>>, T> {
    private final DoFn<Pair<K, Iterable<Pair<V1, V2>>>, T> intern;
   
    public SSWrapFn(DoFn<Pair<K, Iterable<Pair<V1, V2>>>, T> intern) {
      this.intern = intern;
    }

    @Override
    public void configure(Configuration conf) {
      intern.configure(conf);
    }

    @Override
    public void initialize() {
      intern.setContext(getContext());
      intern.initialize();
    }
   
    @Override
    public void process(Pair<Pair<K, V1>, Iterable<Pair<V1, V2>>> input, Emitter<T> emitter) {
      intern.process(Pair.of(input.first().first(), input.second()), emitter);
    }
   
    @Override
    public void cleanup(Emitter<T> emitter) {
      intern.cleanup(emitter);
    }
  } 
}
TOP

Related Classes of org.apache.crunch.lib.SecondarySort$SSFormatFn

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.