Package org.apache.crunch.types

Examples of org.apache.crunch.types.PTypeFamily


   */
  public static <K, U, V> PTable<K, Pair<Collection<U>, Collection<V>>> cogroup(
      int numReducers,
      PTable<K, U> left,
      PTable<K, V> right) {
    PTypeFamily tf = left.getTypeFamily();
    return cogroup(
        tf.pairs(tf.collections(left.getValueType()),
                 tf.collections(right.getValueType())),
        TupleFactory.PAIR,
        numReducers,
        left, right);
  }
View Full Code Here


   */
  public static <K, U, V> PTable<K, TupleN> cogroup(
      int numReducers,
      PTable<K, ?> first,
      PTable<K, ?>... rest) {
    PTypeFamily tf = first.getTypeFamily();
    PType[] components = new PType[1 + rest.length];
    components[0] = tf.collections(first.getValueType());
    for (int i = 0; i < rest.length; i++) {
      components[i + 1] = rest[i].getValueType();
    }
    return cogroup(
        tf.tuples(components),
        TupleFactory.TUPLEN,
        numReducers,
        first, rest);
  }
View Full Code Here

  private static <K, T extends Tuple> PTable<K, T> cogroup(
      PType<T> outputType,
      TupleFactory tupleFactory,
      int numReducers,
      PTable<K, ?> first, PTable<K, ?>... rest) {
    PTypeFamily ptf = first.getTypeFamily();
    PType[] ptypes = new PType[1 + rest.length];
    ptypes[0] = first.getValueType();
    for (int i = 0; i < rest.length; i++) {
      ptypes[i + 1] = rest[i].getValueType();
    }
    PType<TupleN> itype = ptf.tuples(ptypes);
   
    PTable<K, TupleN> firstInter = first.mapValues("coGroupTag1",
        new CogroupFn(0, 1 + rest.length),
        itype);
    PTable<K, TupleN>[] inter = new PTable[rest.length];
View Full Code Here

   */
  public static <T> PCollection<T> reservoirSample(
      PCollection<T> input,
      int sampleSize,
      Long seed) {
    PTypeFamily ptf = input.getTypeFamily();
    PType<Pair<T, Integer>> ptype = ptf.pairs(input.getPType(), ptf.ints());
    return weightedReservoirSample(
        input.parallelDo("Map to pairs for reservoir sampling", new MapFn<T, Pair<T, Integer>>() {
          @Override
          public Pair<T, Integer> map(T t) { return Pair.of(t, 1); }
        }, ptype),
View Full Code Here

   */
  public static <T, N extends Number> PCollection<T> weightedReservoirSample(
      PCollection<Pair<T, N>> input,
      int sampleSize,
      Long seed) {
    PTypeFamily ptf = input.getTypeFamily();
    PTable<Integer, Pair<T, N>> groupedIn = input.parallelDo(
        new MapFn<Pair<T, N>, Pair<Integer, Pair<T, N>>>() {
          @Override
          public Pair<Integer, Pair<T, N>> map(Pair<T, N> p) {
            return Pair.of(0, p);
          }
        }, ptf.tableOf(ptf.ints(), input.getPType()));
    int[] ss = { sampleSize };
    return groupedWeightedReservoirSample(groupedIn, ss, seed)
        .parallelDo("Extract sampled value from pair", new MapFn<Pair<Integer, T>, T>() {
          @Override
          public T map(Pair<Integer, T> p) {
View Full Code Here

   */
  public static <T, N extends Number> PCollection<Pair<Integer, T>> groupedWeightedReservoirSample(
      PTable<Integer, Pair<T, N>> input,
      int[] sampleSizes,
      Long seed) {
    PTypeFamily ptf = input.getTypeFamily();
    PType<T> ttype = (PType<T>) input.getPTableType().getValueType().getSubTypes().get(0);
    PTableType<Integer, Pair<Double, T>> ptt = ptf.tableOf(ptf.ints(),
        ptf.pairs(ptf.doubles(), ttype));
   
    return input.parallelDo("Initial reservoir sampling", new ReservoirSampleFn<T, N>(sampleSizes, seed, ttype), ptt)
        .groupByKey(1)
        .combineValues(new WRSCombineFn<T>(sampleSizes, ttype))
        .parallelDo("Extract sampled values", new MapFn<Pair<Integer, Pair<Double, T>>, Pair<Integer, T>>() {
          @Override
          public Pair<Integer, T> map(Pair<Integer, Pair<Double, T>> p) {
            return Pair.of(p.first(), p.second().second());
          }
        }, ptf.pairs(ptf.ints(), ttype));
  }
View Full Code Here

   * @param keyType The {@code PType} for the key of the SequenceFile entry
   * @param valueType The {@code PType} for the value of the SequenceFile entry
   * @return A new {@code SourceTable<K, V>} instance
   */
  public static <K, V> TableSource<K, V> sequenceFile(Path path, PType<K> keyType, PType<V> valueType) {
    PTypeFamily ptf = keyType.getFamily();
    return new SeqFileTableSource<K, V>(path, ptf.tableOf(keyType, valueType));
  }
View Full Code Here

   * @param keyType The {@code PType} for the key of the SequenceFile entry
   * @param valueType The {@code PType} for the value of the SequenceFile entry
   * @return A new {@code SourceTable<K, V>} instance
   */
  public static <K, V> TableSource<K, V> sequenceFile(List<Path> paths, PType<K> keyType, PType<V> valueType) {
    PTypeFamily ptf = keyType.getFamily();
    return new SeqFileTableSource<K, V>(paths, ptf.tableOf(keyType, valueType));
  }
View Full Code Here

        .parallelDo("SecondarySort.apply", new SSWrapFn<K, V1, V2, Pair<U, V>>(doFn), ptype);
  }
 
  private static <K, V1, V2> PGroupedTable<Pair<K, V1>, Pair<V1, V2>> prepare(
      PTable<K, Pair<V1, V2>> input, int numReducers) {
    PTypeFamily ptf = input.getTypeFamily();
    PType<Pair<V1, V2>> valueType = input.getValueType();
    PTableType<Pair<K, V1>, Pair<V1, V2>> inter = ptf.tableOf(
        ptf.pairs(input.getKeyType(), valueType.getSubTypes().get(0)),
        valueType);
    GroupingOptions.Builder gob = GroupingOptions.builder()
        .requireSortedKeys()
        .groupingComparatorClass(JoinUtils.getGroupingComparator(ptf))
        .partitionerClass(JoinUtils.getPartitionerClass(ptf));
View Full Code Here

  /**
   * Returns a {@code PTable} that contains the unique elements of this collection mapped to a count
   * of their occurrences.
   */
  public static <S> PTable<S, Long> count(PCollection<S> collect, int numPartitions) {
    PTypeFamily tf = collect.getTypeFamily();
    return collect.parallelDo("Aggregate.count", new MapFn<S, Pair<S, Long>>() {
      public Pair<S, Long> map(S input) {
        return Pair.of(input, 1L);
      }
    }, tf.tableOf(collect.getPType(), tf.longs()))
        .groupByKey(numPartitions)
        .combineValues(Aggregators.SUM_LONGS());
  }
View Full Code Here

TOP

Related Classes of org.apache.crunch.types.PTypeFamily

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.