Package org.apache.crunch.types

Examples of org.apache.crunch.types.PTypeFamily


        }));
    return new FirstElementPObject<S>(minCollect);
  }

  public static <K, V> PTable<K, Collection<V>> collectValues(PTable<K, V> collect) {
    PTypeFamily tf = collect.getTypeFamily();
    final PType<V> valueType = collect.getValueType();
    return collect.groupByKey().mapValues("collect",
        new MapFn<Iterable<V>, Collection<V>>() {
          @Override
          public void initialize() {
            valueType.initialize(getConfiguration());
          }

          public Collection<V> map(Iterable<V> values) {
            List<V> collected = Lists.newArrayList();
            for (V value : values) {
              collected.add(valueType.getDetachedValue(value));
            }
            return collected;
          }
        }, tf.collections(collect.getValueType()));
  }
View Full Code Here


          }
        }, tf.collections(collect.getValueType()));
  }
 
  public static <S> PCollection<S> aggregate(PCollection<S> collect, Aggregator<S> aggregator) {
    PTypeFamily tf = collect.getTypeFamily();
    return collect.parallelDo("Aggregate.aggregator", new MapFn<S, Pair<Void, S>>() {
      public Pair<Void, S> map(S input) {
        return Pair.of(null, input);
      }
    }, tf.tableOf(tf.nulls(), collect.getPType()))
    .groupByKey(1)
    .combineValues(aggregator)
    .values();
  }
View Full Code Here

    File transformedOutput = tmpDir.getFile("transformed-output");
    File tfOutput = tmpDir.getFile("tf-output");

    PCollection<String> docs = pipeline.readTextFile(input);

    PTypeFamily ptf = docs.getTypeFamily();

    /*
     * Input: String Input title text
     *
     * Output: PTable<Pair<String, String>, Long> Pair<Pair<word, title>, count
     * in title>
     */
    PTable<Pair<String, String>, Long> tf = Aggregate.count(docs.parallelDo("term document frequency",
        new DoFn<String, Pair<String, String>>() {
          @Override
          public void process(String doc, Emitter<Pair<String, String>> emitter) {
            String[] kv = doc.split("\t");
            String title = kv[0];
            String text = kv[1];
            for (String word : text.split("\\W+")) {
              if (!word.isEmpty()) {
                Pair<String, String> pair = Pair.of(word.toLowerCase(Locale.ENGLISH), title);
                emitter.emit(pair);
              }
            }
          }
        }, ptf.pairs(ptf.strings(), ptf.strings())));

    if (transformTF) {
      /*
       * Input: Pair<Pair<String, String>, Long> Pair<Pair<word, title>, count
       * in title>
       *
       * Output: PTable<String, Pair<String, Long>> PTable<word, Pair<title,
       * count in title>>
       */
      PTable<String, Pair<String, Long>> wordDocumentCountPair = tf.parallelDo("transform wordDocumentPairCount",
          new MapFn<Pair<Pair<String, String>, Long>, Pair<String, Pair<String, Long>>>() {
            @Override
            public Pair<String, Pair<String, Long>> map(Pair<Pair<String, String>, Long> input) {
              Pair<String, String> wordDocumentPair = input.first();
              return Pair.of(wordDocumentPair.first(), Pair.of(wordDocumentPair.second(), input.second()));
            }
          }, ptf.tableOf(ptf.strings(), ptf.pairs(ptf.strings(), ptf.longs())));

      pipeline.writeTextFile(wordDocumentCountPair, transformedOutput.getAbsolutePath());
    }

    SourceTarget<String> st = At.textFile(tfOutput.getAbsolutePath());
View Full Code Here

  @Rule
  public TemporaryPath tmpDir = TemporaryPaths.create();

  @Test
  public void testAvroReflect() throws Exception {
    PTypeFamily tf = AvroTypeFamily.getInstance();
    PType<PageRankData> prType = Avros.reflects(PageRankData.class);
    String urlInput = tmpDir.copyResourceFileName("urls.txt");
    run(new MRPipeline(PageRankIT.class, tmpDir.getDefaultConfiguration()),
        urlInput, prType, tf);
  }
View Full Code Here

        urlInput, prType, tf);
  }

  @Test
  public void testAvroMReflectInMemory() throws Exception {
    PTypeFamily tf = AvroTypeFamily.getInstance();
    PType<PageRankData> prType = Avros.reflects(PageRankData.class);
    String urlInput = tmpDir.copyResourceFileName("urls.txt");
    run(MemPipeline.getInstance(), urlInput, prType, tf);
  }
View Full Code Here

    run(MemPipeline.getInstance(), urlInput, prType, tf);
  }

  @Test
  public void testAvroJSON() throws Exception {
    PTypeFamily tf = AvroTypeFamily.getInstance();
    PType<PageRankData> prType = PTypes.jsonString(PageRankData.class, tf);
    String urlInput = tmpDir.copyResourceFileName("urls.txt");
    run(new MRPipeline(PageRankIT.class, tmpDir.getDefaultConfiguration()),
        urlInput, prType, tf);
  }
View Full Code Here

        urlInput, prType, tf);
  }

  @Test
  public void testWritablesJSON() throws Exception {
    PTypeFamily tf = WritableTypeFamily.getInstance();
    PType<PageRankData> prType = PTypes.jsonString(PageRankData.class, tf);
    String urlInput = tmpDir.copyResourceFileName("urls.txt");
    run(new MRPipeline(PageRankIT.class, tmpDir.getDefaultConfiguration()),
        urlInput, prType, tf);
  }
View Full Code Here

    run(new MRPipeline(PageRankIT.class, tmpDir.getDefaultConfiguration()),
        urlInput, prType, tf);
  }

  public static PTable<String, PageRankData> pageRank(PTable<String, PageRankData> input, final float d) {
    PTypeFamily ptf = input.getTypeFamily();
    PTable<String, Float> outbound = input.parallelDo(new DoFn<Pair<String, PageRankData>, Pair<String, Float>>() {
      @Override
      public void process(Pair<String, PageRankData> input, Emitter<Pair<String, Float>> emitter) {
        PageRankData prd = input.second();
        for (String link : prd.urls) {
          emitter.emit(Pair.of(link, prd.propagatedScore()));
        }
      }
    }, ptf.tableOf(ptf.strings(), ptf.floats()));

    return input.cogroup(outbound).mapValues(
        new MapFn<Pair<Collection<PageRankData>, Collection<Float>>, PageRankData>() {
          @Override
          public PageRankData map(Pair<Collection<PageRankData>, Collection<Float>> input) {
View Full Code Here

  public static class Collect<V1, V2, V3> extends Tuple3<Collection<V1>, Collection<V2>, Collection<V3>> {

    public static <V1, V2, V3> PType<Tuple3.Collect<V1, V2, V3>> derived(PType<V1> first,
        PType<V2> second, PType<V3> third) {
      PTypeFamily tf = first.getFamily();
      PType<Tuple3<Collection<V1>, Collection<V2>, Collection<V3>>> pt =
          tf.triples(
              tf.collections(first),
              tf.collections(second),
              tf.collections(third));
      Object clazz = Tuple3.Collect.class;
      return tf.derived((Class<Tuple3.Collect<V1, V2, V3>>) clazz,
          new MapFn<Tuple3<Collection<V1>, Collection<V2>, Collection<V3>>, Collect<V1, V2, V3>>() {
        @Override
        public Collect<V1, V2, V3> map(
            Tuple3<Collection<V1>, Collection<V2>, Collection<V3>> in) {
          return new Collect<V1, V2, V3>(in.first(), in.second(), in.third());
View Full Code Here

   * @param keyType The {@code PType} for the key of the SequenceFile entry
   * @param valueType The {@code PType} for the value of the SequenceFile entry
   * @return A new {@code TableSourceTarget<K, V>} instance
   */
  public static <K, V> TableSourceTarget<K, V> sequenceFile(Path path, PType<K> keyType, PType<V> valueType) {
    PTypeFamily ptf = keyType.getFamily();
    return new SeqFileTableSourceTarget<K, V>(path, ptf.tableOf(keyType, valueType));
  }
View Full Code Here

TOP

Related Classes of org.apache.crunch.types.PTypeFamily

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.