Examples of MRPipeline


Examples of org.apache.crunch.impl.mr.MRPipeline

                                  new MapsideJoinStrategy<Integer, String, String>(false));
  }

  @Test
  public void testMapsideJoin_RightOuterJoin_Materialized() throws IOException {
    runMapsideRightOuterJoin(new MRPipeline(MapsideJoinStrategyIT.class, tmpDir.getDefaultConfiguration()),
                             false, true, MapsideJoinStrategy.<Integer, String, String>create(true));
  }
View Full Code Here

Examples of org.apache.crunch.impl.mr.MRPipeline

                             false, true, MapsideJoinStrategy.<Integer, String, String>create(true));
  }

  @Test
  public void testLegacyMapsideJoin_LeftOuterJoin_Materialized() throws IOException {
    runLegacyMapsideLeftOuterJoin(new MRPipeline(MapsideJoinStrategyIT.class, tmpDir.getDefaultConfiguration()),
                                  false, true,
                                  new MapsideJoinStrategy<Integer, String, String>(true));
  }
View Full Code Here

Examples of org.apache.crunch.impl.mr.MRPipeline

    out1.flush();
    out1.close();
    out2.flush();
    out2.close();

    final MRPipeline pipeline = new MRPipeline(MapsideJoinStrategyIT.class, tmpDir.getDefaultConfiguration());

    final PCollection<String> values1 = pipeline.readTextFile(path1.toString());
    final PCollection<String> values2 = pipeline.readTextFile(path2.toString());

    final PTable<Text, Text> convertedValues1 = convertStringToText(values1);
    final PTable<Text, Text> convertedValues2 = convertStringToText(values2);

    // for map side join
    final MapsideJoinStrategy<Text, Text, Text> mapSideJoinStrategy = MapsideJoinStrategy.<Text, Text, Text>create();

    final PTable<Text, Pair<Text, Text>> updatedJoinedRows = mapSideJoinStrategy.join(convertedValues1, convertedValues2, JoinType.INNER_JOIN);
    pipeline.run();

    // Join should have 2 results
    // Join should have contentBytes1 and contentBytes2
    assertEquals(4, updatedJoinedRows.materializeToMap().size());
  }
View Full Code Here

Examples of org.apache.crunch.impl.mr.MRPipeline

    fs.delete(TEMP_DIR, true);
  }

  @Test
  public void testHFileTarget() throws Exception {
    Pipeline pipeline = new MRPipeline(HFileTargetIT.class, HBASE_TEST_UTILITY.getConfiguration());
    Path inputPath = copyResourceFileToHDFS("shakes.txt");
    Path outputPath = getTempPathOnHDFS("out");

    PCollection<String> shakespeare = pipeline.read(At.textFile(inputPath, Writables.strings()));
    PCollection<String> words = split(shakespeare, "\\s+");
    PTable<String, Long> wordCounts = words.count();
    pipeline.write(convertToKeyValues(wordCounts), ToHBase.hfile(outputPath));

    PipelineResult result = pipeline.run();
    assertTrue(result.succeeded());

    FileSystem fs = FileSystem.get(HBASE_TEST_UTILITY.getConfiguration());
    KeyValue kv = readFromHFiles(fs, outputPath, "and");
    assertEquals(427L, Bytes.toLong(kv.getValue()));
View Full Code Here

Examples of org.apache.crunch.impl.mr.MRPipeline

    assertEquals(427L, Bytes.toLong(kv.getValue()));
  }

  @Test
  public void testBulkLoad() throws Exception {
    Pipeline pipeline = new MRPipeline(HFileTargetIT.class, HBASE_TEST_UTILITY.getConfiguration());
    Path inputPath = copyResourceFileToHDFS("shakes.txt");
    Path outputPath = getTempPathOnHDFS("out");
    HTable testTable = createTable(26);

    PCollection<String> shakespeare = pipeline.read(At.textFile(inputPath, Writables.strings()));
    PCollection<String> words = split(shakespeare, "\\s+");
    PTable<String,Long> wordCounts = words.count();
    PCollection<Put> wordCountPuts = convertToPuts(wordCounts);
    HFileUtils.writePutsToHFilesForIncrementalLoad(
        wordCountPuts,
        testTable,
        outputPath);

    PipelineResult result = pipeline.run();
    assertTrue(result.succeeded());

    new LoadIncrementalHFiles(HBASE_TEST_UTILITY.getConfiguration())
        .doBulkLoad(outputPath, testTable);
View Full Code Here

Examples of org.apache.crunch.impl.mr.MRPipeline

  }

  /** See CRUNCH-251 */
  @Test
  public void testMultipleHFileTargets() throws Exception {
    Pipeline pipeline = new MRPipeline(HFileTargetIT.class, HBASE_TEST_UTILITY.getConfiguration());
    Path inputPath = copyResourceFileToHDFS("shakes.txt");
    Path outputPath1 = getTempPathOnHDFS("out1");
    Path outputPath2 = getTempPathOnHDFS("out2");
    HTable table1 = createTable(26);
    HTable table2 = createTable(26);
    LoadIncrementalHFiles loader = new LoadIncrementalHFiles(HBASE_TEST_UTILITY.getConfiguration());

    PCollection<String> shakespeare = pipeline.read(At.textFile(inputPath, Writables.strings()));
    PCollection<String> words = split(shakespeare, "\\s+");
    PCollection<String> shortWords = words.filter(SHORT_WORD_FILTER);
    PCollection<String> longWords = words.filter(FilterFns.not(SHORT_WORD_FILTER));
    PTable<String, Long> shortWordCounts = shortWords.count();
    PTable<String, Long> longWordCounts = longWords.count();
    HFileUtils.writePutsToHFilesForIncrementalLoad(
        convertToPuts(shortWordCounts),
        table1,
        outputPath1);
    HFileUtils.writePutsToHFilesForIncrementalLoad(
        convertToPuts(longWordCounts),
        table2,
        outputPath2);

    PipelineResult result = pipeline.run();
    assertTrue(result.succeeded());

    loader.doBulkLoad(outputPath1, table1);
    loader.doBulkLoad(outputPath2, table2);

View Full Code Here

Examples of org.apache.crunch.impl.mr.MRPipeline

  @Test
  public void testHFileUsesFamilyConfig() throws Exception {
    DataBlockEncoding newBlockEncoding = DataBlockEncoding.PREFIX;
    assertNotSame(newBlockEncoding, DataBlockEncoding.valueOf(HColumnDescriptor.DEFAULT_DATA_BLOCK_ENCODING));

    Pipeline pipeline = new MRPipeline(HFileTargetIT.class, HBASE_TEST_UTILITY.getConfiguration());
    Path inputPath = copyResourceFileToHDFS("shakes.txt");
    Path outputPath = getTempPathOnHDFS("out");
    HColumnDescriptor hcol = new HColumnDescriptor(TEST_FAMILY);
    hcol.setDataBlockEncoding(newBlockEncoding);
    HTable testTable = createTable(26, hcol);

    PCollection<String> shakespeare = pipeline.read(At.textFile(inputPath, Writables.strings()));
    PCollection<String> words = split(shakespeare, "\\s+");
    PTable<String,Long> wordCounts = words.count();
    PCollection<Put> wordCountPuts = convertToPuts(wordCounts);
    HFileUtils.writePutsToHFilesForIncrementalLoad(
        wordCountPuts,
        testTable,
        outputPath);

    PipelineResult result = pipeline.run();
    assertTrue(result.succeeded());

    int hfilesCount = 0;
    Configuration conf = HBASE_TEST_UTILITY.getConfiguration();
    FileSystem fs = outputPath.getFileSystem(conf);
View Full Code Here

Examples of org.apache.crunch.impl.mr.MRPipeline

    createTable(configuration, TABLE_TARGET, Bytes.toString(COLUMN_FAMILY_TARGET));

    putInHbase(putList, configuration);

    // We create the pipeline which will handle most of the job.
    Pipeline pipeline = new MRPipeline(WordAggregationHBase.class, HBaseConfiguration.create());

    // The scan which will retrieve the data from the source in hbase.
    Scan scan = new Scan();
    scan.addColumn(COLUMN_FAMILY_SOURCE, COLUMN_QUALIFIER_SOURCE_PLAY);
    scan.addColumn(COLUMN_FAMILY_SOURCE, COLUMN_QUALIFIER_SOURCE_QUOTE);

    // Our hbase source
    HBaseSourceTarget source = new HBaseSourceTarget(TABLE_SOURCE, scan);

    // Our source, in a format which can be use by crunch
    PTable<ImmutableBytesWritable, Result> rawText = pipeline.read(source);

    // We process the data from the source HTable then concatenate all data
    // with the same rowkey
    PTable<String, String> textExtracted = extractText(rawText);
    PTable<String, String> result = textExtracted.groupByKey()
        .combineValues(Aggregators.STRING_CONCAT(" "true));

    // We create the collection of puts from the concatenated datas
    PCollection<Put> resultPut = createPut(result);

    // We write the puts in hbase, in the target table
    pipeline.write(resultPut, new HBaseTarget(TABLE_TARGET));

    pipeline.done();
    return 0;
  }
View Full Code Here

Examples of org.apache.crunch.impl.mr.MRPipeline

 
  private <T> void testSourceTarget(PType<T> ptype, T expected) {
    Path inputPath = new Path(tempPath, "input.orc");
    Path outputPath = new Path(tempPath, "output");
   
    Pipeline pipeline = new MRPipeline(OrcFileSourceTargetIT.class, conf);
    OrcFileSource<T> source = new OrcFileSource<T>(inputPath, ptype);
    PCollection<T> rows = pipeline.read(source);
    List<T> result = Lists.newArrayList(rows.materialize());
   
    assertEquals(Lists.newArrayList(expected), result);
   
    OrcFileTarget target = new OrcFileTarget(outputPath);
    pipeline.write(rows, target);
   
    assertTrue(pipeline.done().succeeded());
   
    OrcFileReaderFactory<T> reader = new OrcFileReaderFactory<T>(ptype);
    List<T> newResult = Lists.newArrayList(reader.read(fs, inputPath));
   
    assertEquals(Lists.newArrayList(expected), newResult);
View Full Code Here

Examples of org.apache.crunch.impl.mr.MRPipeline

 
  @Test
  public void testColumnPruning() throws IOException {
    generateInputData();
   
    Pipeline pipeline = new MRPipeline(OrcFileSourceTargetIT.class, conf);
    int[] readColumns = {0, 1};
    OrcFileSource<Person> source = new OrcFileSource<Person>(new Path(tempPath, "input.orc"),
        Orcs.reflects(Person.class), readColumns);
    PCollection<Person> rows = pipeline.read(source);
    List<Person> result = Lists.newArrayList(rows.materialize());
   
    Person expected = new Person("Alice", 23, null);
    assertEquals(Lists.newArrayList(expected), result);
  }
View Full Code Here
TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.