Source Code of org.apache.crunch.io.hbase.WordCountHBaseIT

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.crunch.io.hbase;


import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;


import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.Random;
import java.util.jar.JarEntry;
import java.util.jar.JarOutputStream;


import org.apache.crunch.DoFn;
import org.apache.crunch.Emitter;
import org.apache.crunch.PCollection;
import org.apache.crunch.PTable;
import org.apache.crunch.Pair;
import org.apache.crunch.Pipeline;
import org.apache.crunch.impl.mr.MRPipeline;
import org.apache.crunch.io.hbase.HBaseSourceTarget;
import org.apache.crunch.io.hbase.HBaseTarget;
import org.apache.crunch.lib.Aggregate;
import org.apache.crunch.test.TemporaryPath;
import org.apache.crunch.test.TemporaryPaths;
import org.apache.crunch.types.writable.Writables;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HBaseTestingUtility;
import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.client.Get;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.mapred.TaskAttemptContext;
import org.junit.After;
import org.junit.Before;
import org.junit.Rule;
import org.junit.Test;


import com.google.common.io.ByteStreams;


public class WordCountHBaseIT {
  @Rule
  public TemporaryPath tmpDir = TemporaryPaths.create();


  private static final byte[] COUNTS_COLFAM = Bytes.toBytes("cf");
  private static final byte[] WORD_COLFAM = Bytes.toBytes("cf");


  private HBaseTestingUtility hbaseTestUtil = new HBaseTestingUtility();


  @SuppressWarnings("serial")
  public static PCollection<Put> wordCount(PTable<ImmutableBytesWritable, Result> words) {
    PTable<String, Long> counts = Aggregate.count(words.parallelDo(
        new DoFn<Pair<ImmutableBytesWritable, Result>, String>() {
          @Override
          public void process(Pair<ImmutableBytesWritable, Result> row, Emitter<String> emitter) {
            byte[] word = row.second().getValue(WORD_COLFAM, null);
            if (word != null) {
              emitter.emit(Bytes.toString(word));
            }
          }
        }, words.getTypeFamily().strings()));


    return counts.parallelDo("convert to put", new DoFn<Pair<String, Long>, Put>() {
      @Override
      public void process(Pair<String, Long> input, Emitter<Put> emitter) {
        Put put = new Put(Bytes.toBytes(input.first()));
        put.add(COUNTS_COLFAM, null, Bytes.toBytes(input.second()));
        emitter.emit(put);
      }


    }, Writables.writables(Put.class));
  }


  @Before
  public void setUp() throws Exception {
    Configuration conf = hbaseTestUtil.getConfiguration();
    conf.set("hadoop.log.dir", tmpDir.getFileName("logs"));
    conf.set("hadoop.tmp.dir", tmpDir.getFileName("hadoop-tmp"));
    conf.set(HConstants.ZOOKEEPER_ZNODE_PARENT, "/1");
    conf.setInt("hbase.master.info.port", -1);
    conf.setInt("hbase.regionserver.info.port", -1);
    
    // Workaround for HBASE-5711, we need to set config value dfs.datanode.data.dir.perm
    // equal to the permissions of the temp dirs on the filesystem. These temp dirs were
    // probably created using this process' umask. So we guess the temp dir permissions as
    // 0777 & ~umask, and use that to set the config value.
    try {
      Process process = Runtime.getRuntime().exec("/bin/sh -c umask");
      BufferedReader br = new BufferedReader(new InputStreamReader(process.getInputStream()));
      int rc = process.waitFor();
      if(rc == 0) {
        String umask = br.readLine();


        int umaskBits = Integer.parseInt(umask, 8);
        int permBits = 0777 & ~umaskBits;
        String perms = Integer.toString(permBits, 8);


        conf.set("dfs.datanode.data.dir.perm", perms);
      }
    } catch (Exception e) {
      // ignore errors, we might not be running on POSIX, or "sh" might not be on the path
    }


    hbaseTestUtil.startMiniZKCluster();
    hbaseTestUtil.startMiniCluster();
    hbaseTestUtil.startMiniMapReduceCluster(1);


    // For Hadoop-2.0.0, we have to do a bit more work.
    if (TaskAttemptContext.class.isInterface()) {
      conf = hbaseTestUtil.getConfiguration();
      FileSystem fs = FileSystem.get(conf);
      Path tmpPath = new Path("target", "WordCountHBaseTest-tmpDir");
      FileSystem localFS = FileSystem.getLocal(conf);
      for (FileStatus jarFile : localFS.listStatus(new Path("target/lib/"))) {
        Path target = new Path(tmpPath, jarFile.getPath().getName());
        fs.copyFromLocalFile(jarFile.getPath(), target);
        DistributedCache.addFileToClassPath(target, conf, fs);
      }


      // Create a programmatic container for this jar.
      JarOutputStream jos = new JarOutputStream(new FileOutputStream("WordCountHBaseIT.jar"));
      File baseDir = new File("target/test-classes");
      String prefix = "org/apache/crunch/io/hbase/";
      jarUp(jos, baseDir, prefix + "WordCountHBaseIT.class");
      jarUp(jos, baseDir, prefix + "WordCountHBaseIT$1.class");
      jarUp(jos, baseDir, prefix + "WordCountHBaseIT$2.class");
      jos.close();


      Path target = new Path(tmpPath, "WordCountHBaseIT.jar");
      fs.copyFromLocalFile(true, new Path("WordCountHBaseIT.jar"), target);
      DistributedCache.addFileToClassPath(target, conf, fs);
    }
  }


  private void jarUp(JarOutputStream jos, File baseDir, String classDir) throws IOException {
    File file = new File(baseDir, classDir);
    JarEntry e = new JarEntry(classDir);
    e.setTime(file.lastModified());
    jos.putNextEntry(e);
    ByteStreams.copy(new FileInputStream(file), jos);
    jos.closeEntry();
  }


  @Test
  public void testWordCount() throws IOException {
    run(new MRPipeline(WordCountHBaseIT.class, hbaseTestUtil.getConfiguration()));
  }


  @After
  public void tearDown() throws Exception {
    hbaseTestUtil.shutdownMiniMapReduceCluster();
    hbaseTestUtil.shutdownMiniCluster();
    hbaseTestUtil.shutdownMiniZKCluster();
  }


  public void run(Pipeline pipeline) throws IOException {


    Random rand = new Random();
    int postFix = Math.abs(rand.nextInt());
    String inputTableName = "crunch_words_" + postFix;
    String outputTableName = "crunch_counts_" + postFix;


    try {


      HTable inputTable = hbaseTestUtil.createTable(Bytes.toBytes(inputTableName), WORD_COLFAM);
      HTable outputTable = hbaseTestUtil.createTable(Bytes.toBytes(outputTableName), COUNTS_COLFAM);


      int key = 0;
      key = put(inputTable, key, "cat");
      key = put(inputTable, key, "cat");
      key = put(inputTable, key, "dog");
      Scan scan = new Scan();
      scan.addColumn(WORD_COLFAM, null);
      HBaseSourceTarget source = new HBaseSourceTarget(inputTableName, scan);
      PTable<ImmutableBytesWritable, Result> shakespeare = pipeline.read(source);
      pipeline.write(wordCount(shakespeare), new HBaseTarget(outputTableName));
      pipeline.done();


      assertIsLong(outputTable, "cat", 2);
      assertIsLong(outputTable, "dog", 1);
    } finally {
      // not quite sure...
    }
  }


  protected int put(HTable table, int key, String value) throws IOException {
    Put put = new Put(Bytes.toBytes(key));
    put.add(WORD_COLFAM, null, Bytes.toBytes(value));
    table.put(put);
    return key + 1;
  }


  protected void assertIsLong(HTable table, String key, long i) throws IOException {
    Get get = new Get(Bytes.toBytes(key));
    get.addColumn(COUNTS_COLFAM, null);
    Result result = table.get(get);


    byte[] rawCount = result.getValue(COUNTS_COLFAM, null);
    assertTrue(rawCount != null);
    assertEquals(new Long(i), new Long(Bytes.toLong(rawCount)));
  }
}
Source Code of org.apache.crunch.io.hbase.WordCountHBaseIT

Related Classes of org.apache.crunch.io.hbase.WordCountHBaseIT