Package org.apache.hadoop.hive.ql.io

Source Code of org.apache.hadoop.hive.ql.io.PerformTestRCFileAndSeqFile

package org.apache.hadoop.hive.ql.io;

import java.io.IOException;
import java.util.Random;

import junit.framework.TestCase;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.serde2.ColumnProjectionUtils;
import org.apache.hadoop.hive.serde2.columnar.BytesRefArrayWritable;
import org.apache.hadoop.hive.serde2.columnar.BytesRefWritable;
import org.apache.hadoop.hive.serde2.io.ByteWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.SequenceFile.CompressionType;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.DefaultCodec;

/**
* PerformTestRCFileAndSeqFile.
*
*/
public class PerformTestRCFileAndSeqFile extends TestCase {

  private final Configuration conf = new Configuration();

  private Path testRCFile;
  private Path testSeqFile;

  private FileSystem fs;

  int columnMaxSize = 30;

  Random randomCharGenerator = new Random(3);

  Random randColLenGenerator = new Random(20);

  public PerformTestRCFileAndSeqFile(boolean local, String file)
      throws IOException {
    if (local) {
      fs = FileSystem.getLocal(conf);
    } else {
      fs = FileSystem.get(conf);
    }
    conf.setInt(RCFile.Writer.COLUMNS_BUFFER_SIZE_CONF_STR, 1 * 1024 * 1024);
    if (file == null) {
      Path dir = new Path(System.getProperty("test.data.dir", ".") + "/mapred");
      testRCFile = new Path(dir, "test_rcfile");
      testSeqFile = new Path(dir, "test_seqfile");
    } else {
      testRCFile = new Path(file + "-rcfile");
      testSeqFile = new Path(file + "-seqfile");
    }
    fs.delete(testRCFile, true);
    fs.delete(testSeqFile, true);
    System.out.println("RCFile:" + testRCFile.toString());
    System.out.println("SequenceFile:" + testSeqFile.toString());
  }

  private void writeSeqenceFileTest(FileSystem fs, int rowCount, Path file,
      int columnNum, CompressionCodec codec) throws IOException {

    byte[][] columnRandom;

    resetRandomGenerators();

    BytesRefArrayWritable bytes = new BytesRefArrayWritable(columnNum);
    columnRandom = new byte[columnNum][];
    for (int i = 0; i < columnNum; i++) {
      BytesRefWritable cu = new BytesRefWritable();
      bytes.set(i, cu);
    }

    // zero length key is not allowed by block compress writer, so we use a byte
    // writable
    ByteWritable key = new ByteWritable();
    SequenceFile.Writer seqWriter = SequenceFile.createWriter(fs, conf, file,
        ByteWritable.class, BytesRefArrayWritable.class, CompressionType.BLOCK,
        codec);

    for (int i = 0; i < rowCount; i++) {
      nextRandomRow(columnRandom, bytes);
      seqWriter.append(key, bytes);
    }
    seqWriter.close();
  }

  private void resetRandomGenerators() {
    randomCharGenerator = new Random(3);
    randColLenGenerator = new Random(20);
  }

  private void writeRCFileTest(FileSystem fs, int rowCount, Path file,
      int columnNum, CompressionCodec codec) throws IOException {
    fs.delete(file, true);

    resetRandomGenerators();

    RCFileOutputFormat.setColumnNumber(conf, columnNum);
    RCFile.Writer writer = new RCFile.Writer(fs, conf, file, null, codec);

    byte[][] columnRandom;

    BytesRefArrayWritable bytes = new BytesRefArrayWritable(columnNum);
    columnRandom = new byte[columnNum][];
    for (int i = 0; i < columnNum; i++) {
      BytesRefWritable cu = new BytesRefWritable();
      bytes.set(i, cu);
    }

    for (int i = 0; i < rowCount; i++) {
      nextRandomRow(columnRandom, bytes);
      writer.append(bytes);
    }
    writer.close();
  }

  private void nextRandomRow(byte[][] row, BytesRefArrayWritable bytes) {
    bytes.resetValid(row.length);
    for (int i = 0; i < row.length; i++) {
      int len = Math.abs(randColLenGenerator.nextInt(columnMaxSize));
      row[i] = new byte[len];
      for (int j = 0; j < len; j++) {
        row[i][j] = getRandomChar(randomCharGenerator);
      }
      bytes.get(i).set(row[i], 0, len);
    }
  }

  private static int CHAR_END = 122 - 7;

  private byte getRandomChar(Random random) {
    byte b = 0;
    do {
      b = (byte) random.nextInt(CHAR_END);
    } while ((b < 65));
    if (b > 90) {
      b += 7;
    }
    return b;
  }

  public static void main(String[] args) throws Exception {
    int count = 1000;
    String file = null;

    try {
      for (int i = 0; i < args.length; ++i) { // parse command line
        if (args[i] == null) {
          continue;
        } else if (args[i].equals("-count")) {
          count = Integer.parseInt(args[++i]);
        } else {
          // file is required parameter
          file = args[i];
        }
      }

      // change it to choose the appropriate file system
      boolean isLocalFS = true;

      PerformTestRCFileAndSeqFile testcase = new PerformTestRCFileAndSeqFile(
          isLocalFS, file);

      // change these parameters
      boolean checkCorrect = true;
      CompressionCodec codec = new DefaultCodec();
      testcase.columnMaxSize = 30;

      // testcase.testWithColumnNumber(count, 2, checkCorrect, codec);
      // testcase.testWithColumnNumber(count, 10, checkCorrect, codec);
      // testcase.testWithColumnNumber(count, 25, checkCorrect, codec);
      testcase.testWithColumnNumber(count, 40, checkCorrect, codec);
      // testcase.testWithColumnNumber(count, 50, checkCorrect, codec);
      // testcase.testWithColumnNumber(count, 80, checkCorrect, codec);

    } finally {
    }
  }

  private void testWithColumnNumber(int rowCount, int columnNum,
      boolean checkCorrect, CompressionCodec codec) throws IOException {
    // rcfile

    // rcfile write
    long start = System.currentTimeMillis();
    writeRCFileTest(fs, rowCount, testRCFile, columnNum, codec);
    long cost = System.currentTimeMillis() - start;
    long fileLen = fs.getFileStatus(testRCFile).getLen();
    System.out.println("Write RCFile with " + columnNum
        + " random string columns and " + rowCount + " rows cost " + cost
        + " milliseconds. And the file's on disk size is " + fileLen);

    // sequence file write
    start = System.currentTimeMillis();
    writeSeqenceFileTest(fs, rowCount, testSeqFile, columnNum, codec);
    cost = System.currentTimeMillis() - start;
    fileLen = fs.getFileStatus(testSeqFile).getLen();
    System.out.println("Write SequenceFile with " + columnNum
        + " random string columns and " + rowCount + " rows cost " + cost
        + " milliseconds. And the file's on disk size is " + fileLen);

    // rcfile read
    start = System.currentTimeMillis();
    int readRows = performRCFileReadFirstColumnTest(fs, testRCFile, columnNum,
        checkCorrect);
    cost = System.currentTimeMillis() - start;
    System.out.println("Read only one column of a RCFile with " + columnNum
        + " random string columns and " + rowCount + " rows cost " + cost
        + " milliseconds.");
    if (rowCount != readRows) {
      throw new IllegalStateException("Compare read and write row count error.");
    }
    assertEquals("", rowCount, readRows);

    if (isLocalFileSystem() && !checkCorrect) {
      // make some noisy to avoid disk caches data.
      performSequenceFileRead(fs, rowCount, testSeqFile);
    }

    start = System.currentTimeMillis();
    readRows = performRCFileReadFirstAndLastColumnTest(fs, testRCFile,
        columnNum, checkCorrect);
    cost = System.currentTimeMillis() - start;
    System.out.println("Read only first and last columns of a RCFile with "
        + columnNum + " random string columns and " + rowCount + " rows cost "
        + cost + " milliseconds.");
    if (rowCount != readRows) {
      throw new IllegalStateException("Compare read and write row count error.");
    }
    assertEquals("", rowCount, readRows);

    if (isLocalFileSystem() && !checkCorrect) {
      // make some noisy to avoid disk caches data.
      performSequenceFileRead(fs, rowCount, testSeqFile);
    }

    start = System.currentTimeMillis();
    performRCFileFullyReadColumnTest(fs, testRCFile, columnNum, checkCorrect);
    cost = System.currentTimeMillis() - start;
    System.out.println("Read all columns of a RCFile with " + columnNum
        + " random string columns and " + rowCount + " rows cost " + cost
        + " milliseconds.");
    if (rowCount != readRows) {
      throw new IllegalStateException("Compare read and write row count error.");
    }
    assertEquals("", rowCount, readRows);

    // sequence file read
    start = System.currentTimeMillis();
    performSequenceFileRead(fs, rowCount, testSeqFile);
    cost = System.currentTimeMillis() - start;
    System.out.println("Read SequenceFile with " + columnNum
        + "  random string columns and " + rowCount + " rows cost " + cost
        + " milliseconds.");
  }

  public boolean isLocalFileSystem() {
    return fs.getUri().toString().startsWith("file://");
  }

  public void performSequenceFileRead(FileSystem fs, int count, Path file) throws IOException {
    SequenceFile.Reader reader = new SequenceFile.Reader(fs, file, conf);
    ByteWritable key = new ByteWritable();
    BytesRefArrayWritable val = new BytesRefArrayWritable();
    for (int i = 0; i < count; i++) {
      reader.next(key, val);
    }
  }

  public int performRCFileReadFirstColumnTest(FileSystem fs, Path file,
      int allColumnsNumber, boolean chechCorrect) throws IOException {

    byte[][] checkBytes = null;
    BytesRefArrayWritable checkRow = new BytesRefArrayWritable(allColumnsNumber);
    if (chechCorrect) {
      resetRandomGenerators();
      checkBytes = new byte[allColumnsNumber][];
    }

    int actualReadCount = 0;

    java.util.ArrayList<Integer> readCols = new java.util.ArrayList<Integer>();
    readCols.add(Integer.valueOf(0));
    ColumnProjectionUtils.setReadColumnIDs(conf, readCols);
    RCFile.Reader reader = new RCFile.Reader(fs, file, conf);

    LongWritable rowID = new LongWritable();
    BytesRefArrayWritable cols = new BytesRefArrayWritable();
    while (reader.next(rowID)) {
      reader.getCurrentRow(cols);
      boolean ok = true;
      if (chechCorrect) {
        nextRandomRow(checkBytes, checkRow);
        ok = ok && (checkRow.get(0).equals(cols.get(0)));
      }
      if (!ok) {
        throw new IllegalStateException("Compare read and write error.");
      }
      actualReadCount++;
    }
    return actualReadCount;
  }

  public int performRCFileReadFirstAndLastColumnTest(FileSystem fs, Path file,
      int allColumnsNumber, boolean chechCorrect) throws IOException {

    byte[][] checkBytes = null;
    BytesRefArrayWritable checkRow = new BytesRefArrayWritable(allColumnsNumber);
    if (chechCorrect) {
      resetRandomGenerators();
      checkBytes = new byte[allColumnsNumber][];
    }

    int actualReadCount = 0;

    java.util.ArrayList<Integer> readCols = new java.util.ArrayList<Integer>();
    readCols.add(Integer.valueOf(0));
    readCols.add(Integer.valueOf(allColumnsNumber - 1));
    ColumnProjectionUtils.setReadColumnIDs(conf, readCols);
    RCFile.Reader reader = new RCFile.Reader(fs, file, conf);

    LongWritable rowID = new LongWritable();
    BytesRefArrayWritable cols = new BytesRefArrayWritable();
    while (reader.next(rowID)) {
      reader.getCurrentRow(cols);
      boolean ok = true;
      if (chechCorrect) {
        nextRandomRow(checkBytes, checkRow);
        ok = ok && (checkRow.get(0).equals(cols.get(0)));
        ok = ok
            && checkRow.get(allColumnsNumber - 1).equals(
            cols.get(allColumnsNumber - 1));
      }
      if (!ok) {
        throw new IllegalStateException("Compare read and write error.");
      }
      actualReadCount++;
    }
    return actualReadCount;
  }

  public int performRCFileFullyReadColumnTest(FileSystem fs, Path file,
      int allColumnsNumber, boolean chechCorrect) throws IOException {

    byte[][] checkBytes = null;
    BytesRefArrayWritable checkRow = new BytesRefArrayWritable(allColumnsNumber);
    if (chechCorrect) {
      resetRandomGenerators();
      checkBytes = new byte[allColumnsNumber][];
    }

    int actualReadCount = 0;

    ColumnProjectionUtils.setFullyReadColumns(conf);
    RCFile.Reader reader = new RCFile.Reader(fs, file, conf);

    LongWritable rowID = new LongWritable();
    BytesRefArrayWritable cols = new BytesRefArrayWritable();
    while (reader.next(rowID)) {
      reader.getCurrentRow(cols);
      boolean ok = true;
      if (chechCorrect) {
        nextRandomRow(checkBytes, checkRow);
        ok = ok && checkRow.equals(cols);
      }
      if (!ok) {
        throw new IllegalStateException("Compare read and write error.");
      }
      actualReadCount++;
    }
    return actualReadCount;
  }

}
TOP

Related Classes of org.apache.hadoop.hive.ql.io.PerformTestRCFileAndSeqFile

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.