Package org.apache.hadoop.mapred

Source Code of org.apache.hadoop.mapred.TestConcatenatedCompressedInput

/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements.  See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership.  The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.hadoop.mapred;

import java.io.ByteArrayInputStream;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.List;
import java.util.zip.Inflater;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.*;
import org.apache.hadoop.util.LineReader;
import org.apache.hadoop.util.ReflectionUtils;

import org.junit.Ignore;
import org.junit.Test;
import static org.junit.Assert.*;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
@Ignore
public class TestConcatenatedCompressedInput {
  private static final Log LOG =
    LogFactory.getLog(TestConcatenatedCompressedInput.class.getName());
  private static int MAX_LENGTH = 10000;
  private static JobConf defaultConf = new JobConf();
  private static FileSystem localFs = null;

  // from ~roelofs/ss30b-colors.hh
  final static String COLOR_RED        = "";     // background doesn't matter...  ""
  final static String COLOR_GREEN      = "";     // background doesn't matter...  ""
  final static String COLOR_YELLOW     = ""// DO force black background     ""
  final static String COLOR_BLUE       = "";     // do NOT force black background ""
  final static String COLOR_MAGENTA    = "";     // background doesn't matter...  ""
  final static String COLOR_CYAN       = "";     // background doesn't matter...  ""
  final static String COLOR_WHITE      = ""// DO force black background     ""
  final static String COLOR_BR_RED     = "";     // background doesn't matter...  ""
  final static String COLOR_BR_GREEN   = "";     // background doesn't matter...  ""
  final static String COLOR_BR_YELLOW  = ""// DO force black background     ""
  final static String COLOR_BR_BLUE    = "";     // do NOT force black background ""
  final static String COLOR_BR_MAGENTA = "";     // background doesn't matter...  ""
  final static String COLOR_BR_CYAN    = "";     // background doesn't matter...  ""
  final static String COLOR_BR_WHITE   = ""// DO force black background     ""
  final static String COLOR_NORMAL     = "";

  static {
    try {
      defaultConf.set("fs.defaultFS", "file:///");
      localFs = FileSystem.getLocal(defaultConf);
    } catch (IOException e) {
      throw new RuntimeException("init failure", e);
    }
  }

  private static Path workDir =
    new Path(new Path(System.getProperty("test.build.data", "/tmp")),
             "TestConcatenatedCompressedInput").makeQualified(localFs);

  private static LineReader makeStream(String str) throws IOException {
    return new LineReader(new ByteArrayInputStream(str.getBytes("UTF-8")),
                          defaultConf);
  }

  private static void writeFile(FileSystem fs, Path name,
                                CompressionCodec codec, String contents)
  throws IOException {
    OutputStream stm;
    if (codec == null) {
      stm = fs.create(name);
    } else {
      stm = codec.createOutputStream(fs.create(name));
    }
    stm.write(contents.getBytes());
    stm.close();
  }

  private static final Reporter voidReporter = Reporter.NULL;

  private static List<Text> readSplit(TextInputFormat format,
                                      InputSplit split, JobConf jobConf)
  throws IOException {
    List<Text> result = new ArrayList<Text>();
    RecordReader<LongWritable, Text> reader =
      format.getRecordReader(split, jobConf, voidReporter);
    LongWritable key = reader.createKey();
    Text value = reader.createValue();
    while (reader.next(key, value)) {
      result.add(value);
      value = reader.createValue();
    }
    reader.close();
    return result;
  }


  /**
   * Test using Hadoop's original, native-zlib gzip codec for reading.
   */
  @Test
  public void testGzip() throws IOException {
    JobConf jobConf = new JobConf(defaultConf);

    CompressionCodec gzip = new GzipCodec();
    ReflectionUtils.setConf(gzip, jobConf);
    localFs.delete(workDir, true);

    // preferred, but not compatible with Apache/trunk instance of Hudson:
/*
    assertFalse("[native (C/C++) codec]",
      (org.apache.hadoop.io.compress.zlib.BuiltInGzipDecompressor.class ==
       gzip.getDecompressorType()) );
    System.out.println(COLOR_BR_RED +
      "testGzip() using native-zlib Decompressor (" +
      gzip.getDecompressorType() + ")" + COLOR_NORMAL);
*/

    // alternative:
    if (org.apache.hadoop.io.compress.zlib.BuiltInGzipDecompressor.class ==
        gzip.getDecompressorType()) {
      System.out.println(COLOR_BR_RED +
        "testGzip() using native-zlib Decompressor (" +
        gzip.getDecompressorType() + ")" + COLOR_NORMAL);
    } else {
      LOG.warn("testGzip() skipped:  native (C/C++) libs not loaded");
      return;
    }

/*
*      // THIS IS BUGGY: omits 2nd/3rd gzip headers; screws up 2nd/3rd CRCs--
*      //                see https://issues.apache.org/jira/browse/HADOOP-6799
*  Path fnHDFS = new Path(workDir, "concat" + gzip.getDefaultExtension());
*  //OutputStream out = localFs.create(fnHDFS);
*  //GzipCodec.GzipOutputStream gzOStm = new GzipCodec.GzipOutputStream(out);
*      // can just combine those two lines, probably
*  //GzipCodec.GzipOutputStream gzOStm =
*  //  new GzipCodec.GzipOutputStream(localFs.create(fnHDFS));
*      // oops, no:  this is a protected helper class; need to access
*      //   it via createOutputStream() instead:
*  OutputStream out = localFs.create(fnHDFS);
*  Compressor gzCmp = gzip.createCompressor();
*  CompressionOutputStream gzOStm = gzip.createOutputStream(out, gzCmp);
*      // this SHOULD be going to HDFS:  got out from localFs == HDFS
*      //   ...yup, works
*  gzOStm.write("first gzip concat\n member\nwith three lines\n".getBytes());
*  gzOStm.finish();
*  gzOStm.resetState();
*  gzOStm.write("2nd gzip concat member\n".getBytes());
*  gzOStm.finish();
*  gzOStm.resetState();
*  gzOStm.write("gzip concat\nmember #3\n".getBytes());
*  gzOStm.close();
*      //
*  String fn = "hdfs-to-local-concat" + gzip.getDefaultExtension();
*  Path fnLocal = new Path(System.getProperty("test.concat.data","/tmp"), fn);
*  localFs.copyToLocalFile(fnHDFS, fnLocal);
*/

    // copy prebuilt (correct!) version of concat.gz to HDFS
    final String fn = "concat" + gzip.getDefaultExtension();
    Path fnLocal = new Path(System.getProperty("test.concat.data", "/tmp"), fn);
    Path fnHDFS  = new Path(workDir, fn);
    localFs.copyFromLocalFile(fnLocal, fnHDFS);

    writeFile(localFs, new Path(workDir, "part2.txt.gz"), gzip,
              "this is a test\nof gzip\n");
    FileInputFormat.setInputPaths(jobConf, workDir);
    TextInputFormat format = new TextInputFormat();
    format.configure(jobConf);

    InputSplit[] splits = format.getSplits(jobConf, 100);
    assertEquals("compressed splits == 2", 2, splits.length);
    FileSplit tmp = (FileSplit) splits[0];
    if (tmp.getPath().getName().equals("part2.txt.gz")) {
      splits[0] = splits[1];
      splits[1] = tmp;
    }

    List<Text> results = readSplit(format, splits[0], jobConf);
    assertEquals("splits[0] num lines", 6, results.size());
    assertEquals("splits[0][5]", "member #3",
                 results.get(5).toString());

    results = readSplit(format, splits[1], jobConf);
    assertEquals("splits[1] num lines", 2, results.size());
    assertEquals("splits[1][0]", "this is a test",
                 results.get(0).toString());
    assertEquals("splits[1][1]", "of gzip",
                 results.get(1).toString());
  }

  /**
   * Test using the raw Inflater codec for reading gzip files.
   */
  @Test
  public void testPrototypeInflaterGzip() throws IOException {
    CompressionCodec gzip = new GzipCodec()// used only for file extension
    localFs.delete(workDir, true);            // localFs = FileSystem instance

    System.out.println(COLOR_BR_BLUE + "testPrototypeInflaterGzip() using " +
      "non-native/Java Inflater and manual gzip header/trailer parsing" +
      COLOR_NORMAL);

    // copy prebuilt (correct!) version of concat.gz to HDFS
    final String fn = "concat" + gzip.getDefaultExtension();
    Path fnLocal = new Path(System.getProperty("test.concat.data", "/tmp"), fn);
    Path fnHDFS  = new Path(workDir, fn);
    localFs.copyFromLocalFile(fnLocal, fnHDFS);

    final FileInputStream in = new FileInputStream(fnLocal.toString());
    assertEquals("concat bytes available", 148, in.available());

    // should wrap all of this header-reading stuff in a running-CRC wrapper
    // (did so in BuiltInGzipDecompressor; see below)

    byte[] compressedBuf = new byte[256];
    int numBytesRead = in.read(compressedBuf, 0, 10);
    assertEquals("header bytes read", 10, numBytesRead);
    assertEquals("1st byte", 0x1f, compressedBuf[0] & 0xff);
    assertEquals("2nd byte", 0x8b, compressedBuf[1] & 0xff);
    assertEquals("3rd byte (compression method)", 8, compressedBuf[2] & 0xff);

    byte flags = (byte)(compressedBuf[3] & 0xff);
    if ((flags & 0x04) != 0) {   // FEXTRA
      numBytesRead = in.read(compressedBuf, 0, 2);
      assertEquals("XLEN bytes read", 2, numBytesRead);
      int xlen = ((compressedBuf[1] << 8) | compressedBuf[0]) & 0xffff;
      in.skip(xlen);
    }
    if ((flags & 0x08) != 0) {   // FNAME
      while ((numBytesRead = in.read()) != 0) {
        assertFalse("unexpected end-of-file while reading filename",
                    numBytesRead == -1);
      }
    }
    if ((flags & 0x10) != 0) {   // FCOMMENT
      while ((numBytesRead = in.read()) != 0) {
        assertFalse("unexpected end-of-file while reading comment",
                    numBytesRead == -1);
      }
    }
    if ((flags & 0xe0) != 0) {   // reserved
      assertTrue("reserved bits are set??", (flags & 0xe0) == 0);
    }
    if ((flags & 0x02) != 0) {   // FHCRC
      numBytesRead = in.read(compressedBuf, 0, 2);
      assertEquals("CRC16 bytes read", 2, numBytesRead);
      int crc16 = ((compressedBuf[1] << 8) | compressedBuf[0]) & 0xffff;
    }

    // ready to go!  next bytes should be start of deflated stream, suitable
    // for Inflater
    numBytesRead = in.read(compressedBuf);

    // Inflater docs refer to a "dummy byte":  no clue what that's about;
    // appears to work fine without one
    byte[] uncompressedBuf = new byte[256];
    Inflater inflater = new Inflater(true);

    inflater.setInput(compressedBuf, 0, numBytesRead);
    try {
      int numBytesUncompressed = inflater.inflate(uncompressedBuf);
      String outString =
        new String(uncompressedBuf, 0, numBytesUncompressed, "UTF-8");
      System.out.println("uncompressed data of first gzip member = [" +
                         outString + "]");
    } catch (java.util.zip.DataFormatException ex) {
      throw new IOException(ex.getMessage());
    }

    in.close();
  }

  /**
   * Test using the new BuiltInGzipDecompressor codec for reading gzip files.
   */
  // NOTE:  This fails on RHEL4 with "java.io.IOException: header crc mismatch"
  //        due to buggy version of zlib (1.2.1.2) included.
  @Test
  public void testBuiltInGzipDecompressor() throws IOException {
    JobConf jobConf = new JobConf(defaultConf);
    jobConf.setBoolean("io.native.lib.available", false);

    CompressionCodec gzip = new GzipCodec();
    ReflectionUtils.setConf(gzip, jobConf);
    localFs.delete(workDir, true);

    assertEquals("[non-native (Java) codec]",
      org.apache.hadoop.io.compress.zlib.BuiltInGzipDecompressor.class,
      gzip.getDecompressorType());
    System.out.println(COLOR_BR_YELLOW + "testBuiltInGzipDecompressor() using" +
      " non-native (Java Inflater) Decompressor (" + gzip.getDecompressorType()
      + ")" + COLOR_NORMAL);

    // copy single-member test file to HDFS
    String fn1 = "testConcatThenCompress.txt" + gzip.getDefaultExtension();
    Path fnLocal1 = new Path(System.getProperty("test.concat.data","/tmp"),fn1);
    Path fnHDFS1  = new Path(workDir, fn1);
    localFs.copyFromLocalFile(fnLocal1, fnHDFS1);

    // copy multiple-member test file to HDFS
    // (actually in "seekable gzip" format, a la JIRA PIG-42)
    String fn2 = "testCompressThenConcat.txt" + gzip.getDefaultExtension();
    Path fnLocal2 = new Path(System.getProperty("test.concat.data","/tmp"),fn2);
    Path fnHDFS2  = new Path(workDir, fn2);
    localFs.copyFromLocalFile(fnLocal2, fnHDFS2);

    FileInputFormat.setInputPaths(jobConf, workDir);

    // here's first pair of DecompressorStreams:
    final FileInputStream in1 = new FileInputStream(fnLocal1.toString());
    final FileInputStream in2 = new FileInputStream(fnLocal2.toString());
    assertEquals("concat bytes available", 2734, in1.available());
    assertEquals("concat bytes available", 3413, in2.available()); // w/hdr CRC

    CompressionInputStream cin2 = gzip.createInputStream(in2);
    LineReader in = new LineReader(cin2);
    Text out = new Text();

    int numBytes, totalBytes=0, lineNum=0;
    while ((numBytes = in.readLine(out)) > 0) {
      ++lineNum;
      totalBytes += numBytes;
    }
    in.close();
    assertEquals("total uncompressed bytes in concatenated test file",
                 5346, totalBytes);
    assertEquals("total uncompressed lines in concatenated test file",
                 84, lineNum);

    // test BuiltInGzipDecompressor with lots of different input-buffer sizes
    doMultipleGzipBufferSizes(jobConf, false);

    // test GzipZlibDecompressor (native), just to be sure
    // (FIXME?  could move this call to testGzip(), but would need filename
    // setup above) (alternatively, maybe just nuke testGzip() and extend this?)
    doMultipleGzipBufferSizes(jobConf, true);
  }

  // this tests either the native or the non-native gzip decoder with 43
  // input-buffer sizes in order to try to catch any parser/state-machine
  // errors at buffer boundaries
  private static void doMultipleGzipBufferSizes(JobConf jConf,
                                                boolean useNative)
  throws IOException {
    System.out.println(COLOR_YELLOW + "doMultipleGzipBufferSizes() using " +
      (useNative? "GzipZlibDecompressor" : "BuiltInGzipDecompressor") +
      COLOR_NORMAL);

    jConf.setBoolean("io.native.lib.available", useNative);

    int bufferSize;

    // ideally would add some offsets/shifts in here (e.g., via extra fields
    // of various sizes), but...significant work to hand-generate each header
    for (bufferSize = 1; bufferSize < 34; ++bufferSize) {
      jConf.setInt("io.file.buffer.size", bufferSize);
      doSingleGzipBufferSize(jConf);
    }

    bufferSize = 512;
    jConf.setInt("io.file.buffer.size", bufferSize);
    doSingleGzipBufferSize(jConf);

    bufferSize = 1024;
    jConf.setInt("io.file.buffer.size", bufferSize);
    doSingleGzipBufferSize(jConf);

    bufferSize = 2*1024;
    jConf.setInt("io.file.buffer.size", bufferSize);
    doSingleGzipBufferSize(jConf);

    bufferSize = 4*1024;
    jConf.setInt("io.file.buffer.size", bufferSize);
    doSingleGzipBufferSize(jConf);

    bufferSize = 63*1024;
    jConf.setInt("io.file.buffer.size", bufferSize);
    doSingleGzipBufferSize(jConf);

    bufferSize = 64*1024;
    jConf.setInt("io.file.buffer.size", bufferSize);
    doSingleGzipBufferSize(jConf);

    bufferSize = 65*1024;
    jConf.setInt("io.file.buffer.size", bufferSize);
    doSingleGzipBufferSize(jConf);

    bufferSize = 127*1024;
    jConf.setInt("io.file.buffer.size", bufferSize);
    doSingleGzipBufferSize(jConf);

    bufferSize = 128*1024;
    jConf.setInt("io.file.buffer.size", bufferSize);
    doSingleGzipBufferSize(jConf);

    bufferSize = 129*1024;
    jConf.setInt("io.file.buffer.size", bufferSize);
    doSingleGzipBufferSize(jConf);
  }

  // this tests both files (testCompressThenConcat, testConcatThenCompress);
  // all should work with either native zlib or new Inflater-based decoder
  private static void doSingleGzipBufferSize(JobConf jConf) throws IOException {

    TextInputFormat format = new TextInputFormat();
    format.configure(jConf);

    // here's Nth pair of DecompressorStreams:
    InputSplit[] splits = format.getSplits(jConf, 100);
    assertEquals("compressed splits == 2", 2, splits.length);
    FileSplit tmp = (FileSplit) splits[0];
    if (tmp.getPath().getName().equals("testCompressThenConcat.txt.gz")) {
      System.out.println("  (swapping)");
      splits[0] = splits[1];
      splits[1] = tmp;
    }

    List<Text> results = readSplit(format, splits[0], jConf);
    assertEquals("splits[0] length (num lines)", 84, results.size());
    assertEquals("splits[0][0]",
      "Call me Ishmael. Some years ago--never mind how long precisely--having",
      results.get(0).toString());
    assertEquals("splits[0][42]",
      "Tell me, does the magnetic virtue of the needles of the compasses of",
      results.get(42).toString());

    results = readSplit(format, splits[1], jConf);
    assertEquals("splits[1] length (num lines)", 84, results.size());
    assertEquals("splits[1][0]",
      "Call me Ishmael. Some years ago--never mind how long precisely--having",
      results.get(0).toString());
    assertEquals("splits[1][42]",
      "Tell me, does the magnetic virtue of the needles of the compasses of",
      results.get(42).toString());
  }

  /**
   * Test using the bzip2 codec for reading
   */
  @Test
  public void testBzip2() throws IOException {
    JobConf jobConf = new JobConf(defaultConf);

    CompressionCodec bzip2 = new BZip2Codec();
    ReflectionUtils.setConf(bzip2, jobConf);
    localFs.delete(workDir, true);

    System.out.println(COLOR_BR_CYAN +
      "testBzip2() using non-native CBZip2InputStream (presumably)" +
      COLOR_NORMAL);

    // copy prebuilt (correct!) version of concat.bz2 to HDFS
    final String fn = "concat" + bzip2.getDefaultExtension();
    Path fnLocal = new Path(System.getProperty("test.concat.data", "/tmp"), fn);
    Path fnHDFS  = new Path(workDir, fn);
    localFs.copyFromLocalFile(fnLocal, fnHDFS);

    writeFile(localFs, new Path(workDir, "part2.txt.bz2"), bzip2,
              "this is a test\nof bzip2\n");
    FileInputFormat.setInputPaths(jobConf, workDir);
    TextInputFormat format = new TextInputFormat()// extends FileInputFormat
    format.configure(jobConf);
    format.setMinSplitSize(256)// work around 2-byte splits issue
    // [135 splits for a 208-byte file and a 62-byte file(!)]

    InputSplit[] splits = format.getSplits(jobConf, 100);
    assertEquals("compressed splits == 2", 2, splits.length);
    FileSplit tmp = (FileSplit) splits[0];
    if (tmp.getPath().getName().equals("part2.txt.bz2")) {
      splits[0] = splits[1];
      splits[1] = tmp;
    }

    List<Text> results = readSplit(format, splits[0], jobConf);
    assertEquals("splits[0] num lines", 6, results.size());
    assertEquals("splits[0][5]", "member #3",
                 results.get(5).toString());

    results = readSplit(format, splits[1], jobConf);
    assertEquals("splits[1] num lines", 2, results.size());
    assertEquals("splits[1][0]", "this is a test",
                 results.get(0).toString());
    assertEquals("splits[1][1]", "of bzip2",
                 results.get(1).toString());
  }

  /**
   * Extended bzip2 test, similar to BuiltInGzipDecompressor test above.
   */
  @Test
  public void testMoreBzip2() throws IOException {
    JobConf jobConf = new JobConf(defaultConf);

    CompressionCodec bzip2 = new BZip2Codec();
    ReflectionUtils.setConf(bzip2, jobConf);
    localFs.delete(workDir, true);

    System.out.println(COLOR_BR_MAGENTA +
      "testMoreBzip2() using non-native CBZip2InputStream (presumably)" +
      COLOR_NORMAL);

    // copy single-member test file to HDFS
    String fn1 = "testConcatThenCompress.txt" + bzip2.getDefaultExtension();
    Path fnLocal1 = new Path(System.getProperty("test.concat.data","/tmp"),fn1);
    Path fnHDFS1  = new Path(workDir, fn1);
    localFs.copyFromLocalFile(fnLocal1, fnHDFS1);

    // copy multiple-member test file to HDFS
    String fn2 = "testCompressThenConcat.txt" + bzip2.getDefaultExtension();
    Path fnLocal2 = new Path(System.getProperty("test.concat.data","/tmp"),fn2);
    Path fnHDFS2  = new Path(workDir, fn2);
    localFs.copyFromLocalFile(fnLocal2, fnHDFS2);

    FileInputFormat.setInputPaths(jobConf, workDir);

    // here's first pair of BlockDecompressorStreams:
    final FileInputStream in1 = new FileInputStream(fnLocal1.toString());
    final FileInputStream in2 = new FileInputStream(fnLocal2.toString());
    assertEquals("concat bytes available", 2567, in1.available());
    assertEquals("concat bytes available", 3056, in2.available());

/*
    // FIXME
    // The while-loop below dies at the beginning of the 2nd concatenated
    // member (after 17 lines successfully read) with:
    //
    //   java.io.IOException: bad block header
    //   at org.apache.hadoop.io.compress.bzip2.CBZip2InputStream.initBlock(
    //   CBZip2InputStream.java:527)
    //
    // It is not critical to concatenated-gzip support, HADOOP-6835, so it's
    // simply commented out for now (and HADOOP-6852 filed).  If and when the
    // latter issue is resolved--perhaps by fixing an error here--this code
    // should be reenabled.  Note that the doMultipleBzip2BufferSizes() test
    // below uses the same testCompressThenConcat.txt.bz2 file but works fine.

    CompressionInputStream cin2 = bzip2.createInputStream(in2);
    LineReader in = new LineReader(cin2);
    Text out = new Text();

    int numBytes, totalBytes=0, lineNum=0;
    while ((numBytes = in.readLine(out)) > 0) {
      ++lineNum;
      totalBytes += numBytes;
    }
    in.close();
    assertEquals("total uncompressed bytes in concatenated test file",
                 5346, totalBytes);
    assertEquals("total uncompressed lines in concatenated test file",
                 84, lineNum);
*/

    // test CBZip2InputStream with lots of different input-buffer sizes
    doMultipleBzip2BufferSizes(jobConf, false);

    // no native version of bzip2 codec (yet?)
    //doMultipleBzip2BufferSizes(jobConf, true);
  }

  // this tests either the native or the non-native gzip decoder with more than
  // three dozen input-buffer sizes in order to try to catch any parser/state-
  // machine errors at buffer boundaries
  private static void doMultipleBzip2BufferSizes(JobConf jConf,
                                                boolean useNative)
  throws IOException {
    System.out.println(COLOR_MAGENTA + "doMultipleBzip2BufferSizes() using " +
      "default bzip2 decompressor" + COLOR_NORMAL);

    jConf.setBoolean("io.native.lib.available", useNative);

    int bufferSize;

    // ideally would add some offsets/shifts in here (e.g., via extra header
    // data?), but...significant work to hand-generate each header, and no
    // bzip2 spec for reference
    for (bufferSize = 1; bufferSize < 34; ++bufferSize) {
      jConf.setInt("io.file.buffer.size", bufferSize);
      doSingleBzip2BufferSize(jConf);
    }

    bufferSize = 512;
    jConf.setInt("io.file.buffer.size", bufferSize);
    doSingleBzip2BufferSize(jConf);

    bufferSize = 1024;
    jConf.setInt("io.file.buffer.size", bufferSize);
    doSingleBzip2BufferSize(jConf);

    bufferSize = 2*1024;
    jConf.setInt("io.file.buffer.size", bufferSize);
    doSingleBzip2BufferSize(jConf);

    bufferSize = 4*1024;
    jConf.setInt("io.file.buffer.size", bufferSize);
    doSingleBzip2BufferSize(jConf);

    bufferSize = 63*1024;
    jConf.setInt("io.file.buffer.size", bufferSize);
    doSingleBzip2BufferSize(jConf);

    bufferSize = 64*1024;
    jConf.setInt("io.file.buffer.size", bufferSize);
    doSingleBzip2BufferSize(jConf);

    bufferSize = 65*1024;
    jConf.setInt("io.file.buffer.size", bufferSize);
    doSingleBzip2BufferSize(jConf);

    bufferSize = 127*1024;
    jConf.setInt("io.file.buffer.size", bufferSize);
    doSingleBzip2BufferSize(jConf);

    bufferSize = 128*1024;
    jConf.setInt("io.file.buffer.size", bufferSize);
    doSingleBzip2BufferSize(jConf);

    bufferSize = 129*1024;
    jConf.setInt("io.file.buffer.size", bufferSize);
    doSingleBzip2BufferSize(jConf);
  }

  // this tests both files (testCompressThenConcat, testConcatThenCompress); all
  // should work with existing Java bzip2 decoder and any future native version
  private static void doSingleBzip2BufferSize(JobConf jConf) throws IOException {
    TextInputFormat format = new TextInputFormat();
    format.configure(jConf);
    format.setMinSplitSize(5500)// work around 256-byte/22-splits issue

    // here's Nth pair of DecompressorStreams:
    InputSplit[] splits = format.getSplits(jConf, 100);
    assertEquals("compressed splits == 2", 2, splits.length);
    FileSplit tmp = (FileSplit) splits[0];
    if (tmp.getPath().getName().equals("testCompressThenConcat.txt.gz")) {
      System.out.println("  (swapping)");
      splits[0] = splits[1];
      splits[1] = tmp;
    }

    // testConcatThenCompress (single)
    List<Text> results = readSplit(format, splits[0], jConf);
    assertEquals("splits[0] length (num lines)", 84, results.size());
    assertEquals("splits[0][0]",
      "Call me Ishmael. Some years ago--never mind how long precisely--having",
      results.get(0).toString());
    assertEquals("splits[0][42]",
      "Tell me, does the magnetic virtue of the needles of the compasses of",
      results.get(42).toString());

    // testCompressThenConcat (multi)
    results = readSplit(format, splits[1], jConf);
    assertEquals("splits[1] length (num lines)", 84, results.size());
    assertEquals("splits[1][0]",
      "Call me Ishmael. Some years ago--never mind how long precisely--having",
      results.get(0).toString());
    assertEquals("splits[1][42]",
      "Tell me, does the magnetic virtue of the needles of the compasses of",
      results.get(42).toString());
  }

  private static String unquote(String in) {
    StringBuffer result = new StringBuffer();
    for(int i=0; i < in.length(); ++i) {
      char ch = in.charAt(i);
      if (ch == '\\') {
        ch = in.charAt(++i);
        switch (ch) {
        case 'n':
          result.append('\n');
          break;
        case 'r':
          result.append('\r');
          break;
        default:
          result.append(ch);
          break;
        }
      } else {
        result.append(ch);
      }
    }
    return result.toString();
  }

  /**
   * Parse the command line arguments into lines and display the result.
   * @param args
   * @throws Exception
   */
  public static void main(String[] args) throws Exception {
    for(String arg: args) {
      System.out.println("Working on " + arg);
      LineReader reader = makeStream(unquote(arg));
      Text line = new Text();
      int size = reader.readLine(line);
      while (size > 0) {
        System.out.println("Got: " + line.toString());
        size = reader.readLine(line);
      }
      reader.close();
    }
  }
}
TOP

Related Classes of org.apache.hadoop.mapred.TestConcatenatedCompressedInput

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.