Package org.apache.hadoop.io.simpleseekableformat

Source Code of org.apache.hadoop.io.simpleseekableformat.SimpleSeekableFormatInputStream

package org.apache.hadoop.io.simpleseekableformat;

import java.io.DataInputStream;
import java.io.EOFException;
import java.io.IOException;
import java.io.InputStream;
import java.util.HashMap;
import java.util.SortedMap;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.CodecPrematureEOFException;
import org.apache.hadoop.io.compress.CompressionInputStream;
import org.apache.hadoop.io.compress.Decompressor;
import org.apache.hadoop.io.simpleseekableformat.DataSegmentReader.EmptyDataSegmentException;

/**
* The reader for Seekable File Format.
*
* This class inherits CompressionInputStream because an instance of this will
* be returned by SimpleSeekableFormatCodec.
*
* See {@link SimpleSeekableFormat}
*/
public class SimpleSeekableFormatInputStream extends CompressionInputStream {

  private final InterleavedInputStream interleavedIn;
  private final DataInputStream dataIn;
  private InputStream dataSegmentIn;

  // Stores the latest metaData block
  private final SimpleSeekableFormat.MetaData metaData;

  private final HashMap<Text, Decompressor> decompressorCache
    = new HashMap<Text, Decompressor>();

  private final Configuration conf = new Configuration();

  public SimpleSeekableFormatInputStream(InputStream in) {
    // we don't use the inherited field "in" at all:
    super(null);
    metaData = new SimpleSeekableFormat.MetaData();
    interleavedIn = createInterleavedInputStream(in,
        SimpleSeekableFormat.METADATA_BLOCK_LENGTH,
        SimpleSeekableFormat.DATA_BLOCK_LENGTH,
        new SimpleSeekableFormat.MetaDataConsumer(metaData));
    this.dataIn = new DataInputStream(interleavedIn);
  }

  /**
   * This factory method can be overwritten by subclass to provide different behavior.
   * It's only called in the constructor.
   */
  protected InterleavedInputStream createInterleavedInputStream(InputStream in,
      int metaDataBlockLength, int dataBlockLength,
      SimpleSeekableFormat.MetaDataConsumer consumer) {
    return new InterleavedInputStream(in, metaDataBlockLength, dataBlockLength, consumer);
  }

  protected InterleavedInputStream getInterleavedIn() {
    return interleavedIn;
  }

  protected SimpleSeekableFormat.MetaData getMetaData() {
    return metaData;
  }

  @Override
  public int read() throws IOException {
    if (dataSegmentIn == null) {
      if (!moveToNextDataSegment()) {
        return -1;
      }
    }
    do {
      int result = dataSegmentIn.read();
      if (result != -1) {
        return result;
      }
      if (!moveToNextDataSegment()) {
        return -1;
      }
    } while (true);
  }

  @Override
  public int read(byte[] b, int start, int length) throws IOException {

    if (dataSegmentIn == null) {
      if (!moveToNextDataSegment()) {
        return -1;
      }
    }
    do {
      int result = dataSegmentIn.read(b, start, length);
      if (result != -1) {
        return result;
      }
      if (!moveToNextDataSegment()) {
        return -1;
      }
    } while (true);
  }

  @Override
  public void close() throws IOException {
    clearDataSegment();
    dataIn.close();
  }

  /**
   * This function depends on that the underlying dataSegmentIn.available() only
   * returns 0 when EOF.  Otherwise it will break because it jumps over the dataSegmentIn
   * that has available() == 0.
   */
  @Override
  public int available() throws IOException {
    if (dataSegmentIn == null) {
      if (!moveToNextDataSegment()) {
        return 0;
      }
    }
    do {
      int result = dataSegmentIn.available();
      if (result != 0) {
        return result;
      }
      if (!moveToNextDataSegment()) {
        return 0;
      }
    } while (true);
  }

  /**
   * Returns false if there are no more data segments.
   */
  private boolean moveToNextDataSegment() throws IOException {
    try {
      clearDataSegment();
      DataSegmentReader dataSegmentReader =
          new DataSegmentReader(dataIn, conf, decompressorCache);
      dataSegmentIn = dataSegmentReader.getInputStream();
    } catch (EmptyDataSegmentException e){
      // no data available
      return false;
    } catch (EOFException e) {
      // EOFException is thrown when the underlying data stream is truncated, e.g. truncated file.
      // This is considered as a normal case.
      throw new CodecPrematureEOFException("Truncated .SSF file detected.");
    } catch (ClassNotFoundException e) {
      throw new RuntimeException(e);
    }
    return true;
  }

  /**
   * Called by subclass to clear out the current dataSegmentIn.
   */
  protected void clearDataSegment() throws IOException {
    if (dataSegmentIn != null) {
      dataSegmentIn.close();
      dataSegmentIn = null;
    }
  }

  @Override
  public void resetState() throws IOException {
    throw new RuntimeException("SeekableFileInputFormat does not support resetState()");
  }

  /**
   * This function seeks forward using all "available" bytes.
   * It returns the offset after the seek.
   *
   * This function throws EOFException if there are no available complete metaDataBlock
   * or the metaDataBlock points to a position after the file end (e.g. truncated files).
   */
  public long seekForward() throws IOException {
    // Try to read the last metadata block
    interleavedIn.skipToLastAvailableMetaDataBlock();
    if (!interleavedIn.readMetaDataIfNeeded()) {
      throw new EOFException("Cannot get a complete metadata block");
    }

    // Move the interleavedIn to the beginning of a dataSegment
    SortedMap<Long, Long> offsetPairs = metaData.getOffsetPairs();
    // The last key in the offsetPair points to the farthest position that we can seek to.
    long uncompressedDataOffset = offsetPairs.lastKey();
    long compressedDataOffset = offsetPairs.get(uncompressedDataOffset);
    long toSkip = compressedDataOffset - interleavedIn.getDataOffset();
    if (toSkip < 0) {
      throw new CorruptedDataException("SSF format error: The last offset pair is before the current position in InterleaveStream!");
    }
    try {
      interleavedIn.skipExactly(toSkip);
    } catch (EOFException e) {
      // Ignore this exception
      // This is the PTail use case.  We don't care about this CodecPrematureEOFException
    }

    clearDataSegment();
    return uncompressedDataOffset;
  }

}
TOP

Related Classes of org.apache.hadoop.io.simpleseekableformat.SimpleSeekableFormatInputStream

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.