Package org.apache.hadoop.io.simpleseekableformat

Source Code of org.apache.hadoop.io.simpleseekableformat.SimpleSeekableFormatOutputStream

package org.apache.hadoop.io.simpleseekableformat;

import java.io.DataOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.util.SortedMap;
import java.util.TreeMap;

import org.apache.hadoop.conf.Configurable;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.CompressionOutputStream;
import org.apache.hadoop.io.compress.Compressor;
import org.apache.hadoop.io.simpleseekableformat.SimpleSeekableFormat.MetaData;
import org.apache.hadoop.io.simpleseekableformat.SimpleSeekableFormat.OffsetPair;
import org.apache.hadoop.util.ReflectionUtils;

/**
* Write data in Seekable File Format.
* Data from a single write will be in a single data segment.
*
* See {@link SimpleSeekableFormat}
*/
public class SimpleSeekableFormatOutputStream extends CompressionOutputStream implements Configurable {

  /**
   * This is a hint.  The actual max can go beyond this number if a lot of data are
   * sent via a single write.  A single write will always be in the same data segment.
   */
  private static final int DEFAULT_MAX_UNCOMPRESSED_SEGMENT_LENGTH = 1024 * 1024;

  /**
   * dataSegmentOut is a wrapper stream that automatically inserts MetaDataBlocks
   * while writing out data segments.
   */
  final InterleavedOutputStream dataSegmentOut;
  /**
   * dataSegmentDataOut is a DataOutputStream wrapping dataSegmentOut.
   */
  private final DataOutputStream dataSegmentDataOut;

  private final MetaData metadata;

  private Configuration conf;
  private Class<? extends CompressionCodec> codecClass;
  private CompressionCodec codec;
  private Compressor codecCompressor;
  private int thresholdUncompressedSegmentLength;


  private final SimpleSeekableFormat.Buffer currentDataSegmentBuffer = new SimpleSeekableFormat.Buffer();

  public SimpleSeekableFormatOutputStream(OutputStream out) {
    this(new DataOutputStream(out));
  }

  /**
   * DataOutputStream allows easy write of integer, string etc.
   */
  protected SimpleSeekableFormatOutputStream(DataOutputStream out) {
    // We don't use the inherited field "out" at all.
    super(null);

    metadata = new MetaData();
    SortedMap<Long, Long> offsetPairs = new TreeMap<Long, Long>();
    offsetPairs.put(0L, 0L);
    metadata.setOffsetPairs(offsetPairs);

    this.dataSegmentOut =
        new InterleavedOutputStream(out,
            SimpleSeekableFormat.METADATA_BLOCK_LENGTH,
            SimpleSeekableFormat.DATA_BLOCK_LENGTH,
            new SimpleSeekableFormat.MetaDataProducer(metadata)
          );
    this.dataSegmentDataOut = new DataOutputStream(dataSegmentOut);
  }


  @Override
  public Configuration getConf() {
    return conf;
  }

  @Override
  public void setConf(Configuration conf) {
    this.conf = conf;
    // Set the codec
    codecClass = conf.getClass(SimpleSeekableFormat.FILEFORMAT_SSF_CODEC_CONF, null,
        CompressionCodec.class);
    if (codecClass == null) {
      codec = null;
    } else {
      codec = ReflectionUtils.newInstance(codecClass, conf);
      codecCompressor = codec.createCompressor();
    }
    // Set the max segment length
    thresholdUncompressedSegmentLength = conf.getInt(
        SimpleSeekableFormat.FILEFORMAT_SSF_MAX_UNCOMPRESSED_SEGMENT_LENGTH,
        DEFAULT_MAX_UNCOMPRESSED_SEGMENT_LENGTH);
  }

  @Override
  public void write(int b) throws IOException {
    currentDataSegmentBuffer.write(b);
    flushIfNeeded();
  }

  /**
   * This function makes sure the whole buffer is written into the same data segment.
   */
  @Override
  public void write(byte[] b, int start, int length) throws IOException {
    currentDataSegmentBuffer.write(b, start, length);
    flushIfNeeded();
  }

  @Override
  public void close() throws IOException {
    if (currentDataSegmentBuffer.size() > 0) {
      flush();
    }
    dataSegmentDataOut.close();
  }

  private void flushIfNeeded() throws IOException {
    if (currentDataSegmentBuffer.size() >= thresholdUncompressedSegmentLength) {
      flush();
    }
  }


  private void updateMetadata(long uncompressedSegmentSize,
      long compressedSegmentSize) {
    SortedMap<Long, Long> offsetPairs = metadata.getOffsetPairs();
    long lastUncompressedOffset = offsetPairs.firstKey();
    long lastCompressedOffset = offsetPairs.get(lastUncompressedOffset);
    long uncompressedOffset = lastUncompressedOffset + uncompressedSegmentSize;
    long compressedOffset = lastCompressedOffset + compressedSegmentSize;
    offsetPairs.clear();
    offsetPairs.put(uncompressedOffset, compressedOffset);
  }

  /**
   * Take the current data segment, optionally compress it,
   * calculate the crc32, and then write it out.
   *
   * The method sets the lastOffsets to the end of the file before it starts
   * writing.  That means the offsets in the MetaDataBlock will be after the
   * end of the current data block.
   */
  @Override
  public void flush() throws IOException {

    // Do not do anything if no data has been written
    if (currentDataSegmentBuffer.size() == 0) {
      return;
    }

    // Create the current DataSegment
    DataSegmentWriter currentDataSegment =
        new DataSegmentWriter(currentDataSegmentBuffer, codec, codecCompressor);

    // Update the metadata
    updateMetadata(currentDataSegmentBuffer.size(), currentDataSegment.size());

    // Write out the DataSegment
    currentDataSegment.writeTo(dataSegmentDataOut);

    // Clear out the current buffer. Note that this has to be done after
    // currentDataSegment.writeTo(...), because currentDataSegment can
    // keep a reference to the currentDataSegmentBuffer.
    currentDataSegmentBuffer.reset();

    // Flush out the underlying stream
    dataSegmentDataOut.flush();
  }

  @Override
  public void finish() throws IOException {
    // we don't need to do anything for finish().
  }

  @Override
  public void resetState() throws IOException {
    // we don't need to do anything for resetState().
  }

}
TOP

Related Classes of org.apache.hadoop.io.simpleseekableformat.SimpleSeekableFormatOutputStream

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.