Package com.hadoop.compression.lzo

Source Code of com.hadoop.compression.lzo.LzoIndex

/*
* This file is part of Hadoop-Gpl-Compression.
*
* Hadoop-Gpl-Compression is free software: you can redistribute it
* and/or modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation, either version 3 of
* the License, or (at your option) any later version.
*
* Hadoop-Gpl-Compression is distributed in the hope that it will be
* useful, but WITHOUT ANY WARRANTY; without even the implied warranty
* of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Hadoop-Gpl-Compression.  If not, see
* <http://www.gnu.org/licenses/>.
*/

package com.hadoop.compression.lzo;

import java.io.EOFException;
import java.io.IOException;
import java.util.Arrays;
import org.anarres.lzo.LzopInputStream;

import org.apache.hadoop.conf.Configurable;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.CompressionCodecFactory;

/**
* Represents the lzo index.
*/
public class LzoIndex {
  public static final String LZO_INDEX_SUFFIX = ".index";
  public static final String LZO_TMP_INDEX_SUFFIX = ".index.tmp";
  public static final long NOT_FOUND = -1;

  private long[] blockPositions_;

  /**
   * Create an empty index, typically indicating no index file exists.
   */
  public LzoIndex() { }

  /**
   * Create an index specifying the number of LZO blocks in the file.
   * @param blocks The number of blocks in the LZO file the index is representing.
   */
  public LzoIndex(int blocks) {
    blockPositions_ = new long[blocks];
  }

  /**
   * Set the position for the block.
   *
   * @param blockNumber Block to set pos for.
   * @param pos Position.
   */
  public void set(int blockNumber, long pos) {
    blockPositions_[blockNumber] = pos;
  }

  /**
   * Get the total number of blocks in the index file.
   */
  public int getNumberOfBlocks() {
    return blockPositions_.length;
  }

  /**
   * Get the block offset for a given block.
   * @param block
   * @return the byte offset into the file where this block starts.  It is the developer's
   * responsibility to call getNumberOfBlocks() to know appropriate bounds on the parameter.
   * The argument block should satisfy 0 <= block < getNumberOfBlocks().
   */
  public long getPosition(int block) {
    return blockPositions_[block];
  }

  /**
   * Find the next lzo block start from the given position.
   *
   * @param pos The position to start looking from.
   * @return Either the start position of the block or -1 if it couldn't be found.
   */
  public long findNextPosition(long pos) {
    int block = Arrays.binarySearch(blockPositions_, pos);

    if (block >= 0) {
      // direct hit on a block start position
      return blockPositions_[block];
    } else {
      block = Math.abs(block) - 1;
      if (block > blockPositions_.length - 1) {
        return NOT_FOUND;
      }
      return blockPositions_[block];
    }
  }

  /**
   * Return true if the index has no blocks set.
   *
   * @return true if the index has no blocks set.
   */
  public boolean isEmpty() {
    return blockPositions_ == null || blockPositions_.length == 0;
  }

  /**
   * Nudge a given file slice start to the nearest LZO block start no earlier than
   * the current slice start.
   *
   * @param start The current slice start
   * @param end The current slice end
   * @return The smallest block offset in the index between [start, end), or
   *         NOT_FOUND if there is none such.
   */
  public long alignSliceStartToIndex(long start, long end) {
    if (start != 0) {
      // find the next block position from
      // the start of the split
      long newStart = findNextPosition(start);
      if (newStart == NOT_FOUND || newStart >= end) {
        return NOT_FOUND;
      }
      start = newStart;
    }
    return start;
  }

  /**
   * Nudge a given file slice end to the nearest LZO block end no earlier than
   * the current slice end.
   *
   * @param end The current slice end
   * @param fileSize The size of the file, i.e. the max end position.
   * @return The smallest block offset in the index between [end, fileSize].
   */
  public long alignSliceEndToIndex(long end, long fileSize) {
    long newEnd = findNextPosition(end);
    if (newEnd != NOT_FOUND) {
      end = newEnd;
    } else {
      // didn't find the next position
      // we have hit the end of the file
      end = fileSize;
    }
    return end;
  }

  /**
   * Read the index of the lzo file.

   * @param fs The index file is on this file system.
   * @param lzoFile the file whose index we are reading -- NOT the index file itself.  That is,
   * pass in filename.lzo, not filename.lzo.index, for this parameter.
   * @throws IOException
   */
  public static LzoIndex readIndex(FileSystem fs, Path lzoFile) throws IOException {
    FSDataInputStream indexIn = null;
    try {
      Path indexFile = lzoFile.suffix(LZO_INDEX_SUFFIX);
      if (!fs.exists(indexFile)) {
        // return empty index, fall back to the unsplittable mode
        return new LzoIndex();
      }

      long indexLen = fs.getFileStatus(indexFile).getLen();
      int blocks = (int) (indexLen / 8);
      LzoIndex index = new LzoIndex(blocks);
      indexIn = fs.open(indexFile);
      for (int i = 0; i < blocks; i++) {
        index.set(i, indexIn.readLong());
      }
      return index;
    } finally {
      if (indexIn != null) {
        indexIn.close();
      }
    }
  }

  /**
   * Index an lzo file to allow the input format to split them into separate map
   * jobs.
   *
   * @param fs File system that contains the file.
   * @param lzoFile the lzo file to index.  For filename.lzo, the created index file will be
   * filename.lzo.index.
   * @throws IOException
   */
  public static void createIndex(FileSystem fs, Path lzoFile)
  throws IOException {

    Configuration conf = fs.getConf();
    CompressionCodecFactory factory = new CompressionCodecFactory(conf);
    CompressionCodec codec = factory.getCodec(lzoFile);
    if (null == codec) {
      throw new IOException("Could not find codec for file " + lzoFile +
        " - you may need to add the LZO codec to your io.compression.codecs " +
        "configuration in core-site.xml");
    }
    ((Configurable) codec).setConf(conf);

    FSDataInputStream is = null;
    FSDataOutputStream os = null;
    Path outputFile = lzoFile.suffix(LZO_INDEX_SUFFIX);
    Path tmpOutputFile = lzoFile.suffix(LZO_TMP_INDEX_SUFFIX);

    // Track whether an exception was thrown or not, so we know to either
    // delete the tmp index file on failure, or rename it to the new index file on success.
    boolean indexingSucceeded = false;
    try {
      is = fs.open(lzoFile);
      os = fs.create(tmpOutputFile);
      // Solely for reading the header
      LzopInputStream lzis = new LzopInputStream(is);
      int numCompressedChecksums = lzis.getCompressedChecksumCount();
      int numDecompressedChecksums = lzis.getUncompressedChecksumCount();

      while (true) {
        // read and ignore, we just want to get to the next int
        int uncompressedBlockSize = is.readInt();
        if (uncompressedBlockSize == 0) {
          break;
        } else if (uncompressedBlockSize < 0) {
          throw new EOFException();
        }

        int compressedBlockSize = is.readInt();
        if (compressedBlockSize <= 0) {
          throw new IOException("Could not read compressed block size");
        }

        // See LzopInputStream.getCompressedData
        boolean isUncompressedBlock = (uncompressedBlockSize == compressedBlockSize);
        int numChecksumsToSkip = isUncompressedBlock ?
            numDecompressedChecksums : numDecompressedChecksums + numCompressedChecksums;
        long pos = is.getPos();
        // write the pos of the block start
        os.writeLong(pos - 8);
        // seek to the start of the next block, skip any checksums
        is.seek(pos + compressedBlockSize + (4 * numChecksumsToSkip));
      }
      // If we're here, indexing was successful.
      indexingSucceeded = true;
    } finally {
      // Close any open streams.
      if (is != null) {
        is.close();
      }

      if (os != null) {
        os.close();
      }

      if (!indexingSucceeded) {
        // If indexing didn't succeed (i.e. an exception was thrown), clean up after ourselves.
        fs.delete(tmpOutputFile, false);
      } else {
        // Otherwise, rename filename.lzo.index.tmp to filename.lzo.index.
        fs.rename(tmpOutputFile, outputFile);
      }
    }
  }
}
TOP

Related Classes of com.hadoop.compression.lzo.LzoIndex

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.