Source Code of com.hadoop.compression.lzo.LzoIndex

/*
 * This file is part of Hadoop-Gpl-Compression.
 *
 * Hadoop-Gpl-Compression is free software: you can redistribute it
 * and/or modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation, either version 3 of
 * the License, or (at your option) any later version.
 *
 * Hadoop-Gpl-Compression is distributed in the hope that it will be
 * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with Hadoop-Gpl-Compression.  If not, see
 * <http://www.gnu.org/licenses/>.
 */


package com.hadoop.compression.lzo;


import java.io.EOFException;
import java.io.IOException;
import java.util.Arrays;
import org.anarres.lzo.LzopInputStream;


import org.apache.hadoop.conf.Configurable;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.CompressionCodecFactory;


/**
 * Represents the lzo index.
 */
public class LzoIndex {
  public static final String LZO_INDEX_SUFFIX = ".index";
  public static final String LZO_TMP_INDEX_SUFFIX = ".index.tmp";
  public static final long NOT_FOUND = -1;


  private long[] blockPositions_;


  /**
   * Create an empty index, typically indicating no index file exists.
   */
  public LzoIndex() { }


  /**
   * Create an index specifying the number of LZO blocks in the file.
   * @param blocks The number of blocks in the LZO file the index is representing.
   */
  public LzoIndex(int blocks) {
    blockPositions_ = new long[blocks];
  }


  /**
   * Set the position for the block.
   *
   * @param blockNumber Block to set pos for.
   * @param pos Position.
   */
  public void set(int blockNumber, long pos) {
    blockPositions_[blockNumber] = pos;
  }


  /**
   * Get the total number of blocks in the index file.
   */
  public int getNumberOfBlocks() {
    return blockPositions_.length;
  }


  /**
   * Get the block offset for a given block.
   * @param block
   * @return the byte offset into the file where this block starts.  It is the developer's
   * responsibility to call getNumberOfBlocks() to know appropriate bounds on the parameter.
   * The argument block should satisfy 0 <= block < getNumberOfBlocks().
   */
  public long getPosition(int block) {
    return blockPositions_[block];
  }


  /**
   * Find the next lzo block start from the given position.
   *
   * @param pos The position to start looking from.
   * @return Either the start position of the block or -1 if it couldn't be found.
   */
  public long findNextPosition(long pos) {
    int block = Arrays.binarySearch(blockPositions_, pos);


    if (block >= 0) {
      // direct hit on a block start position
      return blockPositions_[block];
    } else {
      block = Math.abs(block) - 1;
      if (block > blockPositions_.length - 1) {
        return NOT_FOUND;
      }
      return blockPositions_[block];
    }
  }


  /**
   * Return true if the index has no blocks set.
   *
   * @return true if the index has no blocks set.
   */
  public boolean isEmpty() {
    return blockPositions_ == null || blockPositions_.length == 0;
  }


  /**
   * Nudge a given file slice start to the nearest LZO block start no earlier than
   * the current slice start.
   *
   * @param start The current slice start
   * @param end The current slice end
   * @return The smallest block offset in the index between [start, end), or
   *         NOT_FOUND if there is none such.
   */
  public long alignSliceStartToIndex(long start, long end) {
    if (start != 0) {
      // find the next block position from
      // the start of the split
      long newStart = findNextPosition(start);
      if (newStart == NOT_FOUND || newStart >= end) {
        return NOT_FOUND;
      }
      start = newStart;
    }
    return start;
  }


  /**
   * Nudge a given file slice end to the nearest LZO block end no earlier than
   * the current slice end.
   *
   * @param end The current slice end
   * @param fileSize The size of the file, i.e. the max end position.
   * @return The smallest block offset in the index between [end, fileSize].
   */
  public long alignSliceEndToIndex(long end, long fileSize) {
    long newEnd = findNextPosition(end);
    if (newEnd != NOT_FOUND) {
      end = newEnd;
    } else {
      // didn't find the next position
      // we have hit the end of the file
      end = fileSize;
    }
    return end;
  }


  /**
   * Read the index of the lzo file.


   * @param fs The index file is on this file system.
   * @param lzoFile the file whose index we are reading -- NOT the index file itself.  That is,
   * pass in filename.lzo, not filename.lzo.index, for this parameter.
   * @throws IOException
   */
  public static LzoIndex readIndex(FileSystem fs, Path lzoFile) throws IOException {
    FSDataInputStream indexIn = null;
    try {
      Path indexFile = lzoFile.suffix(LZO_INDEX_SUFFIX);
      if (!fs.exists(indexFile)) {
        // return empty index, fall back to the unsplittable mode
        return new LzoIndex();
      }


      long indexLen = fs.getFileStatus(indexFile).getLen();
      int blocks = (int) (indexLen / 8);
      LzoIndex index = new LzoIndex(blocks);
      indexIn = fs.open(indexFile);
      for (int i = 0; i < blocks; i++) {
        index.set(i, indexIn.readLong());
      }
      return index;
    } finally {
      if (indexIn != null) {
        indexIn.close();
      }
    }
  }


  /**
   * Index an lzo file to allow the input format to split them into separate map
   * jobs.
   *
   * @param fs File system that contains the file.
   * @param lzoFile the lzo file to index.  For filename.lzo, the created index file will be
   * filename.lzo.index.
   * @throws IOException
   */
  public static void createIndex(FileSystem fs, Path lzoFile)
  throws IOException {


    Configuration conf = fs.getConf();
    CompressionCodecFactory factory = new CompressionCodecFactory(conf);
    CompressionCodec codec = factory.getCodec(lzoFile);
    if (null == codec) {
      throw new IOException("Could not find codec for file " + lzoFile +
        " - you may need to add the LZO codec to your io.compression.codecs " +
        "configuration in core-site.xml");
    }
    ((Configurable) codec).setConf(conf);


    FSDataInputStream is = null;
    FSDataOutputStream os = null;
    Path outputFile = lzoFile.suffix(LZO_INDEX_SUFFIX);
    Path tmpOutputFile = lzoFile.suffix(LZO_TMP_INDEX_SUFFIX);


    // Track whether an exception was thrown or not, so we know to either
    // delete the tmp index file on failure, or rename it to the new index file on success.
    boolean indexingSucceeded = false;
    try {
      is = fs.open(lzoFile);
      os = fs.create(tmpOutputFile);
      // Solely for reading the header
      LzopInputStream lzis = new LzopInputStream(is);
      int numCompressedChecksums = lzis.getCompressedChecksumCount();
      int numDecompressedChecksums = lzis.getUncompressedChecksumCount();


      while (true) {
        // read and ignore, we just want to get to the next int
        int uncompressedBlockSize = is.readInt();
        if (uncompressedBlockSize == 0) {
          break;
        } else if (uncompressedBlockSize < 0) {
          throw new EOFException();
        }


        int compressedBlockSize = is.readInt();
        if (compressedBlockSize <= 0) {
          throw new IOException("Could not read compressed block size");
        }


        // See LzopInputStream.getCompressedData
        boolean isUncompressedBlock = (uncompressedBlockSize == compressedBlockSize);
        int numChecksumsToSkip = isUncompressedBlock ?
            numDecompressedChecksums : numDecompressedChecksums + numCompressedChecksums;
        long pos = is.getPos();
        // write the pos of the block start
        os.writeLong(pos - 8);
        // seek to the start of the next block, skip any checksums
        is.seek(pos + compressedBlockSize + (4 * numChecksumsToSkip));
      }
      // If we're here, indexing was successful.
      indexingSucceeded = true;
    } finally {
      // Close any open streams.
      if (is != null) {
        is.close();
      }


      if (os != null) {
        os.close();
      }


      if (!indexingSucceeded) {
        // If indexing didn't succeed (i.e. an exception was thrown), clean up after ourselves.
        fs.delete(tmpOutputFile, false);
      } else {
        // Otherwise, rename filename.lzo.index.tmp to filename.lzo.index.
        fs.rename(tmpOutputFile, outputFile);
      }
    }
  }
}
Source Code of com.hadoop.compression.lzo.LzoIndex

Related Classes of com.hadoop.compression.lzo.LzoIndex