Examples of LzoIndex


Examples of com.hadoop.compression.lzo.LzoIndex

      if (!file.toString().endsWith(fileExtension)) {
        // Get rid of non-LZO files.
        it.remove();
      } else {
        FileSystem fs = file.getFileSystem(conf);
        LzoIndex index = LzoIndex.readIndex(fs, file);
        indexes.put(file, index);
      }
    }

    return files.toArray(new FileStatus[] {});
View Full Code Here

Examples of com.hadoop.compression.lzo.LzoIndex

    return files.toArray(new FileStatus[] {});
  }

  @Override
  protected boolean isSplitable(FileSystem fs, Path filename) {
    LzoIndex index = indexes.get(filename);
    return !index.isEmpty();
  }
View Full Code Here

Examples of com.hadoop.compression.lzo.LzoIndex

    List<FileSplit> result = new ArrayList<FileSplit>();

    for (FileSplit fileSplit: splits) {
      Path file = fileSplit.getPath();
      FileSystem fs = file.getFileSystem(conf);
      LzoIndex index = indexes.get(file);
      if (index == null) {
        throw new IOException("Index not found for " + file);
      }
      if (index.isEmpty()) {
        // Empty index, keep it as is.
        result.add(fileSplit);
        continue;
      }

      long start = fileSplit.getStart();
      long end = start + fileSplit.getLength();

      long lzoStart = index.alignSliceStartToIndex(start, end);
      long lzoEnd = index.alignSliceEndToIndex(end, fs.getFileStatus(file).getLen());

      if (lzoStart != LzoIndex.NOT_FOUND  && lzoEnd != LzoIndex.NOT_FOUND) {
        result.add(new FileSplit(file, lzoStart, lzoEnd - lzoStart, fileSplit.getLocations()));
      }
    }
View Full Code Here

Examples of com.hadoop.compression.lzo.LzoIndex

    Path file = fileSplit.getPath();
    fileSize = file.getFileSystem(context.getConfiguration())
        .getFileStatus(file).getLen();

    LzoIndex lzoBlockIndex = LzoIndex.readIndex(
        file.getFileSystem(context.getConfiguration()), file);

    if (lzoBlockIndex.isEmpty()) {

      throw new IOException("No LZO index file exists for the input file "
          + file.toString() + " cannot index the input file");
    }

    int num_lzo_blocks = lzoBlockIndex.getNumberOfBlocks();
    lastLZOBlockStartOffset = lzoBlockIndex.getPosition(num_lzo_blocks - 1);

    LOG.info(context.getTaskAttemptID() + " splitStart= " + splitStart
        + " splitEnd=" + splitEnd + " splitLength=" + splitLength);
    LOG.info(context.getTaskAttemptID() + ":total LZOblocks in this file: "
        + num_lzo_blocks);

    // first loop to get the range of block offsets in lzoBlockIndex this mapper
    // is responsible for;
    int startPos = 0;
    int endPos = 0;

    boolean foundStartPos = false;
    boolean foundEndPos = false;
    for (int i = 0; i < num_lzo_blocks; i++) {
      long currentBlockOffset = lzoBlockIndex.getPosition(i);
      if (currentBlockOffset >= splitStart) {
        if (!foundStartPos) {
          startPos = i;
          foundStartPos = true;
        }
      }
      if (currentBlockOffset >= splitEnd) {
        if (!foundEndPos) {
          endPos = i;
          foundEndPos = true;
        }
      }

      if (foundStartPos && foundEndPos)
        break;
    }

    if (!foundEndPos) {
      endPos = num_lzo_blocks - 1;
      totalLZOBlocks = endPos - startPos + 1;
      // the last split, we need to copy from startPos to the end and additional
      // add the end of the file to the array lzoBlockOffset
    } else {
      if (endPos < num_lzo_blocks - 1)
        endPos++;
      if (endPos == num_lzo_blocks - 1) // treat as if it's the last split;
      {
        totalLZOBlocks = endPos - startPos + 1;
        foundEndPos = false;
      } else
        totalLZOBlocks = endPos - startPos;
    }

    // special treatment for the first lzoblock offset, due to the current
    // lzoindex implementation, we have to
    // use 0 for the first lzo block in any lzo compressed file though in fact
    // the actual start offset to the first lzoblock is not 0.
    // later we may consider to change the lzo related package to make sure all
    // lzo block start offsets are treated the same way.

    lzoOffsets = new long[totalLZOBlocks + 1];

    if (foundEndPos) {
      for (int i = 0; i <= totalLZOBlocks; i++)
        lzoOffsets[i] = lzoBlockIndex.getPosition(i + startPos);
    } else {
      // treat the last InputSplit differently
      if (LOG.isDebugEnabled()) {
        LOG.debug("read the last lzo block offset, add the file end offset to the last element in the index array");
      }
      for (int i = 0; i < totalLZOBlocks; i++)
        lzoOffsets[i] = lzoBlockIndex.getPosition(i + startPos);
      lzoOffsets[totalLZOBlocks] = fileSize;
    }

    if (splitStart == 0) {
      lzoOffsets[0] = 0;
View Full Code Here

Examples of com.hadoop.compression.lzo.LzoIndex

    //create 3 files to test globs and test on single lzo block in a split;
    //create File 1, which has only one lzo block.
    FileSystem fs = FileSystem.get(conf);
    String baseFilePath = TESTDIR + INPUTDIR;
    LzoIndex index ;
    int repeatFactor1 = 1;
    createLZOFile(baseFilePath+"11.lzo", repeatFactor1, true);
    index = LzoIndex.readIndex(fs, new Path(baseFilePath+"11.lzo"));
    if(index.getNumberOfBlocks() > 1)
      throw new RuntimeException(baseFilePath+"11.lzo has more than one " +
          "lzo block" );

    //create File 2, which has more than 1 lzo blocks.
    int repeatFactor2 = 10;
    createLZOFile(baseFilePath+"21.lzo", repeatFactor2, true);
    index = LzoIndex.readIndex(fs, new Path(baseFilePath+"21.lzo"));
    if(index.getNumberOfBlocks() < 2)
      throw new RuntimeException(baseFilePath+"21.lzo has only one lzo block" );


    //create a new lzo file 3 to test combining lzo blocks.

    int repeatFactor3 = 30;
    createLZOFile(baseFilePath + "31.lzo", repeatFactor3, true); //b64 format
    index = LzoIndex.readIndex(fs, new Path(baseFilePath+"31.lzo"));
    if(index.getNumberOfBlocks() < 2)
      throw new RuntimeException(baseFilePath+"31.lzo has only one lzo block" );


    int repeatFactor4 = 1;
    createLZOFile(baseFilePath + "b11.lzo", repeatFactor4, true);
    index = LzoIndex.readIndex(fs, new Path(baseFilePath + "b11.lzo"));
    if(index.getNumberOfBlocks() > 1)
      throw new RuntimeException(baseFilePath+"b11.lzo has more than one " +
          "lzo block" );

    //create File 2, which has more than 1 lzo blocks.
    int repeatFactor5 = 10;
    createLZOFile(baseFilePath + "b21.lzo", repeatFactor5, true);
    index = LzoIndex.readIndex(fs, new Path(baseFilePath + "b21.lzo"));
    if(index.getNumberOfBlocks() < 2)
      throw new RuntimeException(baseFilePath + "b21.lzo has only one lzo block" );

    int repeatFactor6 = 30;
    createLZOFile(baseFilePath + "b31.lzo", repeatFactor6, true);
    index = LzoIndex.readIndex(fs, new Path(baseFilePath + "b31.lzo"));
    if(index.getNumberOfBlocks() < 2)
      throw new RuntimeException(baseFilePath+"b31.lzo has only one lzo block" );

    //index the created lzo files without combining lzo blocks;
    String[] args = new String[] {
        "-jobpoolsize=1",
View Full Code Here

Examples of com.hadoop.compression.lzo.LzoIndex

    // Find new starts and ends of the file splits that align with the lzo blocks.
    List<InputSplit> result = new ArrayList<InputSplit>();

    Path prevFile = null;
    LzoIndex prevIndex = null;

    for (InputSplit genericSplit : defaultSplits) {
      // Load the index.
      FileSplit fileSplit = (FileSplit)genericSplit;
      Path file = fileSplit.getPath();

      LzoIndex index; // reuse index for files with multiple blocks.
      if ( file.equals(prevFile) ) {
        index = prevIndex;
      } else {
        index = LzoIndex.readIndex(file.getFileSystem(HadoopCompat.getConfiguration(job)), file);
        prevFile = file;
        prevIndex = index;
      }

      if (index == null) {
        // In listStatus above, a (possibly empty, but non-null) index was put in for every split.
        throw new IOException("Index not found for " + file);
      }

      if (index.isEmpty()) {
        // Empty index, so leave the default split.
        // split's start position should be 0.
        result.add(fileSplit);
        continue;
      }

      long start = fileSplit.getStart();
      long end = start + fileSplit.getLength();

      long lzoStart = index.alignSliceStartToIndex(start, end);
      long lzoEnd = index.alignSliceEndToIndex(end, file.getFileSystem(HadoopCompat.getConfiguration(job)).getFileStatus(file).getLen());

      if (lzoStart != LzoIndex.NOT_FOUND  && lzoEnd != LzoIndex.NOT_FOUND) {
        result.add(new FileSplit(file, lzoStart, lzoEnd - lzoStart, fileSplit.getLocations()));
        LOG.debug("Added LZO split for " + file + "[start=" + lzoStart + ", length=" + (lzoEnd - lzoStart) + "]");
      }
View Full Code Here

Examples of com.hadoop.compression.lzo.LzoIndex

  /**
   * Make sure the lzo index class works as described.
   */
  @Test
  public void testLzoIndex() {
    LzoIndex index = new LzoIndex();
    assertTrue(index.isEmpty());
    index = new LzoIndex(4);
    index.set(0, 0);
    index.set(1, 5);
    index.set(2, 10);
    index.set(3, 15);
    assertFalse(index.isEmpty());

    assertEquals(0, index.findNextPosition(-1));
    assertEquals(5, index.findNextPosition(1));
    assertEquals(5, index.findNextPosition(5));
    assertEquals(15, index.findNextPosition(11));
    assertEquals(15, index.findNextPosition(15));
    assertEquals(-1, index.findNextPosition(16));

    assertEquals(5, index.alignSliceStartToIndex(3, 20));
    assertEquals(15, index.alignSliceStartToIndex(15, 20));
    assertEquals(10, index.alignSliceEndToIndex(8, 30));
    assertEquals(10, index.alignSliceEndToIndex(10, 30));
    assertEquals(30, index.alignSliceEndToIndex(17, 30));
    assertEquals(LzoIndex.NOT_FOUND, index.alignSliceStartToIndex(16, 20));
  }
View Full Code Here
TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.