Package org.apache.hadoop.hive.ql.exec.persistence

Source Code of org.apache.hadoop.hive.ql.exec.persistence.BytesBytesMultiHashMap$Ref

/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements.  See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership.  The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.hadoop.hive.ql.exec.persistence;

import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;

import org.apache.commons.lang.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.hive.ql.debug.Utils;
import org.apache.hadoop.hive.serde2.ByteStream.RandomAccessOutput;
import org.apache.hadoop.hive.serde2.SerDeException;
import org.apache.hadoop.hive.serde2.WriteBuffers;

import com.google.common.annotations.VisibleForTesting;


/**
* HashMap that maps byte arrays to byte arrays with limited functionality necessary for
* MapJoin hash tables, with small memory overhead. Supports multiple values for single key.
* Values can be added for key (but cannot be removed); values can be gotten for the key.
* Some things (like entrySet) are easy to add; some e.g. deletion are pretty hard to do well.
* Additionally, for each key it contains a magic "state byte" which is not part of the key and
* can be updated on every put for that key. That is really silly, we use it to store aliasFilter.
* Magic byte could be removed for generality.
* Initially inspired by HPPC LongLongOpenHashMap; however, the code is almost completely reworked
* and there's very little in common left save for quadratic probing (and that with some changes).
*/
public final class BytesBytesMultiHashMap {
  public static final Log LOG = LogFactory.getLog(BytesBytesMultiHashMap.class);

  /*
   * This hashtable stores "references" in an array of longs;  index in the array is hash of
   * the key; these references point into infinite byte buffer (see below). This buffer contains
   * records written one after another. There are several simple record formats.
   * - single record for the key
   *    [key bytes][value bytes][vlong value length][vlong key length][padding]
   *    We leave padding to ensure we have at least 5 bytes after key and value.
   * - first of multiple records for the key (updated from "single value for the key")
   *    [key bytes][value bytes][5-byte long offset to a list start record]
   *  - list start record
   *    [vlong value length][vlong key length][5-byte long offset to the 2nd list record]
   *    Lengths are preserved from the first record. Offset is discussed above.
   *  - subsequent values in the list
   *    [value bytes][value length][vlong relative offset to next record].
   *
   * In summary, because we have separate list record, we have very little list overhead for
   * the typical case of primary key join, where there's no list for any key; large lists also
   * don't have a lot of relative overhead (also see the todo below).
   *
   * So the record looks as follows for one value per key (hash is fixed, 4 bytes, and is
   * stored to expand w/o rehashing, and to more efficiently deal with collision
   *
   *             i = key hash
   *           ._______.
   * REFS: ... |offset | ....
   *           `--\----'
   *               `-------------.
   *                            \|/
   *          .______._____._____'__.__._.
   * WBS: ... | hash | key | val |vl|kl| | ....
   *          `------'-----'-----'--'--'-'
   *
   * After that refs don't change so they are not pictured.
   * When we add the 2nd value, we rewrite lengths with relative offset to the list start record.
   * That way, the first record points to the "list record".
   *                         ref .---------.
   *                         \|/ |        \|/
   *       .______._____._____'__|___.     '__.__.______.
   * WBS:  | hash | key | val |offset| ... |vl|kl|      |
   *       `------'-----'-----'------'     '--'--'------'
   * After that refs don't change so they are not pictured. List record points to the 2nd value.
   *                         ref .---------.        .---------------.
   *                         \|/ |        \|/       |              \|/
   *       .______._____._____'__|___.     '__.__.__|___.     ._____'__._.
   * WBS:  | hash | key | val |offset| ... |vl|kl|offset| ... | val |vl|0|
   *       `------'-----'-----'------'     '--'--'------'     '-----'--'-'
   * If we add another value, we overwrite the list record.
   * We don't need to overwrite any vlongs and suffer because of that.
   *                         ref .---------.         .-------------------------------.
   *                         \|/ |        \|/        |                              \|/
   *       .______._____._____'__|___.     '__.__.___|__.     ._____.__._.     ._____'__.______.
   * WBS:  | hash | key | val |offset| ... |vl|kl|offset| ... | val |vl|0| ... | val |vl|offset|
   *       `------'-----'-----'------'     '--'--'------'     '-----:--'-'     '-----'--'--|---'
   *                                                               /|\                     |
   *                                                                `----------------------'
   * And another value (for example)
   * ... ---.         .-----------------------------------------------------.
   *       \|/        |                                                    \|/
   *        '__.__.___|__.     ._____.__._.     ._____.__.______.     ._____'__.______.
   * ...    |vl|kl|offset| ... | val |vl|0| ... | val |vl|offset| ... | val |vl|offset|
   *        '--'--'------'     '-----:--'-'     '-----'--:--|---'     '-----'--'--|---'
   *                                /|\                 /|\ |                     |
   *                                 `-------------------+--'                     |
   *                                                     `------------------------'
   */

  /**
   * Write buffers for keys and values. For the description of the structure above, think
   * of this as one infinite byte buffer.
   */
  private WriteBuffers writeBuffers;

  private final float loadFactor;

  private int resizeThreshold;
  private int keysAssigned;

  /**
   * Largest number of probe steps ever taken to find location for a key. When getting, we can
   * conclude that they key is not in hashtable when we make this many steps and don't find it.
   */
  private int largestNumberOfSteps = 0;

  /**
   * References to keys of the hashtable. The index is hash of the key; collisions are
   * resolved using open addressing with quadratic probing. Reference format
   * [40: offset into writeBuffers][8: state byte][1: has list flag]
   * [15: part of hash used to optimize probing]
   * Offset is tail offset of the first record for the key (the one containing the key).
   * It is not necessary to store 15 bits in particular to optimize probing; in fact when
   * we always store the hash it is not necessary. But we have nothing else to do with them.
   * TODO: actually we could also use few bits to store largestNumberOfSteps for each,
   *      so we'd stop earlier on read collision. Need to profile on real queries.
   */
  private long[] refs;
  private int startingHashBitCount, hashBitCount;

  private int metricPutConflict = 0, metricExpands = 0, metricExpandsUs = 0;

  /** We have 39 bits to store list pointer from the first record; this is size limit */
  final static long MAX_WB_SIZE = ((long)1) << 38;
  /** 8 Gb of refs is the max capacity if memory limit is not specified. If someone has 100s of
   * Gbs of memory (this might happen pretty soon) we'd need to string together arrays anyway. */
  private final static int DEFAULT_MAX_CAPACITY = 1024 * 1024 * 1024;

  public BytesBytesMultiHashMap(int initialCapacity,
      float loadFactor, int wbSize, long memUsage, int defaultCapacity) {
    if (loadFactor < 0 || loadFactor > 1) {
      throw new AssertionError("Load factor must be between (0, 1].");
    }
    assert initialCapacity > 0;
    initialCapacity = (Long.bitCount(initialCapacity) == 1)
        ? initialCapacity : nextHighestPowerOfTwo(initialCapacity);
    // 8 bytes per long in the refs, assume data will be empty. This is just a sanity check.
    int maxCapacity =  (memUsage <= 0) ? DEFAULT_MAX_CAPACITY
        : (int)Math.min((long)DEFAULT_MAX_CAPACITY, memUsage / 8);
    if (maxCapacity < initialCapacity || initialCapacity <= 0) {
      // Either initialCapacity is too large, or nextHighestPowerOfTwo overflows
      initialCapacity = (Long.bitCount(maxCapacity) == 1)
          ? maxCapacity : nextLowestPowerOfTwo(maxCapacity);
    }

    validateCapacity(initialCapacity);
    startingHashBitCount = 63 - Long.numberOfLeadingZeros(initialCapacity);
    this.loadFactor = loadFactor;
    refs = new long[initialCapacity];
    writeBuffers = new WriteBuffers(wbSize, MAX_WB_SIZE);
    resizeThreshold = (int)(initialCapacity * this.loadFactor);
  }

  @VisibleForTesting
  BytesBytesMultiHashMap(int initialCapacity, float loadFactor, int wbSize) {
    this(initialCapacity, loadFactor, wbSize, -1, 100000);
  }

  /** The source of keys and values to put into hashtable; avoids byte copying. */
  public static interface KvSource {
    /** Write key into output. */
    public void writeKey(RandomAccessOutput dest) throws SerDeException;

    /** Write value into output. */
    public void writeValue(RandomAccessOutput dest) throws SerDeException;

    /**
     * Provide updated value for state byte for a key.
     * @param previousValue Previous value; null if this is the first call per key.
     * @return The updated value.
     */
    public byte updateStateByte(Byte previousValue);
  }

  /**
   * Adds new value to new or existing key in hashmap.
   * @param kv Keyvalue writer. Each method will be called at most once.
   */
  private static final byte[] FOUR_ZEROES = new byte[] { 0, 0, 0, 0 };
  public void put(KvSource kv) throws SerDeException {
    if (resizeThreshold <= keysAssigned) {
      expandAndRehash();
    }

    // Reserve 4 bytes for the hash (don't just reserve, there may be junk there)
    writeBuffers.write(FOUR_ZEROES);

    // Write key to buffer to compute hashcode and compare; if it's a new key, it will
    // become part of the record; otherwise, we will just write over it later.
    long keyOffset = writeBuffers.getWritePoint();

    kv.writeKey(writeBuffers);
    int keyLength = (int)(writeBuffers.getWritePoint() - keyOffset);
    int hashCode = writeBuffers.hashCode(keyOffset, keyLength);

    int slot = findKeySlotToWrite(keyOffset, keyLength, hashCode);
    // LOG.info("Write hash code is " + Integer.toBinaryString(hashCode) + " - " + slot);

    long ref = refs[slot];
    if (ref == 0) {
      // This is a new key, keep writing the first record.
      long tailOffset = writeFirstValueRecord(kv, keyOffset, keyLength, hashCode);
      byte stateByte = kv.updateStateByte(null);
      refs[slot] = Ref.makeFirstRef(tailOffset, stateByte, hashCode, startingHashBitCount);
      ++keysAssigned;
    } else {
      // This is not a new key; we'll overwrite the key and hash bytes - not needed anymore.
      writeBuffers.setWritePoint(keyOffset - 4);
      long lrPtrOffset = createOrGetListRecord(ref);
      long tailOffset = writeValueAndLength(kv);
      addRecordToList(lrPtrOffset, tailOffset);
      byte oldStateByte = Ref.getStateByte(ref);
      byte stateByte = kv.updateStateByte(oldStateByte);
      if (oldStateByte != stateByte) {
        ref = Ref.setStateByte(ref, stateByte);
      }
      refs[slot] = Ref.setListFlag(ref);
    }
  }

  /**
   * Gets "lazy" values for a key (as a set of byte segments in underlying buffer).
   * @param key Key buffer.
   * @param length Length of the key in buffer.
   * @param result The list to use to store the results.
   * @return the state byte for the key (see class description).
   */
  public byte getValueRefs(byte[] key, int length, List<WriteBuffers.ByteSegmentRef> result) {
    // First, find first record for the key.
    result.clear();
    long ref = findKeyRefToRead(key, length);
    if (ref == 0) {
      return 0;
    }
    boolean hasList = Ref.hasList(ref);

    // This relies on findKeyRefToRead doing key equality check and leaving read ptr where needed.
    long lrPtrOffset = hasList ? writeBuffers.getReadPoint() : 0;

    writeBuffers.setReadPoint(getFirstRecordLengthsOffset(ref));
    int valueLength = (int)writeBuffers.readVLong();
    // LOG.info("Returning value at " + (Ref.getOffset(ref) - valueLength) +  " length " + valueLength);
    result.add(new WriteBuffers.ByteSegmentRef(Ref.getOffset(ref) - valueLength, valueLength));
    byte stateByte = Ref.getStateByte(ref);
    if (!hasList) {
      return stateByte;
    }

    // There're multiple records for the key; get the offset of the next one.
    long nextTailOffset = writeBuffers.readFiveByteULong(lrPtrOffset);
    // LOG.info("Next tail offset " + nextTailOffset);

    while (nextTailOffset > 0) {
      writeBuffers.setReadPoint(nextTailOffset);
      valueLength = (int)writeBuffers.readVLong();
      // LOG.info("Returning value at " + (nextTailOffset - valueLength) +  " length " + valueLength);
      result.add(new WriteBuffers.ByteSegmentRef(nextTailOffset - valueLength, valueLength));
      // Now read the relative offset to next record. Next record is always before the
      // previous record in the write buffers (see writeBuffers javadoc).
      long delta = writeBuffers.readVLong();
      nextTailOffset = delta == 0 ? 0 : (nextTailOffset - delta);
      // LOG.info("Delta " + delta +  ", next tail offset " + nextTailOffset);
    }
    return stateByte;
  }


  /**
   * Take the segment reference from {@link #getValueRefs(byte[], int, List)}
   * result and makes it self-contained - adds byte array where the value is stored, and
   * updates the offset from "global" write buffers offset to offset within that array.
   */
  public void populateValue(WriteBuffers.ByteSegmentRef valueRef) {
    writeBuffers.populateValue(valueRef);
  }

  public int size() {
    return keysAssigned;
  }

  public void seal() {
    writeBuffers.seal();
  }

  public void clear() {
    // This will make the object completely unusable. Semantics of clear are not defined...
    this.writeBuffers.clear();
    this.refs = new long[1];
    this.keysAssigned = 0;
  }

  private static void validateCapacity(long capacity) {
    if (Long.bitCount(capacity) != 1) {
      throw new AssertionError("Capacity must be a power of two");
    }
    if (capacity <= 0) {
      throw new AssertionError("Invalid capacity " + capacity);
    }
  }

  /**
   * Finds the slot to use for writing, based on the key bytes already written to buffers.
   * @param keyOffset Offset to the key.
   * @param keyLength Length of the key.
   * @param hashCode Hash code of the key (passed in because java doesn't have ref/pointers).
   * @return The slot to use for writing; can be new, or matching existing key.
   */
  private int findKeySlotToWrite(long keyOffset, int keyLength, int hashCode) {
    final int bucketMask = (refs.length - 1);
    int slot = hashCode & bucketMask;
    long probeSlot = slot;
    int i = 0;
    while (true) {
      long ref = refs[slot];
      if (ref == 0 || isSameKey(keyOffset, keyLength, ref, hashCode)) {
        break;
      }
      ++metricPutConflict;
      // Some other key (collision) - keep probing.
      probeSlot += (++i);
      slot = (int)(probeSlot & bucketMask);
    }
    if (largestNumberOfSteps < i) {
      if (LOG.isDebugEnabled()) {
        LOG.debug("Probed " + i + " slots (the longest so far) to find space");
      }
      largestNumberOfSteps = i;
      // debugDumpKeyProbe(keyOffset, keyLength, hashCode, slot);
    }
    return slot;
  }

  /**
   * Finds the slot to use for reading.
   * @param key Read key array.
   * @param length Read key length.
   * @return The ref to use for reading.
   */
  private long findKeyRefToRead(byte[] key, int length) {
    final int bucketMask = (refs.length - 1);
    int hashCode = writeBuffers.hashCode(key, 0, length);
    int slot = hashCode & bucketMask;
    // LOG.info("Read hash code for " + Utils.toStringBinary(key, 0, length)
    //   + " is " + Integer.toBinaryString(hashCode) + " - " + slot);
    long probeSlot = slot;
    int i = 0;
    while (true) {
      long ref = refs[slot];
      // When we were inserting the key, we would have inserted here; so, there's no key.
      if (ref == 0) {
        return 0;
      }
      if (isSameKey(key, length, ref, hashCode)) {
        return ref;
      }
      probeSlot += (++i);
      if (i > largestNumberOfSteps) {
        // We know we never went that far when we were inserting.
        return 0;
      }
      slot = (int)(probeSlot & bucketMask);
    }
  }

  /**
   * Puts ref into new refs array.
   * @param newRefs New refs array.
   * @param newRef New ref value.
   * @param hashCode Hash code to use.
   * @return The number of probe steps taken to find key position.
   */
  private int relocateKeyRef(long[] newRefs, long newRef, int hashCode) {
    final int bucketMask = newRefs.length - 1;
    int newSlot = hashCode & bucketMask;
    long probeSlot = newSlot;
    int i = 0;
    while (true) {
      long current = newRefs[newSlot];
      if (current == 0) {
        newRefs[newSlot] = newRef;
        break;
      }
      // New array cannot contain the records w/the same key, so just advance, don't check.
      probeSlot += (++i);
      newSlot = (int)(probeSlot & bucketMask);
    }
    return i;
  }

  /**
   * Verifies that the key matches a requisite key.
   * @param cmpOffset The offset to the key to compare with.
   * @param cmpLength The length of the key to compare with.
   * @param ref The ref that can be used to retrieve the candidate key.
   * @param hashCode
   * @return -1 if the key referenced by ref is different than the one referenced by cmp...
   *         0 if the keys match, and there's only one value for this key (no list).
   *         Offset if the keys match, and there are multiple values for this key (a list).
   */
  private boolean isSameKey(long cmpOffset, int cmpLength, long ref, int hashCode) {
    if (!compareHashBits(ref, hashCode)) {
      return false; // Hash bits in ref don't match.
    }
    writeBuffers.setReadPoint(getFirstRecordLengthsOffset(ref));
    int valueLength = (int)writeBuffers.readVLong(), keyLength = (int)writeBuffers.readVLong();
    if (keyLength != cmpLength) {
      return false;
    }
    long keyOffset = Ref.getOffset(ref) - (valueLength + keyLength);
    // There's full hash code stored in front of the key. We could check that first. If keyLength
    // is <= 4 it obviously doesn't make sense, less bytes to check in a key. Then, if there's a
    // match, we check it in vain. But what is the proportion of matches? For writes it could be 0
    // if all keys are unique, for reads we hope it's really high. Then if there's a mismatch what
    // probability is there that key mismatches in <4 bytes (so just checking the key is faster)?
    // We assume the latter is pretty high, so we don't check for now.
    return writeBuffers.isEqual(cmpOffset, cmpLength, keyOffset, keyLength);
  }

  /**
   * Same as {@link #isSameKey(long, int, long, int)} but for externally stored key.
   */
  private boolean isSameKey(byte[] key, int length, long ref, int hashCode) {
    if (!compareHashBits(ref, hashCode)) {
      return false// Hash bits don't match.
    }
    writeBuffers.setReadPoint(getFirstRecordLengthsOffset(ref));
    int valueLength = (int)writeBuffers.readVLong(), keyLength = (int)writeBuffers.readVLong();
    long keyOffset = Ref.getOffset(ref) - (valueLength + keyLength);
    // See the comment in the other isSameKey
    return writeBuffers.isEqual(key, length, keyOffset, keyLength);
  }

  private boolean compareHashBits(long ref, int hashCode) {
    long fakeRef = Ref.makeFirstRef(0, (byte)0, hashCode, startingHashBitCount);
    return (Ref.getHashBits(fakeRef) == Ref.getHashBits(ref));
  }

  /**
   * @param ref Reference.
   * @return The offset to value and key length vlongs of the first record referenced by ref.
   */
  private long getFirstRecordLengthsOffset(long ref) {
    long tailOffset = Ref.getOffset(ref);
    if (Ref.hasList(ref)) {
      long relativeOffset = writeBuffers.readFiveByteULong(tailOffset);
      tailOffset += relativeOffset;
    }
    return tailOffset;
  }

  private void expandAndRehash() {
    long expandTime = System.nanoTime();
    final long[] oldRefs = refs;
    long capacity = refs.length << 1;
    validateCapacity(capacity);
    long[] newRefs = new long[(int)capacity];

    // We store some hash bits in ref; for every expansion, we need to add one bit to hash.
    // If we have enough bits, we'll do that; if we don't, we'll rehash.
    // LOG.info("Expanding the hashtable to " + capacity + " capacity");
    int newHashBitCount = hashBitCount + 1;

    // Relocate all assigned slots from the old hash table.
    int maxSteps = 0;
    for (int oldSlot = 0; oldSlot < oldRefs.length; ++oldSlot) {
      long oldRef = oldRefs[oldSlot];
      if (oldRef == 0) {
        continue;
      }
      // TODO: we could actually store a bit flag in ref indicating whether this is a hash
      //       match or a probe, and in the former case use hash bits (for a first few resizes).
      // int hashCodeOrPart = oldSlot | Ref.getNthHashBit(oldRef, startingHashBitCount, newHashBitCount);
      writeBuffers.setReadPoint(getFirstRecordLengthsOffset(oldRef));
      // Read the value and key length for the first record.
      int hashCode = writeBuffers.readInt(Ref.getOffset(oldRef)
          - writeBuffers.readVLong() - writeBuffers.readVLong() - 4);
      int probeSteps = relocateKeyRef(newRefs, oldRef, hashCode);
      maxSteps = Math.max(probeSteps, maxSteps);
    }
    this.refs = newRefs;
    this.largestNumberOfSteps = maxSteps;
    this.hashBitCount = newHashBitCount;
    this.resizeThreshold = (int)(capacity * loadFactor);
    metricExpandsUs += (System.nanoTime() - expandTime);
    ++metricExpands;

  }

  /**
   * @param ref The ref.
   * @return The offset to list record pointer; list record is created if it doesn't exist.
   */
  private long createOrGetListRecord(long ref) {
    if (Ref.hasList(ref)) {
      // LOG.info("Found list record at " + writeBuffers.getReadPoint());
      return writeBuffers.getReadPoint(); // Assumes we are here after key compare.
    }
    long firstTailOffset = Ref.getOffset(ref);
    // LOG.info("First tail offset to create list record is " + firstTailOffset);

    // Determine the length of storage for value and key lengths of the first record.
    writeBuffers.setReadPoint(firstTailOffset);
    writeBuffers.skipVLong();
    writeBuffers.skipVLong();
    int lengthsLength = (int)(writeBuffers.getReadPoint() - firstTailOffset);

    // Create the list record, copy first record value/key lengths there.
    writeBuffers.writeBytes(firstTailOffset, lengthsLength);
    long lrPtrOffset = writeBuffers.getWritePoint();
    // LOG.info("Creating list record: copying " + lengthsLength + ", lrPtrOffset " + lrPtrOffset);

    // Reserve 5 bytes for writeValueRecord to fill. There might be junk there so null them.
    writeBuffers.write(FIVE_ZEROES);
    // Link the list record to the first element.
    writeBuffers.writeFiveByteULong(firstTailOffset,
        lrPtrOffset - lengthsLength - firstTailOffset);
    return lrPtrOffset;
  }

  /**
   * Adds a newly-written record to existing list.
   * @param lrPtrOffset List record pointer offset.
   * @param tailOffset New record offset.
   */
  private void addRecordToList(long lrPtrOffset, long tailOffset) {
    // Now, insert this record into the list.
    long prevHeadOffset = writeBuffers.readFiveByteULong(lrPtrOffset);
    // LOG.info("Reading offset " + prevHeadOffset + " at " + lrPtrOffset);
    assert prevHeadOffset < tailOffset; // We replace an earlier element, must have lower offset.
    writeBuffers.writeFiveByteULong(lrPtrOffset, tailOffset);
    // LOG.info("Writing offset " + tailOffset + " at " + lrPtrOffset);
    writeBuffers.writeVLong(prevHeadOffset == 0 ? 0 : (tailOffset - prevHeadOffset));
  }


  /**
   * Writes first value and lengths to finish the first record after the key has been written.
   * @param kv Key-value writer.
   * @param keyOffset
   * @param keyLength Key length (already written).
   * @param hashCode
   * @return The offset of the new record.
   */
  private long writeFirstValueRecord(
      KvSource kv, long keyOffset, int keyLength, int hashCode) throws SerDeException {
    long valueOffset = writeBuffers.getWritePoint();
    kv.writeValue(writeBuffers);
    long tailOffset = writeBuffers.getWritePoint();
    int valueLength = (int)(tailOffset - valueOffset);
    // LOG.info("Writing value at " + valueOffset + " length " + valueLength);
    // In an unlikely case of 0-length key and value for the very first entry, we want to tell
    // this apart from an empty value. We'll just advance one byte; this byte will be lost.
    if (tailOffset == 0) {
      writeBuffers.reserve(1);
      ++tailOffset;
    }
    // LOG.info("First tail offset " + writeBuffers.getWritePoint());
    writeBuffers.writeVLong(valueLength);
    writeBuffers.writeVLong(keyLength);
    long lengthsLength = writeBuffers.getWritePoint() - tailOffset;
    if (lengthsLength < 5) { // Reserve space for potential future list
      writeBuffers.reserve(5 - (int)lengthsLength);
    }
    // Finally write the hash code.
    writeBuffers.writeInt(keyOffset - 4, hashCode);
    return tailOffset;
  }

  /**
   * Writes the value and value length for non-first record.
   * @param kv Key-value writer.
   * @return The offset of the new record.
   */
  private long writeValueAndLength(KvSource kv) throws SerDeException {
    long valueOffset = writeBuffers.getWritePoint();
    kv.writeValue(writeBuffers);
    long tailOffset = writeBuffers.getWritePoint();
    writeBuffers.writeVLong(tailOffset - valueOffset);
    // LOG.info("Writing value at " + valueOffset + " length " + (tailOffset - valueOffset));
    return tailOffset;
  }

  /** Writes the debug dump of the table into logs. */
  public void debugDumpTable() {
    StringBuilder dump = new StringBuilder(keysAssigned + " keys\n");
    TreeMap<Long, Integer> byteIntervals = new TreeMap<Long, Integer>();
    List<WriteBuffers.ByteSegmentRef> results = new ArrayList<WriteBuffers.ByteSegmentRef>();
    int examined = 0;
    for (int slot = 0; slot < refs.length; ++slot) {
      long ref = refs[slot];
      if (ref == 0) {
        continue;
      }
      ++examined;
      long recOffset = getFirstRecordLengthsOffset(ref);
      long tailOffset = Ref.getOffset(ref);
      writeBuffers.setReadPoint(recOffset);
      int valueLength = (int)writeBuffers.readVLong(), keyLength = (int)writeBuffers.readVLong();
      long ptrOffset = writeBuffers.getReadPoint();
      if (Ref.hasList(ref)) {
        byteIntervals.put(recOffset, (int)(ptrOffset + 5 - recOffset));
      }
      long keyOffset = tailOffset - valueLength - keyLength;
      byte[] key = new byte[keyLength];
      WriteBuffers.ByteSegmentRef fakeRef = new WriteBuffers.ByteSegmentRef(keyOffset, keyLength);
      byteIntervals.put(keyOffset - 4, keyLength + 4);
      writeBuffers.populateValue(fakeRef);
      System.arraycopy(fakeRef.getBytes(), (int)fakeRef.getOffset(), key, 0, keyLength);
      getValueRefs(key, key.length, results);
      dump.append(Utils.toStringBinary(key, 0, key.length)).append(" ref [").append(dumpRef(ref))
        .append("]: ").append(results.size()).append(" rows\n");
      for (int i = 0; i < results.size(); ++i) {
        WriteBuffers.ByteSegmentRef segment = results.get(i);
        byteIntervals.put(segment.getOffset(),
            segment.getLength() + ((i == 0) ? 1 : 0)); // state byte in the first record
      }
    }
    if (examined != keysAssigned) {
      dump.append("Found " + examined + " keys!\n");
    }
    // Report suspicious gaps in writeBuffers
    long currentOffset = 0;
    for (Map.Entry<Long, Integer> e : byteIntervals.entrySet()) {
      long start = e.getKey(), len = e.getValue();
      if (start - currentOffset > 4) {
        dump.append("Gap! [" + currentOffset + ", " + start + ")\n");
      }
      currentOffset = start + len;
    }
    LOG.info("Hashtable dump:\n " + dump.toString());
  }

  private final static byte[] FIVE_ZEROES = new byte[] { 0,0,0,0,0 };

  private static int nextHighestPowerOfTwo(int v) {
    return Integer.highestOneBit(v) << 1;
  }

  private static int nextLowestPowerOfTwo(int v) {
    return Integer.highestOneBit(v);
  }

  @VisibleForTesting
  int getCapacity() {
    return refs.length;
  }

  /** Static helper for manipulating refs */
  private final static class Ref {
    private final static int OFFSET_SHIFT = 24;
    private final static int STATE_BYTE_SHIFT = 16;
    private final static long STATE_BYTE_MASK = ((long)1 << (OFFSET_SHIFT - STATE_BYTE_SHIFT)) - 1;
    public final static long HASH_BITS_COUNT = STATE_BYTE_SHIFT - 1;
    private final static long HAS_LIST_MASK = (long)1 << HASH_BITS_COUNT;
    private final static long HASH_BITS_MASK = HAS_LIST_MASK - 1;

    private final static long REMOVE_STATE_BYTE = ~(STATE_BYTE_MASK << STATE_BYTE_SHIFT);

    public static long getOffset(long ref) {
      return ref >>> OFFSET_SHIFT;
    }

    public static byte getStateByte(long ref) {
      return (byte)((ref >>> STATE_BYTE_SHIFT) & STATE_BYTE_MASK);
    }

    public static boolean hasList(long ref) {
      return (ref & HAS_LIST_MASK) == HAS_LIST_MASK;
    }

    public static long getHashBits(long ref) {
      return ref & HASH_BITS_MASK;
    }

    public static long makeFirstRef(long offset, byte stateByte, int hashCode, int skipHashBits) {
      long hashPart = (hashCode >>> skipHashBits) & HASH_BITS_MASK;
      return offset << OFFSET_SHIFT | hashPart | ((stateByte & 0xffl) << STATE_BYTE_SHIFT);
    }

    public static int getNthHashBit(long ref, int skippedBits, int position) {
      long hashPart = getHashBits(ref) << skippedBits; // Original hashcode, with 0-d low bits.
      return (int)(hashPart & (1 << (position - 1)));
    }


    public static long setStateByte(long ref, byte stateByte) {
      return (ref & REMOVE_STATE_BYTE) | ((stateByte & 0xffl) << STATE_BYTE_SHIFT);
    }

    public static long setListFlag(long ref) {
      return ref | HAS_LIST_MASK;
    }
  }

  private static String dumpRef(long ref) {
    return StringUtils.leftPad(Long.toBinaryString(ref), 64, "0") + " o="
        + Ref.getOffset(ref) + " s=" + Ref.getStateByte(ref) + " l=" + Ref.hasList(ref)
        + " h=" + Long.toBinaryString(Ref.getHashBits(ref));
  }

  public void debugDumpMetrics() {
    LOG.info("Map metrics: keys allocated " + this.refs.length +", keys assigned " + keysAssigned
        + ", write conflict " + metricPutConflict  + ", write max dist " + largestNumberOfSteps
        + ", expanded " + metricExpands + " times in " + metricExpandsUs + "us");
  }

  private void debugDumpKeyProbe(long keyOffset, int keyLength, int hashCode, int finalSlot) {
    final int bucketMask = refs.length - 1;
    WriteBuffers.ByteSegmentRef fakeRef = new WriteBuffers.ByteSegmentRef(keyOffset, keyLength);
    writeBuffers.populateValue(fakeRef);
    int slot = hashCode & bucketMask;
    long probeSlot = slot;
    StringBuilder sb = new StringBuilder("Probe path debug for [");
    sb.append(Utils.toStringBinary(
        fakeRef.getBytes(), (int)fakeRef.getOffset(), fakeRef.getLength()));
    sb.append("] hashCode ").append(Integer.toBinaryString(hashCode)).append(" is: ");
    int i = 0;
    while (slot != finalSlot) {
      probeSlot += (++i);
      slot = (int)(probeSlot & bucketMask);
      sb.append(slot).append(" - ").append(probeSlot).append(" - ")
        .append(Long.toBinaryString(refs[slot])).append("\n");
    }
    LOG.info(sb.toString());
  }
}
TOP

Related Classes of org.apache.hadoop.hive.ql.exec.persistence.BytesBytesMultiHashMap$Ref

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.