Source Code of eu.stratosphere.pact.runtime.hash.MutableHashTable$ProbeIterator

/***********************************************************************************************************************
 * Copyright (C) 2010-2013 by the Stratosphere project (http://stratosphere.eu)
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 * specific language governing permissions and limitations under the License.
 **********************************************************************************************************************/


package eu.stratosphere.pact.runtime.hash;




import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.atomic.AtomicBoolean;


import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;


import eu.stratosphere.api.common.typeutils.TypeComparator;
import eu.stratosphere.api.common.typeutils.TypePairComparator;
import eu.stratosphere.api.common.typeutils.TypeSerializer;
import eu.stratosphere.core.memory.MemorySegment;
import eu.stratosphere.core.memory.MemorySegmentSource;
import eu.stratosphere.core.memory.SeekableDataOutputView;
import eu.stratosphere.nephele.services.iomanager.BlockChannelReader;
import eu.stratosphere.nephele.services.iomanager.BulkBlockChannelReader;
import eu.stratosphere.nephele.services.iomanager.Channel;
import eu.stratosphere.nephele.services.iomanager.ChannelReaderInputView;
import eu.stratosphere.nephele.services.iomanager.HeaderlessChannelReaderInputView;
import eu.stratosphere.nephele.services.iomanager.IOManager;
import eu.stratosphere.pact.runtime.io.ChannelReaderInputViewIterator;
import eu.stratosphere.pact.runtime.iterative.io.HashPartitionIterator;
import eu.stratosphere.pact.runtime.util.MathUtils;
import eu.stratosphere.util.MutableObjectIterator;




/**
 * An implementation of a Hybrid Hash Join. The join starts operating in memory and gradually starts
 * spilling contents to disk, when the memory is not sufficient. It does not need to know a priori 
 * how large the input will be.
 * <p>
 * The design of this class follows on many parts the design presented in
 * "Hash joins and hash teams in Microsoft SQL Server", by Goetz Graefe et al. In its current state, the
 * implementation lacks features like dynamic role reversal, partition tuning, or histogram guided partitioning. 
 *<p>
 *
 *
 * <hr>
 * 
 * The layout of the buckets inside a memory segment is as follows:
 * 
 * <pre>
 * +----------------------------- Bucket x ----------------------------
 * |Partition (1 byte) | Status (1 byte) | element count (2 bytes) |
 * | next-bucket-in-chain-pointer (8 bytes) | reserved (4 bytes) |
 * |
 * |hashCode 1 (4 bytes) | hashCode 2 (4 bytes) | hashCode 3 (4 bytes) |
 * | ... hashCode n-1 (4 bytes) | hashCode n (4 bytes)
 * |
 * |pointer 1 (8 bytes) | pointer 2 (8 bytes) | pointer 3 (8 bytes) |
 * | ... pointer n-1 (8 bytes) | pointer n (8 bytes)
 * |
 * +---------------------------- Bucket x + 1--------------------------
 * |Partition (1 byte) | Status (1 byte) | element count (2 bytes) |
 * | next-bucket-in-chain-pointer (8 bytes) | reserved (4 bytes) |
 * |
 * |hashCode 1 (4 bytes) | hashCode 2 (4 bytes) | hashCode 3 (4 bytes) |
 * | ... hashCode n-1 (4 bytes) | hashCode n (4 bytes)
 * |
 * |pointer 1 (8 bytes) | pointer 2 (8 bytes) | pointer 3 (8 bytes) |
 * | ... pointer n-1 (8 bytes) | pointer n (8 bytes)
 * +-------------------------------------------------------------------
 * | ...
 * |
 * </pre>
 * 
 * @param <BT> The type of records from the build side that are stored in the hash table.
 * @param <PT> The type of records from the probe side that are stored in the hash table.
 */
public class MutableHashTable<BT, PT> implements MemorySegmentSource {
  
  private static final Log LOG = LogFactory.getLog(MutableHashTable.class);
  
  // ------------------------------------------------------------------------
  //                         Internal Constants
  // ------------------------------------------------------------------------
  
  /**
   * The maximum number of recursive partitionings that the join does before giving up.
   */
  private static final int MAX_RECURSION_DEPTH = 3;
  
  /**
   * The minimum number of memory segments the hash join needs to be supplied with in order to work.
   */
  private static final int MIN_NUM_MEMORY_SEGMENTS = 33;
  
  /**
   * The maximum number of partitions, which defines the spilling granularity. Each recursion, the
   * data is divided maximally into that many partitions, which are processed in one chuck.
   */
  private static final int MAX_NUM_PARTITIONS = Byte.MAX_VALUE;
  
  /**
   * The default record width that is used when no width is given. The record width is
   * used to determine the ratio of the number of memory segments intended for partition
   * buffers and the number of memory segments in the hash-table structure. 
   */
  private static final int DEFAULT_RECORD_LEN = 24;
  
  /**
   * The length of the hash code stored in the bucket.
   */
  private static final int HASH_CODE_LEN = 4;
  
  /**
   * The length of a pointer from a hash bucket to the record in the buffers.
   */
  private static final int POINTER_LEN = 8;
  
  /**
   * The number of bytes for the serialized record length in the partition buffers.
   */
  private static final int SERIALIZED_LENGTH_FIELD_BYTES = 0;
  
  /**
   * The number of bytes that the entry in the hash structure occupies, in bytes.
   * It corresponds to a 4 byte hash value and an 8 byte pointer.
   */
  private static final int RECORD_TABLE_BYTES = HASH_CODE_LEN + POINTER_LEN;
  
  /**
   * The total storage overhead per record, in bytes. This corresponds to the space in the
   * actual hash table buckets, consisting of a 4 byte hash value and an 8 byte
   * pointer, plus the overhead for the stored length field.
   */
  private static final int RECORD_OVERHEAD_BYTES = RECORD_TABLE_BYTES + SERIALIZED_LENGTH_FIELD_BYTES;
  
  // -------------------------- Bucket Size and Structure -------------------------------------
  
  static final int NUM_INTRA_BUCKET_BITS = 7;
  
  static final int HASH_BUCKET_SIZE = 0x1 << NUM_INTRA_BUCKET_BITS;
  
  static final int BUCKET_HEADER_LENGTH = 16;
  
  private static final int NUM_ENTRIES_PER_BUCKET = (HASH_BUCKET_SIZE - BUCKET_HEADER_LENGTH) / RECORD_OVERHEAD_BYTES;
  
  private static final int BUCKET_POINTER_START_OFFSET = BUCKET_HEADER_LENGTH + (NUM_ENTRIES_PER_BUCKET * HASH_CODE_LEN);
  
  // ------------------------------ Bucket Header Fields ------------------------------
  
  /**
   * Offset of the field in the bucket header indicating the bucket's partition.
   */
  private static final int HEADER_PARTITION_OFFSET = 0;
  
  /**
   * Offset of the field in the bucket header indicating the bucket's status (spilled or in-memory).
   */
  private static final int HEADER_STATUS_OFFSET = 1;
  
  /**
   * Offset of the field in the bucket header indicating the bucket's element count.
   */
  private static final int HEADER_COUNT_OFFSET = 2;
  
  /**
   * Offset of the field in the bucket header that holds the forward pointer to its
   * first overflow bucket.
   */
  private static final int HEADER_FORWARD_OFFSET = 4;  
  
  /**
   * Constant for the forward pointer, indicating that the pointer is not set. 
   */
  private static final long BUCKET_FORWARD_POINTER_NOT_SET = ~0x0L;
  
//  private static final byte BUCKET_STATUS_SPILLED = 1;
  
  /**
   * Constant for the bucket status, indicating that the bucket is in memory.
   */
  private static final byte BUCKET_STATUS_IN_MEMORY = 0;
  
  // ------------------------------------------------------------------------
  //                              Members
  // ------------------------------------------------------------------------


  /**
   * The utilities to serialize the build side data types.
   */
  protected final TypeSerializer<BT> buildSideSerializer;


  /**
   * The utilities to serialize the probe side data types.
   */
  protected final TypeSerializer<PT> probeSideSerializer;
  
  /**
   * The utilities to hash and compare the build side data types.
   */
  protected final TypeComparator<BT> buildSideComparator;


  /**
   * The utilities to hash and compare the probe side data types.
   */
  private final TypeComparator<PT> probeSideComparator;
  
  /**
   * The comparator used to determine (in)equality between probe side and build side records.
   */
  private final TypePairComparator<PT, BT> recordComparator;
  
  /**
   * The free memory segments currently available to the hash join.
   */
  protected final List<MemorySegment> availableMemory;
  
  /**
   * The queue of buffers that can be used for write-behind. Any buffer that is written
   * asynchronously to disk is returned through this queue. hence, it may sometimes contain more
   */
  protected final LinkedBlockingQueue<MemorySegment> writeBehindBuffers;
  
  /**
   * The I/O manager used to instantiate writers for the spilled partitions.
   */
  protected final IOManager ioManager;
  
  /**
   * The size of the segments used by the hash join buckets. All segments must be of equal size to ease offset computations.
   */
  protected final int segmentSize;
  
  /**
   * The total number of memory segments available to the hash join.
   */
  private final int totalNumBuffers;
  
  /**
   * The number of write-behind buffers used.
   */
  private final int numWriteBehindBuffers;
  
  /**
   * The number of hash table buckets in a single memory segment - 1.
   * Because memory segments can be comparatively large, we fit multiple buckets into one memory segment.
   * This variable is a mask that is 1 in the lower bits that define the number of a bucket
   * in a segment.
   */
  protected final int bucketsPerSegmentMask;
  
  /**
   * The number of bits that describe the position of a bucket in a memory segment. Computed as log2(bucketsPerSegment).
   */
  protected final int bucketsPerSegmentBits;
  
  /**
   * An estimate for the average record length.
   */
  private final int avgRecordLen;
  
  // ------------------------------------------------------------------------
  
  /**
   * The partitions that are built by processing the current partition.
   */
  protected final ArrayList<HashPartition<BT, PT>> partitionsBeingBuilt;
  
  /**
   * The partitions that have been spilled previously and are pending to be processed.
   */
  private final ArrayList<HashPartition<BT, PT>> partitionsPending;
  
  /**
   * Iterator over the elements in the hash table.
   */
  private HashBucketIterator<BT, PT> bucketIterator;
  
//  private LazyHashBucketIterator<BT, PT> lazyBucketIterator;
  
  /**
   * Iterator over the elements from the probe side.
   */
  protected ProbeIterator<PT> probeIterator;
  
  /**
   * The reader for the spilled-file of the probe partition that is currently read.
   */
  private BlockChannelReader currentSpilledProbeSide;
  
  /**
   * The channel enumerator that is used while processing the current partition to create
   * channels for the spill partitions it requires.
   */
  protected Channel.Enumerator currentEnumerator;
  
  /**
   * The array of memory segments that contain the buckets which form the actual hash-table
   * of hash-codes and pointers to the elements.
   */
  protected MemorySegment[] buckets;
  
  /**
   * The number of buckets in the current table. The bucket array is not necessarily fully
   * used, when not all buckets that would fit into the last segment are actually used.
   */
  protected int numBuckets;
  
  /**
   * The number of buffers in the write behind queue that are actually not write behind buffers,
   * but regular buffers that only have not yet returned. This is part of an optimization that the
   * spilling code needs not wait until the partition is completely spilled before proceeding.
   */
  protected int writeBehindBuffersAvailable;
  
  /**
   * The recursion depth of the partition that is currently processed. The initial table
   * has a recursion depth of 0. Partitions spilled from a table that is built for a partition
   * with recursion depth <i>n</i> have a recursion depth of <i>n+1</i>. 
   */
  protected int currentRecursionDepth;
  
  /**
   * Flag indicating that the closing logic has been invoked.
   */
  protected AtomicBoolean closed = new AtomicBoolean();
  
  /**
   * If true, build side partitions are kept for multiple probe steps.
   */
  protected boolean keepBuildSidePartitions = false;
  
  protected boolean furtherPartitioning = false;
  
  private boolean running = true;


  // ------------------------------------------------------------------------
  //                         Construction and Teardown
  // ------------------------------------------------------------------------
  
  public MutableHashTable(TypeSerializer<BT> buildSideSerializer, TypeSerializer<PT> probeSideSerializer,
      TypeComparator<BT> buildSideComparator, TypeComparator<PT> probeSideComparator,
      TypePairComparator<PT, BT> comparator, List<MemorySegment> memorySegments, IOManager ioManager)
  {
    this(buildSideSerializer, probeSideSerializer, buildSideComparator, probeSideComparator, comparator,
      memorySegments, ioManager, DEFAULT_RECORD_LEN);
  }
  
  public MutableHashTable(TypeSerializer<BT> buildSideSerializer, TypeSerializer<PT> probeSideSerializer,
      TypeComparator<BT> buildSideComparator, TypeComparator<PT> probeSideComparator,
      TypePairComparator<PT, BT> comparator, List<MemorySegment> memorySegments,
      IOManager ioManager, int avgRecordLen)
  {
    // some sanity checks first
    if (memorySegments == null) {
      throw new NullPointerException();
    }
    if (memorySegments.size() < MIN_NUM_MEMORY_SEGMENTS) {
      throw new IllegalArgumentException("Too few memory segments provided. Hash Join needs at least " + 
        MIN_NUM_MEMORY_SEGMENTS + " memory segments.");
    }
    
    // assign the members
    this.buildSideSerializer = buildSideSerializer;
    this.probeSideSerializer = probeSideSerializer;
    this.buildSideComparator = buildSideComparator;
    this.probeSideComparator = probeSideComparator;
    this.recordComparator = comparator;
    this.availableMemory = memorySegments;
    this.ioManager = ioManager;
    
    this.avgRecordLen = avgRecordLen > 0 ? avgRecordLen : 
        buildSideSerializer.getLength() == -1 ? DEFAULT_RECORD_LEN : buildSideSerializer.getLength();
    
    // check the size of the first buffer and record it. all further buffers must have the same size.
    // the size must also be a power of 2
    this.totalNumBuffers = memorySegments.size();
    this.segmentSize = memorySegments.get(0).size();
    if ( (this.segmentSize & this.segmentSize - 1) != 0) {
      throw new IllegalArgumentException("Hash Table requires buffers whose size is a power of 2.");
    }
    int bucketsPerSegment = this.segmentSize >> NUM_INTRA_BUCKET_BITS;
    if (bucketsPerSegment == 0) {
      throw new IllegalArgumentException("Hash Table requires buffers of at least " + HASH_BUCKET_SIZE + " bytes.");
    }
    this.bucketsPerSegmentMask = bucketsPerSegment - 1;
    this.bucketsPerSegmentBits = MathUtils.log2strict(bucketsPerSegment);
    
    // take away the write behind buffers
    this.writeBehindBuffers = new LinkedBlockingQueue<MemorySegment>();
    this.numWriteBehindBuffers = getNumWriteBehindBuffers(memorySegments.size());
    
    this.partitionsBeingBuilt = new ArrayList<HashPartition<BT, PT>>();
    this.partitionsPending = new ArrayList<HashPartition<BT, PT>>();
    
    // because we allow to open and close multiple times, the state is initially closed
    this.closed.set(true);
  }
  
  
  // ------------------------------------------------------------------------
  //                              Life-Cycle
  // ------------------------------------------------------------------------
  
  /**
   * Opens the hash join. This method reads the build-side input and constructs the initial
   * hash table, gradually spilling partitions that do not fit into memory. 
   * 
   * @throws IOException Thrown, if an I/O problem occurs while spilling a partition.
   */
  public void open(final MutableObjectIterator<BT> buildSide, final MutableObjectIterator<PT> probeSide)
  throws IOException
  {
    // sanity checks
    if (!this.closed.compareAndSet(true, false)) {
      throw new IllegalStateException("Hash Join cannot be opened, because it is currently not closed.");
    }
    
    // grab the write behind buffers first
    for (int i = this.numWriteBehindBuffers; i > 0; --i)
    {
      this.writeBehindBuffers.add(this.availableMemory.remove(this.availableMemory.size() - 1));
    }
    // open builds the initial table by consuming the build-side input
    this.currentRecursionDepth = 0;
    buildInitialTable(buildSide);
    
    // the first prober is the probe-side input
    this.probeIterator = new ProbeIterator<PT>(probeSide, this.probeSideSerializer.createInstance());
    
    // the bucket iterator can remain constant over the time
    this.bucketIterator = new HashBucketIterator<BT, PT>(this.buildSideSerializer, this.recordComparator);
  }
  
  protected boolean processProbeIter() throws IOException{
    final ProbeIterator<PT> probeIter = this.probeIterator;
    final TypeComparator<PT> probeAccessors = this.probeSideComparator;
    
    PT next;
    while ((next = probeIter.next()) != null) {
      final int hash = hash(probeAccessors.hash(next), this.currentRecursionDepth);
      final int posHashCode = hash % this.numBuckets;
      
      // get the bucket for the given hash code
      final int bucketArrayPos = posHashCode >> this.bucketsPerSegmentBits;
      final int bucketInSegmentOffset = (posHashCode & this.bucketsPerSegmentMask) << NUM_INTRA_BUCKET_BITS;
      final MemorySegment bucket = this.buckets[bucketArrayPos];
      
      // get the basic characteristics of the bucket
      final int partitionNumber = bucket.get(bucketInSegmentOffset + HEADER_PARTITION_OFFSET);
      final HashPartition<BT, PT> p = this.partitionsBeingBuilt.get(partitionNumber);
      
      // for an in-memory partition, process set the return iterators, else spill the probe records
      if (p.isInMemory()) {
        this.recordComparator.setReference(next);
        this.bucketIterator.set(bucket, p.overflowSegments, p, hash, bucketInSegmentOffset);
        return true;
      }
      else {
        p.insertIntoProbeBuffer(next);
      }
    }
    
    // -------------- partition done ---------------
    
    return false;
  }
  
  protected boolean prepareNextPartition() throws IOException {
    // finalize and cleanup the partitions of the current table
    int buffersAvailable = 0;
    for (int i = 0; i < this.partitionsBeingBuilt.size(); i++) {
      final HashPartition<BT, PT> p = this.partitionsBeingBuilt.get(i);
      p.setFurtherPatitioning(this.furtherPartitioning);
      buffersAvailable += p.finalizeProbePhase(this.availableMemory, this.partitionsPending);
    }
    
    this.partitionsBeingBuilt.clear();
    this.writeBehindBuffersAvailable += buffersAvailable;
    
    releaseTable();


    if (this.currentSpilledProbeSide != null) {
      this.currentSpilledProbeSide.closeAndDelete();
      this.currentSpilledProbeSide = null;
    }


    // check if there are pending partitions
    if (!this.partitionsPending.isEmpty())
    {
      final HashPartition<BT, PT> p = this.partitionsPending.get(0);


      // build the next table
      buildTableFromSpilledPartition(p);


      // set the probe side - gather memory segments for reading
      LinkedBlockingQueue<MemorySegment> returnQueue = new LinkedBlockingQueue<MemorySegment>();
      this.currentSpilledProbeSide = this.ioManager.createBlockChannelReader(p.getProbeSideChannel().getChannelID(), returnQueue);


      List<MemorySegment> memory = new ArrayList<MemorySegment>();
      memory.add(getNextBuffer());
      memory.add(getNextBuffer());


      ChannelReaderInputViewIterator<PT> probeReader = new ChannelReaderInputViewIterator<PT>(this.currentSpilledProbeSide,
        returnQueue, memory, this.availableMemory, this.probeSideSerializer, p.getProbeSideBlockCount());
      this.probeIterator.set(probeReader);


      // unregister the pending partition
      this.partitionsPending.remove(0);
      this.currentRecursionDepth = p.getRecursionLevel() + 1;
      
      // recursively get the next
      return nextRecord();
    }
    else {
      // no more data
      return false;
    }
  }
  /**
   * @return
   * @throws IOException
   */
  public boolean nextRecord() throws IOException {
    
    final boolean probeProcessing = processProbeIter();
    if(probeProcessing) {
      return true;
    }
    return prepareNextPartition();
  }
  
  public HashBucketIterator<BT, PT> getMatchesFor(PT record) throws IOException
  {
    final TypeComparator<PT> probeAccessors = this.probeSideComparator;
    final int hash = hash(probeAccessors.hash(record), this.currentRecursionDepth);
    final int posHashCode = hash % this.numBuckets;
    
    // get the bucket for the given hash code
    final int bucketArrayPos = posHashCode >> this.bucketsPerSegmentBits;
    final int bucketInSegmentOffset = (posHashCode & this.bucketsPerSegmentMask) << NUM_INTRA_BUCKET_BITS;
    final MemorySegment bucket = this.buckets[bucketArrayPos];
    
    // get the basic characteristics of the bucket
    final int partitionNumber = bucket.get(bucketInSegmentOffset + HEADER_PARTITION_OFFSET);
    final HashPartition<BT, PT> p = this.partitionsBeingBuilt.get(partitionNumber);
    
    // for an in-memory partition, process set the return iterators, else spill the probe records
    if (p.isInMemory()) {
      this.recordComparator.setReference(record);
      this.bucketIterator.set(bucket, p.overflowSegments, p, hash, bucketInSegmentOffset);
      return this.bucketIterator;
    }
    else {
      throw new IllegalStateException("Method is not applicable to partially spilled hash tables.");
    }
  }
  
//  public LazyHashBucketIterator<BT, PT> getLazyMatchesFor(PT record) throws IOException
//  {
//    final TypeComparator<PT> probeAccessors = this.probeSideComparator;
//    final int hash = hash(probeAccessors.hash(record), this.currentRecursionDepth);
//    final int posHashCode = hash % this.numBuckets;
//    
//    // get the bucket for the given hash code
//    final int bucketArrayPos = posHashCode >> this.bucketsPerSegmentBits;
//    final int bucketInSegmentOffset = (posHashCode & this.bucketsPerSegmentMask) << NUM_INTRA_BUCKET_BITS;
//    final MemorySegment bucket = this.buckets[bucketArrayPos];
//    
//    // get the basic characteristics of the bucket
//    final int partitionNumber = bucket.get(bucketInSegmentOffset + HEADER_PARTITION_OFFSET);
//    final HashPartition<BT, PT> p = this.partitionsBeingBuilt.get(partitionNumber);
//    
//    // for an in-memory partition, process set the return iterators, else spill the probe records
//    if (p.isInMemory()) {
//      this.recordComparator.setReference(record);
//      this.lazyBucketIterator.set(bucket, p.overflowSegments, p, hash, bucketInSegmentOffset);
//      return this.lazyBucketIterator;
//    }
//    else {
//      throw new IllegalStateException("Method is not applicable to partially spilled hash tables.");
//    }
//  }
  
  /**
   * @return
   */
  public PT getCurrentProbeRecord() {
    return this.probeIterator.getCurrent();
  }
  
  /**
   * @return
   */
  public HashBucketIterator<BT, PT> getBuildSideIterator() {
    return this.bucketIterator;
  }


  public MutableObjectIterator<BT> getPartitionEntryIterator() {
    return new HashPartitionIterator<BT, PT>(this.partitionsBeingBuilt.iterator(), this.buildSideSerializer);
  }
  
  /**
   * Closes the hash table. This effectively releases all internal structures and closes all
   * open files and removes them. The call to this method is valid both as a cleanup after the
   * complete inputs were properly processed, and as an cancellation call, which cleans up
   * all resources that are currently held by the hash join.
   */
  public void close() {
    // make sure that we close only once
    if (!this.closed.compareAndSet(false, true)) {
      return;
    }
    
    // clear the iterators, so the next call to next() will notice
    this.bucketIterator = null;
    this.probeIterator = null;
    
    // release the table structure
    releaseTable();
    
    // clear the memory in the partitions
    clearPartitions();
    
    // clear the current probe side channel, if there is one
    if (this.currentSpilledProbeSide != null) {
      try {
        this.currentSpilledProbeSide.closeAndDelete();
      }
      catch (Throwable t) {
        LOG.warn("Could not close and delete the temp file for the current spilled partition probe side.", t);
      }
    }
    
    // clear the partitions that are still to be done (that have files on disk)
    for (int i = 0; i < this.partitionsPending.size(); i++) {
      final HashPartition<BT, PT> p = this.partitionsPending.get(i);
      p.clearAllMemory(this.availableMemory);
    }
    
    // return the write-behind buffers
    for (int i = 0; i < this.numWriteBehindBuffers + this.writeBehindBuffersAvailable; i++) {
      try {
        this.availableMemory.add(this.writeBehindBuffers.take());
      }
      catch (InterruptedException iex) {
        throw new RuntimeException("Hashtable closing was interrupted");
      }
    }
  }
  
  public void abort() {
    this.running = false;
  }
  
  public List<MemorySegment> getFreedMemory() {
    if (!this.closed.get()) {
      throw new IllegalStateException("Cannot return memory while join is open.");
    }
    
    return this.availableMemory;
  }
  
  // ------------------------------------------------------------------------
  //                       Hash Table Building
  // ------------------------------------------------------------------------
  
  /**
   * @param input
   * @throws IOException
   */
  protected void buildInitialTable(final MutableObjectIterator<BT> input)
  throws IOException
  {
    // create the partitions
    final int partitionFanOut = getPartitioningFanOutNoEstimates(this.availableMemory.size());
    if (partitionFanOut > MAX_NUM_PARTITIONS) {
      throw new RuntimeException("Hash join partitions estimate exeeds maximum number of partitions."); 
    }
    createPartitions(partitionFanOut, 0);
    
    // set up the table structure. the write behind buffers are taken away, as are one buffer per partition
    final int numBuckets = getInitialTableSize(this.availableMemory.size(), this.segmentSize, 
      partitionFanOut, this.avgRecordLen);
    initTable(numBuckets, (byte) partitionFanOut);
    
    final TypeComparator<BT> buildTypeComparator = this.buildSideComparator;
    BT record = this.buildSideSerializer.createInstance();


    // go over the complete input and insert every element into the hash table
    while (this.running && ((record = input.next(record)) != null)) {
      final int hashCode = hash(buildTypeComparator.hash(record), 0);
      insertIntoTable(record, hashCode);
    }
    
    if (!this.running) {
      return;
    }


    // finalize the partitions
    for (int i = 0; i < this.partitionsBeingBuilt.size(); i++) {
      HashPartition<BT, PT> p = this.partitionsBeingBuilt.get(i);
      p.finalizeBuildPhase(this.ioManager, this.currentEnumerator, this.writeBehindBuffers);
    }
  }
  
  /**
   * @param p
   * @throws IOException
   */
  protected void buildTableFromSpilledPartition(final HashPartition<BT, PT> p) throws IOException {
    
    final int nextRecursionLevel = p.getRecursionLevel() + 1;
    if (nextRecursionLevel > MAX_RECURSION_DEPTH) {
      throw new RuntimeException("Hash join exceeded maximum number of recursions, without reducing "
        + "partitions enough to be memory resident. Probably cause: Too many duplicate keys.");
    }
    
    // we distinguish two cases here:
    // 1) The partition fits entirely into main memory. That is the case if we have enough buffers for
    //    all partition segments, plus enough buffers to hold the table structure.
    //    --> We read the partition in as it is and create a hashtable that references only
    //        that single partition.
    // 2) We can not guarantee that enough memory segments are available and read the partition
    //    in, distributing its data among newly created partitions.
    final int totalBuffersAvailable = this.availableMemory.size() + this.writeBehindBuffersAvailable;
    if (totalBuffersAvailable != this.totalNumBuffers - this.numWriteBehindBuffers) {
      throw new RuntimeException("Hash Join bug in memory management: Memory buffers leaked.");
    }
    
    long numBuckets = (p.getBuildSideRecordCount() * RECORD_TABLE_BYTES) / (HASH_BUCKET_SIZE - BUCKET_HEADER_LENGTH) + 1;
    
    // we need to consider the worst case where everything hashes to one bucket which needs to overflow by the same
    // number of total buckets again.
    final long totalBuffersNeeded = (numBuckets * 2) / (this.bucketsPerSegmentMask + 1) + p.getBuildSideBlockCount() + 1;
    
    if (totalBuffersNeeded < totalBuffersAvailable) {
      // we are guaranteed to stay in memory
      ensureNumBuffersReturned(p.getBuildSideBlockCount());
      
      // first read the partition in
      final BulkBlockChannelReader reader = this.ioManager.createBulkBlockChannelReader(p.getBuildSideChannel().getChannelID(), 
        this.availableMemory, p.getBuildSideBlockCount());
      // call waits until all is read
      if (keepBuildSidePartitions && p.recursionLevel == 0) {
        reader.close(); // keep the partitions
      } else {
        reader.closeAndDelete();
      }
      
      final List<MemorySegment> partitionBuffers = reader.getFullSegments();
      final HashPartition<BT, PT> newPart = new HashPartition<BT, PT>(this.buildSideSerializer, this.probeSideSerializer,
          0, nextRecursionLevel, partitionBuffers, p.getBuildSideRecordCount(), this.segmentSize, p.getLastSegmentLimit());
      
      this.partitionsBeingBuilt.add(newPart);
      
      // erect the buckets
      initTable((int) numBuckets, (byte) 1);
      
      // now, index the partition through a hash table
      final HashPartition<BT, PT>.PartitionIterator pIter = newPart.getPartitionIterator(this.buildSideComparator);
      BT record = this.buildSideSerializer.createInstance();
      
      while ((record = pIter.next(record)) != null) {
        final int hashCode = hash(pIter.getCurrentHashCode(), nextRecursionLevel);
        final int posHashCode = hashCode % this.numBuckets;
        final long pointer = pIter.getPointer();
        
        // get the bucket for the given hash code
        final int bucketArrayPos = posHashCode >> this.bucketsPerSegmentBits;
        final int bucketInSegmentPos = (posHashCode & this.bucketsPerSegmentMask) << NUM_INTRA_BUCKET_BITS;
        final MemorySegment bucket = this.buckets[bucketArrayPos];
        
        insertBucketEntry(newPart, bucket, bucketInSegmentPos, hashCode, pointer);
      }
    }
    else {
      // we need to partition and partially spill
      final int avgRecordLenPartition = (int) (((long) p.getBuildSideBlockCount()) * 
          this.segmentSize / p.getBuildSideRecordCount());
      
      final int bucketCount = (int) (((long) totalBuffersAvailable) * RECORD_TABLE_BYTES / 
          (avgRecordLenPartition + RECORD_OVERHEAD_BYTES));
      
      // compute in how many splits, we'd need to partition the result 
      final int splits = (int) (totalBuffersNeeded / totalBuffersAvailable) + 1;
      final int partitionFanOut = Math.min(10 * splits /* being conservative */, MAX_NUM_PARTITIONS);
      
      createPartitions(partitionFanOut, nextRecursionLevel);
      
      // set up the table structure. the write behind buffers are taken away, as are one buffer per partition
      initTable(bucketCount, (byte) partitionFanOut);
      
      // go over the complete input and insert every element into the hash table
      // first set up the reader with some memory.
      final List<MemorySegment> segments = new ArrayList<MemorySegment>(2);
      segments.add(getNextBuffer());
      segments.add(getNextBuffer());
      
      final BlockChannelReader inReader = this.ioManager.createBlockChannelReader(p.getBuildSideChannel().getChannelID());
      final ChannelReaderInputView inView = new HeaderlessChannelReaderInputView(inReader, segments,
            p.getBuildSideBlockCount(), p.getLastSegmentLimit(), false);
      final ChannelReaderInputViewIterator<BT> inIter = new ChannelReaderInputViewIterator<BT>(inView, 
          this.availableMemory, this.buildSideSerializer);
      final TypeComparator<BT> btComparator = this.buildSideComparator;
      BT rec = this.buildSideSerializer.createInstance();
      while ((rec = inIter.next(rec)) != null)
      {  
        final int hashCode = hash(btComparator.hash(rec), nextRecursionLevel);
        insertIntoTable(rec, hashCode);
      }


      // finalize the partitions
      for (int i = 0; i < this.partitionsBeingBuilt.size(); i++) {
        HashPartition<BT, PT> part = this.partitionsBeingBuilt.get(i);
        part.finalizeBuildPhase(this.ioManager, this.currentEnumerator, this.writeBehindBuffers);
      }
    }
  }
  
  /**
   * @param record
   * @param hashCode
   * @throws IOException
   */
  protected final void insertIntoTable(final BT record, final int hashCode) throws IOException {
    final int posHashCode = hashCode % this.numBuckets;
    
    // get the bucket for the given hash code
    final int bucketArrayPos = posHashCode >> this.bucketsPerSegmentBits;
    final int bucketInSegmentPos = (posHashCode & this.bucketsPerSegmentMask) << NUM_INTRA_BUCKET_BITS;
    final MemorySegment bucket = this.buckets[bucketArrayPos];
    
    // get the basic characteristics of the bucket
    final int partitionNumber = bucket.get(bucketInSegmentPos + HEADER_PARTITION_OFFSET);
    
    // get the partition descriptor for the bucket
    if (partitionNumber < 0 || partitionNumber >= this.partitionsBeingBuilt.size()) {
      throw new RuntimeException("Error: Hash structures in Hash-Join are corrupt. Invalid partition number for bucket.");
    }
    final HashPartition<BT, PT> p = this.partitionsBeingBuilt.get(partitionNumber);
    
    // --------- Step 1: Get the partition for this pair and put the pair into the buffer ---------
    
    long pointer = p.insertIntoBuildBuffer(record);
    if (pointer != -1) {
      // record was inserted into an in-memory partition. a pointer must be inserted into the buckets
      insertBucketEntry(p, bucket, bucketInSegmentPos, hashCode, pointer);
    }
  }
  
  /**
   * @param p
   * @param bucket
   * @param bucketInSegmentPos
   * @param hashCode
   * @param pointer
   * @throws IOException
   */
  final void insertBucketEntry(final HashPartition<BT, PT> p, final MemorySegment bucket, 
      final int bucketInSegmentPos, final int hashCode, final long pointer)
  throws IOException
  {
    // find the position to put the hash code and pointer
    final int count = bucket.getShort(bucketInSegmentPos + HEADER_COUNT_OFFSET);
    if (count < NUM_ENTRIES_PER_BUCKET)
    {
      // we are good in our current bucket, put the values
      bucket.putInt(bucketInSegmentPos + BUCKET_HEADER_LENGTH + (count * HASH_CODE_LEN), hashCode);  // hash code
      bucket.putLong(bucketInSegmentPos + BUCKET_POINTER_START_OFFSET + (count * POINTER_LEN), pointer); // pointer
      bucket.putShort(bucketInSegmentPos + HEADER_COUNT_OFFSET, (short) (count + 1)); // update count
    }
    else {
      // we need to go to the overflow buckets
      final long originalForwardPointer = bucket.getLong(bucketInSegmentPos + HEADER_FORWARD_OFFSET);
      final long forwardForNewBucket;
      
      if (originalForwardPointer != BUCKET_FORWARD_POINTER_NOT_SET) {
        
        // forward pointer set
        final int overflowSegNum = (int) (originalForwardPointer >>> 32);
        final int segOffset = (int) (originalForwardPointer & 0xffffffff);
        final MemorySegment seg = p.overflowSegments[overflowSegNum];
        
        final short obCount = seg.getShort(segOffset + HEADER_COUNT_OFFSET);
        
        // check if there is space in this overflow bucket
        if (obCount < NUM_ENTRIES_PER_BUCKET) {
          // space in this bucket and we are done
          seg.putInt(segOffset + BUCKET_HEADER_LENGTH + (obCount * HASH_CODE_LEN), hashCode);  // hash code
          seg.putLong(segOffset + BUCKET_POINTER_START_OFFSET + (obCount * POINTER_LEN), pointer); // pointer
          seg.putShort(segOffset + HEADER_COUNT_OFFSET, (short) (obCount + 1)); // update count
          return;
        }
        else {
          // no space here, we need a new bucket. this current overflow bucket will be the
          // target of the new overflow bucket
          forwardForNewBucket = originalForwardPointer;
        }
      }
      else {
        // no overflow bucket yet, so we need a first one
        forwardForNewBucket = BUCKET_FORWARD_POINTER_NOT_SET;
      }
      
      // we need a new overflow bucket
      MemorySegment overflowSeg;
      final int overflowBucketNum;
      final int overflowBucketOffset;
      
      
      // first, see if there is space for an overflow bucket remaining in the last overflow segment
      if (p.nextOverflowBucket == 0) {
        // no space left in last bucket, or no bucket yet, so create an overflow segment
        overflowSeg = getNextBuffer();
        if (overflowSeg == null) {
          // no memory available to create overflow bucket. we need to spill a partition
          final int spilledPart = spillPartition();
          if (spilledPart == p.getPartitionNumber()) {
            // this bucket is no longer in-memory
            return;
          }
          overflowSeg = getNextBuffer();
          if (overflowSeg == null) {
            throw new RuntimeException("Bug in HybridHashJoin: No memory became available after spilling a partition.");
          }
        }
        overflowBucketOffset = 0;
        overflowBucketNum = p.numOverflowSegments;
        
        // add the new overflow segment
        if (p.overflowSegments.length <= p.numOverflowSegments) {
          MemorySegment[] newSegsArray = new MemorySegment[p.overflowSegments.length * 2];
          System.arraycopy(p.overflowSegments, 0, newSegsArray, 0, p.overflowSegments.length);
          p.overflowSegments = newSegsArray;
        }
        p.overflowSegments[p.numOverflowSegments] = overflowSeg;
        p.numOverflowSegments++;
      }
      else {
        // there is space in the last overflow bucket
        overflowBucketNum = p.numOverflowSegments - 1;
        overflowSeg = p.overflowSegments[overflowBucketNum];
        overflowBucketOffset = p.nextOverflowBucket << NUM_INTRA_BUCKET_BITS;
      }
      
      // next overflow bucket is one ahead. if the segment is full, the next will be at the beginning
      // of a new segment
      p.nextOverflowBucket = (p.nextOverflowBucket == this.bucketsPerSegmentMask ? 0 : p.nextOverflowBucket + 1);
      
      // insert the new overflow bucket in the chain of buckets
      // 1) set the old forward pointer
      // 2) let the bucket in the main table point to this one
      overflowSeg.putLong(overflowBucketOffset + HEADER_FORWARD_OFFSET, forwardForNewBucket);
      final long pointerToNewBucket = (((long) overflowBucketNum) << 32) | ((long) overflowBucketOffset);
      bucket.putLong(bucketInSegmentPos + HEADER_FORWARD_OFFSET, pointerToNewBucket);
      
      // finally, insert the values into the overflow buckets
      overflowSeg.putInt(overflowBucketOffset + BUCKET_HEADER_LENGTH, hashCode);  // hash code
      overflowSeg.putLong(overflowBucketOffset + BUCKET_POINTER_START_OFFSET, pointer); // pointer
      
      // set the count to one
      overflowSeg.putShort(overflowBucketOffset + HEADER_COUNT_OFFSET, (short) 1); 
    }
  }
  
  // --------------------------------------------------------------------------------------------
  //                          Setup and Tear Down of Structures
  // --------------------------------------------------------------------------------------------
  
  /**
   * Returns a new inMemoryPartition object.
   * This is required as a plug for ReOpenableMutableHashTable.
   */
  protected HashPartition<BT, PT> getNewInMemoryPartition(int number, int recursionLevel) {
    return new HashPartition<BT, PT>(this.buildSideSerializer, this.probeSideSerializer,
        number, recursionLevel, this.availableMemory.remove(this.availableMemory.size() - 1),
        this, this.segmentSize);
  }
  /**
   * @param numPartitions
   */
  protected void createPartitions(int numPartitions, int recursionLevel) {
    // sanity check
    ensureNumBuffersReturned(numPartitions);
    
    this.currentEnumerator = this.ioManager.createChannelEnumerator();
    
    this.partitionsBeingBuilt.clear();
    for (int i = 0; i < numPartitions; i++) {
      HashPartition<BT, PT> p = getNewInMemoryPartition(i, recursionLevel);
      this.partitionsBeingBuilt.add(p);
    }
  }
  
  /**
   * This method clears all partitions currently residing (partially) in memory. It releases all memory
   * and deletes all spilled partitions.
   * <p>
   * This method is intended for a hard cleanup in the case that the join is aborted.
   */
  protected void clearPartitions() {
    for (int i = this.partitionsBeingBuilt.size() - 1; i >= 0; --i) {
      final HashPartition<BT, PT> p = this.partitionsBeingBuilt.get(i);
      try {
        p.clearAllMemory(this.availableMemory);
      } catch (Exception e) {
        LOG.error("Error during partition cleanup.", e);
      }
    }
    this.partitionsBeingBuilt.clear();
  }
  
  /**
   * @param numBuckets
   * @param numPartitions
   * @return
   */
  protected void initTable(int numBuckets, byte numPartitions) {
    final int bucketsPerSegment = this.bucketsPerSegmentMask + 1;
    final int numSegs = (numBuckets >>> this.bucketsPerSegmentBits) + ( (numBuckets & this.bucketsPerSegmentMask) == 0 ? 0 : 1);
    final MemorySegment[] table = new MemorySegment[numSegs];
    
    ensureNumBuffersReturned(numSegs);
    
    // go over all segments that are part of the table
    for (int i = 0, bucket = 0; i < numSegs && bucket < numBuckets; i++) {
      final MemorySegment seg = getNextBuffer();
      
      // go over all buckets in the segment
      for (int k = 0; k < bucketsPerSegment && bucket < numBuckets; k++, bucket++) {
        final int bucketOffset = k * HASH_BUCKET_SIZE;  
        
        // compute the partition that the bucket corresponds to
        final byte partition = assignPartition(bucket, numPartitions);
        
        // initialize the header fields
        seg.put(bucketOffset + HEADER_PARTITION_OFFSET, partition);
        seg.put(bucketOffset + HEADER_STATUS_OFFSET, BUCKET_STATUS_IN_MEMORY);
        seg.putShort(bucketOffset + HEADER_COUNT_OFFSET, (short) 0);
        seg.putLong(bucketOffset + HEADER_FORWARD_OFFSET, BUCKET_FORWARD_POINTER_NOT_SET);
      }
      
      table[i] = seg;
    }
    this.buckets = table;
    this.numBuckets = numBuckets;
  }
  
  /**
   * Releases the table (the array of buckets) and returns the occupied memory segments to the list of free segments.
   */
  protected void releaseTable() {
    // set the counters back
    this.numBuckets = 0;
    
    if (this.buckets != null) {
      for (int i = 0; i < this.buckets.length; i++) {
        this.availableMemory.add(this.buckets[i]);
      }
      this.buckets = null;
    }
  }
  
  // --------------------------------------------------------------------------------------------
  //                                    Memory Handling
  // --------------------------------------------------------------------------------------------
  
  /**
   * Selects a partition and spills it. The number of the spilled partition is returned.
   * 
   * @return The number of the spilled partition.
   */
  protected int spillPartition() throws IOException {
    // find the largest partition
    ArrayList<HashPartition<BT, PT>> partitions = this.partitionsBeingBuilt;
    int largestNumBlocks = 0;
    int largestPartNum = -1;
    
    for (int i = 0; i < partitions.size(); i++) {
      HashPartition<BT, PT> p = partitions.get(i);
      if (p.isInMemory() && p.getBuildSideBlockCount() > largestNumBlocks) {
        largestNumBlocks = p.getBuildSideBlockCount();
        largestPartNum = i;
      }
    }
    final HashPartition<BT, PT> p = partitions.get(largestPartNum);
    
    // spill the partition
    int numBuffersFreed = p.spillPartition(this.availableMemory, this.ioManager, 
                    this.currentEnumerator.next(), this.writeBehindBuffers);
    this.writeBehindBuffersAvailable += numBuffersFreed;
    // grab as many buffers as are available directly
    MemorySegment currBuff = null;
    while (this.writeBehindBuffersAvailable > 0 && (currBuff = this.writeBehindBuffers.poll()) != null) {
      this.availableMemory.add(currBuff);
      this.writeBehindBuffersAvailable--;
    }
    return largestPartNum;
  }
  
  /**
   * This method makes sure that at least a certain number of memory segments is in the list of free segments.
   * Free memory can be in the list of free segments, or in the return-queue where segments used to write behind are
   * put. The number of segments that are in that return-queue, but are actually reclaimable is tracked. This method
   * makes sure at least a certain number of buffers is reclaimed.
   *  
   * @param minRequiredAvailable The minimum number of buffers that needs to be reclaimed.
   */
  final void ensureNumBuffersReturned(final int minRequiredAvailable) {
    if (minRequiredAvailable > this.availableMemory.size() + this.writeBehindBuffersAvailable) {
      throw new IllegalArgumentException("More buffers requested available than totally available.");
    }
    
    try {
      while (this.availableMemory.size() < minRequiredAvailable) {
        this.availableMemory.add(this.writeBehindBuffers.take());
        this.writeBehindBuffersAvailable--;
      }
    }
    catch (InterruptedException iex) {
      throw new RuntimeException("Hash Join was interrupted.");
    }
  }
  
  /**
   * Gets the next buffer to be used with the hash-table, either for an in-memory partition, or for the
   * table buckets. This method returns <tt>null</tt>, if no more buffer is available. Spilling a partition
   * may free new buffers then.
   * 
   * @return The next buffer to be used by the hash-table, or null, if no buffer remains.
   * @throws IOException Thrown, if the thread is interrupted while grabbing the next buffer. The I/O
   *                     exception replaces the <tt>InterruptedException</tt> to consolidate the exception
   *                     signatures.
   */
  final MemorySegment getNextBuffer() {
    // check if the list directly offers memory
    int s = this.availableMemory.size();
    if (s > 0) {
      return this.availableMemory.remove(s-1);
    }
    
    // check if there are write behind buffers that actually are to be used for the hash table
    if (this.writeBehindBuffersAvailable > 0) {
      // grab at least one, no matter what
      MemorySegment toReturn;
      try {
        toReturn = this.writeBehindBuffers.take();
      }
      catch (InterruptedException iex) {
        throw new RuntimeException("Hybrid Hash Join was interrupted while taking a buffer.");
      }
      this.writeBehindBuffersAvailable--;
      
      // grab as many more buffers as are available directly
      MemorySegment currBuff = null;
      while (this.writeBehindBuffersAvailable > 0 && (currBuff = this.writeBehindBuffers.poll()) != null) {
        this.availableMemory.add(currBuff);
        this.writeBehindBuffersAvailable--;
      }
      return toReturn;
    } else {
      // no memory available
      return null;
    }
  }
  


  @Override
  public MemorySegment nextSegment() {
    final MemorySegment seg = getNextBuffer();
    if (seg == null) {
      try {
        spillPartition();
      } catch (IOException ioex) {
        throw new RuntimeException("Error spilling Hash Join Partition" + (ioex.getMessage() == null ?
          "." : ": " + ioex.getMessage()), ioex);
      }
      
      MemorySegment fromSpill = getNextBuffer();
      if (fromSpill == null) {
        throw new RuntimeException("BUG in Hybrid Hash Join: Spilling did not free a buffer.");
      } else {
        return fromSpill;
      }
    } else {
      return seg;
    }
  }


  // --------------------------------------------------------------------------------------------
  //                             Utility Computational Functions
  // --------------------------------------------------------------------------------------------
  
  /**
   * Determines the number of buffers to be used for asynchronous write behind. It is currently
   * computed as the logarithm of the number of buffers to the base 4, rounded up, minus 2.
   * The upper limit for the number of write behind buffers is however set to six.
   * 
   * @param numBuffers The number of available buffers.
   * @return The number 
   */
  public static final int getNumWriteBehindBuffers(int numBuffers) {
    int numIOBufs = (int) (Math.log(numBuffers) / Math.log(4) - 1.5);
    return numIOBufs > 6 ? 6 : numIOBufs;
  }
  
  /**
   * Gets the number of partitions to be used for an initial hash-table, when no estimates are
   * available.
   * <p>
   * The current logic makes sure that there are always between 10 and 127 partitions, and close
   * to 0.1 of the number of buffers.
   * 
   * @param numBuffers The number of buffers available.
   * @return The number of partitions to use.
   */
  public static final int getPartitioningFanOutNoEstimates(int numBuffers) {
    return Math.max(10, Math.min(numBuffers / 10, MAX_NUM_PARTITIONS));
  }
  
  public static final int getInitialTableSize(int numBuffers, int bufferSize, int numPartitions, int recordLenBytes) {
    // ----------------------------------------------------------------------------------------
    // the following observations hold:
    // 1) If the records are assumed to be very large, then many buffers need to go to the partitions
    //    and fewer to the table
    // 2) If the records are small, then comparatively many have to go to the buckets, and fewer to the
    //    partitions
    // 3) If the bucket-table is chosen too small, we will eventually get many collisions and will grow the
    //    hash table, incrementally adding buffers.
    // 4) If the bucket-table is chosen to be large and we actually need more buffers for the partitions, we
    //    cannot subtract them afterwards from the table
    //
    // ==> We start with a comparatively small hash-table. We aim for a 200% utilization of the bucket table
    //     when all the partition buffers are full. Most likely, that will cause some buckets to be re-hashed
    //     and grab additional buffers away from the partitions.
    // NOTE: This decision may be subject to changes after conclusive experiments!
    // ----------------------------------------------------------------------------------------
    
    final long totalSize = ((long) bufferSize) * numBuffers;
    final long numRecordsStorable = totalSize / (recordLenBytes + RECORD_OVERHEAD_BYTES);
    final long bucketBytes = numRecordsStorable * RECORD_TABLE_BYTES;
    final long numBuckets = bucketBytes / (2 * HASH_BUCKET_SIZE) + 1;
    
    return numBuckets > Integer.MAX_VALUE ? Integer.MAX_VALUE : (int) numBuckets;
  }
  
  /**
   * Assigns a partition to a bucket.
   * 
   * @param bucket
   * @param numPartitions
   * @return The hash code for the integer.
   */
  public static final byte assignPartition(int bucket, byte numPartitions) {
    return (byte) (bucket % numPartitions);
  }
  
  /**
   * This function hashes an integer value. It is adapted from Bob Jenkins' website
   * <a href="http://www.burtleburtle.net/bob/hash/integer.html">http://www.burtleburtle.net/bob/hash/integer.html</a>.
   * The hash function has the <i>full avalanche</i> property, meaning that every bit of the value to be hashed
   * affects every bit of the hash value. 
   * 
   * @param code The integer to be hashed.
   * @return The hash code for the integer.
   */
  public static final int hash(int code, int level) {
    final int rotation = level * 11;
    
    code = (code << rotation) | (code >>> -rotation);


    code = (code + 0x7ed55d16) + (code << 12);
    code = (code ^ 0xc761c23c) ^ (code >>> 19);
    code = (code + 0x165667b1) + (code << 5);
    code = (code + 0xd3a2646c) ^ (code << 9);
    code = (code + 0xfd7046c5) + (code << 3);
    code = (code ^ 0xb55a4f09) ^ (code >>> 16);
    return code >= 0 ? code : -(code + 1);
  }


  public TypeComparator<PT> getProbeSideComparator () {
    return this.probeSideComparator;
  }
  
  // ======================================================================================================
  
  /**
   *
   */
  public static class HashBucketIterator<BT, PT> implements MutableObjectIterator<BT> {
    
    private final TypeSerializer<BT> accessor;
    
    private final TypePairComparator<PT, BT> comparator;
    
    private MemorySegment bucket;
    
    private MemorySegment[] overflowSegments;
    
    private HashPartition<BT, PT> partition;
    
    private int bucketInSegmentOffset;
    
    private int searchHashCode;
    
    private int posInSegment;
    
    private int countInSegment;
    
    private int numInSegment;
    
    private int originalBucketInSegmentOffset;
    
    private MemorySegment originalBucket;
    
    private long lastPointer;
    
    
    HashBucketIterator(TypeSerializer<BT> accessor, TypePairComparator<PT, BT> comparator) {
      this.accessor = accessor;
      this.comparator = comparator;
    }
    
    
    void set(MemorySegment bucket, MemorySegment[] overflowSegments, HashPartition<BT, PT> partition,
        int searchHashCode, int bucketInSegmentOffset)
    {
      this.bucket = bucket;
      this.originalBucket = bucket;
      this.overflowSegments = overflowSegments;
      this.partition = partition;
      this.searchHashCode = searchHashCode;
      this.bucketInSegmentOffset = bucketInSegmentOffset;
      this.originalBucketInSegmentOffset = bucketInSegmentOffset;
      
      this.posInSegment = this.bucketInSegmentOffset + BUCKET_HEADER_LENGTH;
      this.countInSegment = bucket.getShort(bucketInSegmentOffset + HEADER_COUNT_OFFSET);
      this.numInSegment = 0;
    }
    


    public BT next(BT reuse) {
      // loop over all segments that are involved in the bucket (original bucket plus overflow buckets)
      while (true) {
        
        while (this.numInSegment < this.countInSegment) {
          
          final int thisCode = this.bucket.getInt(this.posInSegment);
          this.posInSegment += HASH_CODE_LEN;
            
          // check if the hash code matches
          if (thisCode == this.searchHashCode) {
            // get the pointer to the pair
            final long pointer = this.bucket.getLong(this.bucketInSegmentOffset + 
                          BUCKET_POINTER_START_OFFSET + (this.numInSegment * POINTER_LEN));
            this.numInSegment++;
            
            // deserialize the key to check whether it is really equal, or whether we had only a hash collision
            try {
              this.partition.setReadPosition(pointer);
              reuse = this.accessor.deserialize(reuse, this.partition);
              if (this.comparator.equalToReference(reuse)) {
                this.lastPointer = pointer;
                return reuse;
              }
            }
            catch (IOException ioex) {
              throw new RuntimeException("Error deserializing key or value from the hashtable: " +
                  ioex.getMessage(), ioex);
            }
          }
          else {
            this.numInSegment++;
          }
        }
        
        // this segment is done. check if there is another chained bucket
        final long forwardPointer = this.bucket.getLong(this.bucketInSegmentOffset + HEADER_FORWARD_OFFSET);
        if (forwardPointer == BUCKET_FORWARD_POINTER_NOT_SET) {
          return null;
        }
        
        final int overflowSegNum = (int) (forwardPointer >>> 32);
        this.bucket = this.overflowSegments[overflowSegNum];
        this.bucketInSegmentOffset = (int) (forwardPointer & 0xffffffff);
        this.countInSegment = this.bucket.getShort(this.bucketInSegmentOffset + HEADER_COUNT_OFFSET);
        this.posInSegment = this.bucketInSegmentOffset + BUCKET_HEADER_LENGTH;
        this.numInSegment = 0;
      }
    }
    
    public void writeBack(BT value) throws IOException {
      final SeekableDataOutputView outView = this.partition.getWriteView();
      outView.setWritePosition(this.lastPointer);
      this.accessor.serialize(value, outView);
    }
    
    public void reset() {
      this.bucket = this.originalBucket;
      this.bucketInSegmentOffset = this.originalBucketInSegmentOffset;
      
      this.posInSegment = this.bucketInSegmentOffset + BUCKET_HEADER_LENGTH;
      this.countInSegment = bucket.getShort(bucketInSegmentOffset + HEADER_COUNT_OFFSET);
      this.numInSegment = 0;
    }


  } // end HashBucketIterator
  


  // ======================================================================================================
  
//  public static final class LazyHashBucketIterator<BT, PT> {
//    
//    private final TypePairComparator<PT, BT> comparator;
//    
//    private MemorySegment bucket;
//    
//    private MemorySegment[] overflowSegments;
//    
//    private HashPartition<BT, PT> partition;
//    
//    private int bucketInSegmentOffset;
//    
//    private int searchHashCode;
//    
//    private int posInSegment;
//    
//    private int countInSegment;
//    
//    private int numInSegment;
//    
//    private LazyHashBucketIterator(TypePairComparator<PT, BT> comparator) {
//      this.comparator = comparator;
//    }
//    
//    
//    void set(MemorySegment bucket, MemorySegment[] overflowSegments, HashPartition<BT, PT> partition,
//        int searchHashCode, int bucketInSegmentOffset) {
//      
//      this.bucket = bucket;
//      this.overflowSegments = overflowSegments;
//      this.partition = partition;
//      this.searchHashCode = searchHashCode;
//      this.bucketInSegmentOffset = bucketInSegmentOffset;
//      
//      this.posInSegment = this.bucketInSegmentOffset + BUCKET_HEADER_LENGTH;
//      this.countInSegment = bucket.getShort(bucketInSegmentOffset + HEADER_COUNT_OFFSET);
//      this.numInSegment = 0;
//    }
//
//    public boolean next(BT target) {
//      // loop over all segments that are involved in the bucket (original bucket plus overflow buckets)
//      while (true) {
//        
//        while (this.numInSegment < this.countInSegment) {
//          
//          final int thisCode = this.bucket.getInt(this.posInSegment);
//          this.posInSegment += HASH_CODE_LEN;
//            
//          // check if the hash code matches
//          if (thisCode == this.searchHashCode) {
//            // get the pointer to the pair
//            final long pointer = this.bucket.getLong(this.bucketInSegmentOffset + 
//                          BUCKET_POINTER_START_OFFSET + (this.numInSegment * POINTER_LEN));
//            this.numInSegment++;
//              
//            // check whether it is really equal, or whether we had only a hash collision
//            LazyDeSerializable lds = (LazyDeSerializable) target;
//            lds.setDeSerializer(this.partition, this.partition.getWriteView(), pointer);
//            if (this.comparator.equalToReference(target)) {
//              return true;
//            }
//          }
//          else {
//            this.numInSegment++;
//          }
//        }
//        
//        // this segment is done. check if there is another chained bucket
//        final long forwardPointer = this.bucket.getLong(this.bucketInSegmentOffset + HEADER_FORWARD_OFFSET);
//        if (forwardPointer == BUCKET_FORWARD_POINTER_NOT_SET) {
//          return false;
//        }
//        
//        final int overflowSegNum = (int) (forwardPointer >>> 32);
//        this.bucket = this.overflowSegments[overflowSegNum];
//        this.bucketInSegmentOffset = (int) (forwardPointer & 0xffffffff);
//        this.countInSegment = this.bucket.getShort(this.bucketInSegmentOffset + HEADER_COUNT_OFFSET);
//        this.posInSegment = this.bucketInSegmentOffset + BUCKET_HEADER_LENGTH;
//        this.numInSegment = 0;
//      }
//    }
//  } 
  


  // ======================================================================================================
  
  public static final class ProbeIterator<PT> {
    
    private MutableObjectIterator<PT> source;
    
    private PT instance;
    
    
    ProbeIterator(MutableObjectIterator<PT> source, PT instance) {
      this.instance = instance;
      set(source);
    }
    
    void set(MutableObjectIterator<PT> source) {
      this.source = source;
    }
    
    public PT next() throws IOException {
      PT retVal = this.source.next(this.instance);
      if (retVal != null) {
        this.instance = retVal;
        return retVal;
      } else {
        return null;
      }
    }
    
    public PT getCurrent() {
      return this.instance;
    }
  }
}
Source Code of eu.stratosphere.pact.runtime.hash.MutableHashTable$ProbeIterator

Related Classes of eu.stratosphere.pact.runtime.hash.MutableHashTable$ProbeIterator