package com.salesforce.phoenix.cache.aggcache;
import java.io.IOException;
import java.nio.BufferOverflowException;
import java.nio.MappedByteBuffer;
import java.util.AbstractMap;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import org.apache.hadoop.hbase.util.Bytes;
import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.hash.BloomFilter;
import com.google.common.hash.Funnels;
import com.salesforce.hbase.index.util.ImmutableBytesPtr;
/**
* Class implements an active spilled partition serialized tuples are first written into an in-memory data structure
* that represents a single page. As the page fills up, it is written to the current spillFile or spill partition For
* fast tuple discovery, the class maintains a per page bloom-filter and never de-serializes elements. The element
* spilling employs an extentible hashing technique.
*/
public class SpillMap extends AbstractMap<ImmutableBytesPtr, byte[]> implements Iterable<byte[]> {
// Threshold is typically the page size
private final int thresholdBytes;
private final int pageInserts;
// Global directory depth
private int globalDepth;
private int curMapBufferIndex;
private SpillFile spillFile;
// Directory of hash buckets --> extendible hashing implementation
private MappedByteBufferMap[] directory;
private final SpillableGroupByCache.QueryCache cache;
public SpillMap(SpillFile file, int thresholdBytes, int estValueSize, SpillableGroupByCache.QueryCache cache)
throws IOException {
this.thresholdBytes = thresholdBytes - Bytes.SIZEOF_INT;
this.pageInserts = thresholdBytes / estValueSize;
this.spillFile = file;
this.cache = cache;
// Init the e-hashing directory structure
globalDepth = 1;
directory = new MappedByteBufferMap[(1 << globalDepth)];
for (int i = 0; i < directory.length; i++) {
// Create an empty bucket list
directory[i] = new MappedByteBufferMap(i, this.thresholdBytes, pageInserts, file);
directory[i].flushBuffer();
}
directory[0].pageIn();
curMapBufferIndex = 0;
}
// Get the directoy index for a specific key
private int getBucketIndex(ImmutableBytesPtr key) {
// Get key hash
int hashCode = key.hashCode();
// Mask all but globalDepth low n bits
return hashCode & ((1 << globalDepth) - 1);
}
// Function redistributes the elements in the current index
// to two new buckets, based on the bit at localDepth + 1 position.
// Optionally this function also doubles the directory to allow
// for bucket splits
private void redistribute(int index, ImmutableBytesPtr keyNew, byte[] valueNew) {
// Get the respective bucket
MappedByteBufferMap byteMap = directory[index];
// Get the actual bucket index, that the directory index points to
int mappedIdx = byteMap.pageIndex;
int localDepth = byteMap.localDepth;
ArrayList<Integer> buckets = Lists.newArrayList();
// Get all directory entries that point to the same bucket.
// TODO: can be made faster!
for (int i = 0; i < directory.length; i++) {
if (directory[i].pageIndex == mappedIdx) {
buckets.add(i);
}
}
// Assuming no directory doubling for now
// compute the two new bucket Ids for splitting
// SpillFile adds new files dynamically in case the directory points to pageIDs
// that exceed the size limit of a single file.
// TODO verify if some sort of de-fragmentation might be helpful
int tmpIndex = index ^ ((1 << localDepth));
int b1Index = Math.min(index, tmpIndex);
int b2Index = Math.max(index, tmpIndex);
// Create two new split buckets
MappedByteBufferMap b1 = new MappedByteBufferMap(b1Index, thresholdBytes, pageInserts, spillFile);
MappedByteBufferMap b2 = new MappedByteBufferMap(b2Index, thresholdBytes, pageInserts, spillFile);
// redistribute old elements into b1 and b2
for (Entry<ImmutableBytesPtr, byte[]> element : byteMap.pageMap.entrySet()) {
ImmutableBytesPtr key = element.getKey();
byte[] value = element.getValue();
// Only add key during redistribution if its not in the cache
// Otherwise this is an good point to reduce the number of spilled elements
if (!cache.isKeyContained(key)) {
// Re-distribute element onto the new 2 split buckets
if ((key.hashCode() & ((1 << localDepth))) != 0) {
b2.addElement(null, key, value);
} else {
b1.addElement(null, key, value);
}
}
}
// Clear and GC the old now redistributed bucket
byteMap.pageMap.clear();
byteMap = null;
// Increase local bucket depths
b1.localDepth = localDepth + 1;
b2.localDepth = localDepth + 1;
boolean doubleDir = false;
if (globalDepth < (localDepth + 1)) {
// Double directory structure and re-adjust pointers
doubleDir = true;
b2Index = doubleDirectory(b2Index, keyNew);
}
if (!doubleDir) {
// This is a bit more tricky, we have to cover scenarios where
// globalDepth - localDepth > 1
// Here even after bucket splitting, multiple directory entries point to
// the new buckets
for (int i = 0; i < buckets.size(); i++) {
if ((buckets.get(i) & (1 << (localDepth))) != 0) {
directory[buckets.get(i)] = b2;
} else {
directory[buckets.get(i)] = b1;
}
}
} else {
// Update the directory indexes in case of directory doubling
directory[b1Index] = b1;
directory[b2Index] = b2;
}
}
// Doubles the directory and readjusts pointers.
private int doubleDirectory(int b2Index, ImmutableBytesPtr keyNew) {
// Double the directory in size, second half points to original first half
int newDirSize = 1 << (globalDepth + 1);
// Ensure that the new directory size does not exceed size limits
Preconditions.checkArgument(newDirSize < Integer.MAX_VALUE);
// Double it!
MappedByteBufferMap[] newDirectory = new MappedByteBufferMap[newDirSize];
for (int i = 0; i < directory.length; i++) {
newDirectory[i] = directory[i];
newDirectory[i + directory.length] = directory[i];
}
directory = newDirectory;
newDirectory = null;
// Adjust the index for new split bucket, according to the directory double
b2Index = (keyNew.hashCode() & ((1 << globalDepth) - 1)) | (1 << globalDepth);
// Increment global depth
globalDepth++;
return b2Index;
}
/**
* Get a key from the spillable data structures. page is determined via hash partitioning, and a bloomFilter check
* is used to determine if its worth paging in the data.
*/
@Override
public byte[] get(Object key) {
if (!(key instanceof ImmutableBytesPtr)) {
// TODO ... work on type safety
}
ImmutableBytesPtr ikey = (ImmutableBytesPtr)key;
byte[] value = null;
int bucketIndex = getBucketIndex(ikey);
MappedByteBufferMap byteMap = directory[bucketIndex];
// Decision based on bucket ID, not the directory ID due to the n:1 relationship
if (directory[curMapBufferIndex].pageIndex != byteMap.pageIndex) {
// map not paged in
MappedByteBufferMap curByteMap = directory[curMapBufferIndex];
// Use bloomFilter to check if key was spilled before
if (byteMap.containsKey(ikey.copyBytesIfNecessary())) {
// ensure consistency and flush current memory page to disk
// fflush current buffer
curByteMap.flushBuffer();
// page in new buffer
byteMap.pageIn();
// update index
curMapBufferIndex = bucketIndex;
}
}
// get KV from current map
value = byteMap.getPagedInElement(ikey);
return value;
}
// Similar as get(Object key) function, however
// always pages in page a key is spilled to, no bloom filter decision
private byte[] getAlways(ImmutableBytesPtr key) {
byte[] value = null;
int bucketIndex = getBucketIndex(key);
MappedByteBufferMap byteMap = directory[bucketIndex];
if (directory[curMapBufferIndex].pageIndex != byteMap.pageIndex) {
MappedByteBufferMap curByteMap = directory[curMapBufferIndex];
// ensure consistency and flush current memory page to disk
curByteMap.flushBuffer();
byteMap.pageIn();
curMapBufferIndex = bucketIndex;
}
// get KV from current queue
value = byteMap.getPagedInElement(key);
return value;
}
/**
* Spill a key First we discover if the key has been spilled before and load it into memory: #ref get() if it was
* loaded before just replace the old value in the memory page if it was not loaded before try to store it in the
* current page alternatively if not enough memory available, request new page.
*/
@Override
public byte[] put(ImmutableBytesPtr key, byte[] value) {
boolean redistributed = false;
// page in element and replace if present
byte[] spilledValue = getAlways(key);
MappedByteBufferMap byteMap = directory[curMapBufferIndex];
int index = curMapBufferIndex;
// TODO: We split buckets until the new element fits onto a
// one of the new buckets. Might consider the use of an overflow
// bucket, especially in case the directory runs out of page IDs.
while (!byteMap.canFit(spilledValue, value)) {
// Element does not fit... Split the bucket!
redistribute(index, key, value);
redistributed = true;
index = getBucketIndex(key);
byteMap = directory[index];
}
// Ensure that all pages that were paged in during redistribution are flushed back out
// to disk to keep memory footprint small.
if (redistributed) {
for (int i = 0; i < directory.length; i++) {
if (directory[i].pageIndex != byteMap.pageIndex) {
directory[i].flushBuffer();
}
}
// Ensure the page that receives the new key is in memory
spilledValue = getAlways(key);
}
byteMap.addElement(spilledValue, key, value);
return value;
}
/**
* Function returns the current spill file
*/
public SpillFile getSpillFile() {
return spillFile;
}
/**
* This inner class represents the currently mapped file region. It uses a Map to represent the current in memory
* page for easy get() and update() calls on an individual key The class keeps track of the current size of the in
* memory page and handles flushing and paging in respectively
*/
private static class MappedByteBufferMap {
private SpillFile spillFile;
private int pageIndex;
private final int thresholdBytes;
private long totalResultSize;
private boolean pagedIn;
private int localDepth;
// dirtyPage flag tracks if a paged in page was modified
// if not, no need to flush it back out to disk
private boolean dirtyPage;
// Use a map for in memory page representation
Map<ImmutableBytesPtr, byte[]> pageMap = Maps.newHashMap();
// Used to determine is an element was written to this page before or not
BloomFilter<byte[]> bFilter;
public MappedByteBufferMap(int id, int thresholdBytes, int pageInserts, SpillFile spillFile) {
this.spillFile = spillFile;
// size threshold of a page
this.thresholdBytes = thresholdBytes;
this.pageIndex = id;
pageMap.clear();
bFilter = BloomFilter.create(Funnels.byteArrayFunnel(), pageInserts);
pagedIn = true;
totalResultSize = 0;
localDepth = 1;
dirtyPage = true;
}
private boolean containsKey(byte[] key) {
return bFilter.mightContain(key);
}
private boolean canFit(byte[] curValue, byte[] newValue) {
if (thresholdBytes < newValue.length) {
// TODO resize page size if single element is too big,
// Can this ever happen?
throw new RuntimeException("page size too small to store a single KV element");
}
int resultSize = newValue.length + Bytes.SIZEOF_INT;
if (curValue != null) {
// Key existed before
// Ensure to compensate for potential larger byte[] for agg
resultSize = Math.max(0, resultSize - (curValue.length + Bytes.SIZEOF_INT));
}
if ((thresholdBytes - totalResultSize) <= (resultSize)) {
// KV does not fit
return false;
}
// KV fits
return true;
}
// Flush the current page to the memory mapped byte buffer
private void flushBuffer() throws BufferOverflowException {
if (pagedIn) {
MappedByteBuffer buffer;
// Only flush if page was changed
if (dirtyPage) {
Collection<byte[]> values = pageMap.values();
buffer = spillFile.getPage(pageIndex);
buffer.clear();
// number of elements
buffer.putInt(values.size());
for (byte[] value : values) {
// element length
buffer.putInt(value.length);
// element
buffer.put(value, 0, value.length);
}
}
buffer = null;
// Reset page stats
pageMap.clear();
totalResultSize = 0;
}
pagedIn = false;
dirtyPage = false;
}
// load memory mapped region into a map for fast element access
private void pageIn() throws IndexOutOfBoundsException {
if (!pagedIn) {
// Map the memory region
MappedByteBuffer buffer = spillFile.getPage(pageIndex);
int numElements = buffer.getInt();
for (int i = 0; i < numElements; i++) {
int kvSize = buffer.getInt();
byte[] data = new byte[kvSize];
buffer.get(data, 0, kvSize);
try {
pageMap.put(SpillManager.getKey(data), data);
totalResultSize += (data.length + Bytes.SIZEOF_INT);
} catch (IOException ioe) {
// Error during key access on spilled resource
// TODO rework error handling
throw new RuntimeException(ioe);
}
}
pagedIn = true;
dirtyPage = false;
}
}
/**
* Return a cache element currently page into memory Direct access via mapped page map
*
* @param key
* @return
*/
public byte[] getPagedInElement(ImmutableBytesPtr key) {
return pageMap.get(key);
}
/**
* Inserts / Replaces cache element in the currently loaded page. Direct access via mapped page map
*
* @param key
* @param value
*/
public void addElement(byte[] spilledValue, ImmutableBytesPtr key, byte[] value) {
// put Element into map
pageMap.put(key, value);
// Update bloom filter
bFilter.put(key.copyBytesIfNecessary());
// track current Map size to prevent Buffer overflows
if (spilledValue != null) {
// if previous key was present, just add the size difference
totalResultSize += Math.max(0, value.length - (spilledValue.length));
} else {
// Add new size information
totalResultSize += (value.length + Bytes.SIZEOF_INT);
}
dirtyPage = true;
}
/**
* Returns a value iterator over the pageMap
*/
public Iterator<byte[]> getPageMapEntries() {
pageIn();
return pageMap.values().iterator();
}
}
/**
* Iterate over all spilled elements, including the ones that are currently paged into memory
*/
@Override
public Iterator<byte[]> iterator() {
directory[curMapBufferIndex].flushBuffer();
return new Iterator<byte[]>() {
int pageIndex = 0;
Iterator<byte[]> entriesIter = directory[pageIndex].getPageMapEntries();
HashSet<Integer> dups = new HashSet<Integer>();
@Override
public boolean hasNext() {
if (!entriesIter.hasNext()) {
boolean found = false;
// Clear in memory map
while (!found) {
pageIndex++;
if (pageIndex >= directory.length) { return false; }
directory[pageIndex - 1].pageMap.clear();
// get keys from all spilled pages
if (!dups.contains(directory[pageIndex].pageIndex)) {
dups.add(directory[pageIndex].pageIndex);
entriesIter = directory[pageIndex].getPageMapEntries();
if (entriesIter.hasNext()) {
found = true;
}
}
}
}
dups.add(directory[pageIndex].pageIndex);
return true;
}
@Override
public byte[] next() {
// get elements from in memory map first
return entriesIter.next();
}
@Override
public void remove() {
throw new IllegalAccessError("Iterator does not support removal operation");
}
};
}
// TODO implement this method to make the SpillMap a true Map implementation
@Override
public Set<java.util.Map.Entry<ImmutableBytesPtr, byte[]>> entrySet() {
throw new IllegalAccessError("entrySet is not supported for this type of cache");
}
}