Package com.persistit

Source Code of com.persistit.BufferPool$BufferHolder

/**
* Copyright 2005-2012 Akiban Technologies, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package com.persistit;

import static com.persistit.util.Util.NS_PER_S;

import java.io.DataOutputStream;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;
import java.util.concurrent.atomic.AtomicLongArray;
import java.util.concurrent.locks.ReentrantLock;

import com.persistit.JournalManager.PageNode;
import com.persistit.exception.InUseException;
import com.persistit.exception.InvalidPageAddressException;
import com.persistit.exception.InvalidPageStructureException;
import com.persistit.exception.PersistitException;
import com.persistit.exception.PersistitIOException;
import com.persistit.exception.PersistitInterruptedException;
import com.persistit.exception.RetryException;
import com.persistit.exception.VolumeClosedException;
import com.persistit.util.Debug;
import com.persistit.util.Util;

/**
* A pool of {@link Buffer} objects, maintained on various lists that permit
* rapid lookup and replacement of pages images within <code>Buffer</code>s.
*
* @author peter
* @version 2.1
*/
public class BufferPool {
    /**
     * Default PageWriter polling interval
     */
    private final static long DEFAULT_WRITER_POLL_INTERVAL = 5000;

    private final static int PAGE_WRITER_TRANCHE_SIZE = 5000;

    /**
     * Sleep time when buffers are exhausted
     */
    private final static long RETRY_SLEEP_TIME = 50;

    /**
     * The ratio of hash table slots per buffer in this pool
     */
    private final static int HASH_MULTIPLE = 3;

    /**
     * Minimum number of buffers this pool may have
     */
    public final static int MINIMUM_POOL_COUNT = 7;
    /**
     * Maximum number of buffers this pool may have
     */
    public final static int MAXIMUM_POOL_COUNT = Integer.MAX_VALUE;

    /**
     * The maximum number of lock buckets
     */
    private final static int HASH_LOCKS = 4096;

    /**
     * Ratio determines which of two volume invalidation algorithms to invoke.
     */
    private final static float SMALL_VOLUME_RATIO = 0.1f;

    /**
     * Ratio of age-based write priority bump
     */
    private final static int WRITE_AGE_THRESHOLD_RATIO = 4;

    private final static String INVENTORY_TREE_NAME = "_buffers";
    /**
     * Maximum number of buffer inventory versions to retain
     */
    private final static int INVENTORY_VERSIONS = 3;

    /**
     * Preload log message interval, in seconds
     */
    private final static long INVENTORY_PRELOAD_LOG_MESSAGE_NS = 60L * NS_PER_S;

    /**
     * The Persistit instance that references this BufferPool.
     */
    private final Persistit _persistit;

    /**
     * Hash table - fast access to buffer by hash of address.
     */
    private final Buffer[] _hashTable;

    /**
     * Locks used to lock hashtable entries.
     */
    private final ReentrantLock[] _hashLocks;

    /**
     * All Buffers in this pool
     */
    private final Buffer[] _buffers;
    /**
     * Count of Buffers allocated to this pool.
     */
    private final int _bufferCount;

    /**
     * Size of each buffer
     */
    private final int _bufferSize;

    /**
     * Bit map for invalidated pages. Elements in this array, one bit per page,
     * indicate buffers that have been invalidated and are therefore able to be
     * allocated without evicting a valid page.
     */
    private final AtomicLongArray _availablePagesBits;

    private final AtomicBoolean _availablePages = new AtomicBoolean();

    /**
     * The maximum number of keys allowed Buffers in this pool
     */
    private final int _maxKeys;

    /**
     * Pointer to next location to look for a replacement buffer
     */
    private final AtomicInteger _clock = new AtomicInteger();

    /**
     * Count of buffer pool misses (buffer not found in pool)
     */
    private final AtomicLong _missCounter = new AtomicLong();

    /**
     * Count of buffer pool hits (buffer found in pool)
     */
    private final AtomicLong _hitCounter = new AtomicLong();

    /**
     * Count of newly created pages
     */
    private final AtomicLong _newCounter = new AtomicLong();

    /**
     * Count of valid buffers evicted to make room for another page.
     */
    private final AtomicLong _evictCounter = new AtomicLong();

    /**
     * Count of dirty pages
     */
    private final AtomicInteger _dirtyPageCount = new AtomicInteger();

    /**
     * Count of pages written from this pool
     */
    private final AtomicLong _writeCounter = new AtomicLong();
    /**
     * Count of pages written due to being dirty when selected by the buffer
     * allocator.
     */
    private final AtomicLong _forcedWriteCounter = new AtomicLong();

    /**
     * (with n Count of pages written due to being dirty before a checkpoint
     */
    private final AtomicLong _forcedCheckpointWriteCounter = new AtomicLong();
    /**
     * Indicates that Persistit has closed this buffer pool.
     */
    private final AtomicBoolean _closed = new AtomicBoolean(false);

    /**
     * Oldest update timestamp found during PAGE_WRITER's most recent scan.
     */
    private volatile long _earliestDirtyTimestamp = Long.MIN_VALUE;

    /**
     * Timestamp to which all dirty pages should be written. PAGE_WRITER writes
     * any page with a lower update timestamp regardless of urgency.
     */
    private final AtomicLong _flushTimestamp = new AtomicLong();

    /**
     * Polling interval for PageWriter
     */
    private volatile long _writerPollInterval = DEFAULT_WRITER_POLL_INTERVAL;

    private volatile int _pageWriterTrancheSize = PAGE_WRITER_TRANCHE_SIZE;

    /**
     * The PAGE_WRITER IOTaskRunnable
     */
    private PageWriter _writer;

    /**
     * Construct a BufferPool with the specified count of <code>Buffer</code>s
     * of the specified size.
     *
     * @param count
     *            The number of buffers in the pool
     * @param size
     *            The size (in bytes) of each buffer
     */
    BufferPool(final int count, final int size, final Persistit persistit) {
        _persistit = persistit;
        if (count < MINIMUM_POOL_COUNT) {
            throw new IllegalArgumentException("Buffer pool count too small: " + count);
        }
        if (count > MAXIMUM_POOL_COUNT) {
            throw new IllegalArgumentException("Buffer pool count too large: " + count);
        }

        int possibleSize = Buffer.MIN_BUFFER_SIZE;
        boolean ok = false;
        while (!ok && possibleSize <= Buffer.MAX_BUFFER_SIZE) {
            if (size == possibleSize)
                ok = true;
            else
                possibleSize *= 2;
        }
        if (!ok)
            throw new IllegalArgumentException("Invalid buffer size requested: " + size);

        _bufferCount = count;
        _bufferSize = size;
        _buffers = new Buffer[_bufferCount];
        _availablePagesBits = new AtomicLongArray((count + 63) / 64);
        _hashTable = new Buffer[_bufferCount * HASH_MULTIPLE];
        _hashLocks = new ReentrantLock[HASH_LOCKS];
        _maxKeys = (_bufferSize - Buffer.HEADER_SIZE) / Buffer.MAX_KEY_RATIO;

        for (int index = 0; index < HASH_LOCKS; index++) {
            _hashLocks[index] = new ReentrantLock();
        }

        int buffers = 0;
        //
        // Allocate this here so that in the event of an OOME we can release it
        // to free enough memory to write the error information out.
        //
        byte[] reserve = new byte[1024 * 1024];
        try {
            for (int index = 0; index < _bufferCount; index++) {
                final Buffer buffer = new Buffer(size, index, this, _persistit);
                _buffers[index] = buffer;
                buffers++;
            }
        } catch (final OutOfMemoryError e) {
            //
            // Note: written this way to try to avoid another OOME.
            // Do not use String.format here.
            //
            reserve = null;
            System.err.print("Out of memory with ");
            System.err.print(Runtime.getRuntime().freeMemory());
            System.err.print(" bytes free after creating ");
            System.err.print(buffers);
            System.err.print("/");
            System.err.print(_bufferCount);
            System.err.print(" buffers from maximum heap ");
            System.err.println(_persistit.getAvailableHeap());
            throw e;
        }
        _writer = new PageWriter();
    }

    void startThreads() throws PersistitException {
        _writer.start();
    }

    void close() {
        _closed.set(true);
        _persistit.waitForIOTaskStop(_writer);
        _writer = null;
    }

    /**
     * Abruptly stop (using {@link Thread#stop()}) the writer, cacher, and
     * collector threads. This method should be used only by tests.
     */
    void crash() {
        IOTaskRunnable.crash(_writer);
    }

    void flush(final long timestamp) throws PersistitInterruptedException {
        setFlushTimestamp(timestamp);
        _writer.kick();
        while (isFlushing()) {
            Util.sleep(RETRY_SLEEP_TIME);
        }
    }

    boolean isFlushing() {
        return _flushTimestamp.get() != 0;
    }

    int hashIndex(final Volume vol, final long page) {
        return (int) (((page ^ vol.hashCode()) & Integer.MAX_VALUE) % _hashTable.length);
    }

    int countInUse(final Volume vol, final boolean writer) {
        int count = 0;
        for (int i = 0; i < _bufferCount; i++) {
            final Buffer buffer = _buffers[i];
            if ((vol == null || buffer.getVolume() == vol)
                    && ((buffer.getStatus() & SharedResource.CLAIMED_MASK) != 0 && (!writer || (buffer.getStatus() & SharedResource.WRITER_MASK) != 0))) {
                count++;
            }
        }
        return count;
    }

    void populateBufferPoolInfo(final ManagementImpl.BufferPoolInfo info) {
        info.bufferCount = _bufferCount;
        info.bufferSize = _bufferSize;
        info.missCount = _missCounter.get();
        info.hitCount = _hitCounter.get();
        info.newCount = _newCounter.get();
        info.evictCount = _evictCounter.get();
        info.dirtyPageCount = _dirtyPageCount.get();
        info.writeCount = _writeCounter.get();
        info.forcedCheckpointWriteCount = _forcedCheckpointWriteCounter.get();
        info.forcedWriteCount = _forcedWriteCounter.get();
        int validPages = 0;
        int readerClaimedPages = 0;
        int writerClaimedPages = 0;

        for (int index = 0; index < _bufferCount; index++) {
            final Buffer buffer = _buffers[index];
            final int status = buffer.getStatus();
            if ((status & SharedResource.VALID_MASK) != 0)
                validPages++;
            if ((status & SharedResource.WRITER_MASK) != 0)
                writerClaimedPages++;
            else if ((status & SharedResource.CLAIMED_MASK) != 0)
                readerClaimedPages++;
        }
        info.validPageCount = validPages;
        info.readerClaimedPageCount = readerClaimedPages;
        info.writerClaimedPageCount = writerClaimedPages;
        info.earliestDirtyTimestamp = getEarliestDirtyTimestamp();

        info.updateAcquisitonTime();
    }

    int populateInfo(final ManagementImpl.BufferInfo[] array, final int traveralType, final int includeMask,
            final int excludeMask) {
        int index = 0;
        switch (traveralType) {
        case 0:
            for (int i = 0; i < _bufferCount; i++) {
                final Buffer buffer = _buffers[i];
                if (selected(buffer, includeMask, excludeMask)) {
                    populateInfo1(array, index, buffer);
                    index++;
                }
            }
            break;

        default:
            index = -1;
            break;
        }
        return index;
    }

    private static void populateInfo1(final ManagementImpl.BufferInfo[] array, final int index, final Buffer buffer) {
        if (index < array.length) {
            if (array[index] == null)
                array[index] = new ManagementImpl.BufferInfo();
            buffer.populateInfo(array[index]);
        }
    }

    private boolean selected(final Buffer buffer, final int includeMask, final int excludeMask) {
        return ((includeMask == 0) || (buffer.getStatus() & includeMask) != 0)
                && (buffer.getStatus() & excludeMask) == 0;
    }

    /**
     * @return Size (in bytes) of each <code>Buffer</code> managed by this pool.
     */
    public int getBufferSize() {
        return _bufferSize;
    }

    /**
     * @return The count of <code>Buffer</code>s managed by this pool.
     */
    public int getBufferCount() {
        return _bufferCount;
    }

    /**
     * @return The count of lookup operations for pages images in this pool that
     *         required a physical read operation. This number, in comparison
     *         with the hit counter, indicates how effective the cache is in
     *         reducing disk I/O.
     */
    public long getMissCounter() {
        return _missCounter.get();
    }

    /**
     * @return The count of lookup operations for pages images in this pool for
     *         which the page image was already found in this
     *         <code>BufferPool</code>. This number, in comparison with the get
     *         counter, indicates how effective the cache is in reducing disk
     *         I/O.
     */
    public long getHitCounter() {
        return _hitCounter.get();
    }

    /**
     * @return The count of buffers newly created in this pool. Each time a new
     *         page is added to a Volume, this counter is incremented.
     */
    public long getNewCounter() {
        return _newCounter.get();
    }

    /**
     * This counter is incremented ach time the eviction algorithm selects a
     * dirty buffer to evict. Normally dirty pages are written by the background
     * PAGE_WRITER thread, and therefore an abnormally large forcedWrite count
     * indicates the PAGE_WRITER thread is falling behind.
     *
     * @return The count of buffers written to disk when evicted.
     */
    public long getForcedWriteCounter() {
        return _forcedWriteCounter.get();
    }

    /**
     * This counter is incremented each time a application modifies a buffer
     * that is (a) dirty, and (b) required to be written as part of a
     * checkpoint. An abnormally large count indicates that the PAGE_WRITER
     * thread is falling behind.
     *
     * @return The count of buffers written to disk due to a checkpoint.
     */
    public long getForcedCheckpointWriteCounter() {
        return _forcedCheckpointWriteCounter.get();
    }

    /**
     * Resets the get and hit counters to zero.
     */
    public void resetCounters() {
        _missCounter.set(0);
        _hitCounter.set(0);
        _newCounter.set(0);
        _evictCounter.set(0);
    }

    int getMaxKeys() {
        return _maxKeys;
    }

    private void bumpHitCounter() {
        _hitCounter.incrementAndGet();
    }

    private void bumpMissCounter() {
        _missCounter.incrementAndGet();
    }

    private void bumpNewCounter() {
        _newCounter.incrementAndGet();
    }

    void bumpWriteCounter() {
        _writeCounter.incrementAndGet();
    }

    void bumpForcedCheckpointWrites() {
        _forcedCheckpointWriteCounter.incrementAndGet();
    }

    /**
     * Get the "hit ratio" - the number of hits divided by the number of overall
     * gets. A value close to 1.0 indicates that most attempts to find data in
     * the <code>BufferPool</code> are successful - i.e., that the cache is
     * effectively reducing the need for disk read operations.
     *
     * @return The ratio
     */
    public double getHitRatio() {
        final long hitCounter = _hitCounter.get();
        final long getCounter = hitCounter + _missCounter.get() + _newCounter.get();
        if (getCounter == 0)
            return 0.0;
        else
            return ((double) hitCounter) / ((double) getCounter);
    }

    void incrementDirtyPageCount() {
        _dirtyPageCount.incrementAndGet();
    }

    void decrementDirtyPageCount() {
        _dirtyPageCount.decrementAndGet();
    }

    int getDirtyPageCount() {
        return _dirtyPageCount.get();
    }

    /**
     * Invalidate all buffers from a specified Volume.
     *
     * @param volume
     *            The volume
     * @throws PersistitInterruptedException
     */
    boolean invalidate(final Volume volume) throws PersistitException {
        final float ratio = (float) volume.getStorage().getNextAvailablePage() / (float) _bufferCount;
        if (ratio < SMALL_VOLUME_RATIO) {
            return invalidateSmallVolume(volume, false);
        } else {
            return invalidateLargeVolume(volume, false);
        }
    }

    boolean evict(final Volume volume) throws PersistitException {
        return invalidateSmallVolume(volume, true);
    }

    boolean invalidateSmallVolume(final Volume volume, final boolean mustWrite) throws PersistitException {
        boolean result = true;
        int markedAvailable = 0;
        for (long page = 1; page < volume.getStorage().getNextAvailablePage(); page++) {
            final int hashIndex = hashIndex(volume, page);
            _hashLocks[hashIndex % HASH_LOCKS].lock();
            try {
                for (Buffer buffer = _hashTable[hashIndex]; buffer != null; buffer = buffer.getNext()) {
                    if ((buffer.getVolume() == volume || volume == null) && !buffer.isFixed() && buffer.isValid()) {
                        if (buffer.claim(true, 0)) {
                            // re-check after claim
                            boolean invalidated = false;
                            try {
                                if ((buffer.getVolume() == volume || volume == null) && !buffer.isFixed()
                                        && buffer.isValid()) {
                                    if (mustWrite && buffer.isDirty()) {
                                        buffer.writePage();
                                    }
                                    invalidate(buffer);
                                    invalidated = true;
                                }
                            } finally {
                                buffer.release();
                            }
                            if (invalidated) {
                                final int q = buffer.getIndex() / 64;
                                final int p = buffer.getIndex() % 64;
                                final long bits = _availablePagesBits.get(q);
                                if (_availablePagesBits.compareAndSet(q, bits, bits | (1L << p))) {
                                    markedAvailable++;
                                }
                            }
                        } else {
                            result = false;
                        }
                    }
                }
            } finally {
                _hashLocks[hashIndex % HASH_LOCKS].unlock();
            }
        }
        if (markedAvailable > 0) {
            _availablePages.set(true);
        }
        return result;

    }

    boolean invalidateLargeVolume(final Volume volume, final boolean mustWrite) throws PersistitException {
        boolean result = true;
        int markedAvailable = 0;
        for (int index = 0; index < _bufferCount; index++) {
            final Buffer buffer = _buffers[index];
            if ((buffer.getVolume() == volume || volume == null) && !buffer.isFixed() && buffer.isValid()) {
                if (buffer.claim(true, 0)) {
                    // re-check after claim
                    boolean invalidated = false;
                    try {
                        if ((buffer.getVolume() == volume || volume == null) && !buffer.isFixed() && buffer.isValid()) {
                            if (mustWrite && buffer.isDirty()) {
                                buffer.writePage();
                            }
                            invalidate(buffer);
                            invalidated = true;
                        }
                    } finally {
                        buffer.release();
                    }
                    if (invalidated) {
                        final int q = buffer.getIndex() / 64;
                        final int p = buffer.getIndex() % 64;
                        final long bits = _availablePagesBits.get(q);
                        if (_availablePagesBits.compareAndSet(q, bits, bits | (1L << p))) {
                            markedAvailable++;
                        }
                    }
                } else {
                    result = false;
                }
            }
        }
        if (markedAvailable > 0) {
            _availablePages.set(true);
        }
        return result;
    }

    private void invalidate(final Buffer buffer) {
        Debug.$assert0.t(buffer.isValid() && buffer.isOwnedAsWriterByMe());

        while (!detach(buffer)) {
            /*
             * Spin until detach succeeds. Note: this method must not throw an
             * Exception because it is called in at at critical time when
             * cleanup must be done. It is not possible to lock the hash bucket
             * here due to possible deadlock. However, the likelihood of a
             * lengthy live-lock is infinitesimal so polling is acceptable.
             */
            try {
                Thread.sleep(1);
            } catch (final InterruptedException ie) {
                // ignore
            }
        }
        buffer.clearValid();
        buffer.clearDirty();
        buffer.setPageAddressAndVolume(0, null);
    }

    private boolean detach(final Buffer buffer) {
        final int hash = hashIndex(buffer.getVolume(), buffer.getPageAddress());
        if (!_hashLocks[hash % HASH_LOCKS].tryLock()) {
            return false;
        }
        try {

            // Detach this buffer from the hash table.
            //
            if (_hashTable[hash] == buffer) {
                _hashTable[hash] = buffer.getNext();
            } else {
                Buffer prev = _hashTable[hash];
                for (Buffer next = prev.getNext();; next = prev.getNext()) {
                    assert next != null : "Attempting to detach an unattached Buffer";
                    if (next == buffer) {
                        prev.setNext(next.getNext());
                        break;
                    }
                    prev = next;
                }
            }
        } finally {
            _hashLocks[hash % HASH_LOCKS].unlock();
        }
        return true;
    }

    /**
     * Find or load a page given its Volume and address. The returned page has a
     * reader or a writer lock, depending on whether the writer parameter is
     * true on entry. Waits up to {@value SharedResource#DEFAULT_MAX_WAIT_TIME}
     * milliseconds to acquire the desired lock on the page
     *
     * @param vol
     *            The Volume
     * @param page
     *            The address of the page
     * @param writer
     *            <i>true</i> if a write lock is required.
     * @param wantRead
     *            <i>true</i> if the caller wants the page read from disk.
     *            <i>false</i> to allocate a new blank page.)
     * @return Buffer The Buffer describing the buffer containing the page.
     * @throws InUseException
     *             if the specific lock could not be acquired
     */
    Buffer get(final Volume vol, final long page, final boolean writer, final boolean wantRead)
            throws PersistitException {
        return get(vol, page, writer, wantRead, SharedResource.DEFAULT_MAX_WAIT_TIME);
    }

    /**
     * Find or load a page given its Volume and address. The returned page has a
     * reader or a writer lock, depending on whether the writer parameter is
     * true on entry.
     *
     * @param vol
     *            The Volume
     * @param page
     *            The address of the page
     * @param writer
     *            <i>true</i> if a write lock is required.
     * @param wantRead
     *            <i>true</i> if the caller wants the page read from disk.
     *            <i>false</i> to allocate a new blank page.)
     * @param timeout
     *            maximum time to wait for the page to become available before
     *            throwing an InUseException
     * @return Buffer The Buffer describing the buffer containing the page.
     * @throws InUseException
     *             if the specific lock could not be acquired within the
     *             specified timeout
     */
    Buffer get(final Volume vol, final long page, final boolean writer, final boolean wantRead, final long timeout)
            throws PersistitException {
        final int hash = hashIndex(vol, page);
        Buffer buffer = null;

        for (;;) {
            boolean mustClaim = false;
            _hashLocks[hash % HASH_LOCKS].lock();
            try {
                buffer = _hashTable[hash];
                //
                // Search for the page
                //
                while (buffer != null) {
                    Debug.$assert0.t(buffer.getNext() != buffer);
                    if (buffer.getPageAddress() == page && buffer.getVolume() == vol) {
                        //
                        // Found it - now claim it.
                        //
                        if (buffer.claim(writer, 0)) {
                            vol.getStatistics().bumpGetCounter();
                            bumpHitCounter();
                            assert !buffer.isOwnedAsWriterByOther();
                            return buffer;
                        } else {
                            mustClaim = true;
                            break;
                        }
                    }
                    buffer = buffer.getNext();
                }

                if (buffer == null) {
                    //
                    // Page not found. Allocate an available buffer and read
                    // in the page from the Volume.
                    //
                    buffer = allocBuffer();
                    Debug.$assert1.t(!buffer.isDirty());
                    Debug.$assert0.t(buffer != _hashTable[hash]);
                    Debug.$assert0.t(buffer.getNext() != buffer);

                    buffer.setPageAddressAndVolume(page, vol);
                    buffer.setNext(_hashTable[hash]);
                    _hashTable[hash] = buffer;
                    //
                    // It's not really valid yet, but it does have a writer
                    // claim on it so no other Thread can access it. In the
                    // meantime, any other Thread seeking access to the same
                    // page will find it.
                    //
                    buffer.setValid();
                    if (vol.isTemporary() || vol.isLockVolume()) {
                        buffer.setTemporary();
                    } else {
                        buffer.clearTemporary();
                    }
                    Debug.$assert0.t(buffer.getNext() != buffer);
                }
            } finally {
                _hashLocks[hash % HASH_LOCKS].unlock();
            }
            if (mustClaim) {
                boolean claimed = false;
                boolean same = true;
                final long start = System.currentTimeMillis();
                while (same && !claimed && System.currentTimeMillis() - start < timeout) {
                    /*
                     * We're here because we found the page we want, but another
                     * thread has an incompatible claim on it. Here we wait,
                     * then recheck to make sure the buffer still represents the
                     * same page.
                     */
                    claimed = buffer.claim(writer, Persistit.SHORT_DELAY);
                    //
                    // Test whether the buffer we picked out is still valid
                    //
                    same = buffer.isValid() && buffer.getPageAddress() == page && buffer.getVolume() == vol;
                    /*
                     * Loop will terminate if we got the claim if the page
                     * changed.
                     */
                }
                if (same) {
                    if (claimed) {
                        //
                        // If so, then we're done.
                        //
                        vol.getStatistics().bumpGetCounter();
                        bumpHitCounter();
                        assert !buffer.isOwnedAsWriterByOther();
                        return buffer;
                    } else {
                        throw new InUseException("Thread " + Thread.currentThread().getName() + " failed to acquire "
                                + (writer ? "writer" : "reader") + " claim on " + buffer);
                    }
                } else if (claimed) {
                    buffer.release();
                }
                continue;
            } else {
                /*
                 * We're here because the required page was not found in the
                 * pool so we have to read it from the Volume. We have a writer
                 * claim on the buffer, so anyone else attempting to get this
                 * page will simply wait for us to finish reading it.
                 *
                 * At this point, the Buffer has been fully set up. It is on the
                 * hash table chain under its new page address, it is marked
                 * valid, and this Thread has a writer claim. If the read
                 * attempt fails, we need to mark the page INVALID so that any
                 * Thread waiting for access to this buffer will not use it. We
                 * also need to demote the writer claim to a reader claim unless
                 * the caller originally asked for a writer claim.
                 */
                if (wantRead) {
                    boolean loaded = false;
                    try {
                        Debug.$assert0.t(buffer.getPageAddress() == page && buffer.getVolume() == vol
                                && hashIndex(buffer.getVolume(), buffer.getPageAddress()) == hash);
                        buffer.load(vol, page);
                        loaded = true;
                        vol.getStatistics().bumpGetCounter();
                        bumpMissCounter();
                    } finally {
                        if (!loaded) {
                            invalidate(buffer);
                            buffer.release();
                        }
                    }
                } else {
                    buffer.clear();
                    buffer.init(Buffer.PAGE_TYPE_UNALLOCATED);
                    bumpNewCounter();
                }
                if (!writer) {
                    buffer.releaseWriterClaim();
                }
                return buffer;
            }
        }
    }

    /**
     * Returns a copy of Buffer. The returned buffer is newly created, is not a
     * member of the buffer pool, and is not claimed. There is no guarantee that
     * the content of this copy is internally consistent because another thread
     * may be modifying the buffer while the copy is being made. The returned
     * Buffer should be used only for display and diagnostic purposes.
     *
     * @param vol
     * @param page
     * @return Copy of the Buffer
     * @throws InvalidPageAddressException
     * @throws InvalidPageStructureException
     * @throws VolumeClosedException
     * @throws PersistitInterruptedException
     * @throws RetryException
     * @throws IOException
     */
    public Buffer getBufferCopy(final Volume vol, final long page) throws InvalidPageAddressException,
            InvalidPageStructureException, VolumeClosedException, InUseException, PersistitIOException,
            PersistitInterruptedException {
        final int hash = hashIndex(vol, page);
        Buffer buffer = null;
        _hashLocks[hash % HASH_LOCKS].lock();
        try {
            buffer = _hashTable[hash];
            //
            // Search for the page
            //
            while (buffer != null) {
                Debug.$assert0.t(buffer.getNext() != buffer);

                if (buffer.getPageAddress() == page && buffer.getVolume() == vol) {
                    Debug.$assert0.t(buffer.isValid());
                    //
                    // Found it - now return a copy of it.
                    //
                    return new Buffer(buffer);
                }
                buffer = buffer.getNext();
            }
        } finally {
            _hashLocks[hash % HASH_LOCKS].unlock();
        }
        //
        // Didn't find it in the pool, so we'll read a copy.
        //
        buffer = new Buffer(_bufferSize, -1, this, _persistit);
        final boolean acquired = buffer.claim(true);
        assert acquired : "buffer not unavailable";
        buffer.load(vol, page);
        buffer.setValid();
        buffer.release();
        return buffer;
    }

    public Buffer getBufferCopy(final int index) throws IllegalArgumentException {
        if (index < 0 || index >= _bufferCount) {
            throw new IllegalArgumentException("Index " + index + " is out of range in " + this);
        }
        return new Buffer(_buffers[index]);
    }

    /**
     * Returns an available buffer. The replacement policy is to return a buffer
     * that's already been marked invalid, if available. Otherwise use the Clock
     * algorithm to choose a page for replacement that is approximately the
     * least-recently-used page.
     *
     * @return Buffer An available buffer, or <i>null</i> if no buffer is
     *         currently available. The buffer has a writer claim.
     * @throws PersistitException
     * @throws IllegalStateException
     *             if there is no available buffer.
     */

    private Buffer allocBuffer() throws PersistitException {
        //
        // Start by searching for an invalid page. It's preferable
        // since no valid page will need to be evicted.
        //
        if (_availablePages.get()) {
            final int start = (_clock.get() / 64) * 64;
            for (int q = start;;) {
                q += 64;
                if (q >= _bufferCount) {
                    q = 0;
                }
                long bits = _availablePagesBits.get(q / 64);
                if (bits != 0) {
                    for (int p = 0; p < 64; p++) {
                        if ((bits & (1L << p)) != 0) {
                            final Buffer buffer = _buffers[q + p];
                            //
                            // Note: need to verify that there are no claims -
                            // including those of the current thread.
                            //
                            if ((buffer.getStatus() & SharedResource.CLAIMED_MASK) == 0 && buffer.claim(true, 0)) {
                                if (!buffer.isValid()) {
                                    bits = _availablePagesBits.get(q / 64);
                                    if (_availablePagesBits.compareAndSet(q / 64, bits, bits & ~(1L << p))) {
                                        buffer.clearDirty();
                                        return buffer;
                                    }
                                }
                                buffer.release();
                            }
                        }
                    }
                }
                if (q == start) {
                    break;
                }

            }
            _availablePages.set(false);
        }
        //
        // Look for a page to evict.
        //
        for (int retry = 0; retry < _bufferCount * 2;) {
            final int clock = _clock.get();
            assert clock < _bufferCount;
            if (!_clock.compareAndSet(clock, (clock + 1) % _bufferCount)) {
                continue;
            }
            final Buffer buffer = _buffers[clock];
            if (buffer.isTouched()) {
                buffer.clearTouched();
            } else {
                //
                // Note: need to verify that there are no claims - including
                // those of the current thread.
                //
                if (!buffer.isFixed() && (buffer.getStatus() & SharedResource.CLAIMED_MASK) == 0
                        && buffer.claim(true, 0)) {
                    if (buffer.isDirty()) {
                        // An invalid dirty buffer is available and does not
                        // need to be written.
                        if (!buffer.isValid()) {
                            buffer.clearDirty();
                            return buffer;
                        }
                        // A dirty valid buffer needs to be written and then
                        // marked invalid. Can't prune it before writing it in
                        // this context
                        try {
                            buffer.writePage(false);
                            if (detach(buffer)) {
                                buffer.clearValid();
                                _forcedWriteCounter.incrementAndGet();
                                _evictCounter.incrementAndGet();
                                _persistit.getIOMeter().chargeEvictPageFromPool(buffer.getVolume(),
                                        buffer.getPageAddress(), buffer.getBufferSize(), buffer.getIndex());
                            }
                        } finally {
                            if (!buffer.isValid()) {
                                return buffer;
                            } else {
                                buffer.release();
                            }
                        }
                    } else {
                        if (buffer.isValid() && detach(buffer)) {
                            buffer.clearValid();
                            _evictCounter.incrementAndGet();
                            _persistit.getIOMeter().chargeEvictPageFromPool(buffer.getVolume(),
                                    buffer.getPageAddress(), buffer.getBufferSize(), buffer.getIndex());
                        }
                        if (!buffer.isValid()) {
                            return buffer;
                        } else {
                            buffer.release();
                        }
                    }
                }
            }
            retry++;
        }
        throw new IllegalStateException("No available Buffers");
    }

    enum Result {
        WRITTEN, UNAVAILABLE, ERROR
    };

    /**
     * @return Earliest timestamp of any dirty page in this
     *         <code>BufferPool</code>.
     */
    public long getEarliestDirtyTimestamp() {
        return _earliestDirtyTimestamp;
    }

    void setFlushTimestamp(final long timestamp) {
        while (true) {
            final long current = _flushTimestamp.get();
            if (timestamp > current) {
                if (_flushTimestamp.compareAndSet(current, timestamp)) {
                    break;
                }
            } else {
                break;
            }
        }
    }

    /**
     * Heuristic to determine when the PAGE_WRITER thread(s) should do work.
     *
     * @return whether PAGE_WRITER thread should write pages on the current
     *         polling cycle
     */
    boolean shouldWritePages() {
        final int cleanCount = _bufferCount - _dirtyPageCount.get();
        if (getEarliestDirtyTimestamp() < _flushTimestamp.get()) {
            return true;
        }
        if (getEarliestDirtyTimestamp() <= _persistit.getCurrentCheckpoint().getTimestamp()) {
            return true;
        }
        if (cleanCount < _pageWriterTrancheSize * 2) {
            return true;
        }
        if (cleanCount < _bufferCount / 8) {
            return true;
        }
        return false;
    }

    void writeDirtyBuffers(final int[] priorities, final BufferHolder[] selectedBuffers) throws PersistitException {
        final int count = selectDirtyBuffers(priorities, selectedBuffers);
        if (count > 0) {
            Arrays.sort(selectedBuffers, 0, count);
            for (int index = 0; index < count; index++) {
                final BufferHolder holder = selectedBuffers[index];
                final Buffer buffer = holder._buffer;
                if (buffer.claim(true, 0)) {
                    try {
                        if (holder.matches(buffer) && buffer.isDirty() && buffer.isValid()) {
                            buffer.writePage();
                        }
                    } finally {
                        buffer.release();
                    }
                }
            }
        }
    }

    int selectDirtyBuffers(final int[] priorities, final BufferHolder[] holders) throws PersistitException {
        Debug.suspend();
        int count = 0;
        final int clock = _clock.get();

        final long checkpointTimestamp = _persistit.getCurrentCheckpoint().getTimestamp();
        final long currentTimestamp = _persistit.getCurrentTimestamp();

        long earliestDirtyTimestamp = currentTimestamp;
        final long flushTimestamp = _flushTimestamp.get();

        boolean flushed = true;
        for (int index = clock; index < clock + _bufferCount; index++) {
            final Buffer buffer = _buffers[index % _bufferCount];
            long timestamp = buffer.getTimestamp();
            /*
             * The following is subtle timing dance. If the buffer claim is
             * acquired here then no other thread can modify its timestamp or
             * dirty flag until it is released, and the timestamp reflects the
             * true sequence at which the buffer became dirty. However, if the
             * buffer is in use by another thread, then the timestamp reflects
             * either (a) the timestamp at which the buffer became dirty if is
             * is dirty, or (b) the the timestamp at which the thread holding
             * the claim acquired it.
             */
            if (!buffer.claim(false, 0)) {
                /*
                 * Without a claim, we are still guaranteed that the buffer will
                 * never receive a dirty timestamp less than its current
                 * timestamp.
                 */
                if (timestamp < earliestDirtyTimestamp) {
                    earliestDirtyTimestamp = timestamp;
                }
                if (timestamp < flushTimestamp) {
                    flushed = false;
                }
            } else {
                try {
                    if (buffer.isDirty()) {
                        final int priority = writePriority(buffer, clock, checkpointTimestamp, currentTimestamp);
                        if (priority > 0) {
                            count = addSelectedBufferByPriority(buffer, priority, priorities, holders, count);
                            if (!buffer.isTemporary()) {
                                timestamp = buffer.getTimestamp();
                                if (timestamp < earliestDirtyTimestamp) {
                                    earliestDirtyTimestamp = timestamp;
                                }
                                if (timestamp <= flushTimestamp) {
                                    flushed = false;
                                }
                            }
                        }
                    }
                } finally {
                    buffer.release();
                }
            }
        }

        _earliestDirtyTimestamp = earliestDirtyTimestamp;

        if (flushed) {
            _flushTimestamp.compareAndSet(flushTimestamp, 0);
        }
        return count;
    }

    int addSelectedBufferByPriority(final Buffer buffer, final int priority, final int[] priorities,
            final BufferHolder[] holders, final int initialCount) {
        int count = initialCount;
        if (priority > 0) {
            if (count == 0 || priorities[count - 1] > priority) {
                if (count < priorities.length) {
                    priorities[count] = priority;
                    holders[count].set(buffer);
                    count++;
                }
            } else {
                count = Math.min(count, priorities.length - 1);
                int where = count;
                while (where > 0 && priorities[where - 1] < priority) {
                    where--;
                }
                final int move = count - where;
                if (move > 0) {
                    final BufferHolder lastHolder = holders[count];
                    System.arraycopy(priorities, where, priorities, where + 1, move);
                    System.arraycopy(holders, where, holders, where + 1, move);
                    holders[where] = lastHolder;
                }
                priorities[where] = priority;
                holders[where].set(buffer);
                count++;
            }
        }
        return count;
    }

    /**
     * Compute a priority for writing the specified Buffer. A larger value
     * denotes a greater priority. Priority 0 indicates the buffer is ineligible
     * to be written.
     *
     * @return priority
     */
    int writePriority(final Buffer buffer, final int clock, final long checkpointTimestamp, final long currentTimestamp) {
        final int status = buffer.getStatus();
        if ((status & Buffer.VALID_MASK) == 0 || (status & Buffer.DIRTY_MASK) == 0) {
            // ineligible
            return 0;
        }
        //
        // compute "distance" between this buffer and the clock. A larger
        // distance results in lower priority.
        //
        int distance = (buffer.getIndex() - _clock.get() + _bufferCount) % _bufferCount;
        int age = 0;
        //
        // If this buffer has been touched, then it won't be evicted for at
        // least another _bufferCount cycles, and its distance is therefore
        // increased.
        //
        if ((status & Buffer.TOUCHED_MASK) != 0) {
            distance += _bufferCount;
        }

        if (!buffer.isTemporary()) {
            long timestampThreshold = (currentTimestamp * WRITE_AGE_THRESHOLD_RATIO + checkpointTimestamp)
                    / WRITE_AGE_THRESHOLD_RATIO;
            if (_flushTimestamp.get() > timestampThreshold) {
                timestampThreshold = _flushTimestamp.get();
            }
            //
            // Give higher priority to a older dirty buffers that need to be
            // written soon to allow a checkpoint.
            //
            if (buffer.getTimestamp() < timestampThreshold) {
                age = (int) Math.min(timestampThreshold - buffer.getTimestamp(), Integer.MAX_VALUE / 2);
                distance = 0;
            }
        } else {
            //
            // Temporary buffer - don't write it at all until the clock goes
            // through at least a full cycle.
            //
            if (distance > _bufferCount) {
                return 0;
            }
        }
        //
        // Bias to a large positive integer (magnitude doesn't matter)
        //
        return _bufferCount * 2 - distance + age;
    }

    static class BufferHolder implements Comparable<BufferHolder> {

        long _page;
        long _volumeId;
        Buffer _buffer;

        private void set(final Buffer buffer) {
            _page = buffer.getPageAddress();
            _volumeId = buffer.getVolumeId();
            _buffer = buffer;
        }

        /**
         * @return the page address
         */
        long getPage() {
            return _page;
        }

        /**
         * @return the volumeId
         */
        long getVolumeId() {
            return _volumeId;
        }

        /**
         * @return the Buffer
         */
        Buffer getBuffer() {
            return _buffer;
        }

        private boolean matches(final Buffer buffer) {
            return buffer == _buffer && buffer.getPageAddress() == _page && buffer.getVolumeId() == _volumeId;
        }

        /**
         * Used to sort buffers in ascending page address order by volume.
         *
         * @param buffer
         * @return -1, 0 or 1 as this <code>Buffer</code> falls before, a, or
         *         after the supplied <code>Buffer</code> in the desired page
         *         address order.
         */
        @Override
        public int compareTo(final BufferHolder buffer) {
            return _volumeId > buffer._volumeId ? 1 : _volumeId < buffer._volumeId ? -1 : _page > buffer._page ? 1
                    : _page < buffer._page ? -1 : 0;

        }

        @Override
        public String toString() {
            final Buffer buffer = _buffer;
            return buffer == null ? null : buffer.toString();
        }
    }

    /**
     * Implementation of PAGE_WRITER thread.
     */
    class PageWriter extends IOTaskRunnable {

        int[] _priorities = new int[0];
        BufferHolder[] _selectedBuffers = new BufferHolder[0];

        PageWriter() {
            super(BufferPool.this._persistit);
        }

        void start() {
            start("PAGE_WRITER:" + _bufferSize, _writerPollInterval);
        }

        @Override
        public void runTask() throws PersistitException {
            final int size = _pageWriterTrancheSize;
            if (size != _priorities.length) {
                _priorities = new int[size];
                _selectedBuffers = new BufferHolder[size];
                for (int index = 0; index < size; index++) {
                    _selectedBuffers[index] = new BufferHolder();
                }
            }

            if (shouldWritePages()) {
                writeDirtyBuffers(_priorities, _selectedBuffers);
            }
        }

        @Override
        protected boolean shouldStop() {
            return _closed.get() && !isFlushing();
        }

        @Override
        protected long pollInterval() {
            return isFlushing() ? 0 : _writerPollInterval;
        }
    }

    @Override
    public String toString() {
        return "BufferPool[" + _bufferCount + "@" + _bufferSize + (_closed.get() ? ":closed" : "") + "]";
    }

    /**
     * @param i
     * @param detail
     * @return toString value for buffer at index <code>i</code>.
     */
    String toString(final int i, final boolean detail) {
        if (detail) {
            return _buffers[i].toStringDetail();
        } else {
            return _buffers[i].toString();
        }
    }

    /**
     * Dump the content of this <code>BufferPool</code> to the suppled stream.
     * Format is identical to the journal, consisting of a stream of IV and PA
     * records.
     *
     * @param stream
     *            DataOutputStream to write to
     * @param bb
     *            ByteBuffer used to buffer intermediate results
     * @param secure
     *            true to obscure data values in the dump
     * @throws Exception
     */
    void dump(final DataOutputStream stream, final ByteBuffer bb, final boolean secure, final boolean verbose)
            throws Exception {
        final String toString = toString();
        if (verbose) {
            System.out.println(toString);
        }

        final Set<Volume> identifiedVolumes = new HashSet<Volume>();
        for (final Buffer buffer : _buffers) {
            buffer.dump(bb, secure, verbose, identifiedVolumes);
            if (bb.remaining() < _bufferSize * 2) {
                bb.flip();
                stream.write(bb.array(), 0, bb.limit());
                bb.clear();
            }
        }
        if (bb.remaining() > 0) {
            bb.flip();
            stream.write(bb.array(), 0, bb.limit());
            bb.clear();
        }
        stream.flush();
    }

    void recordBufferInventory(final long timestamp) throws PersistitException {
        final Exchange exchange = getBufferInventoryExchange();
        /*
         * Advisory only - transaction integrity not needed
         */
        exchange.ignoreTransactions();
        try {
            int total = 0;
            exchange.clear().append(_bufferSize).append(timestamp).append(Key.BEFORE);
            final Value value = exchange.getValue();
            final int clockValueBefore = _clock.get();
            for (int index = 0; index < _buffers.length; index++) {
                final Buffer buffer = _buffers[index];
                long page1 = -1, page2 = -1;
                Volume volume1 = null, volume2 = null;
                if (buffer != null && buffer.isValid()) {
                    while (true) {
                        page1 = buffer.getPageAddress();
                        volume1 = buffer.getVolume();
                        page2 = buffer.getPageAddress();
                        volume2 = buffer.getVolume();
                        if (page1 == page2 && volume1 == volume2) {
                            break;
                        }
                        Util.spinSleep();
                    }
                    if (volume1 != null && !volume1.isTemporary() && !volume1.isLockVolume()) {
                        value.clear().setStreamMode(true);
                        value.put(volume1.getHandle());
                        value.put(page1);
                        exchange.to(index).store();
                        total++;
                    }
                }
            }
            final int clockValueAfter = _clock.get();
            exchange.cut();
            value.clear().setStreamMode(true);
            value.put(_bufferCount);
            value.put(total);
            value.put(clockValueBefore);
            value.put(clockValueAfter);
            value.put(System.currentTimeMillis());
            exchange.store();
            int count = 0;
            while (exchange.previous()) {
                if (++count > INVENTORY_VERSIONS) {
                    exchange.remove(Key.GTEQ);
                }
            }
        } catch (final PersistitException e) {
            _persistit.getLogBase().bufferInventoryException.log(e);
        }
    }

    void preloadBufferInventory() {
        int count = 0;
        int total = 0;
        final long startTime = System.nanoTime();
        long reportTime = startTime;

        try {
            final JournalManager jman = _persistit.getJournalManager();
            final Exchange exchange = getBufferInventoryExchange();
            final Value value = exchange.getValue();
            final List<PageNode> pageNodes = new ArrayList<PageNode>();
            boolean foundInventory = false;
            exchange.clear().append(_bufferSize).append(Key.AFTER);
            while (exchange.previous()) {
                if (exchange.getValue().isDefined()) {
                    foundInventory = true;
                    break;
                }
            }
            if (!foundInventory) {
                return;
            }
            value.setStreamMode(true);
            /* int bufferCount = */value.getInt();
            total = value.getInt();
            /* int clockValueBefore = */value.getInt();
            /* int clockValueAfter = */value.getInt();
            final long systemTime = value.getLong();

            _persistit.getLogBase().bufferInventoryLoad.log(systemTime);

            exchange.append(Key.BEFORE);

            while (exchange.next()) {
                value.setStreamMode(true);
                final int volumeHandle = value.getInt();
                final long pageAddress = value.getLong();
                final PageNode pn = new PageNode(volumeHandle, pageAddress);
                pageNodes.add(pn);
            }

            Collections.sort(pageNodes, PageNode.READ_COMPARATOR);
            for (final PageNode pn : pageNodes) {
                final Volume vol = jman.volumeForHandle(pn.getVolumeHandle());
                if (vol == null) {
                    continue;
                }
                try {
                    final Buffer buff = get(vol, pn.getPageAddress(), false, true);
                    buff.release();
                    count++;
                    final long now = System.nanoTime();
                    if (now - reportTime >= INVENTORY_PRELOAD_LOG_MESSAGE_NS) {
                        _persistit.getLogBase().bufferInventoryProgress
                                .log(count, total, (now - reportTime) / NS_PER_S);
                        reportTime = now;
                    }
                    if (count >= _bufferCount) {
                        //
                        // If the buffer pool is now smaller, no need to load
                        // more pages
                        //
                        break;
                    }
                } catch (final PersistitException e) {
                    // ignore it
                }
            }
        } catch (final PersistitException e) {
            _persistit.getLogBase().bufferInventoryException.log(e);
        } finally {
            final long now = System.nanoTime();
            _persistit.getLogBase().bufferInventoryProgress.log(count, total, (now - reportTime) / NS_PER_S);
        }
    }

    private Exchange getBufferInventoryExchange() throws PersistitException {
        final Volume sysvol = _persistit.getSystemVolume();
        return _persistit.getExchange(sysvol, INVENTORY_TREE_NAME, true);
    }
}
TOP

Related Classes of com.persistit.BufferPool$BufferHolder

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.