Package com.persistit

Source Code of com.persistit.RecoveryManager$RecoveryTransactionPlayerSupport

/**
* Copyright 2011-2012 Akiban Technologies, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package com.persistit;

import static com.persistit.JournalRecord.OVERHEAD;
import static com.persistit.JournalRecord.getLength;
import static com.persistit.JournalRecord.getTimestamp;
import static com.persistit.JournalRecord.getType;
import static com.persistit.JournalRecord.isValidType;
import static com.persistit.TransactionStatus.ABORTED;
import static com.persistit.util.SequencerConstants.RECOVERY_PRUNING_A;
import static com.persistit.util.ThreadSequencer.sequence;
import static com.persistit.util.Util.println;

import java.io.File;
import java.io.FileFilter;
import java.io.IOException;
import java.io.RandomAccessFile;
import java.nio.ByteBuffer;
import java.nio.channels.FileChannel;
import java.text.SimpleDateFormat;
import java.util.Arrays;
import java.util.Date;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.SortedSet;
import java.util.TreeSet;

import com.persistit.CheckpointManager.Checkpoint;
import com.persistit.JournalManager.PageNode;
import com.persistit.JournalManager.TransactionMapItem;
import com.persistit.JournalManager.TreeDescriptor;
import com.persistit.JournalRecord.CP;
import com.persistit.JournalRecord.D0;
import com.persistit.JournalRecord.D1;
import com.persistit.JournalRecord.DR;
import com.persistit.JournalRecord.DT;
import com.persistit.JournalRecord.IT;
import com.persistit.JournalRecord.IV;
import com.persistit.JournalRecord.JE;
import com.persistit.JournalRecord.JH;
import com.persistit.JournalRecord.PA;
import com.persistit.JournalRecord.PM;
import com.persistit.JournalRecord.SR;
import com.persistit.JournalRecord.TM;
import com.persistit.JournalRecord.TX;
import com.persistit.TransactionPlayer.TransactionPlayerListener;
import com.persistit.exception.CorruptJournalException;
import com.persistit.exception.PersistitException;
import com.persistit.exception.PersistitIOException;
import com.persistit.exception.PersistitInterruptedException;
import com.persistit.exception.TestException;
import com.persistit.mxbeans.RecoveryManagerMXBean;
import com.persistit.util.ArgParser;
import com.persistit.util.Util;

/**
* <p>
* Manages the recovery process during Persistit startup. This method is called
* every time Persistit starts up, even if the previous shutdown was normal.
* </p>
*
* <dl>
* <dt>Phase 1:</dt>
*
* <dd>
* <p>
* Find the most recent valid journal file. This is the "keystone" journal file
* because everything will be based on its content. Read its JH (JournalHeader)
* record. Validate all fields in the JH.
* </p>
* <p>
* Read the records in the keystone journal file. Included are IV, PM and TM
* records that provide an initial load of the pageMap, liveTransactionMap and
* volume/handle maps for JournalManager. Included also is a keystone CP
* (checkpoint) record; the presence of a CP record indicates that the IV, PM
* and TM records constitute a complete checkpoint of the journal to the
* specified timestamp. Absence of a CP before the scan terminates indicates
* that the journal file is not a complete snapshot, and therefore the chosen
* keystone journal file is not valid. In this event, restart Phase 1 using the
* immediate predecessor file.
* </p>
* <p>
* During this phase, build a transaction map containing the timestamp and file
* address of every transaction that had not committed before the last valid
* checkpoint. The scan stops when recovery finds a JE "journal end" record,
* end-of-file or an invalid record. The presence of a valid JE record indicates
* a clean shutdown.
* </p>
* <p>
* Finally, after processing the keystone journal file, validate all other
* required journal files. For each journal files from base address to current
* address, read its JH records, verify contiguity (same creation timestamp),
* verify that the page image and transaction records closest to the end of that
* journal file (if there are any) can be read. This last step is a plausibility
* test to make sure each required journal file is present, readable, and long
* enough to support recovery. This step does not test for data corruption
* within the journal since doing so would require a significant amount of
* additional I/O.
* </p>
* </dd>
*
* <dt>Phase 2:</dt>
*
* <dd>
* <p>
* Executed after the buffer pools have been loaded and the journal manager has
* been instantiated. This step applies every committed transaction in the
* transaction map and prunes the MVV values for every uncommitted transaction
* in the transaction map.
* </p>
* </dd>
* </dl>
* <p>
* Transactions are applied in their commit timestamp ordering so that their
* affect on the recovered database is consistent with their original serial
* order. A checkpoint at timestamp T indicates that all pages made dirty prior
* to T have been written to the journal; therefore any transaction with a
* commit timestamp before T does not need to be reapplied because its effects
* are already present in the recovered B-Trees.
* </p>
* <p>
* This class is not threadsafe; it is intended to be called only during the
* single-threaded recovery process.
* </p>
*
* @author peter
*
*/
public class RecoveryManager implements RecoveryManagerMXBean, VolumeHandleLookup {

    final static int DEFAULT_BUFFER_SIZE = 1 * 1024 * 1024;

    /**
     * Number of transactions to apply per progress log message
     */
    private final static int APPLY_TRANSACTION_LOG_COUNT = 10000;

    private final Persistit _persistit;

    //
    // These structures mirror those of JournalManager. However, after recovery
    // is complete, only some of the members of these maps will be donated to
    // JournalManager for ongoing processing.
    //
    private final Map<Long, TransactionMapItem> _recoveredTransactionMap = new HashMap<Long, TransactionMapItem>();

    private final Map<Long, TransactionMapItem> _abortedTransactionMap = new HashMap<Long, TransactionMapItem>();

    private final Map<PageNode, PageNode> _pageMap = new HashMap<PageNode, PageNode>();

    private final Map<PageNode, PageNode> _branchMap = new HashMap<PageNode, PageNode>();

    private final Map<Volume, Integer> _volumeToHandleMap = new HashMap<Volume, Integer>();

    private final Map<Integer, Volume> _handleToVolumeMap = new HashMap<Integer, Volume>();

    private final Map<TreeDescriptor, Integer> _treeToHandleMap = new HashMap<TreeDescriptor, Integer>();

    private final Map<Integer, TreeDescriptor> _handleToTreeMap = new HashMap<Integer, TreeDescriptor>();

    private Checkpoint _lastValidCheckpoint = new Checkpoint(0, 0);

    private long _lastValidCheckpointJournalAddress;

    private final Map<Long, FileChannel> _journalFileChannels = new HashMap<Long, FileChannel>();

    private volatile int _committedTransactionCount;

    private volatile int _uncommittedTransactionCount;

    private volatile int _appliedTransactionCount;

    private volatile int _abortedTransactionCount;

    private volatile int _errorCount;

    private volatile boolean _recoveryDisabledForTestMode;

    private String _journalFilePath;

    private File _keystoneFile;

    private long _journalCreatedTime;

    private long _blockSize;

    private long _baseAddress = 0;

    private long _keystoneAddress;

    private ByteBuffer _readBuffer;

    private final int _readBufferSize = DEFAULT_BUFFER_SIZE;

    private long _readBufferAddress;

    private long _currentAddress;

    private final long _recoveryStatus = Long.MIN_VALUE;

    private long _recoveryEndedAddress;

    private String _recoveryEndedException;

    private TransactionPlayerListener _defaultCommitListener = new DefaultRecoveryListener();

    private TransactionPlayerListener _defaultRollbackListener = new DefaultRollbackListener();

    private final TransactionPlayer _player = new TransactionPlayer(new RecoveryTransactionPlayerSupport());

    static class DefaultRecoveryListener implements TransactionPlayerListener {

        @Override
        public void store(final long address, final long timestamp, final Exchange exchange) throws PersistitException {
            if (exchange.isDirectoryExchange() && exchange.getValue().isDefined()
                    && exchange.getValue().getTypeHandle() == Value.CLASS_TREE) {
                /*
                 * Don't recover tree structure updates within transactions
                 * because the allocation of root pages is not transactional.
                 * The intent of the change is conveyed by the implicit creation
                 * of new trees and explicit remove tree records.
                 */
                return;
            }
            exchange.store();
        }

        @Override
        public void removeKeyRange(final long address, final long timestamp, final Exchange exchange, final Key from,
                final Key to) throws PersistitException {
            if (exchange.isDirectoryExchange()) {
                /*
                 * Don't recover directory tree removes because they are implied
                 * by Remove Tree records in the journal.
                 */
                return;
            }
            exchange.raw_removeKeyRangeInternal(from, to, false, false);
        }

        @Override
        public void removeTree(final long address, final long timestamp, final Exchange exchange)
                throws PersistitException {
            exchange.removeTree();
        }

        @Override
        public void delta(final long address, final long timestamp, final Tree tree, final int index,
                final int accumulatorTypeOrdinal, final long value) throws PersistitException {
            final Accumulator.Type type = Accumulator.Type.values()[accumulatorTypeOrdinal];
            final Accumulator accumulator = tree.getAccumulator(type, index);
            accumulator.updateBaseValue(value, timestamp);
        }

        @Override
        public void startRecovery(final long address, final long timestamp) throws PersistitException {
            // Default: do nothing
        }

        @Override
        public void startTransaction(final long address, final long startTimestamp, final long commitTimestamp)
                throws PersistitException {
            // Default: do nothing
        }

        @Override
        public void endTransaction(final long address, final long timestamp) throws PersistitException {
            // Default: do nothing
        }

        @Override
        public void endRecovery(final long address, final long timestamp) throws PersistitException {
            // Default: do nothing
        }

        @Override
        public boolean requiresLongRecordConversion() {
            return true;
        }

        @Override
        public boolean createTree(final long timestamp) throws PersistitException {
            return true;
        }
    }

    class DefaultRollbackListener implements TransactionPlayerListener {
        @Override
        public void store(final long address, final long timestamp, final Exchange exchange) throws PersistitException {
            exchange.prune();
        }

        @Override
        public void removeKeyRange(final long address, final long timestamp, final Exchange exchange, final Key from,
                final Key to) throws PersistitException {
            exchange.prune(from, to);
        }

        @Override
        public void removeTree(final long address, final long timestamp, final Exchange exchange)
                throws PersistitException {
            // TODO
        }

        @Override
        public void delta(final long address, final long timestamp, final Tree tree, final int index,
                final int accumulatorType, final long value) throws PersistitException {
            // Nothing to to undo.
        }

        @Override
        public void startRecovery(final long address, final long timestamp) throws PersistitException {
            // Default: do nothing
        }

        @Override
        public void startTransaction(final long address, final long startTimestamp, final long commitTimestamp)
                throws PersistitException {
            // Default: do nothing
        }

        @Override
        public void endTransaction(final long address, final long timestamp) throws PersistitException {
            final TransactionStatus ts = _persistit.getTransactionIndex().getStatus(timestamp);
            assert ts != null : "Missing TransactionStatus for timestamp " + timestamp;
            /*
             * Having pruned all pages involved in this transaction, now declare
             * it has no MVVs left. This will allow the cleanup process to
             * remove it entirely.
             */
            ts.setMvvCount(0);
            sequence(RECOVERY_PRUNING_A);
            _persistit.getJournalManager().writeTransactionToJournal(ByteBuffer.allocate(0), timestamp, ABORTED, 0);
        }

        @Override
        public void endRecovery(final long address, final long timestamp) throws PersistitException {
            // Default: do nothing
        }

        @Override
        public boolean requiresLongRecordConversion() {
            return false;
        }

        @Override
        public boolean createTree(final long timestamp) throws PersistitException {
            return false;
        }

    }

    private class RecoveryTransactionPlayerSupport implements TransactionPlayerSupport {

        @Override
        public void read(final long address, final int size) throws PersistitIOException {
            RecoveryManager.this.read(address, size);
        }

        @Override
        public ByteBuffer getReadBuffer() {
            return _readBuffer;
        }

        @Override
        public void convertToLongRecord(final Value value, final int treeHandle, final long address,
                final long commitTimestamp) throws PersistitException {
            RecoveryManager.this.convertToLongRecord(value, treeHandle, address, commitTimestamp);
        }

        @Override
        public Persistit getPersistit() {
            return _persistit;
        }
    }

    static File[] files(final String pathName) {
        final File directory;
        final File path = new File(pathName);
        if (!path.isDirectory()) {
            directory = path.getParentFile() == null ? new File(".") : path.getParentFile();
        } else {
            directory = path;
        }
        final String pathString = path.getPath();
        final File[] files = directory.listFiles(new FileFilter() {

            @Override
            public boolean accept(final File candidate) {
                final String candidateString = candidate.getPath();
                return candidateString.startsWith(pathString)
                        && JournalManager.PATH_PATTERN.matcher(candidateString).matches();
            }
        });

        if (files == null) {
            return new File[0];
        }
        Arrays.sort(files);
        return files;
    }

    static void validate(final long value, final File file, final long address, final long expected,
            final String message) throws CorruptJournalException {
        if (value == expected) {
            return;
        }

        throw new CorruptJournalException(String.format(message, file, address, value, expected));
    }

    static void validate(final long value, final File file, final long address, final long min, final long max,
            final String message) throws CorruptJournalException {
        if (value >= min && value <= max) {
            return;
        }

        throw new CorruptJournalException(String.format(message, file, address, value, min, max));
    }

    RecoveryManager(final Persistit persistit) {
        _persistit = persistit;
    }

    synchronized void populateRecoveryInfo(final Management.RecoveryInfo info) {
        info.keystoneJournalAddress = _keystoneAddress;
        info.currentAddress = _currentAddress;
        info.recoveryStatus = _recoveryStatus;
        info.recoveryEndAddress = _recoveryEndedAddress;
        info.recoveryException = _recoveryEndedException == null ? "" : _recoveryEndedException;
        if (_keystoneAddress > 0) {
            info.keystoneJournalFile = addressToFile(_keystoneAddress).getPath();
            if (_lastValidCheckpointJournalAddress != 0)
                info.lastValidCheckpointSystemTime = _lastValidCheckpoint.getSystemTimeMillis();
            info.lastValidCheckpointTimestamp = _lastValidCheckpoint.getTimestamp();
            info.lastValidCheckpointJournalFile = addressToFile(_lastValidCheckpointJournalAddress).getPath();
            info.lastValidCheckpointJournalAddress = _lastValidCheckpointJournalAddress;
        } else {
            info.lastValidCheckpointSystemTime = 0;
            info.lastValidCheckpointTimestamp = 0;
            info.lastValidCheckpointJournalFile = null;
            info.lastValidCheckpointJournalAddress = 0;
        }
        info.blockSize = _blockSize;
        info.pageMapSize = _pageMap.size();
        info.baseAddress = _baseAddress;
        info.appliedTransactions = _appliedTransactionCount;
        info.committedTransactions = getCommittedCount();
        info.uncommittedTransactions = getUncommittedCount();
    }

    public void init(final String path) throws PersistitException {
        _journalFilePath = JournalManager.journalPath(path).getAbsolutePath();
        _readBuffer = ByteBuffer.allocate(_readBufferSize);
    }

    @Override
    public String getJournalFilePath() {
        return _journalFilePath;
    }

    @Override
    public int getCommittedCount() {
        int count = 0;
        for (final TransactionMapItem trecord : _recoveredTransactionMap.values()) {
            if (trecord.isCommitted()) {
                count++;
            }
        }
        return count;
    }

    @Override
    public int getUncommittedCount() {
        int count = 0;
        for (final TransactionMapItem trecord : _recoveredTransactionMap.values()) {
            if (!trecord.isCommitted()) {
                count++;
            }
        }
        return count;
    }

    @Override
    public int getAppliedTransactionCount() {
        return _appliedTransactionCount;
    }

    @Override
    public int getErrorCount() {
        return _errorCount;
    }

    public Checkpoint getLastValidCheckpoint() {
        return _lastValidCheckpoint;
    }

    @Override
    public long getLastValidCheckpointTimestamp() {
        return _lastValidCheckpoint.getTimestamp();
    }

    @Override
    public long getLastValidCheckpointAddress() {
        return _lastValidCheckpointJournalAddress;
    }

    @Override
    public String getRecoveryEndedException() {
        return _recoveryEndedException;
    }

    @Override
    public long getRecoveryEndedAddress() {
        return _recoveryEndedAddress;
    }

    @Override
    public long getKeystoneAddress() {
        return _keystoneAddress;
    }

    @Override
    public long getBaseAddress() {
        return _baseAddress;
    }

    @Override
    public long getBlockSize() {
        return _blockSize;
    }

    @Override
    public long getJournalCreatedTime() {
        return _journalCreatedTime;
    }

    @Override
    public int getTransactionMapSize() {
        return _recoveredTransactionMap.size();
    }

    @Override
    public int getPageMapSize() {
        return _pageMap.size();
    }

    public TransactionPlayerListener getDefaultCommitListener() {
        return _defaultCommitListener;
    }

    public void setDefaultCommitListener(final TransactionPlayerListener listener) {
        this._defaultCommitListener = listener;
    }

    public TransactionPlayerListener getDefaultRollbackListener() {
        return _defaultRollbackListener;
    }

    public void setDefaultRollbackListener(final TransactionPlayerListener listener) {
        this._defaultRollbackListener = listener;
    }

    @Override
    public synchronized Volume lookupVolumeHandle(final int handle) {
        return _handleToVolumeMap.get(Integer.valueOf(handle));
    }

    File addressToFile(final long address) {
        return JournalManager.generationToFile(_journalFilePath, address / _blockSize);
    }

    /**
     * Copy PageNodes from the recovered page. This method distributes PageNodes
     * for updates that happened before the keystone checkpoint to
     * <code>pageMap</code> and those that happened after the keystone
     * checkpoint to <code>branchMap</code>. Updates in <code>branchMap</code>
     * are used only in recovering certain transactions (insertions with
     * LONG_RECORD values).
     *
     * @param pageMap
     */
    void collectRecoveredPages(final Map<PageNode, PageNode> pageMap, final Map<PageNode, PageNode> branchMap) {
        if (_lastValidCheckpoint != null) {
            final long lastValidTimestamp = _lastValidCheckpoint.getTimestamp();

            for (final PageNode lastPageNode : _pageMap.values()) {
                boolean branched = false;
                PageNode previous = null;
                for (PageNode pageNode = lastPageNode; pageNode != null; pageNode = pageNode.getPrevious()) {
                    if (pageNode.getTimestamp() <= lastValidTimestamp && pageNode.getJournalAddress() >= _baseAddress) {
                        pageNode.setPrevious(null);
                        if (branched) {
                            previous.setPrevious(null);
                        }
                        pageMap.put(pageNode, pageNode);
                        break;
                    } else {
                        if (!branched) {
                            branchMap.put(pageNode, pageNode);
                            branched = true;
                        }
                        previous = pageNode;
                    }
                }
            }
        }
    }

    void collectRecoveredVolumeMaps(final Map<Integer, Volume> handleToVolumeMap,
            final Map<Volume, Integer> volumeToHandleMap) {
        for (final Map.Entry<Integer, Volume> entry : _handleToVolumeMap.entrySet()) {
            final Volume volume = entry.getValue();
            if (!volume.isTemporary()) {
                volumeToHandleMap.put(volume, entry.getKey());
                handleToVolumeMap.put(entry.getKey(), volume);
            }
        }
    }

    void collectRecoveredTreeMaps(final Map<Integer, TreeDescriptor> handleToTreeMap,
            final Map<TreeDescriptor, Integer> treeToHandleMap) {
        treeToHandleMap.putAll(_treeToHandleMap);
        handleToTreeMap.putAll(_handleToTreeMap);
    }

    void collectRecoveredTransactionMap(final Map<Long, TransactionMapItem> map) {
        map.putAll(_recoveredTransactionMap);
    }

    /**
     * Clear the maps created during recovery. This method is called after
     * recovery has been completed and the maps are no longer needed.
     *
     */
    void close() {
        // _logWriter.close();

        if (_recoveryDisabledForTestMode) {
            return;
        }
        for (final FileChannel channel : _journalFileChannels.values()) {
            if (channel != null) {
                try {
                    channel.close();
                } catch (final IOException ioe) {
                    // Closing it just to be polite to file handle count
                }
            }
        }
        _recoveredTransactionMap.clear();
        _pageMap.clear();
        _volumeToHandleMap.clear();
        _handleToVolumeMap.clear();
        _treeToHandleMap.clear();
        _handleToTreeMap.clear();
        _readBuffer = null;
        _journalFileChannels.clear();
    }

    /**
     * @return <code>true</code> if the {@link #applyAllCommittedTransactions()}
     *         method should do nothing. (Lets unit tests look at the plan
     *         before executing it.)
     */
    boolean isRecoveryDisabledForTestMode() {
        return _recoveryDisabledForTestMode;
    }

    /**
     * @param recoverDisabledForTestMode
     *            Set this to <code>true</code> to disable the
     *            {@link #applyAllCommittedTransactions()} method. (Lets unit
     *            tests look at the plan before executing it.)
     */
    void setRecoveryDisabledForTestMode(final boolean recoveryDisabledForTestMode) {
        _recoveryDisabledForTestMode = recoveryDisabledForTestMode;
    }

    /**
     * Return the <code>FileChannel</code> for the journal file containing the
     * supplied <code>address</code>. If necessary, create a new
     * {@link MediatedFileChannel}.
     *
     * @param address
     *            the journal address of a record in the journal for which the
     *            corresponding channel will be returned
     * @throws PersistitIOException
     *             if the <code>MediatedFileChannel</code> cannot be created
     */
    synchronized FileChannel getFileChannel(final long address) throws PersistitIOException {
        final long generation = address / _blockSize;
        FileChannel channel = _journalFileChannels.get(generation);
        if (channel == null) {
            try {
                channel = new MediatedFileChannel(addressToFile(address), "r");
                _journalFileChannels.put(generation, channel);
            } catch (final IOException ioe) {
                throw new PersistitIOException(ioe);
            }
        }
        return channel;
    }

    @Override
    public String toString() {
        final StringBuilder sb = new StringBuilder();
        final SortedSet<TransactionMapItem> sorted = new TreeSet<TransactionMapItem>(_recoveredTransactionMap.values());
        for (final TransactionMapItem ts : sorted) {
            sb.append(ts);
            sb.append(Util.NEW_LINE);
        }
        return sb.toString();
    }

    String addressToString(final long address) {
        return TransactionPlayer.addressToString(address);
    }

    private String addressToString(final long address, final long timestamp) {
        return TransactionPlayer.addressToString(address, timestamp);
    }

    TransactionPlayer getPlayer() {
        return _player;
    }

    // ----------------------------Phase 1----------------------------

    private void findAndValidateKeystone() throws PersistitIOException {
        _keystoneAddress = -1;

        final File[] files = files(_journalFilePath);
        if (files.length == 0) {
            return;
        }

        File rejectedPrimordialFile = null;
        CorruptJournalException savedException = null;

        for (int fileIndex = files.length; --fileIndex >= 0;) {
            final File candidate = files[fileIndex];
            _keystoneFile = candidate;
            final long generation = JournalManager.fileToGeneration(candidate);
            final long size;

            try {
                //
                // Attempt to read and validate a journal file as a candidate
                // keystone.
                //
                final RandomAccessFile raf = new RandomAccessFile(candidate, "r");
                final FileChannel readChannel = raf.getChannel();
                size = Math.min(readChannel.size(), DEFAULT_BUFFER_SIZE);
                if (size < JH.OVERHEAD) {
                    // This file cannot be a valid journal file because
                    // it's too short.
                    throw new CorruptJournalException(String.format(
                            "Invalid Persistit journal file %s - no journal header", candidate));
                }
                _readBufferAddress = 0;
                _readBuffer.limit(JH.MAX_LENGTH);
                readChannel.read(_readBuffer, 0);
                _readBuffer.flip();
                final int recordSize = JH.getLength(_readBuffer);

                final long version = JH.getVersion(_readBuffer);
                _blockSize = JH.getBlockSize(_readBuffer);
                _baseAddress = JH.getBaseJournalAddress(_readBuffer);
                _journalCreatedTime = JH.getJournalCreatedTime(_readBuffer);
                _keystoneAddress = JH.getCurrentJournalAddress(_readBuffer);
                _currentAddress = _keystoneAddress + recordSize;

                validate(version, candidate, 0, JournalManager.VERSION, "Unsupported Version %3$d at %1$s:%2$d");

                validate(_blockSize, candidate, 0, JournalManager.MINIMUM_BLOCK_SIZE,
                        JournalManager.MAXIMUM_BLOCK_SIZE, "Journal file size %3$,d not in valid range "
                                + "[%4$,d:%5$,d] at %1$s:%2$,d");

                validate(_keystoneAddress, candidate, 0, generation * _blockSize,
                        "Invalid current address %3$,d at %1$s:%2$,d");

                validate(_baseAddress, candidate, 0, 0, _keystoneAddress,
                        "Base address %3$,d after current address %4$,d:  at %1$s:%2$,d");

                readChannel.close();
                //
                // The JH record is valid. Now read records until a CP record is
                // found.
                //
                // _logWriter.println("Scanning records");

                boolean checkpointFound = false;
                while (true) {
                    try {
                        final int type = scanOneRecord();
                        if (type == CP.TYPE) {
                            checkpointFound = true;
                        } else if (type == JE.TYPE) {
                            break;
                        }
                    } catch (final CorruptJournalException cje) {
                        _recoveryEndedException = cje.toString();
                        _recoveryEndedAddress = _currentAddress;
                        if (!checkpointFound) {
                            throw cje;
                        } else {
                            break;
                        }
                    }
                }
                if (checkpointFound) {
                    _persistit.getLogBase().recoveryKeystone.log(addressToFile(_keystoneAddress), _currentAddress);
                    _recoveryEndedAddress = _currentAddress;
                    break;
                }
            } catch (final CorruptJournalException je) {
                if (rejectedPrimordialFile == null) {
                    //
                    // Normal case - there was a dirty shutdown, but it's the
                    // primordial stub of a new journal file that didn't get
                    // completed. It's OK to go back to previous file.
                    //
                    rejectedPrimordialFile = candidate;
                    savedException = je;

                    _keystoneAddress = -1;
                    _keystoneFile = null;
                    _recoveredTransactionMap.clear();
                    _pageMap.clear();
                    _branchMap.clear();
                    _treeToHandleMap.clear();
                    _volumeToHandleMap.clear();
                    _handleToTreeMap.clear();
                    _handleToVolumeMap.clear();

                } else {
                    throw savedException;
                }
            } catch (final IOException ioe) {
                throw new PersistitIOException(ioe);
            }
        }
    }

    private long addressUp(final long address) {
        return ((address / _blockSize) + 1) * _blockSize;
    }

    /*
     * Bug 942669 - a transaction found during recovery that has a start address
     * less than the base address recorded during the keystone checkpoint has
     * already been pruned. Simply ignore it during recovery.
     */
    private boolean isZombieTransaction(final long address) {
        return address < _baseAddress;
    }

    private void read(final long address, final int size) throws PersistitIOException {
        if (_readBufferAddress >= 0 && address >= _readBufferAddress
                && size + address - _readBufferAddress <= _readBuffer.limit()) {
            _readBuffer.position((int) (address - _readBufferAddress));
        } else {
            try {
                final FileChannel fc = getFileChannel(address);
                _readBuffer.clear();

                int maxSize = _readBuffer.capacity();
                final long remainingInBlock = addressUp(address) - address;
                if (remainingInBlock < maxSize) {
                    maxSize = (int) remainingInBlock;
                }

                _readBuffer.limit(maxSize);
                int offset = 0;
                while (_readBuffer.remaining() > 0) {
                    final int readSize = fc.read(_readBuffer, offset + address % _blockSize);
                    if (readSize < 0) {
                        break;
                    }
                    offset += readSize;
                }
                _readBufferAddress = address;
                _readBuffer.flip();
                if (_readBuffer.remaining() < size) {
                    throw new CorruptJournalException("End of file at " + addressToString(address));
                }
            } catch (final IOException e) {
                throw new PersistitIOException("Reading from " + addressToString(address), e);
            }
        }
    }

    /**
     * Attempts to read and apply the record at _currentAddress. If it finds
     * valid record contained in the current journal file, it advances the
     * _currentAddress to the start of the next record and returns the type of
     * the record. Otherwise this method does nothing and returns -1;
     *
     * @return The record type: one of the type values specified in
     *         {@link com.persistit.JournalRecord}), 0 if the journal file has
     *         fewer than 16 bytes remaining or -t where t is an invalid type.
     * @throws CorruptJournalException
     * @throws PersistitException
     * @throws JournalNotClosedException
     */
    private int scanOneRecord() throws PersistitIOException {

        final long from = _currentAddress;
        read(_currentAddress, OVERHEAD);
        final int recordSize = getLength(_readBuffer);
        final int type = getType(_readBuffer);
        final long timestamp = getTimestamp(_readBuffer);
        _persistit.getTimestampAllocator().updateTimestamp(timestamp);

        if (recordSize >= _blockSize || recordSize < OVERHEAD) {
            throw new CorruptJournalException("Bad JournalRecord length " + recordSize + " at position "
                    + addressToString(from, timestamp));
        }

        switch (type) {

        case JE.TYPE:
            scanJournalEnd(from, timestamp, recordSize);
            break;

        case JH.TYPE:
            break;

        case SR.TYPE:
        case DR.TYPE:
        case DT.TYPE:
        case D0.TYPE:
        case D1.TYPE:
            throw new CorruptJournalException("Unexpected record of type " + type + " at " + addressToString(from));

        case IV.TYPE:
            scanIdentifyVolume(from, timestamp, recordSize);
            break;

        case IT.TYPE:
            scanIdentifyTree(from, timestamp, recordSize);
            break;

        case PA.TYPE:
            scanLoadPage(from, timestamp, recordSize);
            break;

        case PM.TYPE:
            scanLoadPageMap(from, timestamp, recordSize);
            break;

        case TM.TYPE:
            scanLoadTransactionMap(from, timestamp, recordSize);
            break;

        case TX.TYPE:
            scanOneTransaction(from, timestamp, recordSize);
            break;

        case CP.TYPE:
            scanCheckpoint(from, timestamp, recordSize);
            break;

        default:
            if (!isValidType(type)) {
                _currentAddress -= OVERHEAD;
                throw new CorruptJournalException("Invalid record type " + type + " at " + addressToString(from));
            }
        }
        _currentAddress = from + recordSize;
        return type;
    }

    /**
     * Process an IV (identify volume) record in the journal. Adds a handle ->
     * volume descriptor pair to the handle maps.
     *
     * @param address
     * @param timestamp
     * @param recordSize
     * @throws PersistitIOException
     */
    void scanIdentifyVolume(final long address, final long timestamp, final int recordSize) throws PersistitIOException {
        if (recordSize > IV.MAX_LENGTH) {
            throw new CorruptJournalException("IV JournalRecord too long: " + recordSize + " bytes at position "
                    + addressToString(address, timestamp));
        }
        read(address, recordSize);
        final Integer handle = Integer.valueOf(IV.getHandle(_readBuffer));
        final long id = IV.getVolumeId(_readBuffer);
        final String specification = IV.getVolumeSpecification(_readBuffer);
        final VolumeSpecification vs = new VolumeSpecification(specification);
        vs.setCreate(false);
        vs.setCreateOnly(false);
        final Volume volume = new Volume(vs);
        volume.setId(id);

        _handleToVolumeMap.put(handle, volume);
        _volumeToHandleMap.put(volume, handle);

        _persistit.getLogBase().recoveryRecord.log("IV", addressToString(address, timestamp), vs.getName(), timestamp);
    }

    /**
     * Processes an IT (identify tree) record in the journal. Adds a handle ->
     * TreeDescriptor entry in the handle maps.
     *
     * @param address
     * @param timestamp
     * @param recordSize
     * @throws PersistitIOException
     */
    void scanIdentifyTree(final long address, final long timestamp, final int recordSize) throws PersistitIOException {
        if (recordSize > IT.MAX_LENGTH) {
            throw new CorruptJournalException("IT JournalRecord too long: " + recordSize + " bytes at position "
                    + addressToString(address, timestamp));
        }
        if (_readBuffer.remaining() < recordSize) {
            read(address, recordSize);
        }
        final Integer handle = Integer.valueOf(IT.getHandle(_readBuffer));
        final String treeName = IT.getTreeName(_readBuffer);
        final Integer volumeHandle = Integer.valueOf(IT.getVolumeHandle(_readBuffer));
        final Volume volume = _handleToVolumeMap.get(volumeHandle);
        // Handle records written incorrectly due to bug 1125603
        if (volumeHandle == Volume.LOCK_VOLUME_HANDLE) {
            return;
        }
        if (volume == null) {
            throw new CorruptJournalException("IT JournalRecord refers to unidentified volume handle " + volumeHandle
                    + " at position " + addressToString(address, timestamp));
        }
        if (!volume.isTemporary()) {
            final TreeDescriptor td = new TreeDescriptor(volumeHandle, treeName);
            _handleToTreeMap.put(handle, td);
            _treeToHandleMap.put(td, handle);
            _persistit.getLogBase().recoveryRecord.log("IT", addressToString(address, timestamp), treeName, timestamp);
        }
    }

    /**
     * Process a PA (page) record in the journal. Adds an entry to the Page Map.
     *
     * @param address
     * @param timestamp
     * @param recordSize
     * @throws PersistitIOException
     */
    void scanLoadPage(final long address, final long timestamp, final int recordSize) throws PersistitIOException {
        if (recordSize > Buffer.MAX_BUFFER_SIZE + PA.OVERHEAD) {
            throw new CorruptJournalException("PA JournalRecord too long: " + recordSize + " bytes at position "
                    + addressToString(address, timestamp));
        }
        //
        // timestamp <= 0 means this is a page from a transient volume
        // and should not be added to the recovery set.
        //
        if (timestamp > 0) {
            read(address, recordSize);
            final int volumeHandle = PA.getVolumeHandle(_readBuffer);
            final long pageAddress = PA.getPageAddress(_readBuffer);

            final Volume volume = _handleToVolumeMap.get(volumeHandle);
            if (volume == null) {
                throw new CorruptJournalException("PA reference to volume " + volumeHandle
                        + " is not preceded by an IV record for that handle at " + addressToString(address, timestamp));
            }

            final PageNode pageNode = new PageNode(volumeHandle, pageAddress, address, timestamp);
            final PageNode oldPageNode = _pageMap.get(pageNode);
            pageNode.setPrevious(oldPageNode);
            _pageMap.put(pageNode, pageNode);
            _persistit.getLogBase().recoveryRecord.log("PA", pageNode.toStringJournalAddress(this),
                    pageNode.toStringPageAddress(this), timestamp);
        }
    }

    /*
     * Process a PM (page map) record. There is one PM record near the beginning
     * of each journal file. It provides a copy of the page map that existed at
     * the time the journal file was created, thereby eliminating the need for
     * scanning all of the previous journal files in the journal.
     */
    void scanLoadPageMap(final long from, final long timestamp, final int recordSize) throws PersistitIOException {
        read(from, PM.OVERHEAD);
        final int count = PM.getEntryCount(_readBuffer);
        if (count * PM.ENTRY_SIZE + PM.OVERHEAD != recordSize) {
            throw new CorruptJournalException("Invalid record size " + recordSize + " for PM record at "
                    + addressToString(from, timestamp));
        }

        long address = from + PM.OVERHEAD;
        int index = 0;
        int loaded = 0;

        for (int remaining = count; remaining > 0; remaining--) {
            if (index == loaded) {
                final int loadedSize = Math.min((_readBuffer.capacity() / PM.ENTRY_SIZE), remaining) * PM.ENTRY_SIZE;
                read(address, loadedSize);
                address += loadedSize;
                index = 0;
                loaded = loadedSize / PM.ENTRY_SIZE;
                if (loaded <= 0) {
                    throw new CorruptJournalException("Could not load PageMap segment in entry "
                            + (count - remaining + 1) + " at " + addressToString(from, timestamp));
                }
            }
            final int volumeHandle = PM.getEntryVolumeHandle(_readBuffer, index);
            final Volume volume = _handleToVolumeMap.get(volumeHandle);
            if (volume == null) {
                throw new CorruptJournalException("Page map refers to undefined volume handle " + volumeHandle
                        + " in entry " + (count - remaining + 1) + " at " + addressToString(from, timestamp));
            }
            final long pageAddress = PM.getEntryPageAddress(_readBuffer, index);
            final long pageTimestamp = PM.getEntryTimestamp(_readBuffer, index);
            final long journalAddress = PM.getEntryJournalAddress(_readBuffer, index);
            final PageNode pageNode = new PageNode(volumeHandle, pageAddress, journalAddress, pageTimestamp);
            final PageNode lastPageNode;
            boolean linked = false;
            //
            // The following logic places the recovered PageNode in either the
            // page map or the branch map. The timestamp written in the PM
            // record determines which map receives the page, corresponding with
            // two different recovery scenarios.
            //
            // In one scenario, the PM was written as part of a normal rollover,
            // and all pages in it are part of the recovered history. In this
            // the timestamp at the time the PM record is written will be larger
            // than any existing page, and therefore the page will be added to
            // the page map.
            //
            // However, if the PM record was written immediately after a dirty
            // startup, the PM's timestamp will be consistent with the recovery
            // checkpoint, and there will be pages with timestamps after that.
            // Those pages are part of the branch; they are retained in the
            // recovery state solely to allow long-record recovery, and will
            // then be discarded.
            //
            // Because pre-2.4.1 PM records were written with a timestamp of
            // zero, this is handled as a special case. All pages from such
            // journals are recovered.
            //
            if (timestamp != 0 && timestamp < pageTimestamp) {
                lastPageNode = _branchMap.get(pageNode);
                if (lastPageNode == null || journalAddress > lastPageNode.getJournalAddress()) {
                    pageNode.setPrevious(lastPageNode);
                    _branchMap.put(pageNode, pageNode);
                    linked = true;
                }
            } else {
                lastPageNode = _pageMap.get(pageNode);
                if (lastPageNode == null || journalAddress > lastPageNode.getJournalAddress()) {
                    pageNode.setPrevious(lastPageNode);
                    _pageMap.put(pageNode, pageNode);
                    linked = true;
                }
            }
            if (!linked) {
                for (PageNode pn = lastPageNode; pn != null; pn = pn.getPrevious()) {
                    if (journalAddress == pn.getJournalAddress()) {
                        // TODO - redundant entry
                        break;
                    }
                    if (pn.getPrevious() == null || journalAddress > pn.getPrevious().getJournalAddress()) {
                        pageNode.setPrevious(pn.getPrevious());
                        pn.setPrevious(pageNode);
                        break;
                    }
                }
            }

            index++;
        }
    }

    /*
     * Process a TM (transaction map) record. There is one TM record near the
     * beginning of each journal file. It provides a copy of the live
     * transactions that existed at the time the journal file was created,
     * thereby eliminating the need for scanning all of the previous journal
     * files in the journal.
     */
    void scanLoadTransactionMap(final long from, final long timestamp, final int recordSize)
            throws PersistitIOException {
        read(from, TM.OVERHEAD);
        final int count = TM.getEntryCount(_readBuffer);
        if (count * TM.ENTRY_SIZE + TM.OVERHEAD != recordSize) {
            throw new CorruptJournalException("Invalid record size " + recordSize + " for TM record at "
                    + addressToString(from, timestamp));
        }
        long address = from + TM.OVERHEAD;
        int index = 0;
        int loaded = 0;
        for (int remaining = count; remaining > 0; remaining--) {
            if (index == loaded) {
                final int loadedSize = Math.min(_readBuffer.capacity() / TM.ENTRY_SIZE, remaining) * TM.ENTRY_SIZE;
                read(address, loadedSize);
                address += loadedSize;
                index = 0;
                loaded = loadedSize / TM.ENTRY_SIZE;
                if (loaded <= 0) {
                    throw new CorruptJournalException("Could not load TramsactionMap segment in entry "
                            + (count - remaining + 1) + " at " + addressToString(from, timestamp));
                }
            }
            final long startTimestamp = TM.getEntryStartTimestamp(_readBuffer, index);
            final long commitTimestamp = TM.getEntryCommitTimestamp(_readBuffer, index);
            final long journalAddress = TM.getEntryJournalAddress(_readBuffer, index);
            final long lastRecordAddress = TM.getLastRecordAddress(_readBuffer, index);

            if (!isZombieTransaction(journalAddress)) {
                final TransactionMapItem ts = new TransactionMapItem(startTimestamp, journalAddress);
                final Long key = Long.valueOf(startTimestamp);
                ts.setCommitTimestamp(commitTimestamp);
                ts.setLastRecordAddress(lastRecordAddress);
                if (_recoveredTransactionMap.put(key, ts) != null) {
                    throw new CorruptJournalException("Redundant record in TransactionMap record " + ts + " entry "
                            + (count - remaining + 1) + " at " + addressToString(address, startTimestamp));

                }
                _persistit.getTimestampAllocator().updateTimestamp(commitTimestamp);
            }
            index++;
        }
    }

    void scanJournalEnd(final long address, final long timestamp, final int recordSize) throws PersistitIOException {
        if (recordSize != JE.OVERHEAD) {
            throw new CorruptJournalException("JE JournalRecord has incorrect length: " + recordSize
                    + " bytes at position " + addressToString(address, timestamp));
        }
        read(address, JE.OVERHEAD);
        final long currentAddress = JE.getCurrentJournalAddress(_readBuffer);
        final long baseAddress = JE.getBaseAddress(_readBuffer);
        final long journalCreated = JE.getJournalCreatedTime(_readBuffer);

        validate(journalCreated, _keystoneFile, address, _journalCreatedTime, "JE wrong record journalCreatedTime "
                + " %3$,d: expected %4$,d at %1$s:%2$,d");
        validate(currentAddress, _keystoneFile, address, address,
                "JE record currentAddress %3$,d mismatch at %1$s:%2$,d");
        validate(baseAddress, _keystoneFile, address, _baseAddress,
                "JE record wrong base address %3$,d: expected %4$,d at %1$s:%2$,d");
    }

    void scanCheckpoint(final long address, final long timestamp, final int recordSize) throws PersistitIOException {
        if (recordSize != CP.OVERHEAD) {
            throw new CorruptJournalException("CP JournalRecord has incorrect length: " + recordSize
                    + " bytes at position " + addressToString(address, timestamp));
        }
        read(address, CP.OVERHEAD);
        final long systemTimeMillis = CP.getSystemTimeMillis(_readBuffer);
        final Checkpoint checkpoint = new Checkpoint(timestamp, systemTimeMillis, true);
        final long baseAddress = CP.getBaseAddress(_readBuffer);

        if (baseAddress < _baseAddress || baseAddress > _currentAddress) {
            throw new CorruptJournalException("Invalid base journal address " + baseAddress + " for CP record at "
                    + addressToString(address, timestamp));
        }
        _baseAddress = baseAddress;

        _lastValidCheckpoint = checkpoint;
        _lastValidCheckpointJournalAddress = address;

        for (final Iterator<Map.Entry<Long, TransactionMapItem>> iterator = _recoveredTransactionMap.entrySet()
                .iterator(); iterator.hasNext();) {
            final Map.Entry<Long, TransactionMapItem> entry = iterator.next();
            final TransactionMapItem ts = entry.getValue();
            if (ts.isCommitted() && ts.getCommitTimestamp() < timestamp) {
                iterator.remove();
            } else if (_abortedTransactionMap.get(ts.getStartTimestamp()) != null) {
                iterator.remove();
                _abortedTransactionMap.remove(ts.getStartTimestamp());
            } else if (isZombieTransaction(ts.getStartAddress())) {
                iterator.remove();
            }
        }

        _persistit.getLogBase().checkpointRecovered
                .log(checkpoint, addressToString(address, checkpoint.getTimestamp()));
        _persistit.getLogBase().recoveryRecord.log("CP", addressToString(address, timestamp), checkpoint
                + " pageMap.size()=" + _pageMap.size(), timestamp);
    }

    /**
     * Validates non-keystone journal files referenced by the keystone. Not all
     * records are read; these files are needed only to complete committed
     * transactions and to supply pages from the page map.
     *
     * @param generation
     * @throws PersistitIOException
     */
    private void validateMemberFile(final long generation) throws PersistitIOException {
        final File file = JournalManager.generationToFile(_journalFilePath, generation);
        if (!file.exists()) {
            throw new CorruptJournalException("Missing journal file " + file);
        }
        read(generation * _blockSize, JH.OVERHEAD);
        int recordSize = getLength(_readBuffer);

        validate(recordSize, file, 0, JH.OVERHEAD, JH.MAX_LENGTH,
                "Journal header record size %3$,d is not in valid range " + "[%4$,d:%5$,d] at %1$s:%2$,d");

        int type = getType(_readBuffer);
        validate(type, file, 0, JH.TYPE, "Invalid record type %$3,d at  at %1$s:%2$d");

        final long version = JH.getVersion(_readBuffer);
        final long currentAddress = JH.getCurrentJournalAddress(_readBuffer);
        final long blockSize = JH.getBlockSize(_readBuffer);
        final long baseAddress = JH.getBaseJournalAddress(_readBuffer);
        final long journalCreatedTime = JH.getJournalCreatedTime(_readBuffer);

        validate(version, file, 0, JournalManager.VERSION, "Unsupported Version %3$d at %1$s:%2$d");

        validate(blockSize, file, 0, _blockSize, "Journal file size %3$,d differs from keystone value "
                + "%4$,d at %1$s:%2$,d");

        validate(journalCreatedTime, file, 0, _journalCreatedTime,
                "Journal creation time %3$,d differs from keystone value " + "%4$,d at %1$s:%2$,d");

        validate(baseAddress, file, 0, 0, _baseAddress, "Journal base address %3$,d not in valid range "
                + "[%4$,d:%5$,d] at %1$s:%2$,d");

        validate(currentAddress, file, 0, 0, _keystoneAddress, "Journal base address %3$,d not in valid range "
                + "[%4$,d:%5$,d] at %1$s:%2$,d");

        _persistit.getLogBase().recoveryValidFile.log(file.getPath());

        //
        // Now make sure we can read the last PA record required for
        // recovery
        //
        final long startingAddress = generation * _blockSize;
        final long endingAddress = startingAddress + blockSize;
        long lastRequiredJournalAddress = startingAddress;

        PageNode lastRequiredPageNode = null;
        for (final PageNode pageNode : _pageMap.values()) {
            for (PageNode pn = pageNode; pn != null; pn = pn.getPrevious()) {
                if (pn.getJournalAddress() < lastRequiredJournalAddress) {
                    break;
                }
                if (pn.getJournalAddress() < endingAddress) {
                    lastRequiredJournalAddress = pn.getJournalAddress();
                    lastRequiredPageNode = pn;
                }
            }
        }
        if (lastRequiredJournalAddress > startingAddress) {
            read(lastRequiredJournalAddress, PA.OVERHEAD);
            type = getType(_readBuffer);
            validate(type, file, startingAddress, PA.TYPE, "Invalid record type %3$,d at %1$s:%2$d");
            recordSize = getLength(_readBuffer);
            validate(recordSize, file, startingAddress, PA.OVERHEAD + Buffer.HEADER_SIZE, PA.OVERHEAD
                    + Buffer.MAX_BUFFER_SIZE, "PA record size %3$,d not in valid range "
                    + "[%4$,d:%5$,d] at %1$s:%2$,d");
            final long pageAddress = PA.getPageAddress(_readBuffer);
            validate(pageAddress, file, startingAddress, lastRequiredPageNode.getPageAddress(),
                    "Mismatched page address %3$d at %1$s:%2$d");
            // confirm that we can read the data
            read(lastRequiredJournalAddress, recordSize);

        }

    }

    public void buildRecoveryPlan() throws PersistitIOException, PersistitInterruptedException {
        try {
            //
            // Find the keystone (last) journal file and validate it.
            findAndValidateKeystone();
            if (_keystoneAddress == -1) {
                return;
            }

            //
            // Validate the previous journal files required to complete
            // recovery.
            //
            final long fromGeneration = _baseAddress / _blockSize;
            final long toGeneration = _keystoneAddress / _blockSize;
            for (long generation = fromGeneration; generation < toGeneration; generation++) {
                validateMemberFile(generation);
            }
            //
            // Count committed and uncommitted transactions.
            //
            for (final Iterator<TransactionMapItem> iterator = _recoveredTransactionMap.values().iterator(); iterator
                    .hasNext();) {
                final TransactionMapItem item = iterator.next();
                if (item.isCommitted()) {
                    _committedTransactionCount++;
                } else if (item.getStartTimestamp() < _lastValidCheckpoint.getTimestamp()) {
                    _uncommittedTransactionCount++;
                    try {
                        _persistit.getTransactionIndex().injectAbortedTransaction(item.getStartTimestamp());
                    } catch (final InterruptedException ie) {
                        throw new PersistitInterruptedException(ie);
                    }
                } else {
                    /*
                     * An uncommitted transaction that started after the last
                     * valid checkpoint is of no interest.
                     */
                    iterator.remove();
                }
            }
            _persistit.getLogBase().recoveryPlan.log(_pageMap.size(), _committedTransactionCount,
                    _uncommittedTransactionCount);
        } catch (final PersistitIOException pe) {
            _persistit.getLogBase().recoveryFailure.log(pe);
            throw pe;
        }

    }

    /**
     * Called during Phase 2 to record the FileAddress of a Transaction Update
     * record in the journal.
     *
     * @param ja
     * @throws CorruptJournalException
     */
    void scanOneTransaction(final long address, final long startTimestamp, final int recordSize)
            throws PersistitIOException {
        read(address, recordSize);
        final Long key = Long.valueOf(startTimestamp);
        final long commitTimestamp = TX.getCommitTimestamp(_readBuffer);
        final long backchainAddress = TX.getBackchainAddress(_readBuffer);

        if (isZombieTransaction(address)) {
            return;
        }
        if (commitTimestamp == ABORTED) {
            TransactionMapItem item = _abortedTransactionMap.get(key);
            if (item == null) {
                item = new TransactionMapItem(startTimestamp, address);
                item.setCommitTimestamp(ABORTED);
                _abortedTransactionMap.put(key, item);
            } else {
                throw new CorruptJournalException("Duplicate transaction abort records with same timestamp(" + key
                        + "): previous/current=" + item.getStartAddress() + "/"
                        + addressToString(address, startTimestamp));
            }
        } else {
            TransactionMapItem item = _recoveredTransactionMap.get(key);
            if (item == null) {
                if (backchainAddress != 0) {
                    throw new CorruptJournalException("Missing transaction record at with timestamp(" + key
                            + "): previous/current=" + backchainAddress + "/"
                            + addressToString(address, startTimestamp));
                }
                item = new TransactionMapItem(startTimestamp, address);
                _recoveredTransactionMap.put(key, item);

            } else {
                if (backchainAddress == 0) {
                    throw new CorruptJournalException("Duplicate transactions with same timestamp(" + key
                            + "): previous/current=" + item.getStartAddress() + "/"
                            + addressToString(address, startTimestamp));
                }
                if (item.isCommitted()) {
                    throw new CorruptJournalException("Redundant Transaction Commit Record for " + item + " at "
                            + addressToString(address, startTimestamp));
                }
                if (backchainAddress != item.getLastRecordAddress()) {
                    throw new CorruptJournalException("Broken backchain at " + addressToString(address)
                            + " does not match previous record " + item);
                }
                item.setLastRecordAddress(address);
            }
            item.setCommitTimestamp(commitTimestamp);
            _persistit.getTimestampAllocator().updateTimestamp(commitTimestamp);
        }

    }

    // ---------------------------- Phase 3 ------------------------------------

    public void applyAllRecoveredTransactions(final TransactionPlayerListener commitListener,
            final TransactionPlayerListener rollbackListener) throws TestException {

        if (_recoveryDisabledForTestMode) {
            return;
        }
        boolean started = false;
        /*
         * If there is a checkpoint Transaction record, reset its commit
         * timestamp to the checkpoint timestamp to ensure it gets applied
         * first. This is required to ensure all Accumulator values are
         * initialized correctly to their checkpoint snapshot values before any
         * deltas are applied.
         */
        final TransactionMapItem checkpointTransactionItem = _recoveredTransactionMap.get(_lastValidCheckpoint
                .getTimestamp());
        if (checkpointTransactionItem != null) {
            checkpointTransactionItem.setCommitTimestamp(_lastValidCheckpoint.getTimestamp());
        }

        final SortedSet<TransactionMapItem> sorted = new TreeSet<TransactionMapItem>(_recoveredTransactionMap.values());

        if (!sorted.isEmpty()) {
            final TransactionMapItem last = sorted.last();
            assert last.getCommitTimestamp() <= _persistit.getTimestampAllocator().getCurrentTimestamp();
        }

        for (final TransactionMapItem item : sorted) {
            final TransactionPlayerListener listener = item.isCommitted() ? commitListener : rollbackListener;
            try {
                if (!started) {
                    commitListener.startRecovery(item.getStartAddress(), item.getCommitTimestamp());
                    started = true;
                }

                _player.applyTransaction(item, listener);
                if (item.isCommitted()) {
                    _appliedTransactionCount++;
                } else {
                    _abortedTransactionCount++;
                }

                if ((_appliedTransactionCount + _abortedTransactionCount) % APPLY_TRANSACTION_LOG_COUNT == 0) {
                    _persistit.getLogBase().recoveryProgress.log(_appliedTransactionCount, _abortedTransactionCount,
                            _recoveredTransactionMap.size() - _appliedTransactionCount - _abortedTransactionCount);
                }
            } catch (final TestException te) {
                // Exception thrown by a unit test to interrupt recovery
                _persistit.getLogBase().recoveryException.log(te, item);
                throw te;
            } catch (final Exception pe) {
                _persistit.getLogBase().recoveryException.log(pe, item);
                _errorCount++;
            }
        }
        _branchMap.clear();
    }

    /**
     * Assembles a long record into the provided Value object. This method
     * relies on finding the PAGE_TYPE_LONG_RECORD pages in the journal.
     * Transaction commit writes those pages before writing the TX record; this
     * ensures they are available in the journal.
     *
     * @param value
     *            Value object contains a value of type LONG_REC, is converted
     *            by this method to an actual long record.
     * @param treeHandle
     *            tree containing the long record
     * @param from
     *            journal address of the insert operation - used only in
     *            displaying error messages
     * @param timestamp
     *            timestamp of the transaction
     * @param page
     *
     * @throws PersistitException
     */
    void convertToLongRecord(final Value value, final int treeHandle, final long from, final long timestamp)
            throws PersistitException {
        final TreeDescriptor td = _handleToTreeMap.get(treeHandle);
        final int volumeHandle = td.getVolumeHandle();
        long page = Buffer.decodeLongRecordDescriptorPointer(value.getEncodedBytes(), 0);
        final int size = Buffer.decodeLongRecordDescriptorSize(value.getEncodedBytes(), 0);
        if (size < 0 || size > Value.MAXIMUM_SIZE) {
            throw new CorruptJournalException("Transactional long record specification " + "exceeds maximum size of "
                    + Value.MAXIMUM_SIZE + ":" + size);
        }
        final byte[] rawBytes = value.getEncodedBytes();
        final long startAddress = page;
        value.clear();
        if (size > value.getMaximumSize()) {
            value.setMaximumSize(size);
        }
        value.ensureFit(size);

        int offset = 0; // offset of next segment in the value
        int remainingSize = size;

        System.arraycopy(rawBytes, Buffer.LONGREC_PREFIX_OFFSET, value.getEncodedBytes(), offset,
                Buffer.LONGREC_PREFIX_SIZE);

        offset += Buffer.LONGREC_PREFIX_SIZE;
        remainingSize -= Buffer.LONGREC_PREFIX_SIZE;

        for (int count = 0; page != 0; count++) {

            if (remainingSize == 0) {
                throw new CorruptJournalException("Long record chain has more than " + size
                        + " bytes starting at page " + startAddress + " for transaction at "
                        + addressToString(from, timestamp));
            }
            //
            // Look for the latest version of the page which precedes the
            // record's timestamp.
            //
            final PageNode key = new PageNode(volumeHandle, page, -1, -1);
            PageNode pn = lastPageNodeBefore(_branchMap.get(key), timestamp);
            if (pn == null) {
                pn = lastPageNodeBefore(_pageMap.get(key), timestamp);
            }

            if (pn == null) {
                throw new CorruptJournalException("Long record chain missing page " + page + " at count " + count
                        + " at " + addressToString(from, timestamp));
            }

            _currentAddress = pn.getJournalAddress();
            read(_currentAddress, PA.OVERHEAD);
            final int type = PA.getType(_readBuffer);
            final int recordSize = PA.getLength(_readBuffer);
            final int payloadSize = recordSize - PA.OVERHEAD;
            final int leftSize = PA.getLeftSize(_readBuffer);
            final int bufferSize = PA.getBufferSize(_readBuffer);
            final long pageAddress = PA.getPageAddress(_readBuffer);
            //
            // Verify that this is the valid and appropriate PA record
            //
            if (type != PA.TYPE) {
                throw new CorruptJournalException("Record at " + pn.toStringJournalAddress(this)
                        + " is not a PAGE record");
            }

            if (leftSize < 0 || payloadSize < leftSize || payloadSize > bufferSize) {
                throw new CorruptJournalException("Record at " + pn.toStringJournalAddress(this)
                        + " invalid sizes: recordSize= " + payloadSize + " leftSize=" + leftSize + " bufferSize="
                        + bufferSize);
            }

            if (pageAddress != pn.getPageAddress()) {
                throw new CorruptJournalException("Record at " + pn.toStringJournalAddress(this)
                        + " mismatched page address: expected/actual=" + pn.getPageAddress() + "/" + pageAddress);
            }

            //
            // Verify that this is a PAGE_TYPE_LONG_RECORD
            //
            read(_currentAddress, recordSize);
            final int pageType = JournalRecord.getByte(_readBuffer, PA.OVERHEAD + Buffer.TYPE_OFFSET);

            if (pageType != Buffer.PAGE_TYPE_LONG_RECORD) {
                throw new CorruptJournalException("Long record chain contains invalid page type " + pageType
                        + " for page " + page + " at " + pn.toStringJournalAddress(this) + " in transaction at "
                        + addressToString(from, timestamp));
            }

            final int segmentSize = Math.min(remainingSize, payloadSize - Buffer.HEADER_SIZE);

            System.arraycopy(_readBuffer.array(), _readBuffer.position() + PA.OVERHEAD + Buffer.HEADER_SIZE,
                    value.getEncodedBytes(), offset, segmentSize);
            offset += segmentSize;
            remainingSize -= segmentSize;

            // Next page in chain
            page = JournalRecord.getLong(_readBuffer, PA.OVERHEAD + Buffer.RIGHT_SIBLING_OFFSET);

            if (count > Buffer.MAX_LONG_RECORD_CHAIN) {
                throw new CorruptJournalException("Long record chain has more than " + Buffer.MAX_LONG_RECORD_CHAIN
                        + " pages in starting at page " + startAddress + " for transaction at "
                        + addressToString(from, timestamp));
            }
        }

        if (remainingSize != 0) {
            throw new CorruptJournalException("Long record chain has fewer than " + size + " bytes (" + remainingSize
                    + " not recovered) starting at page " + startAddress + " for transaction at "
                    + addressToString(from, timestamp));
        }
        value.setEncodedSize(size);
    }

    private PageNode lastPageNodeBefore(final PageNode pageNode, final long timestamp) {
        PageNode pn = pageNode;
        while (pn != null) {
            if (pn.getTimestamp() <= timestamp) {
                break;
            }
            pn = pn.getPrevious();
        }
        return pn;
    }

    boolean analyze() throws Exception {
        findAndValidateKeystone();
        if (getKeystoneAddress() == -1) {
            println("No valid journal at %s", getJournalFilePath());
            return false;
        }
        println("Journal at %s:", getJournalFilePath());
        println("Keystone Address:  %,d", getKeystoneAddress());
        println("Base Address: %,d", getBaseAddress());
        println("Block Size: %,d", getBlockSize());
        println("Journal created: %s", new SimpleDateFormat("yyyyMMddHHmm").format(new Date(getJournalCreatedTime())));
        println("Last valid checkpoint: %s", getLastValidCheckpoint());
        println("Last valid checkpoint address: %,d", getLastValidCheckpointAddress());
        println("Recovered transaction count committed=%,d uncommitted=%,d", getCommittedCount(), getUncommittedCount());
        println("Recovered page count: %,d", getPageMapSize());
        println("Volume handle map--");
        for (final Map.Entry<Integer, Volume> entry : _handleToVolumeMap.entrySet()) {
            println(" %5d->%s", entry.getKey(), entry.getValue());
        }
        println("Tree handle map--");
        for (final Map.Entry<Integer, TreeDescriptor> entry : _handleToTreeMap.entrySet()) {
            println(" %5d->%s", entry.getKey(), entry.getValue());
        }

        final long fromGeneration = getBaseAddress() / getBlockSize();
        final long toGeneration = getKeystoneAddress() / getBlockSize();
        boolean okay = true;
        for (long generation = fromGeneration; generation < toGeneration; generation++) {
            final File file = addressToFile(generation * getBlockSize());
            println("Validating file %s", file);
            try {
                validateMemberFile(generation);
            } catch (final PersistitIOException ioe) {
                println("   Unrecoverable: %s", ioe);
                okay = false;
            }
        }
        return okay;
    }

    /**
     * Read and display information about a journal. Read-only - does not change
     * any file.
     *
     * @param args
     * @throws Exception
     */
    public static void main(final String[] args) throws Exception {
        final String[] template = {
                "path||pathname of journal, e.g., /xxx/yyy/zzz_journal "
                        + "for files such as /xxx/yyy/zzz_journal.0000000000000047",
                "_flags|t|emit transaction details" };
        final ArgParser argParser = new ArgParser("RecoveryManager", args, template).strict();
        final Persistit persistit = new Persistit();
        persistit.initializeJournal();
        final RecoveryManager rman = new RecoveryManager(persistit);
        rman.init(argParser.getStringValue("path"));
        rman.analyze();
    }

}
TOP

Related Classes of com.persistit.RecoveryManager$RecoveryTransactionPlayerSupport

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.