Package com.persistit

Source Code of com.persistit.Exchange$LevelCache

/**
* Copyright 2005-2012 Akiban Technologies, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package com.persistit;

import static com.persistit.Buffer.EXACT_MASK;
import static com.persistit.Buffer.HEADER_SIZE;
import static com.persistit.Buffer.KEYBLOCK_LENGTH;
import static com.persistit.Buffer.MAX_VALID_PAGE_ADDR;
import static com.persistit.Buffer.PAGE_TYPE_DATA;
import static com.persistit.Buffer.PAGE_TYPE_INDEX_MIN;
import static com.persistit.Buffer.P_MASK;
import static com.persistit.Buffer.TAILBLOCK_HDR_SIZE_INDEX;
import static com.persistit.Key.AFTER;
import static com.persistit.Key.BEFORE;
import static com.persistit.Key.EQ;
import static com.persistit.Key.GT;
import static com.persistit.Key.GTEQ;
import static com.persistit.Key.LEFT_GUARD_KEY;
import static com.persistit.Key.LT;
import static com.persistit.Key.LTEQ;
import static com.persistit.Key.RIGHT_GUARD_KEY;
import static com.persistit.Key.maxStorableKeySize;
import static com.persistit.util.SequencerConstants.DEALLOCATE_CHAIN_A;
import static com.persistit.util.SequencerConstants.WRITE_WRITE_STORE_A;
import static com.persistit.util.ThreadSequencer.sequence;

import java.util.ArrayList;
import java.util.List;

import com.persistit.CleanupManager.CleanupAction;
import com.persistit.Key.Direction;
import com.persistit.MVV.PrunedVersion;
import com.persistit.ValueHelper.MVVValueWriter;
import com.persistit.ValueHelper.RawValueWriter;
import com.persistit.VolumeStructure.Chain;
import com.persistit.exception.BufferSizeUnavailableException;
import com.persistit.exception.CorruptVolumeException;
import com.persistit.exception.InUseException;
import com.persistit.exception.PersistitException;
import com.persistit.exception.PersistitInterruptedException;
import com.persistit.exception.ReadOnlyVolumeException;
import com.persistit.exception.RebalanceException;
import com.persistit.exception.RetryException;
import com.persistit.exception.RollbackException;
import com.persistit.exception.TreeNotFoundException;
import com.persistit.exception.VersionsOutOfOrderException;
import com.persistit.exception.WWRetryException;
import com.persistit.policy.JoinPolicy;
import com.persistit.policy.SplitPolicy;
import com.persistit.util.Debug;
import com.persistit.util.Util;

/**
* <p>
* The main facade for fetching, storing and removing records from a
* Persistit&trade; database.
* </p>
* <p>
* Applications interact with Persistit through instances of this class. A
* <code>Exchange</code> has two important associated member objects, a
* {@link com.persistit.Key} and a {@link com.persistit.Value}. A
* <code>Key</code> is a mutable representation of a key, and a
* <code>Value</code> is a mutable representation of a value. Applications
* manipulate these objects and interact with the database through one of the
* following four general patterns:
* <ol>
* <li>
* Modify the <code>Key</code>, perform a {@link com.persistit.Exchange#fetch
* fetch} operation, and query the <code>Value</code>.</li>
* <li>
* Modify the <code>Key</code>, modify the <code>Value</code>, and then perform
* a {@link com.persistit.Exchange#store store} operation to insert or replace
* data in the database.</li>
* <li>
* Modify the <code>Key</code>, and then perform a
* {@link com.persistit.Exchange#remove remove} to remove one or more key/value
* pairs.</li>
* <li>
* Optionally modify the <code>Key</code>, perform a
* {@link com.persistit.Exchange#traverse traverse} operation, then query the
* resulting state of <code>Key</code> and/or <code>Value</code> to enumerate
* key/value pairs currently stored in the database.</li>
* </ol>
* <p>
* Additional methods of <code>Exchange</code> include {@link #fetchAndStore
* fetchAndStore} and {@link #fetchAndRemove fetchAndRemove} which atomically
* modify the database and return the former value associated with the current
* <code>Key</code>.
* </p>
* <p>
* <h3>Exchange is Not Threadsafe</h3>
* <em>Important:</em> an <code>Exchange</code> and its associated
* <code>Key</code> and <code>Value</code> instances are <i>not</i> thread-safe.
* Generally each <code>Thread</code> should allocate and use its own
* <code>Exchange</code> instances. Were it to occur, modification of the
* <code>Key</code> or <code>Value</code> objects associated with an
* <code>Exchange</code> by another thread could cause severe and unpredictable
* errors, including possible corruption of the underlying data storage. While
* the methods of one <code>Exchange</code> instance are not threadsafe,
* Persistit is designed to allow multiple threads, using <em>multiple</em>
* <code>Exchange</code> instances, to access and update the underlying database
* in a highly concurrent fashion.
* </p>
* <p>
* <h3>Exchange Pools</h3>
* Normally each thread should allocate its own <code>Exchange</code> instances.
* However, depending on the garbage collection performance characteristics of a
* particular JVM it may be desirable to maintain a pool of
* <code>Exchange</code>s available for reuse, thereby reducing the frequency
* with which <code>Exchange</code>s need to be constructed and then garbage
* collected. An application may get an Exchange using
* {@link Persistit#getExchange(String, String, boolean)} or
* {@link Persistit#getExchange(Volume, String, boolean)}. These methods reuse a
* previously constructed <code>Exchange</code> if one is available in a pool;
* otherwise they construct methods construct a new one. Applications using the
* Exchange pool should call
* {@link Persistit#releaseExchange(Exchange, boolean)} to relinquish an
* <code>Exchange</code> once it is no longer needed, thereby placing it in the
* pool for subsequent reuse.
* </p>
*
* @version 1.0
*/
public class Exchange implements ReadOnlyExchange {

    public enum Sequence {
        NONE, FORWARD, REVERSE
    }

    private static class MvvVisitor implements MVV.VersionVisitor {
        enum Usage {
            FETCH, STORE
        }

        private final static long READ_COMMITTED_TS = TransactionStatus.UNCOMMITTED - 1;

        private final TransactionIndex _ti;
        private final Exchange _exchange;
        private TransactionStatus _status;
        private int _step;
        private int _foundOffset;
        private int _foundLength;
        private long _foundVersion;
        private int _foundStep;
        private Usage _usage;

        private MvvVisitor(final TransactionIndex ti, final Exchange exchange) {
            _ti = ti;
            _exchange = exchange;
        }

        /**
         * @param status
         *            Status to inspect the versions as. <code>null</code> is
         *            allowed iff <code>usage</code> is {@link Usage#FETCH},
         *            which signifies 'read committed' mode.
         * @param step
         *            Current step value associated with <code>status</code>.
         * @param usage
         *            What reason this visit is being done for.
         */
        public void initInternal(final TransactionStatus status, final int step, final Usage usage) {
            Debug.$assert0.t(status != null || usage != Usage.STORE);
            _status = status;
            _step = step;
            _usage = usage;
        }

        public int getOffset() {
            return _foundOffset;
        }

        public int getLength() {
            return _foundLength;
        }

        public boolean foundVersion() {
            return _foundVersion != MVV.VERSION_NOT_FOUND;
        }

        @Override
        public void init() {
            _foundVersion = MVV.VERSION_NOT_FOUND;
            _foundOffset = -1;
            _foundLength = -1;
            _foundStep = 0;
        }

        @Override
        public void sawVersion(final long version, final int offset, final int valueLength) throws PersistitException {
            try {
                switch (_usage) {
                case FETCH:
                    final long ts = _status != null ? _status.getTs() : READ_COMMITTED_TS;
                    final long status = _ti.commitStatus(version, ts, _step);
                    if (status >= 0 && status != TransactionStatus.UNCOMMITTED && status >= _foundVersion) {
                        assert status <= ts;
                        final int step = TransactionIndex.vh2step(version);
                        if (step >= _foundStep || status > _foundVersion) {
                            _foundOffset = offset;
                            _foundLength = valueLength;
                            _foundVersion = status;
                            _foundStep = step;
                        }
                    }
                    break;

                case STORE:
                    final long depends = _ti.wwDependency(version, _status, 0);
                    if (depends == TransactionStatus.TIMED_OUT) {
                        throw new WWRetryException(version);
                    }
                    if (depends != 0 && depends != TransactionStatus.ABORTED) {
                        // version is from concurrent txn that already committed
                        // or timed out waiting to see. Either
                        // way, must abort.
                        _exchange._transaction.rollback();
                        throw new RollbackException();
                    }
                    if (version > _foundVersion) {
                        _foundVersion = version;
                    }
                    break;
                }
            } catch (final InterruptedException ie) {
                throw new PersistitInterruptedException(ie);
            }
        }
    }

    /**
     * Maximum number of levels in one tree. (This count represents a highly
     * pathological case: most trees, even large ones, are no more than four or
     * five levels deep.)
     */
    final static int MAX_TREE_DEPTH = 20;
    /**
     * Upper bound on horizontal page searches.
     */
    final static int MAX_WALK_RIGHT = 50;

    private final static int LEFT_CLAIMED = 1;

    private final static int RIGHT_CLAIMED = 2;

    private final static int VERSIONS_OUT_OF_ORDER_RETRY_COUNT = 3;

    private Persistit _persistit;

    private final Key _key;
    private final Value _value;

    private final LevelCache[] _levelCache = new LevelCache[MAX_TREE_DEPTH];

    private BufferPool _pool;
    private Volume _volume;
    private Tree _tree;
    private long _timeoutMillis = SharedResource.DEFAULT_MAX_WAIT_TIME;

    private volatile long _cachedTreeGeneration = -1;
    private volatile int _cacheDepth = 0;

    private Key _spareKey1;
    private Key _spareKey2;
    private final Key _spareKey3;
    private final Key _spareKey4;

    private final Value _spareValue;

    private SplitPolicy _splitPolicy;
    private JoinPolicy _joinPolicy;

    private boolean _isDirectoryExchange = false;

    private Transaction _transaction;

    private boolean _ignoreTransactions;
    private boolean _ignoreMVCCFetch;
    private boolean _storeCausedSplit;
    private int _keysVisitedDuringTraverse;

    private Object _appCache;

    private ReentrantResourceHolder _treeHolder;

    private final MvvVisitor _mvvVisitor;
    private final RawValueWriter _rawValueWriter = new RawValueWriter();
    private final MVVValueWriter _mvvValueWriter = new MVVValueWriter();
    private LongRecordHelper _longRecordHelper;

    private volatile Thread _thread;

    Exchange(final Persistit persistit) {
        _persistit = persistit;
        _key = new Key(_persistit);
        _spareKey1 = new Key(_persistit);
        _spareKey2 = new Key(_persistit);
        _spareKey3 = new Key(_persistit);
        _spareKey4 = new Key(_persistit);
        _value = new Value(_persistit);
        _spareValue = new Value(_persistit);
        _mvvVisitor = new MvvVisitor(_persistit.getTransactionIndex(), this);
    }

    /**
     * <p>
     * Construct a new <code>Exchange</code> object to create and/or access the
     * {@link Tree} specified by treeName within the {@link Volume} specified by
     * <code>volumeName</code>. This constructor optionally creates a new
     * <code>Tree</code>. If the <code>create</code> parameter is false and a
     * <code>Tree</code> by the specified name does not exist, this constructor
     * throws a {@link com.persistit.exception.TreeNotFoundException}.
     * </p>
     * <p>
     * The <code>volumeName</tt< you supply must match exactly one open
     * <code>Volume</code>. The name matches if either (a) the
     * <code>Volume</code> has an optional alias that is equal to the supplied
     * name, or (b) if the supplied name matches a substring of the
     * <code>Volume</code>'s pathname. If there is not unique match for the name
     * you supply, this method throws a
     * {@link com.persistit.exception.VolumeNotFoundException}.
     * </p>
     *
     * @param volumeName
     *            The volume name that either matches the alias or a partially
     *            matches the pathname of exactly one open <code>Volume</code>.
     *
     * @param treeName
     *            The tree name
     *
     * @param create
     *            <code>true</code> to create a new Tree if one by the specified
     *            name does not already exist.
     *
     * @throws PersistitException
     */
    public Exchange(final Persistit persistit, final String volumeName, final String treeName, final boolean create)
            throws PersistitException {
        this(persistit, persistit.getVolume(volumeName), treeName, create);
    }

    /**
     * <p>
     * Construct a new <code>Exchange</code> object to create and/or access the
     * {@link Tree} specified by treeName within the specified {@link Volume}.
     * This constructor optionally creates a new <code>Tree</code>. If the
     * <code>create</code> parameter is false and a <code>Tree</code> by the
     * specified name does not exist, this constructor throws a
     * {@link com.persistit.exception.TreeNotFoundException}.
     * </p>
     *
     * @param volume
     *            The Volume
     * @param treeName
     *            The tree name
     * @param create
     *            <code>true</code> to create a new Tree if one by the specified
     *            name does not already exist.
     * @throws PersistitException
     */
    public Exchange(final Persistit persistit, final Volume volume, final String treeName, final boolean create)
            throws PersistitException {
        this(persistit);
        if (volume == null) {
            throw new NullPointerException();
        }
        init(volume, treeName, create);
    }

    /**
     * Construct a new <code>Exchange</code> to access the same {@link Volume}
     * and {@link Tree} as the supplied <code>Exchange</code>. The states of the
     * supplied <code>Exchange</code>'s {@link Key} and {@link Value} objects
     * are copied to new the <code>Key</code> and new <code>Value</code>
     * associated with this <code>Exchange</code> so that operations on the two
     * <code>Exchange</code>s initially behave identically.
     *
     * @param exchange
     *            The <code>Exchange</code> to copy from.
     */
    public Exchange(final Exchange exchange) {
        this(exchange._persistit);
        init(exchange);
    }

    /**
     * Construct a new <code>Exchange</code> to access the specified
     * {@link Tree}.
     *
     * @param tree
     *            The <code>Tree</code> to access.
     * @throws BufferSizeUnavailableException
     */
    public Exchange(final Tree tree) {
        this(tree._persistit);
        init(tree);
        _volume = tree.getVolume();
        _isDirectoryExchange = tree == _volume.getDirectoryTree();
        initCache();
    }

    void init(final Volume volume, final String treeName, final boolean create) throws PersistitException {
        if (volume == null) {
            throw new NullPointerException();
        }

        final Tree tree = volume.getTree(treeName, create);
        if (tree == null) {
            throw new TreeNotFoundException(treeName);
        }
        init(tree);
    }

    void init(final Tree tree) {
        assertCorrectThread(true);
        final Volume volume = tree.getVolume();
        _ignoreTransactions = volume.isTemporary();
        _ignoreMVCCFetch = false;
        _pool = volume.getStructure().getPool();

        _transaction = _persistit.getTransaction();
        _key.clear();
        _value.clear();

        if (_volume != volume || _tree != tree) {
            _volume = volume;
            _tree = tree;
            _treeHolder = new ReentrantResourceHolder(_tree);
            _cachedTreeGeneration = -1;
            _isDirectoryExchange = tree == _volume.getDirectoryTree();
            initCache();
        }
        _splitPolicy = _persistit.getDefaultSplitPolicy();
        _joinPolicy = _persistit.getDefaultJoinPolicy();
    }

    void init(final Exchange exchange) {
        assertCorrectThread(true);
        _persistit = exchange._persistit;
        _volume = exchange._volume;
        _ignoreTransactions = _volume.isTemporary();
        _ignoreMVCCFetch = false;
        _tree = exchange._tree;
        _treeHolder = new ReentrantResourceHolder(_tree);
        _pool = exchange._pool;

        _cachedTreeGeneration = -1;
        _transaction = _persistit.getTransaction();
        _cacheDepth = exchange._cacheDepth;

        initCache();

        for (int index = 0; index < _cacheDepth; index++) {
            exchange._levelCache[index].copyTo(_levelCache[index]);
        }

        exchange._key.copyTo(_key);
        exchange._value.copyTo(_value);
        _splitPolicy = exchange._splitPolicy;
        _joinPolicy = exchange._joinPolicy;
    }

    void removeState(final boolean secure) {
        assertCorrectThread(false);
        _key.clear(secure);
        _value.clear(secure);
        _spareKey1.clear(secure);
        _spareKey2.clear(secure);
        _spareValue.clear(secure);
        _transaction = null;
        _ignoreTransactions = false;
        _ignoreMVCCFetch = false;
        _splitPolicy = _persistit.getDefaultSplitPolicy();
        _joinPolicy = _persistit.getDefaultJoinPolicy();
        _treeHolder.verifyReleased();
    }

    /**
     * Drop all cached optimization information
     */
    public void initCache() {
        assertCorrectThread(true);
        for (int level = 0; level < MAX_TREE_DEPTH; level++) {
            if (_levelCache[level] != null)
                _levelCache[level].invalidate();
            else
                _levelCache[level] = new LevelCache(level);
        }
    }

    private void checkLevelCache() throws PersistitException {

        if (!_tree.isLive()) {
            if (_tree.getVolume().isTemporary()) {
                _tree = _tree.getVolume().getTree(_tree.getName(), true);
                _treeHolder = new ReentrantResourceHolder(_tree);
                _cachedTreeGeneration = -1;
            } else {
                throw new TreeNotFoundException();
            }
        }
        if (_cachedTreeGeneration != _tree.getGeneration()) {
            _cachedTreeGeneration = _tree.getGeneration();
            _cacheDepth = _tree.getDepth();
            for (int index = 0; index < MAX_TREE_DEPTH; index++) {
                final LevelCache lc = _levelCache[index];
                lc.invalidate();
            }
        }
    }

    private class LevelCache {
        int _level;
        Buffer _buffer;
        long _page;
        long _bufferGeneration;
        long _keyGeneration;
        int _foundAt;
        int _lastInsertAt;
        //
        // The remaining fields are used only by raw_removeKeyRangeInternal and
        // its helpers.
        //
        Buffer _leftBuffer;
        Buffer _rightBuffer;
        int _leftFoundAt;
        int _rightFoundAt;
        int _flags;
        long _deallocLeftPage;
        long _deallocRightPage;

        private LevelCache(final int level) {
            _level = level;
        }

        @Override
        public String toString() {
            if (_buffer == null)
                return "<empty>";

            return "Buffer=<" + _buffer + ">" + ", keyGeneration=" + _keyGeneration + ", bufferGeneration="
                    + _bufferGeneration + ", foundAt=" + _buffer.foundAtString(_foundAt) + ">";
        }

        private void copyTo(final LevelCache to) {
            Debug.$assert0.t(to._level == _level || to._level == -1);
            to._buffer = _buffer;
            to._page = _page;
            to._foundAt = _foundAt;
            to._keyGeneration = _keyGeneration;
            to._bufferGeneration = _bufferGeneration;
        }

        private void invalidate() {
            _buffer = null;
            _bufferGeneration = -1;
        }

        private void updateInsert(final Buffer buffer, final Key key, final int foundAt) {
            update(buffer, key, foundAt);
            _lastInsertAt = foundAt;
        }

        private void update(final Buffer buffer, final Key key, final int foundAt) {
            Debug.$assert0.t(_level + PAGE_TYPE_DATA == buffer.getPageType());
            // Debug.$assert0.t(foundAt == -1 || (foundAt & EXACT_MASK) == 0
            // || Buffer.decodeDepth(foundAt) == key.getEncodedSize());

            _page = buffer.getPageAddress();
            _buffer = buffer;
            _bufferGeneration = buffer.getGeneration();

            if (key == _key && foundAt > 0 && !buffer.isAfterRightEdge(foundAt)) {
                _keyGeneration = key.getGeneration();
                _foundAt = foundAt;
            } else {
                _keyGeneration = -1;
                _foundAt = -1;
            }
        }

        private Sequence sequence(final int foundAt) {
            final int delta = ((foundAt & P_MASK) - (_lastInsertAt & P_MASK));
            if ((foundAt & EXACT_MASK) == 0 && delta == KEYBLOCK_LENGTH) {
                return Sequence.FORWARD;
            }
            if ((foundAt & EXACT_MASK) == 0 && delta == 0) {
                return Sequence.REVERSE;
            }
            return Sequence.NONE;
        }

        private void initRemoveFields() {
            _leftBuffer = null;
            _rightBuffer = null;
            _leftFoundAt = -1;
            _rightFoundAt = -1;
            _flags = 0;
        }
    }

    /**
     * Bit flags that are passed to {@link #storeInternal(Key, Value, int, int)}
     * to control various behavior. See each member for specifics.
     */
    static class StoreOptions {
        /** The default, implies none of the further options **/
        public static final int NONE = 0;

        /** Fetch the current value before replacing **/
        public static final int FETCH = 1 << 1;

        /** Use MVCC (store as version or fetch restricted version) **/
        public static final int MVCC = 1 << 2;

        /** Block and use for any acquire operation **/
        public static final int WAIT = 1 << 3;

        /** Perform the store only if key is currently visible **/
        public static final int ONLY_IF_VISIBLE = 1 << 4;

        /**
         * Don't write the store operation to the journal - used when storing
         * AntiValues
         **/
        public static final int DONT_JOURNAL = 1 << 5;
    }

    static enum PruneStatus {
        REMOVED, CHANGED, UNCHANGED
    }

    /**
     * A visitor used with the
     * {@link Exchange#traverse(Key.Direction, boolean, int, TraverseVisitor)}
     * The {@link #visit(ReadOnlyExchange)} method is called once for each
     * <code>Key</code> traversed by the <code>traverse</code> method.
     */
    public interface TraverseVisitor {
        /**
         * Receive an Exchange having <code>Key</code> and <code>Value</code>
         * values set by
         * {@link Exchange#traverse(Key.Direction, boolean, int, TraverseVisitor)}
         * . This method will be called once for each key encountered in the
         * traversal. This method may return <code>false</code> to stop
         * traversing additional keys. </p>
         * <p>
         * The implementation of this method:
         * <ul>
         * <li>Must return quickly, especially in a multi-threaded environment,
         * to avoid blocking other threads that may attempt to update records in
         * the same <code>Buffer</code>,
         * <li>Must not perform update operations on any <codeExchange</code>,
         * especially in a multi-threaded environment, to prevent deadlocks,
         * <li>May read and modify the <code>Key</code> and <code>Value</code>
         * fields of the supplied <code>ReadOnlyExchange</code>. Note, however,
         * that modifying the <code>Key</code> affects the results of subsequent
         * traversal operations.
         * </ul>
         *
         * @param ex
         *            a {@link ReadOnlyExchange} from which the current
         *            <code>Key</code> and <code>Value</code> may be read
         * @return <code>true</code> to continue traversing keys, or
         *         <code>false</code> to stop
         * @throws PersistitException
         */
        public boolean visit(final ReadOnlyExchange ex) throws PersistitException;
    }

    /**
     * Delegate to {@link Key#reset} on the associated <code>Key</code> object.
     *
     * @return This <code>Exchange</code> to permit method call chaining.
     */
    public Exchange reset() {
        getKey().reset();
        return this;
    }

    /**
     * Delegate to {@link Key#clear} on the associated <code>Key</code> object.
     *
     * @return This <code>Exchange</code> to permit method call chaining.
     */
    public Exchange clear() {
        getKey().clear();
        return this;
    }

    /**
     * Delegate to {@link Key#setDepth} on the associated <code>Key</code>
     * object.
     *
     * @return This <code>Exchange</code> to permit method call chaining.
     */
    public Exchange setDepth(final int depth) {
        getKey().setDepth(depth);
        return this;
    }

    /**
     * Delegate to {@link Key#cut(int)} on the associated <code>Key</code>
     * object.
     *
     * @return This <code>Exchange</code> to permit method call chaining.
     */
    public Exchange cut(final int level) {
        getKey().cut(level);
        return this;
    }

    /**
     * Delegate to {@link Key#cut()} on the associated <code>Key</code> object.
     *
     * @return This <code>Exchange</code> to permit method call chaining.
     */
    public Exchange cut() {
        getKey().cut();
        return this;
    }

    /**
     * Delegate to {@link Key#append(boolean)} on the associated
     * <code>Key</code> object.
     *
     * @return This <code>Exchange</code> to permit method call chaining.
     */
    public Exchange append(final boolean item) {
        getKey().append(item);
        return this;
    }

    /**
     * Delegate to {@link Key#append(byte)} on the associated <code>Key</code>
     * object.
     *
     * @return This <code>Exchange</code> to permit method call chaining.
     */
    public Exchange append(final byte item) {
        getKey().append(item);
        return this;
    }

    /**
     * Delegate to {@link Key#append(short)} on the associated <code>Key</code>
     * object.
     *
     * @return This <code>Exchange</code> to permit method call chaining.
     */
    public Exchange append(final short item) {
        getKey().append(item);
        return this;
    }

    /**
     * Delegate to {@link Key#append(char)} on the associated <code>Key</code>
     * object.
     *
     * @return This <code>Exchange</code> to permit method call chaining.
     */
    public Exchange append(final char item) {
        getKey().append(item);
        return this;
    }

    /**
     * Delegate to {@link Key#append(int)} on the associated <code>Key</code>
     * object.
     *
     * @return This <code>Exchange</code> to permit method call chaining.
     */
    public Exchange append(final int item) {
        getKey().append(item);
        return this;
    }

    /**
     * Delegate to {@link Key#append(long)} on the associated <code>Key</code>
     * object.
     *
     * @return This <code>Exchange</code> to permit method call chaining.
     */
    public Exchange append(final long item) {
        getKey().append(item);
        return this;
    }

    /**
     * Delegate to {@link Key#append(float)} on the associated <code>Key</code>
     * object.
     *
     * @return This <code>Exchange</code> to permit method call chaining.
     */
    public Exchange append(final float item) {
        getKey().append(item);
        return this;
    }

    /**
     * Delegate to {@link Key#append(double)} on the associated <code>Key</code>
     * object.
     *
     * @return This <code>Exchange</code> to permit method call chaining.
     */
    public Exchange append(final double item) {
        getKey().append(item);
        return this;
    }

    /**
     * Delegate to {@link Key#append(Object)} on the associated <code>Key</code>
     * object.
     *
     * @return This <code>Exchange</code> to permit method call chaining.
     */
    public Exchange append(final Object item) {
        getKey().append(item);
        return this;
    }

    /**
     * Delegate to {@link Key#to(boolean)} on the associated <code>Key</code>
     * object.
     *
     * @return This <code>Exchange</code> to permit method call chaining.
     */
    public Exchange to(final boolean item) {
        getKey().to(item);
        return this;
    }

    /**
     * Delegate to {@link Key#to(byte)} on the associated <code>Key</code>
     * object.
     *
     * @return This <code>Exchange</code> to permit method call chaining.
     */
    public Exchange to(final byte item) {
        getKey().to(item);
        return this;
    }

    /**
     * Delegate to {@link Key#to(short)} on the associated <code>Key</code>
     * object.
     *
     * @return This <code>Exchange</code> to permit method call chaining.
     */
    public Exchange to(final short item) {
        getKey().to(item);
        return this;
    }

    /**
     * Delegate to {@link Key#to(char)} on the associated <code>Key</code>
     * object.
     *
     * @return This <code>Exchange</code> to permit method call chaining.
     */
    public Exchange to(final char item) {
        getKey().to(item);
        return this;
    }

    /**
     * Delegate to {@link Key#to(int)} on the associated <code>Key</code>
     * object.
     *
     * @return This <code>Exchange</code> to permit method call chaining.
     */
    public Exchange to(final int item) {
        getKey().to(item);
        return this;
    }

    /**
     * Delegate to {@link Key#to(long)} on the associated <code>Key</code>
     * object.
     *
     * @return This <code>Exchange</code> to permit method call chaining.
     */
    public Exchange to(final long item) {
        getKey().to(item);
        return this;
    }

    /**
     * Delegate to {@link Key#to(float)} on the associated <code>Key</code>
     * object.
     *
     * @return This <code>Exchange</code> to permit method call chaining.
     */
    public Exchange to(final float item) {
        getKey().to(item);
        return this;
    }

    /**
     * Delegate to {@link Key#to(double)} on the associated <code>Key</code>
     * object.
     *
     * @return This <code>Exchange</code> to permit method call chaining.
     */
    public Exchange to(final double item) {
        getKey().to(item);
        return this;
    }

    /**
     * Delegate to {@link Key#to(Object)} on the associated <code>Key</code>
     * object.
     *
     * @return This <code>Exchange</code> to permit method call chaining.
     */
    public Exchange to(final Object item) {
        getKey().to(item);
        return this;
    }

    /**
     * Return the {@link Key} associated with this <code>Exchange</code>.
     *
     * @return This <code>Key</code>.
     */
    @Override
    public Key getKey() {
        assertCorrectThread(true);
        return _key;
    }

    /**
     * Return the {@link Value} associated with this <code>Exchange</code>.
     *
     * @return The <code>Value</code>.
     */
    @Override
    public Value getValue() {
        assertCorrectThread(true);
        return _value;
    }

    BufferPool getBufferPool() {
        return _pool;
    }

    /**
     * Return the {@link Volume} containing the data accessed by this
     * <code>Exchange</code>.
     *
     * @return The <code>Volume</code>.
     */
    @Override
    public Volume getVolume() {
        assertCorrectThread(true);
        return _volume;
    }

    /**
     * Return the {@link Tree} on which this <code>Exchange</code> operates.
     *
     * @return The <code>Tree</code>
     */
    @Override
    public Tree getTree() {
        assertCorrectThread(true);
        return _tree;
    }

    /**
     * Return the Persistit instance from which this Exchange was created.
     *
     * @return The <code>Persistit</code> instance.
     */
    @Override
    public Persistit getPersistitInstance() {
        assertCorrectThread(true);
        return _persistit;
    }

    /**
     * Return the count of structural changes committed to the {@link Tree} on
     * which this <code>Exchange</code> operates. This count includes changes
     * committed by all Threads, including the current one. A structural change
     * is one in which at least one key is inserted or deleted. Replacement of
     * an existing value is not counted.
     *
     * @return The change count
     */
    @Override
    public long getChangeCount() {
        assertCorrectThread(true);
        return _tree.getChangeCount();
    }

    /**
     * An additional <code>Key</code> maintained for the convenience of
     * {@link Transaction}, {@link PersistitMap} and {@link JournalManager}.
     *
     * @return spareKey1
     */
    Key getAuxiliaryKey1() {
        return _spareKey1;
    }

    /**
     * An additional <code>Key</code> maintained for the convenience of
     * {@link Transaction}, {@link PersistitMap} and {@link JournalManager}.
     *
     * @return spareKey3
     */
    Key getAuxiliaryKey3() {
        return _spareKey3;
    }

    /**
     * An additional <code>Key</code> maintained for the convenience of
     * {@link Transaction}, {@link PersistitMap} and {@link JournalManager}.
     *
     * @return spareKey4
     */
    Key getAuxiliaryKey4() {
        return _spareKey4;
    }

    /**
     * An additional <code>Key</code> maintained for the convenience of
     * {@link Transaction}, {@link PersistitMap} and {@link JournalManager}.
     *
     * @return spareKey2
     */
    Key getAuxiliaryKey2() {
        return _spareKey2;
    }

    /**
     * An additional <code>Value</code> maintained for the convenience of
     * {@link Transaction}.
     *
     * @return spareValue
     */
    Value getAuxiliaryValue() {
        return _spareValue;
    }

    /**
     * Internal value that, if <code>true</code>, indicates the last store
     * operation caused a page split. Reset on every call to a store method.
     *
     * @return storeCausedSplit
     */
    boolean getStoreCausedSplit() {
        return _storeCausedSplit;
    }

    /**
     * Internal value that is a counter of the total loops of the inner loop of
     * {@link #traverse(com.persistit.Key.Direction, boolean, int, int, int)}.
     *
     * @return {@link #_keysVisitedDuringTraverse}
     */
    int getKeysVisitedDuringTraverse() {
        return _keysVisitedDuringTraverse;
    }

    /**
     * Return a displayable String containing the volume name, tree name and
     * current key state for this <code>Exchange</code>.
     *
     * @return The displayable String
     */
    @Override
    public String toString() {
        return "Exchange(Volume=" + _volume.getPath() + ",Tree=" + _tree.getName() + "," + ",Key=<" + _key.toString()
                + ">)";

    }

    /**
     * Search for a data record by key. Uses and maintains level cache. This
     * method returns a foundAt location within a Buffer.
     * <p />
     * As a side effect, this method populates the root LevelCache instance
     * (_levelCache[0]) and establishes a claim on a Buffer at that level to
     * which the foundAt value refers. The caller of this method MUST release
     * that Buffer when finished with it.
     *
     * @return Encoded key location within the data page. The page itself is
     *         made valid in the level cache.
     */
    private int search(final Key key, final boolean writer) throws PersistitException {
        Buffer buffer = null;
        checkLevelCache();
        final LevelCache lc = _levelCache[0];
        buffer = quicklyReclaimBuffer(lc, writer);

        if (buffer == null) {
            return searchTree(key, 0, writer);
        }

        checkPageType(buffer, PAGE_TYPE_DATA, true);

        final int foundAt = findKey(buffer, key, lc);

        if (buffer.isBeforeLeftEdge(foundAt) || buffer.isAfterRightEdge(foundAt)) {
            buffer.release();
            return searchTree(key, 0, writer);
        }
        return foundAt;
    }

    /**
     * Helper method to return the result of the {@link Buffer#findKey(Key)}
     * method given a Buffer, a Key and a LevelCache instance. The caller must
     * establish a claim on the Buffer before calling this method. This method
     * determines whether information cached in the LevelCache is still valid;
     * if so the previous result is still valid.
     *
     * @param buffer
     * @param key
     * @param lc
     * @return foundAt value
     * @throws PersistitInterruptedException
     */
    private int findKey(final Buffer buffer, final Key key, final LevelCache lc) throws PersistitInterruptedException {
        //
        // Possibly we can accelerate.
        //
        // TODO - metrics on hits vs. misses
        //
        int foundAt = lc._foundAt;
        if (foundAt != -1 && buffer.getGeneration() == lc._bufferGeneration && key == _key
                && key.getGeneration() == lc._keyGeneration) {
            Debug.$assert0.t(buffer.findKey(key) == foundAt);
            return foundAt;
        }
        //
        // Otherwise look it up again.
        //
        foundAt = buffer.findKey(key);
        // TODO - why do this if key != _key
        lc.update(buffer, key, foundAt);
        return foundAt;
    }

    /**
     * Searches for the current key from top down and populates the level cache
     * while doing so.
     * <p />
     * As a side effect, this method populates the root LevelCache instance
     * (_levelCache[0]) and establishes a claim on a Buffer at that level to
     * which the foundAt value refers. The caller of this method MUST release
     * that Buffer when finished with it.
     *
     * @return Encoded key location within the level. The page itself is valid
     *         within the level cache.
     */
    private int searchTree(final Key key, final int toLevel, final boolean writer) throws PersistitException {
        Buffer oldBuffer = null;
        int currentLevel;
        int foundAt = -1;

        if (!_treeHolder.claim(false)) {
            Debug.$assert0.t(false);
            throw new InUseException("Thread " + Thread.currentThread().getName() + " failed to get reader claim on "
                    + _tree);
        }
        checkLevelCache();

        long pageAddress = _tree.getRootPageAddr();
        long oldPageAddress = pageAddress;
        Debug.$assert0.t(pageAddress != 0);

        try {
            for (currentLevel = _cacheDepth; --currentLevel >= toLevel;) {
                if (pageAddress <= 0) {
                    corrupt("Volume " + _volume + " level=" + currentLevel + " page=" + pageAddress + " oldPage="
                            + oldPageAddress + " key=<" + key.toString() + "> " + " invalid page address");
                }

                foundAt = searchLevel(key, false, pageAddress, currentLevel, writer && currentLevel == toLevel);
                if (oldBuffer != null) {
                    oldBuffer.releaseTouched();
                    oldBuffer = null;
                }

                final LevelCache lc = _levelCache[currentLevel];
                final Buffer buffer = lc._buffer;

                if (buffer == null || buffer.isBeforeLeftEdge(foundAt)) {
                    oldBuffer = buffer; // So it will be released
                    corrupt("Volume " + _volume + " level=" + currentLevel + " page=" + pageAddress + " key=<"
                            + key.toString() + "> " + " is before left edge");
                }

                checkPageType(buffer, currentLevel + PAGE_TYPE_DATA, true);

                if (currentLevel == toLevel) {
                    for (int level = currentLevel; --level > 0;) {
                        _levelCache[level].invalidate();
                    }
                    return foundAt;
                } else if (buffer.isIndexPage()) {
                    int p = foundAt & P_MASK;
                    if ((foundAt & EXACT_MASK) == 0) {
                        p -= KEYBLOCK_LENGTH;
                    }
                    oldBuffer = buffer; // So it will be released
                    oldPageAddress = pageAddress;
                    pageAddress = buffer.getPointer(p);

                    Debug.$assert0.t(pageAddress > 0 && pageAddress < MAX_VALID_PAGE_ADDR);
                } else {
                    oldBuffer = buffer; // So it will be released
                    corrupt("Volume " + _volume + " level=" + currentLevel + " page=" + pageAddress + " key=<"
                            + key.toString() + ">" + " page type=" + buffer.getPageType() + " is invalid");
                }
            }
            // Should never get here.
            return -1;

        } finally {
            if (oldBuffer != null) {
                oldBuffer.releaseTouched();
                oldBuffer = null;
            }
            _treeHolder.release();
        }
    }

    /**
     * Search for the key in the specified page (data or index). This method
     * gets and claims the identified page. If the key is found to be after the
     * right key of that page, this method "walks" right by getting and claiming
     * the right sibling page and then releasing the original page. This pattern
     * implements the B-link-tree semantic that allows searches to proceed while
     * inserts are adjusting the index structure.
     * <p />
     * As a side effect, this method populates the LevelCache instance for the
     * specified <code>currentLevel</code> and establishes a claim on a Buffer
     * at that level. The caller of this method MUST release that Buffer when
     * finished with it.
     *
     * @param key
     *            Key to search for
     * @param edge
     *            if <code>true</code> select the right-edge key of the left
     *            page, otherwise select the left key of the right page.
     * @param pageAddress
     *            The address of the page to search
     * @param currentLevel
     *            current level in the tree
     * @return Encoded key location within the page.
     */
    private int searchLevel(final Key key, final boolean edge, long pageAddress, final int currentLevel,
            final boolean writer) throws PersistitException {
        Buffer oldBuffer = null;
        try {
            final long initialPageAddress = pageAddress; // DEBUG - debugging
                                                         // only
            long oldPageAddress = pageAddress;
            for (int rightWalk = MAX_WALK_RIGHT; rightWalk-- > 0;) {
                Buffer buffer = null;
                if (pageAddress <= 0 || pageAddress >= _volume.getStorage().getNextAvailablePage()) {
                    corrupt("Volume " + _volume + " level=" + currentLevel + " page=" + pageAddress + " previousPage="
                            + oldPageAddress + " initialPage=" + initialPageAddress + " key=<" + key.toString() + ">"
                            + " oldBuffer=<" + oldBuffer + ">" + " invalid page address");
                }
                final LevelCache lc = _levelCache[currentLevel];

                if (lc._page == pageAddress) {
                    buffer = quicklyReclaimBuffer(lc, writer);
                }

                if (buffer == null) {
                    buffer = _pool.get(_volume, pageAddress, writer, true, _timeoutMillis);
                }
                checkPageType(buffer, currentLevel + PAGE_TYPE_DATA, true);

                //
                // Release previous buffer after claiming this one. This
                // prevents another Thread from inserting pages to the left
                // of our new buffer.
                //
                if (oldBuffer != null) {
                    oldBuffer.releaseTouched();
                    oldBuffer = null;
                }

                if (pageAddress != lc._page) {
                    lc.invalidate();
                }

                final int foundAt = findKey(buffer, key, lc);
                if (!buffer.isAfterRightEdge(foundAt) || edge & (foundAt & EXACT_MASK) != 0) {
                    lc.update(buffer, key, foundAt);
                    return foundAt;
                }

                oldPageAddress = pageAddress;
                pageAddress = buffer.getRightSibling();

                Debug.$assert0.t(pageAddress > 0 && pageAddress < MAX_VALID_PAGE_ADDR);
                oldBuffer = buffer;
            }
            corrupt("Volume " + _volume + " level=" + currentLevel + " page=" + oldPageAddress + " initialPage="
                    + initialPageAddress + " key=<" + key.toString() + ">" + " walked right more than "
                    + MAX_WALK_RIGHT + " pages" + " last page visited=" + pageAddress);
            // won't happen - here to make compiler happy.
            return -1;
        } finally {
            if (oldBuffer != null) {
                oldBuffer.releaseTouched();
            }
        }
    }

    int maxValueSize(final int keySize) {
        final int pageSize = _volume.getPageSize();
        final int reserveForKeys = (KEYBLOCK_LENGTH + TAILBLOCK_HDR_SIZE_INDEX) * 3 + maxStorableKeySize(pageSize) * 2
                + keySize;
        return (pageSize - HEADER_SIZE - reserveForKeys) / 2;
    }

    /**
     * Inserts or replaces a data value in the database.
     *
     * @param key
     *            The key to store.
     * @param value
     *            The value to store.
     * @return This <code>Exchange</code> to permit method call chaining.
     * @throws PersistitException
     *             Upon error
     */
    Exchange store(final Key key, final Value value) throws PersistitException {
        assertCorrectThread(true);
        _persistit.checkClosed();
        if (_volume.isReadOnly()) {
            throw new ReadOnlyVolumeException(_volume.toString());
        }
        key.testValidForStoreAndFetch(_volume.getPageSize());
        if (!isDirectoryExchange()) {
            _persistit.checkSuspended();
        }
        throttle();
        int options = StoreOptions.WAIT;
        options |= (!_ignoreTransactions && _transaction.isActive()) ? StoreOptions.MVCC : 0;
        storeInternal(key, value, 0, options);
        _treeHolder.verifyReleased();

        return this;
    }

    /**
     * Inserts or replaces a data value in the database starting at a specified
     * level and working up toward the root of the tree.
     *
     * <p>
     * <b>Note: Fetch and MVCC are exclusive options.</b>
     * </p>
     *
     * @param key
     *            The key to store.
     * @param value
     *            The value to store.
     * @param level
     *            The level of the backing tree to start the insert at.
     * @param options
     *            Bit flag integer controlling various internal behavior. See
     *            members of {@link StoreOptions} for details.
     * @return <code>true</code> if <b>any version</b> of the key already
     *         existed
     * @throws PersistitException
     *             uponError
     */
    boolean storeInternal(Key key, final Value value, int level, final int options) throws PersistitException {

        final boolean doMVCC = (options & StoreOptions.MVCC) > 0;
        final boolean doFetch = (options & StoreOptions.FETCH) > 0;

        // spares used for new splits/levels
        Debug.$assert0.t(key != _spareKey1);

        _storeCausedSplit = false;
        boolean treeClaimRequired = false;
        boolean treeClaimAcquired = false;
        boolean treeWriterClaimRequired = false;
        boolean committed = false;
        boolean incrementMVVCount = false;

        final int maxSimpleValueSize = maxValueSize(key.getEncodedSize());
        final Value spareValue = _persistit.getThreadLocalValue();
        assert !(doMVCC & value == spareValue || doFetch && value == _spareValue) : "storeInternal may use the supplied Value: "
                + value;

        //
        // First insert the record in the data page
        //
        Buffer buffer = null;

        //
        // The LONG_RECORD pointer that was present before the update, if
        // there is a long record being replaced.
        //
        long oldLongRecordPointer = 0;
        long oldLongRecordPointerMVV = 0;
        //
        // The LONG_RECORD pointer for a new long record value, if the
        // the new value is long.
        //
        long newLongRecordPointer = 0;
        long newLongRecordPointerMVV = 0;

        final boolean isLongRecord = value.getEncodedSize() > maxSimpleValueSize;
        if (isLongRecord) {
            //
            // This method may delay significantly for I/O and must
            // be called when there are no other claimed resources.
            //
            newLongRecordPointer = getLongRecordHelper().storeLongRecord(value, _transaction.isActive());
        }

        if (!_ignoreTransactions && ((options & StoreOptions.DONT_JOURNAL) == 0)) {
            _transaction.store(this, key, value);
        }

        boolean keyExisted = false;

        try {

            Value valueToStore = value;

            mainRetryLoop: for (;;) {
                Debug.$assert0.t(buffer == null);
                if (Debug.ENABLED) {
                    Debug.suspend();
                }

                /*
                 * Can't save the old pointer as the state may have changed
                 * since the last claim, could have even been de-allocated, and
                 * just as equally can't hold onto the new one either.
                 */
                oldLongRecordPointerMVV = 0;
                if (!committed && newLongRecordPointerMVV != 0) {
                    _volume.getStructure().deallocateGarbageChain(newLongRecordPointerMVV, 0);
                    newLongRecordPointerMVV = 0;
                    spareValue.changeLongRecordMode(false);
                }

                if (treeClaimRequired && !treeClaimAcquired) {
                    if (!_treeHolder.claim(treeWriterClaimRequired)) {
                        Debug.$assert0.t(false);
                        throw new InUseException("Thread " + Thread.currentThread().getName() + " failed to get "
                                + (treeWriterClaimRequired ? "writer" : "reader") + " claim on " + _tree);
                    }
                    treeClaimAcquired = true;
                }

                checkLevelCache();
                final List<PrunedVersion> prunedVersions = new ArrayList<PrunedVersion>();

                try {
                    if (level >= _cacheDepth) {
                        Debug.$assert0.t(level == _cacheDepth);
                        //
                        // Need to lock the tree because we may need to change
                        // its root.
                        //
                        if (!treeClaimAcquired || !_treeHolder.upgradeClaim()) {
                            treeClaimRequired = true;
                            treeWriterClaimRequired = true;
                            throw RetryException.SINGLE;
                        }

                        Debug.$assert0.t(valueToStore.getPointerValue() > 0);
                        insertIndexLevel(key, valueToStore);
                        break mainRetryLoop;
                    }

                    Debug.$assert0.t(buffer == null);
                    int foundAt = -1;
                    final LevelCache lc = _levelCache[level];
                    buffer = quicklyReclaimBuffer(lc, true);

                    if (buffer != null) {
                        //
                        // Start by assuming cached value is okay
                        //
                        foundAt = findKey(buffer, key, lc);

                        if (buffer.isBeforeLeftEdge(foundAt) || buffer.isAfterRightEdge(foundAt)) {
                            buffer.release();
                            buffer = null;
                        }
                    }

                    if (buffer == null) {
                        foundAt = searchTree(key, level, true);
                        buffer = lc._buffer;
                    }

                    Debug.$assert0.t(buffer != null && (buffer.getStatus() & SharedResource.WRITER_MASK) != 0
                            && (buffer.getStatus() & SharedResource.CLAIMED_MASK) != 0);

                    boolean didPrune = false;
                    boolean splitRequired = false;

                    if (buffer.isDataPage()) {
                        keyExisted = (foundAt & EXACT_MASK) != 0;
                        if (keyExisted) {
                            oldLongRecordPointer = buffer.fetchLongRecordPointer(foundAt);
                        }

                        if (doFetch || doMVCC) {
                            buffer.fetch(foundAt, spareValue);
                            if (oldLongRecordPointer != 0) {
                                if (isLongMVV(spareValue)) {
                                    oldLongRecordPointerMVV = oldLongRecordPointer;
                                    fetchFixupForLongRecords(spareValue, Integer.MAX_VALUE);
                                }
                            }
                            /*
                             * If it was a long MVV we saved it into the
                             * variable above. Otherwise it is a primordial
                             * value that we can't get rid of.
                             */
                            oldLongRecordPointer = 0;

                            if (doFetch) {
                                spareValue.copyTo(_spareValue);
                                fetchFromValueInternal(_spareValue, Integer.MAX_VALUE, buffer);
                            }
                        }

                        /*
                         * If the Tree is private to an active transaction, and
                         * if this is a virgin value, then we can store it
                         * primordially because if the transaction rolls back,
                         * the entire Tree will be removed.
                         */
                        if (doMVCC && (_spareValue.isDefined() || !_tree.isTransactionPrivate(true))) {
                            valueToStore = spareValue;
                            final int valueSize = value.getEncodedSize();
                            int retries = VERSIONS_OUT_OF_ORDER_RETRY_COUNT;

                            for (;;) {
                                try {
                                    /*
                                     * If key didn't exist the value is truly
                                     * non-existent and not just undefined/zero
                                     * length
                                     */
                                    byte[] spareBytes = spareValue.getEncodedBytes();
                                    int spareSize;
                                    if (keyExisted) {
                                        spareSize = MVV.prune(spareBytes, 0, spareValue.getEncodedSize(),
                                                _persistit.getTransactionIndex(), false, prunedVersions);
                                        spareValue.setEncodedSize(spareSize);
                                    } else {
                                        spareSize = -1;
                                    }

                                    final TransactionStatus tStatus = _transaction.getTransactionStatus();
                                    final int tStep = _transaction.getStep();

                                    if ((options & StoreOptions.ONLY_IF_VISIBLE) != 0) {
                                        /*
                                         * Could be single visit of all versions
                                         * but current TI would still require
                                         * calls to both commitStatus() and
                                         * wwDependency()
                                         */
                                        _mvvVisitor.initInternal(tStatus, tStep, MvvVisitor.Usage.FETCH);
                                        MVV.visitAllVersions(_mvvVisitor, spareBytes, 0, spareSize);
                                        final int offset = _mvvVisitor.getOffset();
                                        if (!_mvvVisitor.foundVersion()
                                                || (_mvvVisitor.getLength() > 0 && spareBytes[offset] == MVV.TYPE_ANTIVALUE)) {
                                            // Completely done, nothing to store
                                            keyExisted = false;
                                            break mainRetryLoop;
                                        }
                                    }

                                    // Visit all versions for ww detection
                                    _mvvVisitor.initInternal(tStatus, tStep, MvvVisitor.Usage.STORE);
                                    MVV.visitAllVersions(_mvvVisitor, spareBytes, 0, spareSize);

                                    final int mvvSize = MVV.estimateRequiredLength(spareBytes, spareSize, valueSize);
                                    spareValue.ensureFit(mvvSize);
                                    spareBytes = spareValue.getEncodedBytes();

                                    final long versionHandle = TransactionIndex.tss2vh(
                                            _transaction.getStartTimestamp(), tStep);
                                    int storedLength = MVV.storeVersion(spareBytes, 0, spareSize, spareBytes.length,
                                            versionHandle, value.getEncodedBytes(), 0, valueSize);

                                    incrementMVVCount = (storedLength & MVV.STORE_EXISTED_MASK) == 0;
                                    storedLength &= MVV.STORE_LENGTH_MASK;
                                    spareValue.setEncodedSize(storedLength);

                                    Debug.$assert0.t(MVV.verify(_persistit.getTransactionIndex(), spareBytes, 0,
                                            storedLength));

                                    if (spareValue.getEncodedSize() > maxSimpleValueSize) {
                                        newLongRecordPointerMVV = getLongRecordHelper().storeLongRecord(spareValue,
                                                _transaction.isActive());
                                    }
                                    break;
                                } catch (final VersionsOutOfOrderException e) {
                                    if (--retries <= 0) {
                                        throw e;
                                    }
                                }
                            }
                        }
                    }

                    Debug.$assert0.t(valueToStore.getEncodedSize() <= maxSimpleValueSize);
                    _rawValueWriter.init(valueToStore);

                    splitRequired = putLevel(lc, key, _rawValueWriter, buffer, foundAt, treeClaimAcquired);

                    Debug.$assert0.t((buffer.getStatus() & SharedResource.WRITER_MASK) != 0
                            && (buffer.getStatus() & SharedResource.CLAIMED_MASK) != 0);
                    //
                    // If a split is required but treeClaimAcquired is false
                    // then putLevel did not change anything. It just backed out
                    // so we can repeat after acquiring the claim. We need to
                    // repeat this after acquiring a tree claim.
                    //
                    if (splitRequired && !treeClaimAcquired) {
                        if (!didPrune && buffer.isDataPage()) {
                            didPrune = true;
                            if (buffer.pruneMvvValues(_tree, false, null)) {
                                continue;
                            }
                        }
                        //
                        // TODO - is it worth it to try an instantaneous claim
                        // and retry?
                        //
                        treeClaimRequired = true;
                        buffer.releaseTouched();
                        buffer = null;
                        continue;
                    }
                    //
                    // The value has been written to the buffer and the
                    // buffer is reserved and dirty. No backing out now.
                    // If we made it to here, any LONG_RECORD value is
                    // committed.
                    //
                    if (buffer.isDataPage()) {
                        if (!keyExisted) {
                            _tree.bumpChangeCount();
                        }
                        assert buffer.isDirty() : "Buffer must be dirty";
                        committed = true;
                        if (incrementMVVCount) {
                            _transaction.getTransactionStatus().incrementMvvCount();
                        }
                        Buffer.deallocatePrunedVersions(_persistit, _volume, prunedVersions);
                    }

                    buffer.releaseTouched();
                    buffer = null;

                    if (!splitRequired) {
                        //
                        // No split means we're totally done.
                        //
                        break;
                    } else {
                        // Otherwise we need to index the new right
                        // sibling at the next higher index level.
                        Debug.$assert0.t(valueToStore.getPointerValue() > 0);
                        //
                        // This maneuver sets key to the key value of
                        // the first record in the newly inserted page.
                        //
                        key = _spareKey1;
                        _spareKey1 = _spareKey2;
                        _spareKey2 = key;
                        //
                        // Bump key generation because it no longer matches
                        // what's in the LevelCache
                        //
                        key.bumpGeneration();
                        //
                        // And now cycle back to insert the key/pointer pair
                        // into the next higher index level.
                        //
                        level++;
                        continue;
                    }

                } catch (final WWRetryException re) {
                    if (buffer != null) {
                        buffer.releaseTouched();
                        buffer = null;
                    }
                    if (treeClaimAcquired) {
                        _treeHolder.release();
                        treeClaimAcquired = false;
                    }
                    try {
                        sequence(WRITE_WRITE_STORE_A);
                        final long depends = _persistit.getTransactionIndex().wwDependency(re.getVersionHandle(),
                                _transaction.getTransactionStatus(), _timeoutMillis);
                        if (depends != 0 && depends != TransactionStatus.ABORTED) {
                            // version is from concurrent txn that already
                            // committed
                            // or timed out waiting to see. Either
                            // way, must abort.
                            _transaction.rollback();
                            throw new RollbackException();
                        }
                    } catch (final InterruptedException ie) {
                        throw new PersistitInterruptedException(ie);
                    }
                } catch (final RetryException re) {
                    if (buffer != null) {
                        buffer.releaseTouched();
                        buffer = null;
                    }

                    if (treeClaimAcquired) {
                        _treeHolder.release();
                        treeClaimAcquired = false;
                    }
                    final boolean doWait = (options & StoreOptions.WAIT) != 0;
                    treeClaimAcquired = _treeHolder.claim(true, doWait ? _timeoutMillis : 0);
                    if (!treeClaimAcquired) {
                        if (!doWait) {
                            throw re;
                        } else {
                            throw new InUseException("Thread " + Thread.currentThread().getName()
                                    + " failed to get reader claim on " + _tree);
                        }
                    }
                } finally {
                    if (buffer != null) {
                        buffer.releaseTouched();
                        buffer = null;
                    }
                }
            }
        } finally {
            if (treeClaimAcquired) {
                _treeHolder.release();
                treeClaimAcquired = false;
            }

            value.changeLongRecordMode(false);
            spareValue.changeLongRecordMode(false);
            if (!committed) {
                //
                // We failed to write the new LONG_RECORD. If there was
                // previously no LONG_RECORD, then deallocate the newly
                // allocated LONG_RECORD chain if we had successfully
                // allocated one.
                //
                if (newLongRecordPointer != oldLongRecordPointer && newLongRecordPointer != 0) {
                    _volume.getStructure().deallocateGarbageChain(newLongRecordPointer, 0);
                }
                if (newLongRecordPointerMVV != 0) {
                    _volume.getStructure().deallocateGarbageChain(newLongRecordPointerMVV, 0);
                }
            } else {
                if (oldLongRecordPointer != newLongRecordPointer && oldLongRecordPointer != 0) {
                    _volume.getStructure().deallocateGarbageChain(oldLongRecordPointer, 0);
                }
                if (oldLongRecordPointerMVV != 0) {
                    _volume.getStructure().deallocateGarbageChain(oldLongRecordPointerMVV, 0);
                }
            }
        }
        _volume.getStatistics().bumpStoreCounter();
        _tree.getStatistics().bumpStoreCounter();
        if (doFetch || doMVCC) {
            _volume.getStatistics().bumpFetchCounter();
            _tree.getStatistics().bumpFetchCounter();
        }
        return keyExisted;
    }

    private long timestamp() {
        return _persistit.getTimestampAllocator().updateTimestamp();
    }

    private void insertIndexLevel(final Key key, final Value value) throws PersistitException {

        Buffer buffer = null;
        try {
            buffer = _volume.getStructure().allocPage();
            final long timestamp = timestamp();
            buffer.writePageOnCheckpoint(timestamp);

            buffer.init(PAGE_TYPE_INDEX_MIN + _tree.getDepth() - 1);

            final long newTopPage = buffer.getPageAddress();
            final long leftSiblingPointer = _tree.getRootPageAddr();

            Debug.$assert0.t(leftSiblingPointer == _tree.getRootPageAddr());
            final long rightSiblingPointer = value.getPointerValue();
            //
            // Note: left and right sibling are of the same level and therefore
            // it is not necessary to invoke value.setPointerPageType() here.
            //
            _rawValueWriter.init(value);
            value.setPointerValue(leftSiblingPointer);
            buffer.putValue(LEFT_GUARD_KEY, _rawValueWriter);

            value.setPointerValue(rightSiblingPointer);
            buffer.putValue(key, _rawValueWriter);

            value.setPointerValue(-1);
            buffer.putValue(RIGHT_GUARD_KEY, _rawValueWriter);

            buffer.setDirtyAtTimestamp(timestamp);

            _tree.changeRootPageAddr(newTopPage, 1);
            _tree.bumpGeneration();
            _volume.getStructure().updateDirectoryTree(_tree);

        } finally {
            if (buffer != null) {
                buffer.releaseTouched();
            }
        }
    }

    /**
     * Inserts a data or pointer value into a level of the tree.
     *
     * @param buffer
     *            The buffer containing the insert location. The buffer must
     *            have a writer claim on it, and must be reserved.
     * @param foundAt
     *            The encoded insert location.
     * @return <code>true</code> if it necessary to insert a key into the
     *         ancestor index page.
     */
    // TODO - Check insertIndexLevel timestamps
    private boolean putLevel(final LevelCache lc, final Key key, final ValueHelper valueWriter, final Buffer buffer,
            final int foundAt, final boolean okToSplit) throws PersistitException {
        Debug.$assert0.t((buffer.getStatus() & SharedResource.WRITER_MASK) != 0
                && (buffer.getStatus() & SharedResource.CLAIMED_MASK) != 0);
        final Sequence sequence = lc.sequence(foundAt);
        long timestamp = timestamp();
        buffer.writePageOnCheckpoint(timestamp);

        final int result = buffer.putValue(key, valueWriter, foundAt, false);
        if (result != -1) {
            buffer.setDirtyAtTimestamp(timestamp);
            lc.updateInsert(buffer, key, result);
            return false;
        } else {
            Debug.$assert0.t(buffer.getPageAddress() != _volume.getStructure().getGarbageRoot());
            Buffer rightSibling = null;

            try {
                // We can't perform the split because we don't have a claim
                // on the Tree. We will just return, and the caller will
                // call again with that claim.
                //
                if (!okToSplit) {
                    return true;
                }
                _storeCausedSplit = true;
                //
                // Allocate a new page
                //
                rightSibling = _volume.getStructure().allocPage();

                timestamp = timestamp();
                buffer.writePageOnCheckpoint(timestamp);
                rightSibling.writePageOnCheckpoint(timestamp);

                Debug.$assert0.t(rightSibling.getPageAddress() != 0);
                Debug.$assert0.t(rightSibling != buffer);

                rightSibling.init(buffer.getPageType());
                // debug
                //
                // Split the page. As a side-effect, this will bump the
                // generation counters of both buffers, and therefore the
                // level cache for this level will become
                // (appropriately) invalid.
                //

                final int at = buffer
                        .split(rightSibling, key, valueWriter, foundAt, _spareKey1, sequence, _splitPolicy);
                if (at < 0) {
                    lc.updateInsert(rightSibling, key, -at);
                } else {
                    lc.updateInsert(buffer, key, at);
                }

                final long oldRightSibling = buffer.getRightSibling();
                final long newRightSibling = rightSibling.getPageAddress();

                Debug.$assert0.t(newRightSibling > 0 && oldRightSibling != newRightSibling);
                Debug.$assert0.t(rightSibling.getPageType() == buffer.getPageType());

                rightSibling.setRightSibling(oldRightSibling);
                buffer.setRightSibling(newRightSibling);

                valueWriter.setPointerValue(newRightSibling);

                rightSibling.setDirtyAtTimestamp(timestamp);
                buffer.setDirtyAtTimestamp(timestamp);

                return true;

            } finally {
                if (rightSibling != null) {
                    rightSibling.releaseTouched();
                }
            }
        }
    }

    private Buffer quicklyReclaimBuffer(final LevelCache lc, final boolean writer) throws PersistitException {
        final Buffer buffer = lc._buffer;
        if (buffer == null)
            return null;

        final boolean available = buffer.claim(writer, 0);
        if (available) {
            if (buffer.getPageAddress() == lc._page && buffer.getVolume() == _volume
                    && _cachedTreeGeneration == _tree.getGeneration() && buffer.getGeneration() == lc._bufferGeneration
                    && buffer.isValid()) {
                return buffer;
            } else {
                buffer.release();
            }
        }
        return null;
    }

    /**
     * <p>
     * Performs generalized tree traversal. The direction value indicates
     * whether to traverse forward or backward in collation sequence, whether to
     * descend to child nodes, and whether the key being sought must be strictly
     * greater than or less then the supplied key.
     * </p>
     * <p>
     * The <code>direction</code> value must be one of:
     * <dl>
     * <dt>Key.GT:</dt>
     * <dd>Find the next key that is strictly greater than the supplied key. If
     * there is none, return false.</dd>
     * <dt>Key.GTEQ:</dt>
     * <dd>If the supplied key exists in the database, return that key;
     * otherwise find the next greater key and return it.</dd>
     * <dt>Key.EQ:</dt>
     * <dd>Return <code>true</code> iff the specified key exists in the
     * database. Does not update the Key.</dd>
     * <dt>Key.LT:</dt>
     * <dd>Find the next key that is strictly less than the supplied key. If
     * there is none, return false.</dd>
     * <dt>Key.LTEQ:</dt>
     * <dd>If the supplied key exists in the database, return that key;
     * otherwise find the next smaller key and return it.</dd>
     * </dl>
     * </p>
     *
     * @param direction
     *            One of Key.GT, Key.GTEQ, Key.EQ, Key.LT or Key.LTEQ.
     *
     * @param deep
     *            Determines whether the result should represent the next (or
     *            previous) physical key in the <code>Tree</code> or should be
     *            restricted to just the logical siblings of the current key.
     *            (See <a href="Key.html#_keyChildren">Logical Key Children and
     *            Siblings</a>).
     * @return <code>true</code> if there is a key to traverse to, else
     *         <code>false</code>.
     * @throws PersistitException
     */
    public boolean traverse(final Direction direction, final boolean deep) throws PersistitException {
        final boolean result = traverse(direction, deep, Integer.MAX_VALUE);
        return result;
    }

    /**
     * <p>
     * Performs generalized tree traversal. The direction value indicates
     * whether to traverse forward or backward in collation sequence and whether
     * the key being sought must be strictly greater than or less than the
     * supplied key.
     * </p>
     * <p>
     * This method normally modifies both the <code>Key</code> and
     * <code>Value</code> fields of this <code>Exchange</code>: the
     * <code>Key</code> is modified to reflect the key found through traversal,
     * and the <code>Value</code> field is modified to contain the value
     * associated with that key. However, this behavior can be modified by the
     * <code>minimumBytes</code> parameter. If <code>minimumBytes</code> is less
     * than 0 then this method modifies neither the <code>Key</code> nor the
     * <code>Value</code> field. If it is equal to zero then only the
     * <code>Key</code> is modified.
     * </p>
     * <p>
     * The <code>direction</code> value must be one of:
     * <dl>
     * <dt>Key.GT:</dt>
     * <dd>Find the next key that is strictly greater than the supplied key. If
     * there is none, return false.</dd>
     * <dt>Key.GTEQ:</dt>
     * <dd>If the supplied key exists in the database, return that key;
     * otherwise find the next greater key and return it.</dd>
     * <dt>Key.EQ:</dt>
     * <dd>Return <code>true</code> iff the specified key exists in the
     * database. Does not update the Key.</dd>
     * <dt>Key.LT:</dt>
     * <dd>Find the next key that is strictly less than the supplied key. If
     * there is none, return false.</dd>
     * <dt>Key.LTEQ:</dt>
     * <dd>If the supplied key exists in the database, return that key;
     * otherwise find the next smaller key and return it.</dd>
     * </dl>
     * </p>
     *
     * @param direction
     *            One of Key.GT, Key.GTEQ, Key.EQ, Key.LT or Key.LTEQ.
     *
     * @param deep
     *            Determines whether the result should represent the next (or
     *            previous) physical key in the <code>Tree</code> or should be
     *            restricted to just the logical siblings of the current key.
     *            (See <a href="Key.html#_keyChildren">Logical Key Children and
     *            Siblings</a>).
     *
     * @param minimumBytes
     *            The minimum number of bytes to fetch. See {@link #fetch(int)}.
     *
     * @return <code>true</code> if there is a key to traverse to, else
     *         <code>false</code>.
     *
     * @throws PersistitException
     */
    public boolean traverse(final Direction direction, final boolean deep, final int minimumBytes)
            throws PersistitException {
        return traverse(direction, deep, minimumBytes, 0, 0, null);
    }

    /**
     * See {@link #traverse(com.persistit.Key.Direction, boolean, int)} for full
     * description
     *
     * @param minKeyDepth
     *            Minimum valid key depth. If a key is found with a depth less
     *            than this value, <i>regardless of MVCC visibility</i>,
     *            <code>false</code> is immediately returned.
     * @param matchUpToIndex
     *            Length of minimum matching key fragment. If a key is found
     *            that does not match this many bytes, <i>regardless of MVCC
     *            visibility</i>, <code>false</code> is immediately returned.
     */
    private boolean traverse(final Direction direction, final boolean deep, final int minimumBytes,
            final int minKeyDepth, final int matchUpToIndex, final TraverseVisitor visitor) throws PersistitException {
        assertCorrectThread(true);
        _persistit.checkClosed();
        final Key spareKey = _spareKey1;
        final boolean doFetch = minimumBytes > 0;
        final boolean doModify = minimumBytes >= 0;
        final boolean reverse = (direction == LT) || (direction == LTEQ);
        final Value outValue = doFetch ? _value : _spareValue;
        outValue.clear();

        Direction dir = direction;
        Buffer buffer = null;
        boolean edge = dir == EQ || dir == GTEQ || dir == LTEQ;
        boolean nudged = false;

        if (_key.getEncodedSize() == 0) {
            if (reverse) {
                _key.appendAfter();
            } else {
                _key.appendBefore();
            }
            nudged = true;
        }

        _key.testValidForTraverse();

        checkLevelCache();

        try {
            //
            // Now we are committed to computing a new key value. Save the
            // original key value for comparison.
            //
            _key.copyTo(spareKey);
            int index = _key.getEncodedSize();

            int foundAt = 0;
            boolean nudgeForMVCC = false;
            _keysVisitedDuringTraverse = 0;

            for (;;) {
                ++_keysVisitedDuringTraverse;
                final LevelCache lc = _levelCache[0];
                boolean matches;
                //
                // Optimal path - pick up the buffer and location left
                // by previous operation.
                //
                if (buffer == null && lc._keyGeneration == _key.getGeneration()) {
                    buffer = quicklyReclaimBuffer(lc, false);
                    foundAt = lc._foundAt;
                }
                //
                // But if direction is leftward and the position is at the left
                // edge of the buffer, re-do with a key search - there is no
                // other way to find the left sibling page.
                //
                if (buffer != null && (nudgeForMVCC || (reverse && (foundAt & P_MASK) <= buffer.getKeyBlockStart()))) {
                    // Going left from first record in the page requires a
                    // key search.
                    buffer.releaseTouched();
                    buffer = null;
                }

                //
                // If the operations above failed to get the key, then
                // look it up with search.
                //
                if (buffer == null) {
                    if (nudgeForMVCC || (!edge && !nudged)) {
                        if (reverse) {
                            if (!_key.isSpecial()) {
                                _key.nudgeLeft();
                            }
                        } else {
                            if (!_key.isSpecial()) {
                                if (deep) {
                                    _key.nudgeDeeper();
                                } else {
                                    _key.nudgeRight();
                                }
                            }
                        }
                        nudged = true;
                        nudgeForMVCC = false;
                    }
                    foundAt = search(_key, false);
                    buffer = lc._buffer;
                }

                if (edge && (foundAt & EXACT_MASK) != 0) {
                    matches = true;
                } else if (edge && !deep && Buffer.decodeDepth(foundAt) == index) {
                    matches = true;
                } else if (dir == EQ) {
                    matches = false;
                } else {
                    edge = false;
                    foundAt = buffer.traverse(_key, dir, foundAt);
                    if (buffer.isAfterRightEdge(foundAt)) {
                        final long rightSiblingPage = buffer.getRightSibling();

                        Debug.$assert0.t(rightSiblingPage >= 0 && rightSiblingPage <= MAX_VALID_PAGE_ADDR);
                        if (rightSiblingPage > 0) {
                            final Buffer rightSibling = _pool.get(_volume, rightSiblingPage, false, true,
                                    _timeoutMillis);
                            buffer.releaseTouched();
                            //
                            // Reset foundAtNext to point to the first key block
                            // of the right sibling page.
                            //
                            buffer = rightSibling;
                            checkPageType(buffer, PAGE_TYPE_DATA, false);
                            foundAt = buffer.traverse(_key, dir, buffer.toKeyBlock(0));
                            matches = !buffer.isAfterRightEdge(foundAt);
                        } else {
                            matches = false;
                        }
                    } else {
                        matches = true;
                    }

                    //
                    // If (a) the key was not nudged, and (b) this is not a deep
                    // traverse, and (c) the foundAtNext refers now to a child
                    // of the original key, then it's the wrong result - the
                    // optimistic assumption that the next key would be adjacent
                    // to the preceding result is wrong. To resolve this,
                    // invalidate the LevelCache entry and retry the loop. That
                    // will nudge the key appropriately and do a standard
                    // search.
                    //

                    if (!nudged && !deep && _key.compareKeyFragment(spareKey, 0, spareKey.getEncodedSize()) == 0) {
                        _key.setEncodedSize(spareKey.getEncodedSize());
                        lc._keyGeneration = -1;
                        buffer.release();
                        buffer = null;
                        continue;
                    }
                }

                //
                // Earliest point we can check for the quick exit. Internal
                // optimization (ignores visibility) that takes advantage of
                // physical key traversal for logical key semantics.
                //
                final boolean stopDueToKeyDepth;
                if (minKeyDepth > 0 && _key.getDepth() < minKeyDepth) {
                    stopDueToKeyDepth = true;
                } else if (matchUpToIndex > 0) {
                    stopDueToKeyDepth = spareKey.compareKeyFragment(_key, 0, matchUpToIndex) != 0;
                } else {
                    stopDueToKeyDepth = false;
                }

                // Original search loop end, MVCC must also inspect value before
                // finishing

                if (reverse && _key.isLeftEdge() || !reverse && _key.isRightEdge() || stopDueToKeyDepth) {
                    matches = false;
                } else {
                    if (deep) {
                        matches |= dir != EQ;
                        index = _key.getEncodedSize();

                        if (matches) {
                            matches = fetchFromBufferInternal(buffer, outValue, foundAt, minimumBytes);
                            if (!matches && dir != EQ) {
                                nudged = false;
                                nudgeForMVCC = (dir == GTEQ || dir == LTEQ);
                                buffer.release();
                                buffer = null;
                                continue;
                            }
                        }
                    } else {
                        int parentIndex = spareKey.previousElementIndex(index);
                        if (parentIndex < 0) {
                            parentIndex = 0;
                        }

                        matches &= (spareKey.compareKeyFragment(_key, 0, parentIndex) == 0);

                        if (matches) {
                            index = _key.nextElementIndex(parentIndex);
                            if (index > 0) {
                                final boolean isVisibleMatch = fetchFromBufferInternal(buffer, outValue, foundAt,
                                        minimumBytes);
                                //
                                // In any case (matching sibling, child or
                                // niece/nephew) we need to ignore this
                                // particular key and continue search if not
                                // visible to current transaction
                                //
                                if (!isVisibleMatch) {
                                    nudged = false;
                                    buffer.release();
                                    buffer = null;
                                    if (dir == EQ) {
                                        matches = false;
                                    } else {
                                        nudgeForMVCC = (dir == GTEQ || dir == LTEQ);
                                        continue;
                                    }
                                }
                                //
                                // It was a niece or nephew, record non-exact
                                // match
                                //
                                if (index != _key.getEncodedSize()) {
                                    foundAt &= ~EXACT_MASK;
                                }
                            } else {
                                matches = false;
                            }
                        }
                    }
                }

                if (doModify) {
                    if (matches) {
                        if (_key.getEncodedSize() == index) {
                            lc.update(buffer, _key, foundAt);
                        } else {
                            //
                            // Parent key determined from seeing a child or
                            // niece/nephew, need to fetch the actual
                            // value of this key before returning
                            //
                            _key.setEncodedSize(index);
                            if (buffer != null) {
                                buffer.releaseTouched();
                                buffer = null;
                            }
                            fetch(minimumBytes);
                        }
                    } else {
                        if (deep) {
                            _key.setEncodedSize(0);
                        } else {
                            spareKey.copyTo(_key);
                        }
                        _key.cut();
                        if (reverse) {
                            _key.appendAfter();
                        } else {
                            _key.appendBefore();
                        }
                    }
                } else {
                    // Restore original key
                    spareKey.copyTo(_key);
                }

                // Done
                _volume.getStatistics().bumpTraverseCounter();
                _tree.getStatistics().bumpTraverseCounter();
                if (matches && visitor != null && visitor.visit(this)) {
                    nudged = false;
                    edge = false;
                    if (dir == GTEQ) {
                        dir = GT;
                    } else if (dir == LTEQ) {
                        dir = LT;
                    } else if (dir == EQ) {
                        return false;
                    }
                    continue;
                }
                return matches;
            }
        } finally {
            if (buffer != null) {
                buffer.releaseTouched();
                buffer = null;
            }
        }
    }

    /**
     * <p>
     * Performs generalized tree traversal using a {@link TraverseVisitor}. The
     * direction value indicates whether to traverse forward or backward in
     * collation sequence and whether the key being sought must be strictly
     * greater than or less than the supplied key.
     * </p>
     * <p>
     * Unlike {@link #traverse(Key.Direction, boolean, int)}, this method does
     * not return each time a new key is encountered in the traversal. Instead,
     * the {@link TraverseVisitor#visit(ReadOnlyExchange)} method is called once
     * for each key. This method avoids performing initial verification of the
     * key value and usually avoids locking a <code>Buffer</code> for every
     * record returned. It may offer better performance in circumstances where a
     * long sequence of keys is being examined. Note that
     * <code>ReadOnlyExchange</code> is an interface implemented by this class
     * which supplies the subset of methods that may be used safely within the
     * visitor.
     * </p>
     * <p>
     * During the call the {@link Buffer} containing the key is locked with a
     * non-exclusive claim, and any thread attempting to update records in the
     * same <code>Buffer</code> will block. Therefore the <code>visit</code>
     * method must be written carefully. See
     * {@link TraverseVisitor#visit(ReadOnlyExchange)} for guidelines.
     * </p>
     * <p>
     * This method normally modifies both the <code>Key</code> and
     * <code>Value</code> fields of this <code>Exchange</code>: the
     * <code>Key</code> is modified to reflect the key found through traversal,
     * and the <code>Value</code> field is modified to contain the value
     * associated with that key. However, this behavior can be modified by the
     * <code>minimumBytes</code> parameter. If <code>minimumBytes</code> is less
     * than or equal to zero then only the <code>Key</code> is modified. If it
     * is greater than zero, then the traverse method may choose to populate
     * only the specified number of bytes of the <code>Value</code>.
     * </p>
     * <p>
     * The <code>direction</code> value must be one of:
     * <dl>
     * <dt>Key.GT:</dt>
     * <dd>Find the next key that is strictly greater than the supplied key. If
     * there is none, return false.</dd>
     * <dt>Key.GTEQ:</dt>
     * <dd>If the supplied key exists in the database, return that key;
     * otherwise find the next greater key and return it.</dd>
     * <dt>Key.EQ:</dt>
     * <dd>Return <code>true</code> iff the specified key exists in the
     * database. Does not update the Key.</dd>
     * <dt>Key.LT:</dt>
     * <dd>Find the next key that is strictly less than the supplied key. If
     * there is none, return false.</dd>
     * <dt>Key.LTEQ:</dt>
     * <dd>If the supplied key exists in the database, return that key;
     * otherwise find the next smaller key and return it.</dd>
     * </dl>
     * </p>
     *
     * @param direction
     *            One of Key.GT, Key.GTEQ, Key.EQ, Key.LT or Key.LTEQ.
     *
     * @param deep
     *            Determines whether the result should represent the next (or
     *            previous) physical key in the <code>Tree</code> or should be
     *            restricted to just the logical siblings of the current key.
     *            (See <a href="Key.html#_keyChildren">Logical Key Children and
     *            Siblings</a>).
     *
     * @param minimumBytes
     *            The minimum number of bytes to fetch. See {@link #fetch(int)}.
     *
     * @param visitor
     *            The application-supplied <code>TraverseVisitor</code>.
     *
     * @return <code>true</code> if additional keys remaining in the traversal
     *         set, or <code>false</code> to indicate that keys are exhausted.
     *
     * @throws PersistitException
     */

    public boolean traverse(final Direction direction, final boolean deep, final int minimumBytes,
            final TraverseVisitor visitor) throws PersistitException {
        return traverse(direction, deep, Math.max(0, minimumBytes), 0, 0, visitor);
    }

    /**
     * <p>
     * Performs generalized tree traversal constrained by a supplied
     * {@link KeyFilter}. The direction value indicates whether to traverse
     * forward or backward in collation sequence, and whether the key being
     * sought must be strictly greater than or less than the supplied key.
     * </p>
     * <p>
     * The direction value must be one of:
     * <dl>
     * <dt>Key.GT:</dt>
     * <dd>Find the next key that is strictly greater than the supplied key. If
     * there is none, return false.</dd>
     * <dt>Key.GTEQ:</dt>
     * <dd>If the supplied key exists in the database, return that key;
     * otherwise find the next greater key and return it.</dd>
     * <dt>Key.EQ:</dt>
     * <dd>Return <code>true</code> if the specified key exists in the database.
     * Does not update the Key.</dd>
     * <dt>Key.LT:</dt>
     * <dd>Find the next key that is strictly less than the supplied key. If
     * there is none, return false.</dd>
     * <dt>Key.LTEQ:</dt>
     * <dd>If the supplied key exists in the database, return that key;
     * otherwise find the next smaller key and return it.</dd>
     * </dl>
     * </p>
     *
     * @param direction
     *            One of Key.GT, Key.GTEQ, Key.EQ, Key.LT or Key.LTEQ.
     *
     * @param keyFilter
     *            A KeyFilter that constrains the keys returned by this
     *            operation.
     *
     * @param minBytes
     *            The minimum number of bytes to fetch. See {@link #fetch(int)}.
     *            If minBytes is less than or equal to 0 then this method does
     *            not update the Key and Value fields of the Exchange.
     *
     * @return <code>true</code> if there is a key to traverse to, else null.
     *
     * @throws PersistitException
     */
    public boolean traverse(final Direction direction, final KeyFilter keyFilter, final int minBytes)
            throws PersistitException {
        if (keyFilter == null) {
            return traverse(direction, true, minBytes);
        }

        if (direction == EQ) {
            return keyFilter.selected(_key) && traverse(direction, true, minBytes);
        }

        assertCorrectThread(true);
        if (_key.getEncodedSize() == 0) {
            if (direction == GT || direction == GTEQ) {
                _key.appendBefore();
            } else {
                _key.appendAfter();
            }
        }

        int totalVisited = 0;
        for (;;) {
            if (!keyFilter.next(_key, direction)) {
                _key.setEncodedSize(0);
                if (direction == LT || direction == LTEQ) {
                    _key.appendAfter();
                } else {
                    _key.appendBefore();
                }
                return false;
            }
            if (keyFilter.isKeyPrefixFilter()) {
                return traverse(direction, true, minBytes, keyFilter.getMinimumDepth(),
                        keyFilter.getKeyPrefixByteCount(), null);
            }
            final boolean matched = traverse(direction, true, minBytes);
            totalVisited += _keysVisitedDuringTraverse;
            _keysVisitedDuringTraverse = totalVisited;
            if (!matched) {
                return false;
            }
            if (keyFilter.selected(_key)) {
                return true;
            }
        }
    }

    /**
     * Traverses to the next logical sibling key value. Equivalent to
     * <code>traverse(Key.GT, false)</code>.
     *
     * @return <code>true</code> if there is a key to traverse to, else null.
     * @throws PersistitException
     */
    public boolean next() throws PersistitException {
        return traverse(GT, false);
    }

    /**
     * Traverses to the previous logical sibling key value. Equivalent to
     * <code>traverse(Key.LT, false)</code>.
     *
     * @return <code>true</code> if there is a key to traverse to, else null.
     * @throws PersistitException
     */
    public boolean previous() throws PersistitException {
        return traverse(LT, false);
    }

    /**
     * Traverses to the next key with control over depth. Equivalent to
     * <code>traverse(Key.GT, deep)</code>.
     *
     * @param deep
     *            Determines whether the result should represent the next (or
     *            previous) physical key in the <code>Tree</code> or should be
     *            restricted to just the logical siblings of the current key.
     *            (See <a href="Key.html#_keyChildren">Logical Key Children and
     *            Siblings</a>).
     *
     * @return <code>true</code> if there is a key to traverse to, else null.
     * @throws PersistitException
     */
    public boolean next(final boolean deep) throws PersistitException {
        return traverse(GT, deep);
    }

    /**
     * Traverses to the previous key with control over depth. Equivalent to
     * <code>traverse(Key.LT, deep)</code>.
     *
     * @param deep
     *            Determines whether the result should represent the next (or
     *            previous) physical key in the <code>Tree</code> or should be
     *            restricted to just the logical siblings of the current key.
     *            (See <a href="Key.html#_keyChildren">Logical Key Children and
     *            Siblings</a>).
     *
     * @return <code>true</code> if there is a key to traverse to, else null.
     *
     * @throws PersistitException
     */
    public boolean previous(final boolean deep) throws PersistitException {
        return traverse(LT, deep);
    }

    /**
     * Traverses to the next key value within the subset of all keys defined by
     * the supplied KeyFilter. Whether logical children of the current key value
     * are included in the result is determined by the <code>KeyFilter</code>.
     *
     * @return <code>true</code> if there is a key to traverse to, else null.
     * @throws PersistitException
     */
    public boolean next(final KeyFilter filter) throws PersistitException {
        return traverse(GT, filter, Integer.MAX_VALUE);
    }

    /**
     * Traverses to the previous key value within the subset of all keys defined
     * by the supplied KeyFilter. Whether logical children of the current key
     * value are included in the result is determined by the
     * <code>KeyFilter</code>.
     *
     * @return <code>true</code> if there is a key to traverse to, else null.
     * @throws PersistitException
     */
    public boolean previous(final KeyFilter filter) throws PersistitException {
        return traverse(LT, filter, Integer.MAX_VALUE);
    }

    /**
     * Determines whether the current key has a logical sibling successor,
     * without changing the state of <code>Key</code> or <code>Value</code>.
     * This method is equivalent to {@link #next()} except that no state is
     * changed.
     *
     * @return <code>true</code> if the key has a successor
     *
     * @throws PersistitException
     */
    public boolean hasNext() throws PersistitException {
        return traverse(GT, false, -1);
    }

    /**
     * Determines whether the current key has a successor within the subset of
     * all keys defined by a <code>KeyFilter</code>. This method does not change
     * the state of <code>Key</code> or <code>Value</code>.
     *
     * @return <code>true</code> if the key has a successor
     *
     * @throws PersistitException
     */
    public boolean hasNext(final KeyFilter filter) throws PersistitException {
        if (filter == null)
            return hasNext();
        _key.copyTo(_spareKey2);
        final boolean result = traverse(GT, filter, 0);
        _spareKey2.copyTo(_key);
        return result;
    }

    /**
     * Determines whether the current key has a logical sibling successor,
     * without changing the state of <code>Key</code> or <code>Value</code>.
     * This method is equivalent to {@link #next(boolean)} except that no state
     * is changed.
     *
     * @param deep
     *            Determines whether the predecessor may be of any logical depth
     *            (<code>true</code>, or must be a restricted logical siblings (
     *            <code>false</code>) of the current key. (See <a
     *            href="Key.html#_keyChildren">Logical Key Children and
     *            Siblings</a>).
     *
     * @return <code>true</code> if the key has a successor
     *
     * @throws PersistitException
     */
    public boolean hasNext(final boolean deep) throws PersistitException {
        return traverse(GT, deep, -1);
    }

    /**
     * Determines whether the current key has a logical sibling predecessor,
     * without changing the state of <code>Key</code> or <code>Value</code>.
     * This method is equivalent to {@link #previous()} except that no state is
     * changed.
     *
     * @return <code>true</code> if the key has a predecessor
     * @throws PersistitException
     */
    public boolean hasPrevious() throws PersistitException {
        return traverse(LT, false, -1);
    }

    /**
     * Determines whether the current key has a logical sibling predecessor,
     * without changing the state of <code>Key</code> or <code>Value</code>.
     * This method is equivalent to {@link #previous(boolean)} except that no
     * state is changed.
     *
     * @param deep
     *            Determines whether the predecessor may be of any logical depth
     *            (<code>true</code>, or must be a restricted logical siblings (
     *            <code>false</code>) of the current key. (See <a
     *            href="Key.html#_keyChildren">Logical Key Children and
     *            Siblings</a>).
     *
     * @return <code>true</code> if the key has a predecessor
     *
     * @throws PersistitException
     */
    public boolean hasPrevious(final boolean deep) throws PersistitException {
        return traverse(LT, deep, -1);
    }

    /**
     * Determines whether the current key has a predecessor within the subset of
     * all keys defined by a <code>KeyFilter</code>. This method does not change
     * the state of <code>Key</code> or <code>Value</code>.
     *
     * @return <code>true</code> if the key has a successor
     *
     * @throws PersistitException
     */
    public boolean hasPrevious(final KeyFilter filter) throws PersistitException {
        if (filter == null)
            return hasPrevious();
        _key.copyTo(_spareKey2);
        final boolean result = traverse(GT, filter, 0);
        _spareKey2.copyTo(_key);
        return result;
    }

    /**
     * Determines whether the current key has an associated value - that is,
     * whether a {@link #fetch} operation would return a defined value - without
     * actually changing the state of either the <code>Key</code> or the
     * <code>Value</code>.
     *
     * @return <code>true</code> if the key has an associated value
     *
     * @throws PersistitException
     */
    public boolean isValueDefined() throws PersistitException {
        return traverse(EQ, true, -1);
    }

    /**
     * Insert the current <code>Key</code> and <code>Value</code> pair into this
     * <code>Exchange</code>'s <code>Tree</code>. If there already is a value
     * associated with the current key, then replace it.
     *
     * @return This <code>Exchange</code> to permit method call chaining
     * @throws PersistitException
     */
    public Exchange store() throws PersistitException {
        return store(_key, _value);
    }

    /**
     * Invoke {@link #lock(Key, long)} with the current key and a default
     * timeout value of
     * {@value com.persistit.SharedResource#DEFAULT_MAX_WAIT_TIME} milliseconds.
     *
     * @throws PersistitException
     */
    public void lock() throws PersistitException {
        lock(_key, SharedResource.DEFAULT_MAX_WAIT_TIME);
    }

    /**
     * Invoke {@link #lock(Key, long)} with the supplied key and a default
     * timeout value of
     * {@value com.persistit.SharedResource#DEFAULT_MAX_WAIT_TIME} milliseconds.
     *
     * @param key
     *            The key to lock
     * @throws PersistitException
     */
    public void lock(final Key key) throws PersistitException {
        lock(key, SharedResource.DEFAULT_MAX_WAIT_TIME);
    }

    /**
     * <p>
     * Within a transaction, enforces a constraint that no other concurrent
     * transaction also successfully locks the same key. This method must run
     * within the scope of an active transaction.
     * </p>
     * <p>
     * This method is designed to help applications overcome problems with
     * "write skew" which is a type of isolation anomaly permitted by Snapshot
     * Isolation. See, for example, http://wikipedia.org/wiki/Snapshot_isolation
     * for a concise explanation of Snapshot Isolation and the write skew
     * anomaly.
     * <p>
     * </p>
     * To use this facility an application specifies a key which may or may not
     * be associated with an actual storage location, but which is designed to
     * conflict with any other transaction that could participate in a write
     * skew. Thus the operation serves as a way of ensuring serializable
     * execution of transactions that could otherwise experience write skew. A
     * key specified in this method is local to the <code>Exchange</code>'s
     * current {@link Tree}. Two concurrent threads locking the same key in
     * different trees do not have a write-write dependency. </p>
     * <p>
     * This method does not actually use any locking mechanism; rather, it
     * creates a write-write conflict with another transaction when both
     * transactions are concurrent and when both transactions attempt to lock
     * the same key. The result in that case is that one of the transactions
     * receives a {@link RollbackException}. An application using this facility
     * simply retries the transaction, at which point it is likely to
     * successfully execute the call to {@link #lock()}.
     * </p>
     * <p>
     * This method works by writing a short value associated with the provided
     * key into a temporary volume (accessible through the
     * {@link Persistit#getLockVolume()} method). The value is removed through
     * the normal pruning process soon after the all potentially conflicting
     * transactions have either rolled back or committed.
     * </p>
     * <p>
     * All of these interactions are performed through the normal MVCC
     * transaction mechanism. This method differs from the {@link #store()}
     * method only in that the {@link Tree} to which a value is written is
     * located in a reserved temporary volume and is therefore normally not
     * written to disk. The key is removed by pruning once there is are no
     * longer any concurrent transactions that could conflict with it.
     * </p>
     * <p>
     * As part of the normal MVCC process, if this method detects a potentially
     * conflicting lock written by another active concurrent transaction, this
     * transaction waits until the other transaction either commits or aborts,
     * or until the timeout interval expires. To prevent an unbounded wait time
     * this method accepts a timeout value in milliseconds. If the potentially
     * conflicting transaction neither commits nor aborts during the timeout
     * interval, this method throws a <code>RollbackException</code>. In the
     * event this method attempts to enter a deadlock state with another current
     * transaction; the potential deadlock is detected immediately and this
     * method immediately throws a <code>RollbackException</code>.
     * </p>
     *
     * @param lockKey
     *            the source Key
     * @param timeout
     *            timeout interval in milliseconds, zero for default timeout
     * @throws PersistitException
     * @throws RollbackException
     *             in the specific case that another concurrent transaction has
     *             also locked the same key
     * @throws IllegalStateException
     *             if this Thread does not have an active transaction scope
     * @see Transaction
     */
    public void lock(final Key lockKey, final long timeout) throws PersistitException {
        assertCorrectThread(true);
        _persistit.checkClosed();
        if (!_transaction.isActive()) {
            throw new IllegalStateException("No active transaction scope");
        }
        final Exchange lockExchange = _persistit.getExchange(_persistit.getLockVolume(), _tree.getName(), true);
        /**
         * Lock table trees need tree handles for pruning
         */
        _persistit.getJournalManager().handleForTree(lockExchange.getTree());
        lockExchange.setTimeoutMillis(timeout);
        lockKey.copyTo(lockExchange.getKey());
        lockExchange.getKey().testValidForStoreAndFetch(_pool.getBufferSize());
        lockExchange.getValue().clear().putAntiValueMVV();
        final int options = StoreOptions.WAIT | StoreOptions.DONT_JOURNAL | StoreOptions.MVCC;
        lockExchange.storeInternal(lockExchange.getKey(), lockExchange.getValue(), 0, options);
        final long page = lockExchange._levelCache[0]._page;
        _transaction.addLockPage(page, lockExchange.getTree().getHandle());
        _persistit.releaseExchange(lockExchange);
    }

    /**
     * Fetches the value associated with the <code>Key</code>, then inserts or
     * updates the value. Effectively this swaps the content of
     * <code>Value</code> with the database record associated with the current
     * <code>key</code>. It is equivalent to the code: <blockquote>
     *
     * <pre>
     *  Value tempValue = new Value();
     *  <i>exchange</i>.fetch(tempValue);
     *  <i>exchange</i>.store();
     *  tempValue.copyTo(exchange.getValue());
     *  return <i>exchange</i>;
     * </pre>
     *
     * </blockquote> except that this operation is performed atomically, without
     * need for external synchronization.
     *
     * @return This <code>Exchange</code> to permit method call chaining
     * @throws PersistitException
     */
    public Exchange fetchAndStore() throws PersistitException {
        assertCorrectThread(true);
        _persistit.checkClosed();
        if (_volume.isReadOnly()) {
            throw new ReadOnlyVolumeException(_volume.toString());
        }
        _persistit.checkSuspended();
        _key.testValidForStoreAndFetch(_volume.getPageSize());
        int options = StoreOptions.WAIT | StoreOptions.FETCH;
        options |= (!_ignoreTransactions && _transaction.isActive()) ? StoreOptions.MVCC : 0;
        storeInternal(_key, _value, 0, options);
        _spareValue.copyTo(_value);
        return this;
    }

    /**
     * Fetches the value associated with the current <code>Key</code> into the
     * <code>Exchange</code>'s <code>Value</code>. The <code>Value</code> object
     * reflects the fetched state. If there is no value associated with the key
     * then {@link Value#isDefined} is false. Otherwise the value may be
     * retrieved using {@link Value#get} and other methods of <code>Value</code>
     * .
     *
     * @return This <code>Exchange</code> to permit method call chaining
     * @throws PersistitException
     */
    public Exchange fetch() throws PersistitException {
        return fetch(_value, Integer.MAX_VALUE);
    }

    /**
     * <p>
     * Fetches or partially fetches the value associated with the current
     * <code>Key</code> into the <code>Exchange</code>'s <code>Value</code>. The
     * <code>Value</code> object reflects the fetched state. If there is no
     * value associated with the key then {@link Value#isDefined} is false.
     * Otherwise the value may be retrieved using {@link Value#get} and other
     * methods of <code>Value</code>.
     * </p>
     * <p>
     * This method sets a lower bound on the number of bytes to be fetched. In
     * particular, it may be useful to retrieve only a small fraction of a very
     * long record such as the serialization of an image. Upon successful
     * completion of this method, at least <code>minimumBytes</code> of the
     * <code>Value</code> object will accurately reflect the value stored in the
     * database. This might allow an application to determine whether to
     * retrieve the rest of the value.
     * </p>
     *
     * @param minimumBytes
     *            specifies a length at which Persistit will truncate the
     *            returned value.
     *
     * @return This <code>Exchange</code> to permit method call chaining
     * @throws PersistitException
     */
    public Exchange fetch(final int minimumBytes) throws PersistitException {
        return fetch(_value, minimumBytes);
    }

    /**
     * Fetches the value associated with the current <code>Key</code> into the
     * supplied <code>Value</code> object (instead of the <code>Exchange</code>
     * 's assigned <code>Value</code>). The <code>Value</code> object reflects
     * the fetched state. If there is no value associated with the key then
     * {@link Value#isDefined} is false. Otherwise the value may be retrieved
     * using {@link Value#get} and other methods of <code>Value</code>.
     *
     * @param value
     *            the <code>Value</code> into which the database value should be
     *            fetched.
     *
     * @return This <code>Exchange</code> to permit method call chaining
     * @throws PersistitException
     */
    public Exchange fetch(final Value value) throws PersistitException {
        return fetch(value, Integer.MAX_VALUE);
    }

    /**
     * Fetch a single version of a value from a <code>Buffer</code> that is
     * assumed, but not required, to be an MVV. The correct version is
     * determined by the current transactions start timestamp. If no transaction
     * is active, the highest committed version is returned.
     *
     * <p>
     * <b>Note</b>: This method only determines the visible version and copies
     * it into <code>value</code>, or clears it if there isn't one. It may still
     * be a LONG_RECORD or AntiValue
     * </p>
     * .
     *
     * @param value
     *            The <code>Value</code> into which the value should be fetched.
     * @param minimumBytes
     *            The minimum number of bytes to copy into <code>value</code>.
     *            Note this only affects the final contents, not the amount of
     *            the internal MVV that was copied.
     * @return <code>true</code> if a version was visible, <code>false</code>
     *         otherwise.
     * @throws PersistitException
     *             for any internal error
     */
    private boolean mvccFetch(final Value value, final int minimumBytes) throws PersistitException {
        final TransactionStatus status;
        final int step;
        if (_transaction.isActive()) {
            status = _transaction.getTransactionStatus();
            step = _transaction.getStep();
        } else {
            status = null;
            step = 0;
        }
        _mvvVisitor.initInternal(status, step, MvvVisitor.Usage.FETCH);

        final int valueSize = value.getEncodedSize();
        final byte[] valueBytes = value.getEncodedBytes();
        MVV.visitAllVersions(_mvvVisitor, valueBytes, 0, valueSize);

        if (_mvvVisitor.foundVersion()) {
            final int finalSize = MVV.fetchVersionByOffset(valueBytes, valueSize, _mvvVisitor.getOffset(), valueBytes);
            value.setEncodedSize(finalSize);
            return true;
        } else {
            if (minimumBytes > 0) {
                value.clear();
            }
            return false;
        }
    }

    /**
     * <p>
     * Fetches or partially fetches the value associated with the current
     * <code>Key</code> into the supplied <code>Value</code> object (instead of
     * the <code>Exchange</code>'s assigned <code>Value</code>). The
     * <code>Value</code> object reflects the fetched state. If there is no
     * value associated with the key then {@link Value#isDefined} is false.
     * Otherwise the value may be retrieved using {@link Value#get} and other
     * methods of <code>Value</code>.
     * </p>
     * <p>
     * This method sets an lower bound on the number of bytes to be fetched. In
     * particular, it may be useful to retrieve only a small fraction of a very
     * long record such as the serialization of an image. Upon successful
     * completion of this method, at least <code>minimumBytes</code> of the
     * <code>Value</code> object will accurately reflect the value stored in the
     * database. This might allow an application to determine whether to
     * retrieve the rest of the value using the {@link #fetch()} operation.
     * </p>
     *
     * @param value
     *            the <code>Value</code> into which the database value should be
     *            fetched.
     * @param minimumBytes
     *            specifies a length at which Persistit will truncate the
     *            returned value.
     *
     *
     * @return This <code>Exchange</code> to permit method call chaining
     * @throws PersistitException
     */
    public Exchange fetch(final Value value, int minimumBytes) throws PersistitException {
        assertCorrectThread(true);
        _persistit.checkClosed();

        _key.testValidForStoreAndFetch(_volume.getPageSize());
        if (minimumBytes < 0) {
            minimumBytes = 0;
        }
        searchAndFetchInternal(value, minimumBytes);
        return this;
    }

    /**
     * Helper for fully pulling a value out of a Buffer. That is, if the value
     * is a LONG_RECORD it will also be fetched.
     *
     * @param buffer
     *            Buffer to read from.
     * @param value
     *            Value to write to.
     * @param foundAt
     *            Location within <code>buffer</code>.
     * @param minimumBytes
     *            Minimum amount of LONG_RECORD to fetch. If &lt;0, the
     *            <code>value</code> will contain just the descriptor portion.
     * @throws PersistitException
     *             As thrown from any internal method.
     * @return <code>true</code> if the value was visible.
     */
    private boolean fetchFromBufferInternal(final Buffer buffer, final Value value, final int foundAt,
            final int minimumBytes) throws PersistitException {
        buffer.fetch(foundAt, value);
        return fetchFromValueInternal(value, minimumBytes, buffer);
    }

    /**
     * Helper for finalizing the value to return from a, potentially, MVV
     * contained in the given Value.
     *
     * @param value
     *            Value to finalize.
     * @param minimumBytes
     *            Minimum amount of LONG_RECORD to fetch. If &lt;0, the
     *            <code>value</code> will contain just the descriptor portion.
     * @param bufferForPruning
     *            If not <code>null</code> and <code>Value</code> did contain an
     *            MVV, call {@link Buffer#enqueuePruningAction(int)}.
     * @throws PersistitException
     *             As thrown from any internal method.
     * @return <code>true</code> if the value was visible.
     */
    private boolean fetchFromValueInternal(final Value value, final int minimumBytes, final Buffer bufferForPruning)
            throws PersistitException {
        boolean visible = true;
        /*
         * We must fetch the full LONG_RECORD, if needed, while buffer is
         * claimed from calling code so that it can't be de-allocated as we are
         * reading it.
         */
        if (!_ignoreMVCCFetch) {
            /*
             * Must fetch entire record as it *could* be an MVV, and reading
             * partial MVV is not supported (need all for correct version)
             */
            fetchFixupForLongRecords(value, Integer.MAX_VALUE);
            if (MVV.isArrayMVV(value.getEncodedBytes(), 0, value.getEncodedSize())) {
                if (bufferForPruning != null) {
                    final int treeHandle = _tree.getHandle();
                    assert treeHandle != 0 : "MVV found in a temporary tree " + _tree;
                    bufferForPruning.enqueuePruningAction(treeHandle);
                }
                visible = mvccFetch(value, minimumBytes);
                fetchFixupForLongRecords(value, minimumBytes);
            }
            if (value.isDefined() && value.isAntiValue()) {
                value.clear();
                visible = false;
            }
        } else {
            fetchFixupForLongRecords(value, minimumBytes);
        }
        return visible;
    }

    /**
     * Looks the current key, {@link #_key}, up in the tree and fetches the
     * value from the page. The value is left as found. Specifically, that means
     * it can be a <b>user value, LONG_RECORD, or MVV</b>.
     *
     * @param value
     *            The value as found on the page.
     * @param minimumBytes
     *            If >= 0 and stored value is a LONG_RECORD, fetch at least this
     *            many bytes.
     * @throws PersistitException
     *             As thrown from {@link #search(Key, boolean)}
     */
    private void searchAndFetchInternal(final Value value, final int minimumBytes) throws PersistitException {
        Buffer buffer = null;
        try {
            final int foundAt = search(_key, false);
            final LevelCache lc = _levelCache[0];
            buffer = lc._buffer;
            fetchFromBufferInternal(buffer, value, foundAt, minimumBytes);
            _volume.getStatistics().bumpFetchCounter();
            _tree.getStatistics().bumpFetchCounter();
        } finally {
            if (buffer != null) {
                buffer.releaseTouched();
            }
            _treeHolder.verifyReleased();
        }
    }

    boolean isLongRecord(final Value value) {
        return value.isDefined() && Buffer.isLongRecord(value.getEncodedBytes(), 0, value.getEncodedSize());
    }

    boolean isLongMVV(final Value value) {
        return value.isDefined() && Buffer.isLongMVV(value.getEncodedBytes(), 0, value.getEncodedSize());
    }

    void fetchFixupForLongRecords(final Value value, final int minimumBytes) throws PersistitException {
        if (minimumBytes >= 0 && isLongRecord(value)) {
            //
            // This will potential require numerous pages: the buffer
            // claim is held for the duration to prevent a non-atomic
            // update.
            //
            getLongRecordHelper().fetchLongRecord(value, minimumBytes, _timeoutMillis);
        }
    }

    /**
     * Return true if there is at least one key stored in this
     * <code>Exchange</code> 's <code>Tree</code> that is a logical child of the
     * current <code>Key</code>. A logical child is a key that can be formed by
     * appending a value to the parent. (See <a
     * href="Key.html#_keyChildren">Logical Key Children and Siblings</a>).
     *
     * @return <code>true</code> if the current <code>Key</code> has logical
     *         children
     * @throws PersistitException
     */
    public boolean hasChildren() throws PersistitException {
        _key.copyTo(_spareKey2);
        final int size = _key.getEncodedSize();
        final boolean result = traverse(GT, true, 0, _key.getDepth() + 1, size, null);
        _spareKey2.copyTo(_key);
        return result;
    }

    /**
     * Remove a single key/value pair from this <code>Exchange</code>'s
     * <code>Tree</code> and return the removed value in the
     * <code>Exchange</code>'s <code>Value</code>. This method atomically
     * fetches the former value then deletes it. If there was no value formerly
     * associated with the key then <code>Value</code> becomes undefined - that
     * is, the value of {@link Value#isDefined} becomes <code>false</code>.
     *
     * @return <code>true</code> if there was a key/value pair to remove
     * @throws PersistitException
     */
    public boolean fetchAndRemove() throws PersistitException {
        assertCorrectThread(true);
        _persistit.checkClosed();
        _persistit.checkSuspended();
        _spareValue.clear();
        final boolean result = removeInternal(EQ, true);
        _spareValue.copyTo(_value);
        Debug.$assert0.t(_value.isDefined() == result);
        return result;
    }

    /**
     * Remove the entire <code>Tree</code> that this <code>Exchange</code> is
     * based on. Subsequent to successful completion of this method, the
     * <code>Exchange</code> will no longer be usable. Attempts to perform
     * operations on it will result in an <code>IllegalStateException</code>.
     *
     * @throws PersistitException
     */
    public void removeTree() throws PersistitException {
        assertCorrectThread(true);
        _persistit.checkSuspended();
        _persistit.checkClosed();

        _volume.getStructure().removeTree(_tree);
        if (!_ignoreTransactions) {
            assert !isDirectoryExchange();
            _transaction.removeTree(this);
        }
        _key.clear();
        _value.clear();
        initCache();
    }

    /**
     * Remove a single key/value pair from the this <code>Exchange</code>'s
     * <code>Tree</code>.
     *
     * @return <code>true</code> if there was a key/value pair to remove
     * @throws PersistitException
     */
    public boolean remove() throws PersistitException {
        return removeInternal(EQ, false);
    }

    /**
     * Remove all keys in this <code>Exchange</code>'s <code>Tree</code>.
     *
     * @return <code>true</code> if there were key/value pairs removed
     * @throws PersistitException
     */
    public boolean removeAll() throws PersistitException {
        clear();
        return removeInternal(GTEQ, false);
    }

    /**
     * <p>
     * Depending on the value of the selection parameter, remove the record
     * associated with the current key, its logical children, or both.
     * </p>
     * <p>
     * Following are valid values for selection: <br />
     * <dl>
     * <dt>Key.EQ</dt>
     * <dd>Remove the record associated with the current key if it exists.</dd>
     * <dt>Key.GT</dt>
     * <dd>Remove the records associated with logical children of the current
     * key.</dd>
     * <dt>Key.GTEQ</dt>
     * <dd>Remove the record associated with the current key AND its logical
     * children.</dd>
     * </dl>
     *
     * @param direction
     *            One of Key.EQ, Key.GT, Key.GTEQ
     * @return <code>true</code> if one or more records were actually removed,
     *         else </i>false</i>.
     * @throws PersistitException
     */
    public boolean remove(final Direction direction) throws PersistitException {
        return removeInternal(direction, false);
    }

    private boolean removeInternal(final Direction selection, final boolean fetchFirst) throws PersistitException {
        if (selection != EQ && selection != GTEQ && selection != GT) {
            throw new IllegalArgumentException("Invalid mode " + selection);
        }
        final int keySize = _key.getEncodedSize();

        _key.copyTo(_spareKey3);
        _key.copyTo(_spareKey4);

        // Special case for empty key
        if (keySize == 0) {
            if (selection == EQ) {
                assertCorrectThread(true);
                return false;
            }
            _spareKey3.append(BEFORE);
            _spareKey4.append(AFTER);
        } else {
            if (selection == EQ) {
                _spareKey4.nudgeDeeper();
            } else if (selection == GT) {
                _spareKey3.nudgeDeeper();
                _spareKey4.nudgeRight();
            } else if (selection == GTEQ) {
                _spareKey4.nudgeRight();
            }
        }

        final boolean result = removeKeyRangeInternal(_spareKey3, _spareKey4, fetchFirst);
        _treeHolder.verifyReleased();
        return result;
    }

    /**
     * Removes all records with keys falling between <code>key1</code> and
     * <code>key2</code>, left-inclusive.
     *
     * @param key1
     *            Start of the deletion range. No record with a key smaller than
     *            key1 will be removed. key1 may be empty, in which case all
     *            records having keys less than key2 will be removed.
     *
     * @param key2
     *            End of the deletion range. No record with a key greater than
     *            or equal to key2 will be removed. key2 may be empty, in which
     *            case all records having keys equal to or greater than key1
     *            will be removed.
     *
     * @return <code>true</code> if one or more records were actually removed,
     *         else <i>false</i>.
     *
     * @throws PersistitException
     *             if there are any internal errors
     * @throws IllegalArgumentException
     *             if key1 is equal to or greater than key2
     */
    public boolean removeKeyRange(final Key key1, final Key key2) throws PersistitException {
        key1.copyTo(_spareKey3);
        key2.copyTo(_spareKey4);

        // Special case for empty key
        if (key1.getEncodedSize() == 0) {
            _spareKey3.append(BEFORE);
        }

        if (key2.getEncodedSize() == 0) {
            _spareKey4.append(AFTER);
        }

        if (_spareKey3.compareTo(_spareKey4) >= 0) {
            throw new IllegalArgumentException("Second key must be greater than the first");
        }

        final boolean result = removeKeyRangeInternal(_spareKey3, _spareKey4, false);
        _treeHolder.verifyReleased();
        return result;
    }

    /**
     * Removes all records with keys falling between <code>key1</code> and
     * </code>key2</code>, left-inclusive. Validity checks and Key value
     * adjustments have been done by caller - this method does the work.
     *
     * @param key1
     *            Key that is less than or equal to the leftmost to be removed
     * @param key2
     *            Key that is greater than the rightmost to be removed
     * @param fetchFirst
     *            Control whether to copy the existing value for the first key
     *            into _spareValue before deleting the record.
     * @return <code>true</code> if any records were removed.
     * @throws PersistitException
     */
    private boolean removeKeyRangeInternal(final Key key1, final Key key2, final boolean fetchFirst)
            throws PersistitException {
        Debug.$assert0.t(key1.getEncodedSize() > 0);
        Debug.$assert0.t(key2.getEncodedSize() > 0);
        Debug.$assert0.t(key1.compareTo(key2) < 0);

        assertCorrectThread(true);
        _persistit.checkClosed();

        if (!isDirectoryExchange()) {
            _persistit.checkSuspended();
        }

        throttle();

        if (_ignoreTransactions || !_transaction.isActive()) {
            return raw_removeKeyRangeInternal(key1, key2, fetchFirst, false);
        }

        // Record the delete operation on the journal

        _transaction.remove(this, key1, key2);

        /*
         * If the Tree was created within this transaction then we can just
         * range-delete the tree since it is not visible outside this
         * transaction.
         */
        if (_tree.isTransactionPrivate(true)) {
            return raw_removeKeyRangeInternal(key1, key2, fetchFirst, false);
        }

        checkLevelCache();

        _value.clear().putAntiValueMVV();
        final int storeOptions = StoreOptions.MVCC | StoreOptions.WAIT | StoreOptions.ONLY_IF_VISIBLE
                | StoreOptions.DONT_JOURNAL | (fetchFirst ? StoreOptions.FETCH : 0);

        boolean anyRemoved = false;
        boolean keyIsLessThan = true;
        final Key nextKey = new Key(key1);

        while (keyIsLessThan && !key1.isRightEdge()) {
            Buffer buffer = null;
            try {
                int foundAt = search(key1, true);
                buffer = _levelCache[0]._buffer;

                while (!buffer.isAfterRightEdge(foundAt)) {
                    keyIsLessThan = key1.compareTo(key2) < 0;
                    if (!keyIsLessThan) {
                        break;
                    }
                    foundAt = buffer.nextKey(nextKey, foundAt);
                    buffer.releaseTouched();
                    buffer = null;
                    anyRemoved |= storeInternal(key1, _value, 0, storeOptions);
                    nextKey.copyTo(key1);
                    break;
                }
            } finally {
                if (buffer != null) {
                    buffer.releaseTouched();
                    buffer = null;
                }
            }
        }

        _value.clear();
        return anyRemoved;
    }

    /**
     * Removes all records with keys falling between <code>key1</code> and
     * </code>key2</code>, lefty-inclusive. Validity checks and Key value
     * adjustments have been done by caller - this method does the work.
     *
     * @param key1
     *            Key that is less than or equal to the leftmost to be removed
     * @param key2
     *            Key that is greater than the rightmost to be removed
     * @param fetchFirst
     *            Control whether to copy the existing value for the first key
     *            into _spareValue before deleting the record.
     * @param removeOnlyAntiValue
     *            Control whether to remove normal records or only an AntiValue.
     *            If true then this method tests whether there is one record
     *            being identified and removes it only if it is a primordial
     *            AntiValue.
     * @return <code>true</code> if any records were removed.
     * @throws PersistitException
     */
    boolean raw_removeKeyRangeInternal(final Key key1, final Key key2, final boolean fetchFirst,
            final boolean removeOnlyAntiValue) throws PersistitException {
        /*
         * _spareKey1 and _spareKey2 are mutated within the method and are then
         * wrong in the event of a retry loop.
         */

        assert key1 != _spareKey1 && key2 != _spareKey1 && key1 != _spareKey2 && key2 != _spareKey2;

        _persistit.checkClosed();
        _persistit.checkSuspended();

        if (_volume.isReadOnly()) {
            throw new ReadOnlyVolumeException(_volume.toString());
        }
        if (Debug.ENABLED) {
            Debug.suspend();
        }
        boolean treeClaimAcquired = false;
        boolean treeWriterClaimRequired = false;
        boolean result = false;

        boolean deallocationRequired = true; // assume until proven false
        boolean tryQuickDelete = true;

        if (!_ignoreTransactions) {
            _transaction.remove(this, key1, key2);
        }

        try {
            //
            // Retry here to get an exclusive Tree latch in the occasional case
            // where pages are being joined.
            //
            for (;;) {

                checkLevelCache();
                int depth = _cacheDepth; // The depth to which we have
                // populated the level cache.

                try {
                    //
                    // First try for a quick delete from a single data page.
                    //
                    if (tryQuickDelete) {
                        final List<Chain> chains = new ArrayList<Chain>();
                        Buffer buffer = null;
                        try {
                            final int foundAt1 = search(key1, true) & P_MASK;
                            buffer = _levelCache[0]._buffer;
                            //
                            // Re-check tree generation because a structure
                            // delete could have changed
                            // search results.
                            //
                            if (_tree.getGeneration() == _cachedTreeGeneration && foundAt1 > buffer.getKeyBlockStart()
                                    && foundAt1 < buffer.getKeyBlockEnd()) {
                                int foundAt2 = buffer.findKey(key2) & P_MASK;
                                if (!buffer.isBeforeLeftEdge(foundAt2) && !buffer.isAfterRightEdge(foundAt2)) {
                                    foundAt2 &= P_MASK;
                                    if (foundAt2 < buffer.getKeyBlockEnd()) {
                                        Debug.$assert0.t(foundAt2 >= foundAt1);

                                        if (removeOnlyAntiValue) {
                                            for (int p = foundAt1; p < foundAt2; p += KEYBLOCK_LENGTH) {
                                                if (!buffer.isPrimordialAntiValue(p)) {
                                                    return false;
                                                }
                                            }
                                        }
                                        if (fetchFirst) {
                                            removeFetchFirst(buffer, foundAt1, buffer, foundAt2);
                                        }

                                        final long timestamp = timestamp();
                                        buffer.writePageOnCheckpoint(timestamp);
                                        _volume.getStructure().harvestLongRecords(buffer, foundAt1, foundAt2, chains);

                                        final boolean removed = buffer.removeKeys(foundAt1, foundAt2, _spareKey1);
                                        if (removed) {
                                            _tree.bumpChangeCount();
                                            buffer.setDirtyAtTimestamp(timestamp);
                                        }
                                        result = removed;
                                        break;
                                    }
                                }
                            }
                            // If we didn't meet the criteria for quick delete,
                            // then don't try it again on a RetryException.
                            tryQuickDelete = false;
                        } finally {
                            if (buffer != null) {
                                buffer.releaseTouched();
                                buffer = null;
                            }
                        }
                        _volume.getStructure().deallocateGarbageChain(chains);
                    }

                    /*
                     * This deletion is more complicated and involves an index
                     * search. The tree must be latched.
                     */
                    if (!treeClaimAcquired) {
                        if (!_treeHolder.claim(treeWriterClaimRequired)) {
                            Debug.$assert0.t(false);
                            throw new InUseException("Thread " + Thread.currentThread().getName()
                                    + " failed to get writer claim on " + _tree);
                        }
                        treeClaimAcquired = true;
                    }
                    //
                    // Need to redo this check now that we have a
                    // claim on the Tree.
                    //
                    checkLevelCache();

                    long pageAddr1 = _tree.getRootPageAddr();
                    long pageAddr2 = pageAddr1;

                    for (int level = _cacheDepth; --level >= 0;) {
                        final LevelCache lc = _levelCache[level];
                        lc.initRemoveFields();
                        depth = level;

                        final int foundAt1 = searchLevel(key1, true, pageAddr1, level, true);
                        int foundAt2 = -1;

                        //
                        // Note: this buffer now has a writer claim on it.
                        //
                        Buffer buffer = lc._buffer;
                        lc._flags |= LEFT_CLAIMED;

                        lc._leftBuffer = buffer;
                        lc._leftFoundAt = foundAt1;
                        boolean samePage = pageAddr2 == pageAddr1;

                        if (samePage) {
                            foundAt2 = buffer.findKey(key2);

                            if (!buffer.isAfterRightEdge(foundAt2)) {
                                lc._rightBuffer = buffer;
                                lc._rightFoundAt = foundAt2;
                            } else {
                                pageAddr2 = buffer.getRightSibling();
                                samePage = false;
                            }
                        }
                        if (!samePage) {
                            //
                            // Since we are spanning pages we need an
                            // exclusive claim on the tree to prevent
                            // an insertion from propagating upward through
                            // the deletion range.
                            //
                            if (!treeWriterClaimRequired) {
                                treeWriterClaimRequired = true;
                                if (!_treeHolder.upgradeClaim()) {
                                    throw RetryException.SINGLE;
                                }
                            }

                            foundAt2 = searchLevel(key2, false, pageAddr2, level, true);

                            buffer = lc._buffer;
                            lc._flags |= RIGHT_CLAIMED;
                            lc._rightBuffer = buffer;
                            lc._rightFoundAt = foundAt2;
                            pageAddr2 = buffer.getPageAddress();
                        }

                        if (lc._leftBuffer.isIndexPage()) {
                            Debug.$assert0.t(lc._rightBuffer.isIndexPage() && depth > 0);
                            //
                            // Come down to the left of the key.
                            //
                            final int p1 = lc._leftBuffer.previousKeyBlock(foundAt1);
                            final int p2 = lc._rightBuffer.previousKeyBlock(foundAt2);

                            Debug.$assert0.t(p1 != -1 && p2 != -1);
                            pageAddr1 = lc._leftBuffer.getPointer(p1);
                            pageAddr2 = lc._rightBuffer.getPointer(p2);

                        } else {
                            Debug.$assert0.t(depth == 0);
                            break;
                        }
                    }

                    LevelCache lc = _levelCache[0];
                    if (removeOnlyAntiValue
                            & !isKeyRangeAntiValue(lc._leftBuffer, lc._leftFoundAt, lc._rightBuffer, lc._rightFoundAt)) {
                        result = false;
                        break;
                    }

                    if (fetchFirst) {
                        removeFetchFirst(lc._leftBuffer, lc._leftFoundAt, lc._rightBuffer, lc._rightFoundAt);
                    }
                    //
                    // We have fully delineated the subtree that
                    // needs to be removed. Now walk down the tree,
                    // stitching together the pages where necessary.
                    //
                    _tree.bumpGeneration();

                    final long timestamp = timestamp();
                    for (int level = _cacheDepth; --level >= 0;) {
                        lc = _levelCache[level];
                        final Buffer buffer1 = lc._leftBuffer;
                        final Buffer buffer2 = lc._rightBuffer;
                        int foundAt1 = lc._leftFoundAt;
                        int foundAt2 = lc._rightFoundAt;
                        foundAt1 &= P_MASK;
                        foundAt2 &= P_MASK;

                        boolean needsReindex = false;
                        buffer1.writePageOnCheckpoint(timestamp);
                        if (buffer1 != buffer2) {
                            buffer2.writePageOnCheckpoint(timestamp);
                            //
                            // Deletion spans multiple pages at this level.
                            // We will need to join or rebalance the pages.
                            //
                            final long leftGarbagePage = buffer1.getRightSibling();
                            _key.copyTo(_spareKey1);

                            // Before we remove the records in this range, we
                            // need to recover any LONG_RECORD pointers that
                            // are associated with keys in this range.
                            _volume.getStructure().harvestLongRecords(buffer1, foundAt1, Integer.MAX_VALUE);
                            _volume.getStructure().harvestLongRecords(buffer2, 0, foundAt2);

                            Debug.$assert0.t(_tree.isOwnedAsWriterByMe() && buffer1.isOwnedAsWriterByMe()
                                    && buffer2.isOwnedAsWriterByMe());
                            boolean rebalanced = false;
                            try {
                                rebalanced = buffer1.join(buffer2, foundAt1, foundAt2, _spareKey1, _spareKey2,
                                        _joinPolicy);
                            } catch (final RebalanceException rbe) {
                                rebalanceSplit(lc);
                                level++;
                                continue;
                            }
                            if (buffer1.isDataPage()) {
                                _tree.bumpChangeCount();
                            }

                            buffer1.setDirtyAtTimestamp(timestamp);
                            buffer2.setDirtyAtTimestamp(timestamp);

                            final long rightGarbagePage = buffer1.getRightSibling();

                            if (rightGarbagePage != leftGarbagePage) {
                                // here we just remember the page boundaries
                                // that will need to be deallocated.
                                lc._deallocLeftPage = leftGarbagePage;
                                lc._deallocRightPage = rightGarbagePage;
                                deallocationRequired = true;
                            }

                            if (rebalanced) {
                                //
                                // If the join operation was not able to
                                // coalesce the two pages into one, then we need
                                // to re-index the new first key of the second
                                // page.
                                //
                                // We have either a quick way to do this or a
                                // more complex way. If there is a single parent
                                // page in the index for the two re-balanced
                                // pages, and if the key to be reinserted fits
                                // in that parent page, then all we need to do
                                // is insert it. Otherwise, we will need to
                                // split the page above us, and that will
                                // potentially result in additional buffer
                                // reservations. Because that could force a
                                // retry at a bad time, in that case we defer
                                // the re-insertion of the index key until
                                // after all the current claims are released.
                                //
                                needsReindex = true;
                                if (level < _cacheDepth - 1) {
                                    final LevelCache parentLc = _levelCache[level + 1];
                                    final Buffer buffer = parentLc._leftBuffer;

                                    Debug.$assert0.t(buffer != null);
                                    if (parentLc._rightBuffer == buffer) {
                                        final int foundAt = buffer.findKey(_spareKey1);
                                        Debug.$assert0.t((foundAt & EXACT_MASK) == 0);
                                        // Try it the simple way
                                        _value.setPointerValue(buffer2.getPageAddress());
                                        _value.setPointerPageType(buffer2.getPageType());
                                        _rawValueWriter.init(_value);
                                        final int fit = buffer.putValue(_spareKey1, _rawValueWriter, foundAt, false);

                                        // If it worked then we're done.
                                        if (fit != -1) {
                                            needsReindex = false;
                                            buffer.setDirtyAtTimestamp(timestamp);
                                        }
                                    }
                                }
                                if (needsReindex) {
                                    _spareKey1.copyTo(_spareKey2);
                                    _value.setPointerValue(buffer2.getPageAddress());
                                    _value.setPointerPageType(buffer2.getPageType());

                                    storeInternal(_spareKey2, _value, level + 1, StoreOptions.NONE);
                                    needsReindex = false;
                                }
                            }

                            result = true;
                        } else if (foundAt1 != foundAt2) {
                            Debug.$assert0.t(foundAt2 > foundAt1);
                            _key.copyTo(_spareKey1);
                            //
                            // Before we remove these records, we need to
                            // recover any LONG_RECORD pointers that may be
                            // associated with keys in this range.
                            //
                            _volume.getStructure().harvestLongRecords(buffer1, foundAt1, foundAt2);
                            result |= buffer1.removeKeys(foundAt1, foundAt2, _spareKey1);
                            if (buffer1.isDataPage() && result) {
                                _tree.bumpChangeCount();
                            }
                            buffer1.setDirtyAtTimestamp(timestamp);
                        }

                        if (level < _cacheDepth - 1) {
                            removeKeyRangeReleaseLevel(level + 1);
                        }
                    }
                    break;
                } catch (final RetryException re) {
                    // handled below by releasing claims and retrying
                } finally {
                    //
                    // Release all buffers.
                    //
                    for (int level = _cacheDepth; --level >= depth;) {
                        removeKeyRangeReleaseLevel(level);
                    }

                    if (treeClaimAcquired) {
                        _treeHolder.release();
                        treeClaimAcquired = false;
                    }
                }
                /*
                 * Having released all prior claims, now acquire an exclusive
                 * claim on the Tree.
                 */
                if (treeWriterClaimRequired) {
                    if (!_treeHolder.claim(true)) {
                        Debug.$assert0.t(false);
                        throw new InUseException("Thread " + Thread.currentThread().getName()
                                + " failed to get reader claim on " + _tree);
                    }
                    treeClaimAcquired = true;
                }
            }

            while (deallocationRequired) {
                long left = -1;
                long right = -1;
                for (int level = _cacheDepth; --level >= 0;) {
                    final LevelCache lc = _levelCache[level];
                    left = lc._deallocLeftPage;
                    right = lc._deallocRightPage;
                    if (left != 0) {
                        sequence(DEALLOCATE_CHAIN_A);
                        _volume.getStructure().deallocateGarbageChain(left, right);
                        lc._deallocLeftPage = 0;
                        lc._deallocRightPage = 0;
                    }
                }
                // If we successfully finish the loop then we're done
                deallocationRequired = false;
                break;
            }
        } finally {
            if (treeClaimAcquired) {
                if (treeWriterClaimRequired) {
                    _tree.bumpGeneration();
                }
                _treeHolder.release();
                treeClaimAcquired = false;
            }
        }

        _volume.getStatistics().bumpRemoveCounter();
        _tree.getStatistics().bumpRemoveCounter();
        if (fetchFirst) {
            _volume.getStatistics().bumpFetchCounter();
            _tree.getStatistics().bumpFetchCounter();
        }
        return result;
    }

    /**
     * Handle the extremely rare case where removing a key from a pair of
     * adjacent pages requires the left page to be split. To split the page this
     * method inserts an empty record with key being deleted, allowing the
     * {@link Buffer#split(Buffer, Key, ValueHelper, int, Key, Sequence, SplitPolicy)}
     * method to be used.
     *
     * @param lc
     *            LevelCache set up by raw_removeKeyRangeInternal
     * @throws PersistitException
     */
    private void rebalanceSplit(final LevelCache lc) throws PersistitException {
        //
        // Allocate a new page
        //
        final int level = lc._level;
        final int foundAt = lc._leftFoundAt;
        final Buffer left = lc._leftBuffer;
        final Buffer inserted = _volume.getStructure().allocPage();
        try {
            final long timestamp = timestamp();
            left.writePageOnCheckpoint(timestamp);
            inserted.writePageOnCheckpoint(timestamp);

            Debug.$assert0.t(inserted.getPageAddress() != 0);
            Debug.$assert0.t(inserted != left);

            inserted.init(left.getPageType());

            final Value value = _persistit.getThreadLocalValue();
            value.clear();
            _rawValueWriter.init(value);
            final Key key = _persistit.getThreadLocalKey();
            lc._rightBuffer.nextKey(key, Buffer.HEADER_SIZE);

            left.split(inserted, key, _rawValueWriter, foundAt | EXACT_MASK, _spareKey1, Sequence.NONE,
                    SplitPolicy.EVEN_BIAS);

            inserted.setRightSibling(left.getRightSibling());
            left.setRightSibling(inserted.getPageAddress());
            left.setDirtyAtTimestamp(timestamp);
            inserted.setDirtyAtTimestamp(timestamp);
            lc._leftBuffer = inserted;
            lc._leftFoundAt = inserted.findKey(key);

            _persistit.getCleanupManager().offer(
                    new CleanupManager.CleanupIndexHole(_tree.getHandle(), inserted.getPageAddress(), level));
        } finally {
            left.releaseTouched();
        }
    }

    private void removeKeyRangeReleaseLevel(final int level) {

        final LevelCache lc = _levelCache[level];
        final Buffer buffer1 = lc._leftBuffer;
        final Buffer buffer2 = lc._rightBuffer;

        if (buffer2 != null && (lc._flags & RIGHT_CLAIMED) != 0) {
            buffer2.releaseTouched();
        }
        if (buffer1 != null && (lc._flags & LEFT_CLAIMED) != 0) {
            buffer1.releaseTouched();
        }

        lc._leftBuffer = null;
        lc._rightBuffer = null;
        lc._flags = 0;
    }

    private void removeFetchFirst(final Buffer buffer1, int foundAt1, final Buffer buffer2, final int foundAt2)
            throws PersistitException {
        if (buffer1 == buffer2) {
            if (buffer1.nextKeyBlock(foundAt1) == (foundAt2 & P_MASK)) {
                buffer1.fetch(foundAt1 | EXACT_MASK, _spareValue);
            }
        } else {
            if (buffer1.getRightSibling() == buffer2.getPageAddress() && buffer1.nextKeyBlock(foundAt1) == -1) {
                foundAt1 = buffer2.toKeyBlock(0);
                if (buffer2.nextKeyBlock(foundAt1) == (foundAt2 & P_MASK)) {
                    buffer2.fetch(foundAt1 | EXACT_MASK, _spareValue);
                }
            }
        }
        if (_spareValue.isDefined()) {
            fetchFixupForLongRecords(_spareValue, Integer.MAX_VALUE);
        }
    }

    private boolean isKeyRangeAntiValue(final Buffer buffer1, final int foundAt1, final Buffer buffer2,
            final int foundAt2) {
        if (buffer1.getKeyBlockEnd() != (foundAt1 & P_MASK) + KEYBLOCK_LENGTH) {
            return false;
        }
        if (buffer2.getKeyBlockStart() != (foundAt2 & P_MASK) - KEYBLOCK_LENGTH) {
            return false;
        }
        if (buffer1.getRightSibling() != buffer2.getPageAddress()) {
            return false;
        }
        return buffer2.isPrimordialAntiValue(Buffer.KEY_BLOCK_START);
    }

    void prune() throws PersistitException {
        prune(_key);
    }

    boolean prune(final Key key) throws PersistitException {
        Buffer buffer = null;
        Debug.$assert1.t(_tree.isLive());
        try {
            search(key, true);
            buffer = _levelCache[0]._buffer;
            if (buffer != null) {
                return buffer.pruneMvvValues(_tree, true, null);
            } else {
                return false;
            }
        } finally {
            if (buffer != null) {
                buffer.release();
            }
        }
    }

    boolean prune(final Key key1, final Key key2) throws PersistitException {
        Buffer buffer = null;
        boolean pruned = false;

        Debug.$assert1.t(_tree.isLive());
        try {
            search(key1, true);
            buffer = _levelCache[0]._buffer;

            while (buffer != null) {
                checkPageType(buffer, Buffer.PAGE_TYPE_DATA, false);
                pruned |= buffer.pruneMvvValues(_tree, true, null);
                final int foundAt = buffer.findKey(key2);
                if (!buffer.isAfterRightEdge(foundAt)) {
                    break;
                }
                final Buffer oldBuffer = buffer;
                final long rightPageAddress = buffer.getRightSibling();
                if (rightPageAddress == 0) {
                    break;
                }
                buffer = _pool.get(_volume, buffer.getRightSibling(), true, true);
                oldBuffer.release();
            }

        } finally {
            if (buffer != null) {
                buffer.release();
            }
        }
        return pruned;
    }

    boolean prune(final long page, final List<CleanupAction> consequentActions) throws PersistitException {
        Buffer buffer = null;
        try {
            buffer = _pool.get(_volume, page, true, true);
            return buffer.pruneMvvValues(_tree, true, consequentActions);
        } finally {
            if (buffer != null) {
                buffer.release();
            }
        }
    }

    boolean pruneLeftEdgeValue(final long page, final List<CleanupAction> consequentActions) throws PersistitException {
        _ignoreTransactions = true;
        Buffer buffer = null;
        try {
            buffer = _pool.get(_volume, page, false, true);
            buffer.clearEnqueuedForPruning();
            final long at = buffer.at(Buffer.KEY_BLOCK_START);
            if (at > 0) {
                final int offset = (int) (at >>> 32);
                final int size = (int) at;
                if (size == 1 && buffer.getBytes()[offset] == MVV.TYPE_ANTIVALUE) {
                    buffer.nextKey(_spareKey3, Buffer.KEY_BLOCK_START);
                    buffer.release();
                    buffer = null;
                    _spareKey3.copyTo(_spareKey4);
                    _spareKey4.nudgeDeeper();
                    raw_removeKeyRangeInternal(_spareKey3, _spareKey4, false, true);
                    return true;
                }
            }
            return false;
        } finally {
            if (buffer != null) {
                buffer.release();
            }
        }
    }

    boolean fixIndexHole(final long page, final int level) throws PersistitException {
        _ignoreTransactions = true;
        Buffer buffer = null;
        if (!_treeHolder.claim(false, Persistit.SHORT_DELAY)) {
            return false;
        }
        try {
            buffer = _pool.get(_volume, page, false, true);
            buffer.nextKey(_spareKey2, buffer.toKeyBlock(0));
            _value.setPointerValue(page);
            _value.setPointerPageType(buffer.getPageType());
            buffer.release();
            buffer = null;
            storeInternal(_spareKey2, _value, level + 1, StoreOptions.NONE);
            return true;
        } finally {
            _treeHolder.release();
            if (buffer != null) {
                buffer.release();
            }
        }
    }

    private void checkPageType(final Buffer buffer, final int expectedType, final boolean releaseOnFailure)
            throws PersistitException {
        assert !buffer.isOwnedAsWriterByOther();
        final int type = buffer.getPageType();
        if (type != expectedType) {
            if (releaseOnFailure) {
                buffer.releaseTouched();
            }
            corrupt("Volume " + _volume + " page " + buffer.getPageAddress() + " invalid page type " + type
                    + ": should be " + expectedType);
        }
    }

    /**
     * Assert that the current thread matches the "owner" of the Exchange. The
     * owner is set when the Exchange is created or first used. To enable
     * pooling, the {@link #removeState(boolean)} method clears it.
     *
     * @param set
     *            Whether to set or clear the thread field for subsequent
     *            checks.
     */
    private void assertCorrectThread(final boolean set) {
        assert checkThread(set) : "Thread " + Thread.currentThread() + " must not use " + this + " owned by " + _thread;
    }

    /**
     * Ensure the this Exchange is compatible with the current Thread; if a
     * Thread was previously assigned then this thread must be the same one.
     *
     * @param set
     *            whether to assign the current thread
     * @return true if and only if there was no assigned Thread or the assigned
     *         Thread is same as the current Thread.
     */
    private boolean checkThread(final boolean set) {
        final Thread t = Thread.currentThread();
        if (_thread == t) {
            if (!set) {
                _thread = null;
            }
            return true;
        }
        if (_thread == null) {
            if (set) {
                _thread = t;
            }
            return true;
        }
        return false;
    }

    /**
     * The transaction context for this Exchange. By default, this is the
     * transaction context of the current thread, and by default, all
     * <code>Exchange</code>s created by a thread share the same transaction
     * context.
     *
     * @return The <code>Transaction</code> context for this thread.
     */
    @Override
    public Transaction getTransaction() {
        assertCorrectThread(true);
        return _transaction;
    }

    LongRecordHelper getLongRecordHelper() {
        if (_longRecordHelper == null) {
            _longRecordHelper = new LongRecordHelper(_persistit, this);
        }
        return _longRecordHelper;
    }

    /**
     * Allows for all MVV contents to be returned through the Value object
     * during fetch. This can then be displayed conveniently through
     * {@link Value#toString()} or as an array from {@link Value#get()}.
     *
     * @param doIgnore
     *            If <code>true</code> return MVVs as described otherwise return
     *            the appropriate single version.
     */
    void ignoreMVCCFetch(final boolean doIgnore) {
        _ignoreMVCCFetch = doIgnore;
    }

    void ignoreTransactions() {
        _ignoreTransactions = true;
    }

    /**
     * Package-private method indicates whether this <code>Exchange</code>
     * refers to the directory tree.
     *
     * @return <code>true</code> if this is a directory exchange, else
     *         <code>false</code>.
     */
    boolean isDirectoryExchange() {
        return _isDirectoryExchange;
    }

    public void setSplitPolicy(final SplitPolicy policy) {
        assertCorrectThread(true);
        _splitPolicy = policy;
    }

    public void setJoinPolicy(final JoinPolicy policy) {
        assertCorrectThread(true);
        _joinPolicy = policy;
    }

    public KeyHistogram computeHistogram(final Key start, final Key end, final int sampleSize, final int keyDepth,
            final KeyFilter keyFilter, final int requestedTreeDepth) throws PersistitException {
        assertCorrectThread(true);
        _persistit.checkClosed();

        checkLevelCache();
        final int treeDepth = requestedTreeDepth > _tree.getDepth() ? _tree.getDepth() : requestedTreeDepth;
        if (treeDepth < 0) {
            throw new IllegalArgumentException("treeDepth out of bounds: " + treeDepth);
        }
        final KeyHistogram histogram = new KeyHistogram(getTree(), start, end, sampleSize, keyDepth, treeDepth);
        Buffer previousBuffer = null;
        LevelCache lc = null;
        Buffer buffer = null;
        Direction direction = GTEQ;
        if (start != null) {
            start.copyTo(_key);
        } else {
            LEFT_GUARD_KEY.copyTo(_key);
            direction = GT;
        }

        int foundAt = searchTree(_key, treeDepth, false);
        try {
            lc = _levelCache[treeDepth];
            buffer = lc._buffer;
            if (buffer != null) {
                checkPageType(buffer, treeDepth + 1, false);
            }

            while (foundAt != -1) {
                foundAt = buffer.traverse(_key, direction, foundAt);
                direction = GT;
                if (buffer.isAfterRightEdge(foundAt)) {
                    final long rightSiblingPage = buffer.getRightSibling();
                    if (rightSiblingPage > 0) {
                        final Buffer rightSibling = _pool.get(_volume, rightSiblingPage, false, true, _timeoutMillis);
                        buffer.releaseTouched();
                        //
                        // Reset foundAtNext to point to the first key block
                        // of the right sibling page.
                        //
                        buffer = rightSibling;
                        checkPageType(buffer, treeDepth + 1, false);
                        foundAt = buffer.traverse(_key, GT, buffer.toKeyBlock(0));
                    } else {
                        foundAt = -1;
                        break;
                    }
                }
                if (end != null && end.compareTo(_key) < 0) {
                    break;
                }
                if (!_key.isLeftEdge()) {
                    if (buffer != previousBuffer) {
                        histogram.addPage(buffer.getBufferSize(), buffer.getBufferSize() - buffer.getAvailableSize());
                        previousBuffer = buffer;
                    }
                    if (keyFilter == null || keyFilter.selected(_key)) {
                        histogram.addKeyCopy(_key);
                    }
                }
            }
        } finally {
            if (buffer != null) {
                buffer.releaseTouched();
            }
        }

        histogram.cull();
        return histogram;
    }

    void corrupt(final String error) throws CorruptVolumeException {
        Debug.$assert0.t(false);
        _persistit.getLogBase().corruptVolume.log(error + Util.NEW_LINE + toStringDetail());
        throw new CorruptVolumeException(error);
    }

    /**
     * Store an Object with this Exchange for the convenience of an application.
     *
     * @param appCache
     *            the object to be cached for application convenience.
     */
    public void setAppCache(final Object appCache) {
        assertCorrectThread(true);
        _appCache = appCache;
    }

    /**
     * @return the object cached for application convenience
     */
    public Object getAppCache() {
        assertCorrectThread(true);
        return _appCache;
    }

    /**
     * @return The standard timeout setting in milliseconds for this
     *         <code>Exchange</code>
     * @see Exchange#setTimeoutMillis(long)
     */
    public long getTimeoutMillis() {
        assertCorrectThread(true);
        return _timeoutMillis;
    }

    /**
     * <p>
     * Set the standard timeout for this <code>Exchange</code>. The timeout
     * value represents an approximate upper bound on the wait time for various
     * methods that wait for actions by other threads. For example, if a thread
     * needs to read a value from a {@link Buffer} that is currently being
     * updated by another thread, the read operation waits up to
     * <code>timeout</code> milliseconds for the other thread to release the
     * <code>Buffer</code>.
     * </p>
     * <p>
     * The timeout value is advisory, and some operations may stall for a longer
     * period of time than specified. Setting a timeout does not guarantee
     * real-time behavior.
     * </p>
     * <p>
     * The supplied value must be greater than or equal to zero. Zero means do
     * not wait.
     * </p>
     *
     * @param timeout
     *            Standard timeout setting, in milliseconds, for operations that
     *            wait.
     */
    public void setTimeoutMillis(final long timeout) {
        _timeoutMillis = Util.rangeCheck(timeout, 0, Long.MAX_VALUE);
    }

    /**
     * Returns a copy of either the data page or a page on the index path to the
     * data page containing the current key. This method looks up the current
     * key, then copies and returns the page at the specified tree level in a
     * new Buffer. The resulting Buffer object is not part of the BufferPool and
     * can simply be discarded when the caller is finished with it.
     *
     * @param level
     *            The tree level, starting at zero for the data page.
     * @return copy of page on the key's index tree at that level.
     */
    public Buffer fetchBufferCopy(final int level) throws PersistitException {
        assertCorrectThread(true);
        if (level >= _tree.getDepth() || level <= -_tree.getDepth()) {
            throw new IllegalArgumentException("Tree depth is " + _tree.getDepth());
        }
        final int lvl = level >= 0 ? level : _tree.getDepth() + level;
        final int foundAt = searchTree(_key, lvl, false);
        final Buffer buffer = _levelCache[lvl]._buffer;
        try {
            if (foundAt == -1) {
                return null;
            } else {
                return new Buffer(buffer);
            }
        } finally {
            buffer.releaseTouched();
        }
    }

    String toStringDetail() {
        final StringBuilder sb = new StringBuilder(toString());
        for (int level = 0; level < MAX_TREE_DEPTH; level++) {
            final LevelCache lc = _levelCache[level];
            if (lc == null || lc._buffer == null) {
                break;
            } else {
                sb.append(Util.NEW_LINE);
                sb.append(level);
                sb.append(": ");
                sb.append(lc);
            }
        }
        return sb.toString();
    }

    /**
     * Intended to be a test method. Fetches the current _key and determines if
     * stored value is a LONG_RECORD. No other state, including the fetched
     * value, can be gotten from this method.
     *
     * @return <code>true</code> if the value is a LONG_RECORD
     * @throws PersistitException
     *             Any error during fetch
     */
    boolean isValueLongRecord() throws PersistitException {
        final boolean savedIgnore = _ignoreMVCCFetch;
        try {
            _ignoreMVCCFetch = true;
            searchAndFetchInternal(_spareValue, -1);
            final boolean wasLong = isLongRecord(_spareValue);
            _spareValue.clear();
            return wasLong;
        } finally {
            _ignoreMVCCFetch = savedIgnore;
        }
    }

    /**
     * Intended to be a test method. Fetches the current _key and determines if
     * stored value is a long MVV. No other state, including the fetched value,
     * can be gotten from this method.
     *
     * @return <code>true</code> if the value is a long MVV
     * @throws PersistitException
     *             Any error during fetch
     */
    boolean isValueLongMVV() throws PersistitException {
        final boolean savedIgnore = _ignoreMVCCFetch;
        try {
            _ignoreMVCCFetch = true;
            searchAndFetchInternal(_spareValue, -1);
            final boolean wasLong = isLongMVV(_spareValue);
            _spareValue.clear();
            return wasLong;
        } finally {
            _ignoreMVCCFetch = savedIgnore;
        }
    }

    private void throttle() throws PersistitInterruptedException {
        /*
         * Don't throttle operations on the directory tree since that makes some
         * unit tests very slow. This test is now necessary because a directory
         * tree update can now occur within the scope of a transaction.
         */
        if (!_ignoreTransactions && !_transaction.isActive() && !isDirectoryExchange()) {
            _persistit.getJournalManager().throttle();
        }
    }
}
TOP

Related Classes of com.persistit.Exchange$LevelCache

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.