Package com.persistit

Source Code of com.persistit.JournalManager

/**
* Copyright 2011-2012 Akiban Technologies, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package com.persistit;

import static com.persistit.TransactionStatus.ABORTED;
import static com.persistit.util.SequencerConstants.PAGE_MAP_READ_INVALIDATE_A;
import static com.persistit.util.SequencerConstants.RECOVERY_PRUNING_B;
import static com.persistit.util.ThreadSequencer.sequence;
import static com.persistit.util.Util.NS_PER_MS;

import java.io.File;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.channels.FileChannel;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.SortedMap;
import java.util.TreeMap;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicLong;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import com.persistit.AlertMonitor.AlertLevel;
import com.persistit.AlertMonitor.Event;
import com.persistit.CheckpointManager.Checkpoint;
import com.persistit.JournalRecord.CP;
import com.persistit.JournalRecord.IT;
import com.persistit.JournalRecord.IV;
import com.persistit.JournalRecord.JE;
import com.persistit.JournalRecord.JH;
import com.persistit.JournalRecord.PA;
import com.persistit.JournalRecord.PM;
import com.persistit.JournalRecord.TM;
import com.persistit.JournalRecord.TX;
import com.persistit.Persistit.FatalErrorException;
import com.persistit.TransactionPlayer.TransactionPlayerListener;
import com.persistit.exception.CorruptJournalException;
import com.persistit.exception.PersistitException;
import com.persistit.exception.PersistitIOException;
import com.persistit.exception.PersistitInterruptedException;
import com.persistit.exception.RebalanceException;
import com.persistit.exception.VolumeNotFoundException;
import com.persistit.mxbeans.JournalManagerMXBean;
import com.persistit.util.Debug;
import com.persistit.util.Util;

/**
* Manages the disk-based I/O journal. The journal contains both committed
* transactions and images of updated pages.
*
* @author peter
*
*/
class JournalManager implements JournalManagerMXBean, VolumeHandleLookup {

    final static int URGENT = 10;
    final static int ALMOST_URGENT = 8;
    final static int HALF_URGENT = 5;
    final static int URGENT_COMMIT_DELAY_MILLIS = 50;
    final static int GENTLE_COMMIT_DELAY_MILLIS = 12;
    private final static int IO_MEASUREMENT_CYCLES = 8;
    private final static int TOO_MANY_WARN_THRESHOLD = 5;
    private final static int TOO_MANY_ERROR_THRESHOLD = 10;
    private final static long KILO = 1024;

    /**
     * REGEX expression that recognizes the name of a journal file.
     */
    final static Pattern PATH_PATTERN = Pattern.compile("(.+)\\.(\\d{12})");

    private long _journalCreatedTime;

    private final Map<PageNode, PageNode> _pageMap = new HashMap<PageNode, PageNode>();

    private final RangeRemovingArrayList<PageNode> _pageList = new RangeRemovingArrayList<PageNode>();

    private final Map<PageNode, PageNode> _branchMap = new HashMap<PageNode, PageNode>();

    private final Map<Volume, Integer> _volumeToHandleMap = new HashMap<Volume, Integer>();

    private final Map<Integer, Volume> _handleToVolumeMap = new HashMap<Integer, Volume>();

    private final Map<TreeDescriptor, Integer> _treeToHandleMap = new HashMap<TreeDescriptor, Integer>();

    private final Map<Integer, TreeDescriptor> _handleToTreeMap = new HashMap<Integer, TreeDescriptor>();
    private final Map<Long, TransactionMapItem> _liveTransactionMap = new HashMap<Long, TransactionMapItem>();

    private final Persistit _persistit;

    private long _blockSize;

    private volatile int _writeBufferSize = DEFAULT_BUFFER_SIZE;

    private ByteBuffer _writeBuffer;

    private long _writeBufferAddress = Long.MAX_VALUE;

    private JournalFlusher _flusher;

    private JournalCopier _copier;

    private final AtomicBoolean _closed = new AtomicBoolean();

    private final AtomicBoolean _copying = new AtomicBoolean();

    private final AtomicBoolean _copyFast = new AtomicBoolean();

    private final AtomicBoolean _flushing = new AtomicBoolean();

    private final AtomicBoolean _appendOnly = new AtomicBoolean();

    private final AtomicBoolean _ignoreMissingVolume = new AtomicBoolean();

    private String _journalFilePath;

    /**
     * Address of first available byte in the journal. This is usually the
     * address of the next record to be written, but if that next record
     * requires more space than is available in the current journal file, it
     * will advance to the start of the next journal file.
     */
    private volatile long _currentAddress;

    /**
     * Smallest journal address at which a record still needed is located.
     * Initially zero, increases as journal files are consumed and deleted.
     */
    private volatile long _baseAddress;

    private final Map<Long, FileChannel> _journalFileChannels = new HashMap<Long, FileChannel>();

    /**
     * Counter used to assign internal handle values to Volume and Tree records.
     */
    private int _handleCounter = 0;

    private Checkpoint _lastValidCheckpoint = new Checkpoint(0, 0);

    private long _lastValidCheckpointJournalAddress = 0;

    private long _lastValidCheckpointBaseAddress = 0;

    private long _deleteBoundaryAddress = 0;

    private int _lastReportedJournalFileCount = 0;

    private boolean _isNewEpoch = true;

    private volatile long _writePageCount = 0;

    private volatile long _readPageCount = 0;

    private volatile long _copiedPageCount = 0;

    private volatile long _droppedPageCount = 0;

    private final AtomicLong _totalCommits = new AtomicLong();

    private final AtomicLong _totalCommitWaitTime = new AtomicLong();

    private final AtomicLong _totalFlushCycles = new AtomicLong();

    private final AtomicLong _totalFlushIoTime = new AtomicLong();

    private volatile long _flushInterval = DEFAULT_FLUSH_INTERVAL_MS;

    private volatile long _slowIoAlertThreshold = DEFAULT_SLOW_IO_ALERT_THRESHOLD_MS;

    private final TransactionPlayer _player = new TransactionPlayer(new JournalTransactionPlayerSupport());

    private final TransactionPlayerListener _listener = new ProactiveRollbackListener();

    private final AtomicBoolean _writePagePruning = new AtomicBoolean(true);

    private final AtomicBoolean _rollbackPruning = new AtomicBoolean(true);

    /*
     * Tunable parameters that determine how vigorously the copyBack thread
     * performs I/O. Hopefully we can set good defaults and not expose these as
     * knobs.
     */
    private volatile long _copierInterval = DEFAULT_COPIER_INTERVAL_MS;

    private volatile int _copiesPerCycle = DEFAULT_COPIES_PER_CYCLE;

    private volatile long _copierTimestampLimit = Long.MAX_VALUE;

    private volatile long _earliestCommittedTimestamp = Long.MAX_VALUE;

    private volatile long _earliestAbortedTimestamp = Long.MAX_VALUE;

    private boolean _allowHandlesForTempVolumesAndTrees;

    private volatile int _urgentFileCountThreshold = DEFAULT_URGENT_FILE_COUNT_THRESHOLD;

    private volatile long _throttleSleepInterval;

    /**
     * <p>
     * Initialize the new journal. This method takes its information from the
     * supplied RecoveryManager if supplied and valid. Otherwise it starts a new
     * journal at address 0.
     * </p>
     * <p>
     * If a RecoveryManager is supplied and has a valid keystone address, then
     * this method continues the existing journal. A new journal file will be
     * created with a generation number one larger than that of the keystone
     * file, and the new file is given the same journal create date as the
     * recovered journal. New journal files are also required to have the same
     * maximumSize and path name (not including generation suffix) as the
     * existing journal, so in the event <code>rman</code> is non-null and
     * contains a valid keystone, the <code>path</code> and
     * <code>maximumSize</code> parameters are ignored.
     * </p>
     * <p>
     * Otherwise, this method creates a new journal starting at journal address
     * 0 with the specified path and maximum file size. Journal file names are
     * created by appending a period followed by a generation number suffix to
     * the supplied path name. For example if the supplied path is
     * "/xxx/yyy/zzz" then journal file names will be
     * "/xxx/yyy/zzz.000000000000", "/xxx/yyy/zzz.000000000001", and so on. (The
     * suffix contains twelve digits.)
     * </p>
     *
     * @param rman
     * @param path
     * @param maximumSize
     * @throws PersistitException
     */
    public synchronized void init(final RecoveryManager rman, final String path, final long maximumSize)
            throws PersistitException {
        _writeBuffer = ByteBuffer.allocate(_writeBufferSize);
        if (rman != null && rman.getKeystoneAddress() != -1) {
            _journalFilePath = rman.getJournalFilePath();
            _blockSize = rman.getBlockSize();
            _currentAddress = rman.getKeystoneAddress() + _blockSize;
            _baseAddress = rman.getBaseAddress();
            _journalCreatedTime = rman.getJournalCreatedTime();
            _lastValidCheckpoint = rman.getLastValidCheckpoint();
            rman.collectRecoveredPages(_pageMap, _branchMap);
            rman.collectRecoveredVolumeMaps(_handleToVolumeMap, _volumeToHandleMap);
            rman.collectRecoveredTreeMaps(_handleToTreeMap, _treeToHandleMap);
            rman.collectRecoveredTransactionMap(_liveTransactionMap);
            /*
             * Set _handleCount so that newly created handles do not conflict
             * with existing resources.
             */
            for (final Integer handle : _handleToTreeMap.keySet()) {
                _handleCounter = Math.max(_handleCounter, handle + 1);
            }
            for (final Integer handle : _handleToVolumeMap.keySet()) {
                _handleCounter = Math.max(_handleCounter, handle + 1);
            }
            /*
             * Populate page list in journal address order.
             */
            for (final PageNode root : _pageMap.values()) {
                for (PageNode pn = root; pn != null; pn = pn.getPrevious()) {
                    _pageList.add(pn);
                }
            }
            Collections.sort(_pageList, PageNode.READ_COMPARATOR);

        } else {
            _journalFilePath = journalPath(path).getAbsoluteFile().toString();
            _blockSize = maximumSize;
            _currentAddress = 0;
            _journalCreatedTime = System.currentTimeMillis();
        }
        _closed.set(false);
    }

    public void startJournal() throws PersistitException {
        synchronized (this) {
            prepareWriteBuffer(JH.OVERHEAD);
        }
        _flusher = new JournalFlusher();
        _copier = new JournalCopier();

        _copier.start();
        _flusher.start();
    }

    /**
     * Copy dynamic variables into a {@link Management.JournalInfo} structure.
     *
     * @param info
     */
    public synchronized void populateJournalInfo(final Management.JournalInfo info) {
        info.closed = _closed.get();
        if (_blockSize == 0) {
            return;
        }
        info.copiedPageCount = _copiedPageCount;
        info.droppedPageCount = _droppedPageCount;
        info.copying = _copying.get();
        info.currentGeneration = _currentAddress;
        info.currentJournalAddress = _writeBuffer == null ? 0 : _writeBufferAddress + _writeBuffer.position();
        info.currentJournalFile = addressToFile(_currentAddress).getPath();
        info.flushing = _flushing.get();
        info.journaledPageCount = _writePageCount;
        info.readPageCount = _readPageCount;
        if (_lastValidCheckpointJournalAddress != 0) {
            info.lastValidCheckpointSystemTime = _lastValidCheckpoint.getSystemTimeMillis();
            info.lastValidCheckpointTimestamp = _lastValidCheckpoint.getTimestamp();
            info.lastValidCheckpointJournalFile = addressToFile(_lastValidCheckpointJournalAddress).getPath();
            info.lastValidCheckpointJournalAddress = _lastValidCheckpointJournalAddress;
        } else {
            info.lastValidCheckpointSystemTime = 0;
            info.lastValidCheckpointTimestamp = 0;
            info.lastValidCheckpointJournalFile = null;
            info.lastValidCheckpointJournalAddress = 0;
        }
        info.blockSize = _blockSize;
        info.pageMapSize = _pageMap.size();
        info.baseAddress = _baseAddress;
        info.appendOnly = _appendOnly.get();
        info.fastCopying = _copyFast.get();
    }

    @Override
    public synchronized int getLiveTransactionMapSize() {
        return _liveTransactionMap.size();
    }

    @Override
    public synchronized int getPageMapSize() {
        return _pageMap.size();
    }

    @Override
    public synchronized int getPageListSize() {
        return _pageList.size();
    }

    @Override
    public synchronized long getBaseAddress() {
        return _baseAddress;
    }

    @Override
    public synchronized long getCurrentAddress() {
        return _currentAddress;
    }

    @Override
    public long getBlockSize() {
        return _blockSize;
    }

    @Override
    public boolean isAppendOnly() {
        return _appendOnly.get();
    }

    @Override
    public boolean isIgnoreMissingVolumes() {
        return _ignoreMissingVolume.get();
    }

    @Override
    public boolean isCopyingFast() {
        return _copyFast.get();
    }

    @Override
    public void setAppendOnly(final boolean appendOnly) {
        _appendOnly.set(appendOnly);
    }

    @Override
    public void setIgnoreMissingVolumes(final boolean ignore) {
        _ignoreMissingVolume.set(ignore);
    }

    @Override
    public void setCopyingFast(final boolean fast) {
        _copyFast.set(fast);
    }

    @Override
    public long getFlushInterval() {
        return _flusher.getPollInterval();
    }

    @Override
    public void setFlushInterval(final long flushInterval) {
        _flusher.setPollInterval(flushInterval);
    }

    @Override
    public long getCopierInterval() {
        return _copier.getPollInterval();
    }

    @Override
    public void setCopierInterval(final long copierInterval) {
        _copier.setPollInterval(copierInterval);
    }

    @Override
    public void setRollbackPruningEnabled(final boolean rollbackPruning) {
        _rollbackPruning.set(rollbackPruning);
    }

    @Override
    public void setWritePagePruningEnabled(final boolean writePruning) {
        _writePagePruning.set(writePruning);
    }

    public JournalManager(final Persistit persistit) {
        _persistit = persistit;
    }

    @Override
    public boolean isClosed() {
        return _closed.get();
    }

    @Override
    public boolean isCopying() {
        return _copying.get();
    }

    @Override
    public boolean isRollbackPruningEnabled() {
        return _rollbackPruning.get();
    }

    @Override
    public boolean isWritePagePruningEnabled() {
        return _writePagePruning.get();
    }

    @Override
    public String getJournalFilePath() {
        return _journalFilePath;
    }

    @Override
    public long getJournaledPageCount() {
        return _writePageCount;
    }

    @Override
    public long getReadPageCount() {
        return _readPageCount;
    }

    @Override
    public long getCopiedPageCount() {
        return _copiedPageCount;
    }

    @Override
    public long getDroppedPageCount() {
        return _droppedPageCount;
    }

    public long getEarliestCommittedTransactionTimestamp() {
        return _earliestCommittedTimestamp;
    }

    public long getEarliestAbortedTransactionTimestamp() {
        return _earliestAbortedTimestamp;
    }

    @Override
    public long getJournalCreatedTime() {
        return _journalCreatedTime;
    }

    public Checkpoint getLastValidCheckpoint() {
        return _lastValidCheckpoint;
    }

    @Override
    public long getLastValidCheckpointTimestamp() {
        return _lastValidCheckpoint.getTimestamp();
    }

    @Override
    public String getLastCopierException() {
        return Util.toString(_copier.getLastException());
    }

    @Override
    public String getLastFlusherException() {
        return Util.toString(_flusher.getLastException());
    }

    @Override
    public long getLastValidCheckpointTimeMillis() {
        return _lastValidCheckpoint.getSystemTimeMillis();
    }

    @Override
    public long getSlowIoAlertThreshold() {
        return _slowIoAlertThreshold;
    }

    @Override
    public long getTotalCompletedCommits() {
        return _totalCommits.get();
    }

    @Override
    public long getCommitCompletionWaitTime() {
        return _totalCommitWaitTime.get() / NS_PER_MS;
    }

    @Override
    public long getCurrentTimestamp() {
        return _persistit.getCurrentTimestamp();
    }

    @Override
    public void setSlowIoAlertThreshold(final long slowIoAlertThreshold) {
        Util.rangeCheck(slowIoAlertThreshold, MINIMUM_SLOW_ALERT_THRESHOLD_MS, MAXIMUM_SLOW_ALERT_THRESHOLD_MS);
        _slowIoAlertThreshold = slowIoAlertThreshold;
    }

    @Override
    public int getUrgentFileCountThreshold() {
        return _urgentFileCountThreshold;
    }

    @Override
    public void setUrgentFileCountThreshold(final int threshold) {
        Util.rangeCheck(threshold, MINIMUM_URGENT_FILE_COUNT_THRESHOLD, MAXIMUM_URGENT_FILE_COUNT_THRESHOLD);
        _urgentFileCountThreshold = threshold;

    }

    /**
     * Compute an "urgency" factor that determines how vigorously the
     * JOURNAL_COPIER thread should perform I/O. This number is computed on a
     * scale of 0 to 10; larger values are intended make the thread work harder.
     * A value of 10 suggests the copier should run flat-out.
     *
     * @return the JOURNAL_COPIER urgency on a scale of 0 to 10
     */
    @Override
    public int urgency() {
        if (_copyFast.get()) {
            return URGENT;
        }
        final int remainingFiles = _urgentFileCountThreshold - getJournalFileCount();
        return Math.max(0, Math.min(URGENT - remainingFiles, URGENT));
    }

    /**
     * Introduce delay into an application thread when JOURNAL_COPIER thread is
     * behind. The amount of delay depends on the value returned by
     * {@link #urgency()}. When that value is {@value #URGENT} then the delay is
     * {@value #URGENT_COMMIT_DELAY_MILLIS} milliseconds.
     *
     * @throws PersistitInterruptedException
     */
    public void throttle() throws PersistitInterruptedException {
        final long interval = _throttleSleepInterval;
        if (interval > 0) {
            Util.sleep(interval);
        }
    }

    int handleForVolume(final Volume volume) throws PersistitException {
        if (volume.getHandle() != 0) {
            return volume.getHandle();
        }
        if (!_allowHandlesForTempVolumesAndTrees && volume.isTemporary()) {
            throw new IllegalStateException("Creating handle for temporary volume " + volume);
        }
        if (volume.getHandle() != 0) {
            return volume.getHandle();
        }
        synchronized (this) {
            if (volume.getHandle() != 0) {
                return volume.getHandle();
            }
            Integer handle = _volumeToHandleMap.get(volume);
            if (handle == null) {
                handle = Integer.valueOf(++_handleCounter);
                Debug.$assert0.t(!_handleToVolumeMap.containsKey(handle));
                writeVolumeHandleToJournal(volume, handle.intValue());
                _volumeToHandleMap.put(volume, handle);
                _handleToVolumeMap.put(handle, volume);
            }
            return volume.setHandle(handle.intValue());
        }
    }

    synchronized int handleForTree(final TreeDescriptor td, final boolean create) throws PersistitException {
        if (td.getVolumeHandle() == -1) {
            // Tree in transient volume -- don't journal updates to it
            return -1;
        }
        Integer handle = _treeToHandleMap.get(td);
        if (handle == null) {
            if (!create) {
                return -1;
            }
            handle = Integer.valueOf(++_handleCounter);
            Debug.$assert0.t(!_handleToTreeMap.containsKey(handle));
            if (td.getVolumeHandle() != Volume.LOCK_VOLUME_HANDLE) {
                writeTreeHandleToJournal(td, handle.intValue());
            }
            _treeToHandleMap.put(td, handle);
            _handleToTreeMap.put(handle, td);
        }
        return handle.intValue();
    }

    int handleForTree(final Tree tree) throws PersistitException {
        if (!_allowHandlesForTempVolumesAndTrees && tree.getVolume().isTemporary() && !tree.getVolume().isLockVolume()) {
            throw new IllegalStateException("Creating handle for temporary tree " + tree);
        }
        if (tree.getHandle() != 0) {
            return tree.getHandle();
        }
        synchronized (this) {
            if (tree.getHandle() != 0) {
                return tree.getHandle();
            }
            final TreeDescriptor td = new TreeDescriptor(handleForVolume(tree.getVolume()), tree.getName());
            return tree.setHandle(handleForTree(td, true));
        }
    }

    Tree treeForHandle(final int handle) throws PersistitException {
        final TreeDescriptor td = lookupTreeHandle(handle);
        if (td == null) {
            return null;
        }
        final Volume volume = volumeForHandle(td.getVolumeHandle());
        if (volume == null) {
            return null;
        }
        return volume.getStructure().getTreeInternal(td.getTreeName());
    }

    Volume volumeForHandle(final int handle) throws PersistitException {
        final Volume volume = lookupVolumeHandle(handle);
        if (volume == null) {
            if (handle == Volume.LOCK_VOLUME_HANDLE) {
                return _persistit.getLockVolume();
            } else {
                return null;
            }
        }
        if (!volume.isOpened()) {
            volume.open(_persistit);
        }
        return volume;
    }

    synchronized Volume getVolumeByName(final String volumeName) {
        for (final Volume v : _handleToVolumeMap.values()) {
            if (volumeName.equals(v.getName())) {
                return v;
            }
        }
        return null;
    }

    @Override
    public synchronized Volume lookupVolumeHandle(final int handle) {
        return _handleToVolumeMap.get(Integer.valueOf(handle));
    }

    public synchronized TreeDescriptor lookupTreeHandle(final int handle) {
        return _handleToTreeMap.get(Integer.valueOf(handle));
    }

    private void readFully(final ByteBuffer bb, final long address) throws PersistitIOException,
            CorruptJournalException {
        //
        // If necessary read the bytes out of the _writeBuffer
        // before they have been written out to the file. This code
        // requires the _writeBuffer to be a HeapByteBuffer.
        //
        final int position = bb.position();
        final int length = bb.remaining();
        synchronized (this) {
            if (address >= _writeBufferAddress && address + length <= _currentAddress) {
                assert _writeBufferAddress + _writeBuffer.position() == _currentAddress : String.format(
                        "writeBufferAddress=%,d position=%,d currentAddress=%,d", _writeBufferAddress,
                        _writeBuffer.position(), _currentAddress);
                final int wbPosition = _writeBuffer.position();
                final int wbLimit = _writeBuffer.limit();
                _writeBuffer.position((int) (address - _writeBufferAddress));
                _writeBuffer.limit((int) (address - _writeBufferAddress) + length);
                bb.put(_writeBuffer);
                _writeBuffer.limit(wbLimit);
                _writeBuffer.position(wbPosition);
                bb.position(position);
                return;
            }
        }

        final FileChannel fc = getFileChannel(address);

        long fileAddr = addressToOffset(address);
        while (bb.remaining() > 0) {
            int count;
            try {
                count = fc.read(bb, fileAddr);
            } catch (final IOException ioe) {
                throw new PersistitIOException(ioe);
            }
            if (count < 0) {
                final File file = addressToFile(address);
                throw new CorruptJournalException(String.format("End of file at %s:%d(%,d)", file, fileAddr, address));
            }
            fileAddr += count;
        }
        bb.limit(bb.position());
        bb.position(position);
    }

    boolean readPageFromJournal(final Buffer buffer) throws PersistitIOException {
        final int bufferSize = buffer.getBufferSize();
        final long pageAddress = buffer.getPageAddress();
        final ByteBuffer bb = buffer.getByteBuffer();

        final Volume volume = buffer.getVolume();
        final PageNode pn = lookupUpPageNode(pageAddress, volume);
        if (pn == null) {
            return false;
        }
        bb.position(0);
        final long recordPageAddress = readPageBufferFromJournal(pn, bb);
        _persistit.getIOMeter().chargeReadPageFromJournal(volume, pageAddress, bufferSize, pn.getJournalAddress(),
                buffer.getIndex());

        if (pageAddress != recordPageAddress) {
            throw new CorruptJournalException("Record at " + pn + " is not volume/page " + buffer.toString());
        }

        if (bb.limit() != bufferSize) {
            throw new CorruptJournalException("Record at " + pn + " is wrong size: expected/actual=" + bufferSize + "/"
                    + bb.limit());
        }
        _readPageCount++;
        buffer.getVolume().getStatistics().bumpReadCounter();
        return true;
    }

    PageNode lookupUpPageNode(final long pageAddress, final Volume volume) {
        PageNode pnLookup = null;
        synchronized (this) {
            final Integer volumeHandle = _volumeToHandleMap.get(volume);
            if (volumeHandle != null) {
                pnLookup = _pageMap.get(new PageNode(volumeHandle, pageAddress, -1, -1));
            }
        }

        if (pnLookup == null) {
            return null;
        }

        final PageNode pn = new PageNode(pnLookup.getVolumeHandle(), pnLookup.getPageAddress(),
                pnLookup.getJournalAddress(), pnLookup.getTimestamp());
        sequence(PAGE_MAP_READ_INVALIDATE_A);

        /*
         * If the page is still valid, use the values saved in pn so we don't
         * lose them mid-processing. We can use it because it was in the map
         * when we first looked and that means it is is still in the journal.
         * The journal won't go away because of the claim on buffer preventing
         * new checkpoints and that keeps the copier from deleting it.
         */
        if (pnLookup.isInvalid()) {
            return null;
        }
        return pn;
    }

    private long readPageBufferFromJournal(final PageNode pn, final ByteBuffer bb) throws PersistitIOException,
            CorruptJournalException {
        final int at = bb.position();
        bb.limit(at + PA.OVERHEAD);
        readFully(bb, pn.getJournalAddress());
        if (bb.remaining() < PA.OVERHEAD) {
            throw new CorruptJournalException("Record at " + pn.toStringJournalAddress(this) + " is incomplete");
        }
        final int type = JournalRecord.getType(bb);
        final int payloadSize = JournalRecord.getLength(bb) - PA.OVERHEAD;
        final int leftSize = PA.getLeftSize(bb);
        final int bufferSize = PA.getBufferSize(bb);
        final long pageAddress = PA.getPageAddress(bb);

        if (type != PA.TYPE) {
            throw new CorruptJournalException("Record at " + pn.toStringJournalAddress(this) + " is not a PAGE record");
        }

        if (leftSize < 0 || payloadSize < leftSize || payloadSize > bufferSize) {
            throw new CorruptJournalException("Record at " + pn.toStringJournalAddress(this)
                    + " invalid sizes: recordSize= " + payloadSize + " leftSize=" + leftSize + " bufferSize="
                    + bufferSize);
        }

        if (pageAddress != pn.getPageAddress() && pn.getPageAddress() != -1) {
            throw new CorruptJournalException("Record at " + pn.toStringJournalAddress(this)
                    + " mismatched page address: expected/actual=" + pn.getPageAddress() + "/" + pageAddress);
        }

        bb.limit(at + payloadSize).position(at);
        readFully(bb, pn.getJournalAddress() + PA.OVERHEAD);

        final int rightSize = payloadSize - leftSize;
        System.arraycopy(bb.array(), leftSize + at, bb.array(), bufferSize - rightSize + at, rightSize);
        Arrays.fill(bb.array(), leftSize + at, bufferSize - rightSize + at, (byte) 0);
        bb.limit(bb.capacity()).position(at).limit(at + bufferSize);
        return pageAddress;
    }

    /**
     * Method used by diagnostic tools to attempt to read a page from journal
     *
     * @param address
     *            journal address
     * @param _bb
     *            ByteBuffer in which to return the result
     * @return pageAddress of the page at the specified location, or -1 if the
     *         address does not reference a valid page
     * @throws PersistitException
     */
    Buffer readPageBuffer(final long address) throws PersistitException {
        ByteBuffer bb = ByteBuffer.allocate(PA.OVERHEAD);
        readFully(bb, address);
        if (bb.remaining() < PA.OVERHEAD) {
            return null;
        }
        final int type = JournalRecord.getType(bb);
        final int payloadSize = JournalRecord.getLength(bb) - PA.OVERHEAD;
        final int leftSize = PA.getLeftSize(bb);
        final int bufferSize = PA.getBufferSize(bb);
        final long pageAddress = PA.getPageAddress(bb);
        final int volumeHandle = PA.getVolumeHandle(bb);

        if (type != PA.TYPE || leftSize < 0 || payloadSize < leftSize || payloadSize > bufferSize) {
            return null;
        }

        final BufferPool pool = _persistit.getBufferPool(bufferSize);
        final Buffer buffer = new Buffer(bufferSize, -1, pool, _persistit);
        buffer.setPageAddressAndVolume(pageAddress, volumeForHandle(volumeHandle));
        bb = buffer.getByteBuffer();
        bb.limit(payloadSize).position(0);
        readFully(bb, address + PA.OVERHEAD);

        if (leftSize > 0) {
            final int rightSize = payloadSize - leftSize;
            System.arraycopy(bb.array(), leftSize, bb.array(), bufferSize - rightSize, rightSize);
            Arrays.fill(bb.array(), leftSize, bufferSize - rightSize, (byte) 0);
        }
        bb.limit(bufferSize).position(0);
        final boolean acquired = buffer.claim(true, 0);
        assert acquired : "buffer in use";
        buffer.load();
        buffer.release();
        return buffer;
    }

    private void advance(final int recordSize) {
        Debug.$assert1.t(recordSize > 0 && recordSize + _writeBuffer.position() <= _writeBuffer.capacity());
        _currentAddress += recordSize;
        _writeBuffer.position(_writeBuffer.position() + recordSize);
    }

    /**
     * Write a JH (journal header) record. This record must be written to the
     * beginning of the journal file. Note that this method does not call
     * {@link #prepareWriteBuffer(int)} - the write buffer needs to be ready to
     * receive the JH record.
     *
     * @throws PersistitException
     */
    synchronized void writeJournalHeader() throws PersistitException {
        JH.putType(_writeBuffer);
        JournalRecord.putTimestamp(_writeBuffer, epochalTimestamp());
        JH.putVersion(_writeBuffer, VERSION);
        JH.putBlockSize(_writeBuffer, _blockSize);
        JH.putBaseJournalAddress(_writeBuffer, _baseAddress);
        JH.putCurrentJournalAddress(_writeBuffer, _currentAddress);
        JH.putJournalCreatedTime(_writeBuffer, _journalCreatedTime);
        JH.putFileCreatedTime(_writeBuffer, System.currentTimeMillis());
        JH.putPath(_writeBuffer, addressToFile(_currentAddress).getPath());
        final int recordSize = JournalRecord.getLength(_writeBuffer);
        _persistit.getIOMeter().chargeWriteOtherToJournal(recordSize, _currentAddress);
        advance(recordSize);
    }

    /**
     * Write the JE (journal end) record. This record must be written to the end
     * of each complete journal file. Note that this method does not call
     * {@link #prepareWriteBuffer(int)} - the write buffer needs to be ready to
     * receive the JE record.
     *
     * @throws PersistitException
     */
    synchronized void writeJournalEnd() throws PersistitException {
        if (_writeBufferAddress != Long.MAX_VALUE) {
            //
            // prepareWriteBuffer contract guarantees there's always room in
            // the write buffer for this record.
            //
            JE.putType(_writeBuffer);
            JournalRecord.putTimestamp(_writeBuffer, epochalTimestamp());
            JournalRecord.putLength(_writeBuffer, JE.OVERHEAD);
            JE.putCurrentJournalAddress(_writeBuffer, _currentAddress);
            JE.putBaseAddress(_writeBuffer, _baseAddress);
            JE.putJournalCreatedTime(_writeBuffer, _journalCreatedTime);
            _persistit.getIOMeter().chargeWriteOtherToJournal(JE.OVERHEAD, _currentAddress);
            advance(JE.OVERHEAD);
        }
    }

    synchronized void writePageMap() throws PersistitException {
        int count = 0;
        for (final PageNode lastPageNode : _pageMap.values()) {
            PageNode pageNode = lastPageNode;
            while (pageNode != null) {
                count++;
                pageNode = pageNode.getPrevious();
            }
        }
        for (final PageNode lastPageNode : _branchMap.values()) {
            PageNode pageNode = lastPageNode;
            while (pageNode != null) {
                count++;
                pageNode = pageNode.getPrevious();
            }
        }

        final int recordSize = PM.OVERHEAD + PM.ENTRY_SIZE * count;
        prepareWriteBuffer(recordSize);
        PM.putType(_writeBuffer);
        JournalRecord.putLength(_writeBuffer, recordSize);
        JournalRecord.putTimestamp(_writeBuffer, epochalTimestamp());
        advance(PM.OVERHEAD);
        int offset = 0;
        for (final PageNode lastPageNode : _pageMap.values()) {
            PageNode pageNode = lastPageNode;
            while (pageNode != null) {
                PM.putEntry(_writeBuffer, offset / PM.ENTRY_SIZE, pageNode.getTimestamp(),
                        pageNode.getJournalAddress(), pageNode.getVolumeHandle(), pageNode.getPageAddress());

                offset += PM.ENTRY_SIZE;
                count--;
                if (count == 0 || offset + PM.ENTRY_SIZE >= _writeBuffer.remaining()) {
                    advance(offset);
                    offset = 0;
                }
                if (PM.ENTRY_SIZE >= _writeBuffer.remaining()) {
                    flush();
                }
                pageNode = pageNode.getPrevious();
            }
        }
        for (final PageNode lastPageNode : _branchMap.values()) {
            PageNode pageNode = lastPageNode;
            while (pageNode != null) {
                PM.putEntry(_writeBuffer, offset / PM.ENTRY_SIZE, pageNode.getTimestamp(),
                        pageNode.getJournalAddress(), pageNode.getVolumeHandle(), pageNode.getPageAddress());

                offset += PM.ENTRY_SIZE;
                count--;
                if (count == 0 || offset + PM.ENTRY_SIZE >= _writeBuffer.remaining()) {
                    advance(offset);
                    offset = 0;
                }
                if (PM.ENTRY_SIZE >= _writeBuffer.remaining()) {
                    flush();
                }
                pageNode = pageNode.getPrevious();
            }
        }
        Debug.$assert0.t(count == 0);
        _persistit.getIOMeter().chargeWriteOtherToJournal(recordSize, _currentAddress - recordSize);
    }

    synchronized void writeTransactionMap() throws PersistitException {
        int count = _liveTransactionMap.size();
        final int recordSize = TM.OVERHEAD + TM.ENTRY_SIZE * count;
        prepareWriteBuffer(recordSize);
        TM.putType(_writeBuffer);
        JournalRecord.putLength(_writeBuffer, recordSize);
        JournalRecord.putTimestamp(_writeBuffer, epochalTimestamp());
        advance(TM.OVERHEAD);
        int offset = 0;
        for (final TransactionMapItem ts : _liveTransactionMap.values()) {
            TM.putEntry(_writeBuffer, offset / TM.ENTRY_SIZE, ts.getStartTimestamp(), ts.getCommitTimestamp(),
                    ts.getStartAddress(), ts.getLastRecordAddress());
            offset += TM.ENTRY_SIZE;
            count--;
            if (count == 0 || offset + TM.ENTRY_SIZE >= _writeBuffer.remaining()) {
                advance(offset);
                offset = 0;
            }
            if (TM.ENTRY_SIZE >= _writeBuffer.remaining()) {
                flush();
            }
        }

        Debug.$assert0.t(count == 0);
        _persistit.getIOMeter().chargeWriteOtherToJournal(recordSize, _currentAddress - recordSize);
    }

    synchronized void writeCheckpointToJournal(final Checkpoint checkpoint) throws PersistitException {
        //
        // Make sure all prior journal entries are committed to disk before
        // writing this record.
        //
        force();
        //
        // Prepare room for CP.OVERHEAD bytes in the journal. If doing so
        // started a new journal file then there's no need to write another
        // CP record.
        //
        if (!prepareWriteBuffer(CP.OVERHEAD)) {
            final long address = _currentAddress;
            JournalRecord.putLength(_writeBuffer, CP.OVERHEAD);
            CP.putType(_writeBuffer);
            JournalRecord.putTimestamp(_writeBuffer, checkpoint.getTimestamp());
            CP.putSystemTimeMillis(_writeBuffer, checkpoint.getSystemTimeMillis());
            CP.putBaseAddress(_writeBuffer, _baseAddress);
            _persistit.getIOMeter().chargeWriteOtherToJournal(CP.OVERHEAD, _currentAddress);
            advance(CP.OVERHEAD);
            force();

            checkpointWritten(checkpoint);

            _persistit.getLogBase().checkpointWritten.log(checkpoint, address);
            _persistit.getIOMeter().chargeWriteOtherToJournal(CP.OVERHEAD, address);
        }

        _lastValidCheckpoint = checkpoint;
        _lastValidCheckpointJournalAddress = _currentAddress - CP.OVERHEAD;
        _lastValidCheckpointBaseAddress = _baseAddress;
    }

    void writePageToJournal(final Buffer buffer) throws PersistitException {

        final Volume volume;
        final int recordSize;

        synchronized (this) {

            if (!buffer.isTemporary() && buffer.getTimestamp() < _lastValidCheckpoint.getTimestamp()) {
                _persistit.getLogBase().lateWrite.log(_lastValidCheckpoint, buffer);
            }

            volume = buffer.getVolume();
            final int handle = handleForVolume(volume);
            int leftSize;
            int rightSize;
            if (buffer.isDataPage() || buffer.isIndexPage() || buffer.isGarbagePage()) {
                leftSize = buffer.getKeyBlockEnd();
                rightSize = buffer.getBufferSize() - buffer.getAlloc();
            } else {
                leftSize = 0;
                rightSize = buffer.getBufferSize();
            }

            recordSize = PA.OVERHEAD + leftSize + rightSize;

            prepareWriteBuffer(recordSize);
            Debug.$assert1.t(_writeBuffer.remaining() >= recordSize);

            final long address = _currentAddress;
            final int position = _writeBuffer.position();

            JournalRecord.putLength(_writeBuffer, recordSize);
            PA.putVolumeHandle(_writeBuffer, handle);
            PA.putType(_writeBuffer);
            JournalRecord.putTimestamp(_writeBuffer, buffer.isTemporary() ? -1 : buffer.getTimestamp());
            PA.putLeftSize(_writeBuffer, leftSize);
            PA.putBufferSize(_writeBuffer, buffer.getBufferSize());
            PA.putPageAddress(_writeBuffer, buffer.getPageAddress());
            advance(PA.OVERHEAD);

            if (leftSize > 0) {
                _writeBuffer.put(buffer.getBytes(), 0, leftSize);
                _writeBuffer.put(buffer.getBytes(), buffer.getBufferSize() - rightSize, rightSize);
            } else {
                _writeBuffer.put(buffer.getBytes());
            }
            Debug.$assert0.t(_writeBuffer.position() - position == recordSize);
            _currentAddress += recordSize - PA.OVERHEAD;

            final PageNode pageNode = new PageNode(handle, buffer.getPageAddress(), address, buffer.getTimestamp());
            _pageList.add(pageNode);
            PageNode oldPageNode = _pageMap.put(pageNode, pageNode);

            if (oldPageNode != null) {
                assert oldPageNode.getTimestamp() <= pageNode.getTimestamp();
            }
            final long checkpointTimestamp = _persistit.getTimestampAllocator().getProposedCheckpointTimestamp();
            if (oldPageNode != null && oldPageNode.getTimestamp() > checkpointTimestamp
                    && buffer.getTimestamp() > checkpointTimestamp) {
                oldPageNode.invalidate();
                oldPageNode = oldPageNode.getPrevious();
            }
            pageNode.setPrevious(oldPageNode);
            _writePageCount++;
        }
        _persistit.getIOMeter().chargeWritePageToJournal(volume, buffer.getPageAddress(), buffer.getBufferSize(),
                _currentAddress - recordSize, urgency(), buffer.getIndex());
    }

    /**
     * package-private for unit tests only.
     *
     * @param volume
     * @param handle
     * @throws PersistitException
     */
    synchronized void writeVolumeHandleToJournal(final Volume volume, final int handle) throws PersistitException {
        prepareWriteBuffer(IV.MAX_LENGTH);
        IV.putType(_writeBuffer);
        IV.putHandle(_writeBuffer, handle);
        IV.putVolumeId(_writeBuffer, volume.getId());
        JournalRecord.putTimestamp(_writeBuffer, epochalTimestamp());
        if (_persistit.getConfiguration().isUseOldVSpec()) {
            IV.putVolumeSpecification(_writeBuffer, volume.getName());
        } else {
            IV.putVolumeSpecification(_writeBuffer, volume.getSpecification().toString());
        }
        final int recordSize = JournalRecord.getLength(_writeBuffer);
        _persistit.getIOMeter().chargeWriteOtherToJournal(recordSize, _currentAddress);
        advance(recordSize);
    }

    synchronized void writeTreeHandleToJournal(final TreeDescriptor td, final int handle) throws PersistitException {
        prepareWriteBuffer(IT.MAX_LENGTH);
        IT.putType(_writeBuffer);
        IT.putHandle(_writeBuffer, handle);
        IT.putVolumeHandle(_writeBuffer, td.getVolumeHandle());
        JournalRecord.putTimestamp(_writeBuffer, epochalTimestamp());
        IT.putTreeName(_writeBuffer, td.getTreeName());
        final int recordSize = JournalRecord.getLength(_writeBuffer);
        _persistit.getIOMeter().chargeWriteOtherToJournal(recordSize, _currentAddress);
        advance(recordSize);
    }

    /**
     * <p>
     * Write a transaction or partial transaction to the journal as a TX record
     * containing a variable number of variable-length update records. The
     * supplied <code>buffer</code> contains the update records.
     * </p>
     * <p>
     * TX records typically represent a complete transaction, but in the case of
     * transactions with a large number of updates, there may be multiple TX
     * records. In that case each TX record but the last one written specifies a
     * commit timestamp value of zero indicating that the transaction has not
     * committed yet, and each TX record but the first one written specifies the
     * journal address of the previous one. These pointers allow the recovery
     * process find efficiently all the updates of a transaction that needs to
     * be rolled back.
     * </p>
     *
     * @param buffer
     *            The buffer containing the update records
     * @param startTimestamp
     *            Transaction start timestamp
     * @param commitTimestamp
     *            Transaction commit timestamp, or 0 if the transaction has not
     *            committed yet
     * @param backchainAddress
     *            Journal address of previous TX record written by this
     *            transaction, or 0 if there is to previous record
     *
     * @return
     * @throws PersistitException
     */
    synchronized long writeTransactionToJournal(final ByteBuffer buffer, final long startTimestamp,
            final long commitTimestamp, final long backchainAddress) throws PersistitException {
        final int recordSize = TX.OVERHEAD + buffer.position();
        prepareWriteBuffer(recordSize);
        final long address = _currentAddress;
        TX.putLength(_writeBuffer, recordSize);
        TX.putType(_writeBuffer);
        TX.putTimestamp(_writeBuffer, startTimestamp);
        TX.putCommitTimestamp(_writeBuffer, commitTimestamp);
        TX.putBackchainAddress(_writeBuffer, backchainAddress);
        _persistit.getIOMeter().chargeWriteTXtoJournal(recordSize, _currentAddress);
        advance(TX.OVERHEAD);
        try {
            buffer.flip();
            _writeBuffer.put(buffer);
        } finally {
            buffer.clear();
        }
        _currentAddress += recordSize - TX.OVERHEAD;
        if (commitTimestamp != ABORTED) {
            final long key = Long.valueOf(startTimestamp);
            TransactionMapItem item = _liveTransactionMap.get(key);
            if (item == null) {
                if (backchainAddress != 0) {
                    throw new IllegalStateException("Missing back-chained transaction for start timestamp "
                            + startTimestamp);
                }
                item = new TransactionMapItem(startTimestamp, address);
                _liveTransactionMap.put(startTimestamp, item);
            } else {
                if (backchainAddress == 0) {
                    throw new IllegalStateException("Duplicate transaction " + item);
                }
                if (item.isCommitted()) {
                    throw new IllegalStateException("Transaction already committed " + item);
                }
                item.setLastRecordAddress(address);
            }
            item.setCommitTimestamp(commitTimestamp);
        }
        return address;
    }

    static File journalPath(final String path) {
        final File file = new File(path);
        if (file.isDirectory()) {
            return new File(file, DEFAULT_JOURNAL_FILE_NAME);
        } else {
            return file;
        }
    }

    static long fileToGeneration(final File file) {
        final Matcher matcher = PATH_PATTERN.matcher(file.getName());
        if (matcher.matches()) {
            return Long.parseLong(matcher.group(2));
        } else {
            return -1;
        }
    }

    static String fileToPath(final File file) {
        final Matcher matcher = PATH_PATTERN.matcher(file.getPath());
        if (matcher.matches()) {
            return matcher.group(1);
        } else {
            return null;
        }
    }

    static File generationToFile(final String path, final long generation) {
        return new File(String.format(PATH_FORMAT, path, generation));
    }

    File addressToFile(final long address) {
        return generationToFile(_journalFilePath, address / _blockSize);
    }

    long addressToOffset(final long address) {
        return address % _blockSize;
    }

    void setWriteBufferSize(final int size) {
        if (size < MINIMUM_BUFFER_SIZE || size > MAXIMUM_BUFFER_SIZE) {
            throw new IllegalArgumentException("Invalid write buffer size: " + size);
        }
        _writeBufferSize = size;
    }

    public void close() throws PersistitException {
        _closed.set(true);
        rollover();

        final JournalCopier copier = _copier;
        _copier = null;
        if (copier != null) {
            _persistit.waitForIOTaskStop(copier);
        }

        final JournalFlusher flusher = _flusher;
        _flusher = null;
        if (flusher != null) {
            _persistit.waitForIOTaskStop(flusher);
        }

        synchronized (this) {
            try {
                closeAllChannels();
            } catch (final IOException ioe) {
                throw new PersistitIOException(ioe);
            } finally {
                _handleToTreeMap.clear();
                _handleToVolumeMap.clear();
                _volumeToHandleMap.clear();
                _treeToHandleMap.clear();
                _pageMap.clear();
                _pageList.clear();
                _writeBuffer = null;
            }
        }
    }

    private void closeAllChannels() throws IOException {
        synchronized (this) {
            try {
                for (final FileChannel channel : _journalFileChannels.values()) {
                    if (channel != null) {
                        channel.close();
                    }
                }

            } finally {
                _journalFileChannels.clear();
            }
        }
    }

    /**
     * Abruptly stop (using {@link Thread#stop()}) the copier and flusher
     * threads. This method should be used only by tests.
     */
    void crash() throws IOException {
        IOTaskRunnable.crash(_flusher);
        IOTaskRunnable.crash(_copier);
        //
        // Even when simulating a crash do this to release
        // channels and therefore allow disk space to be returned to
        // the OS when the files are deleted.
        //
        closeAllChannels();
    }

    /**
     * Flushes the write buffer
     *
     * @throws PersistitException
     */
    synchronized long flush() throws PersistitException {
        _persistit.checkFatal();
        final long address = _writeBufferAddress;
        if (address != Long.MAX_VALUE && _writeBuffer != null) {

            assert _writeBufferAddress + _writeBuffer.position() == _currentAddress : String.format(
                    "writeBufferAddress=%,d position=%,d currentAddress=%,d", _writeBufferAddress,
                    _writeBuffer.position(), _currentAddress);

            try {
                if (_writeBuffer.position() > 0) {
                    final FileChannel channel = getFileChannel(address);
                    final long size = channel.size();
                    if (size < addressToOffset(address)) {
                        throw new CorruptJournalException(String.format(
                                "Journal file %s size %,d does not match current address %,d", addressToFile(address),
                                size, address));
                    }

                    _writeBuffer.flip();
                    boolean writeComplete = false;
                    final int written;
                    try {
                        /*
                         * Note: contract for FileChannel requires write to
                         * return normally only when all bytes have been
                         * written. (See java.nio.channels.WritableByteChannel
                         * #write(ByteBuffer), statement
                         * "Unless otherwise specified...")
                         */
                        channel.write(_writeBuffer, _writeBufferAddress % _blockSize);
                        /*
                         * Surprise: FileChannel#write does not throw an
                         * Exception if it successfully writes some bytes and
                         * then encounters a disk full condition. (Found this
                         * out empirically.)
                         */
                        writeComplete = _writeBuffer.remaining() == 0;
                    } finally {
                        written = _writeBuffer.position();
                        _writeBufferAddress += written;
                        if (writeComplete) {
                            if (_writeBuffer.capacity() != _writeBufferSize) {
                                _writeBuffer = ByteBuffer.allocate(_writeBufferSize);
                            } else {
                                _writeBuffer.clear();
                            }
                        } else {
                            /*
                             * If the buffer didn't get written, perhaps due to
                             * an interrupt or disk-full condition, then compact
                             * to remove any bytes from the buffer that actually
                             * did get written and reset other measurements.
                             */
                            _writeBuffer.compact();
                        }
                        final long remaining = _blockSize - (_writeBufferAddress % _blockSize);
                        if (remaining < (_writeBuffer.limit())) {
                            _writeBuffer.limit((int) remaining);
                        }
                    }

                    assert _writeBufferAddress + _writeBuffer.position() == _currentAddress : String.format(
                            "writeBufferAddress=%,d position=%,d currentAddress=%,d", _writeBufferAddress,
                            _writeBuffer.position(), _currentAddress);

                    _persistit.getIOMeter().chargeFlushJournal(written, address);
                    return _writeBufferAddress;
                }
            } catch (final IOException e) {
                throw new PersistitIOException("Writing to file " + addressToFile(address), e);
            }
        }
        return Long.MAX_VALUE;
    }

    /**
     * Force all data written to the journal file to disk.
     */
    @Override
    public void force() throws PersistitException {
        long address = Long.MAX_VALUE;
        try {
            address = flush();
            if (address != Long.MAX_VALUE) {
                final FileChannel channel = getFileChannel(address);
                channel.force(false);
            }
        } catch (final IOException e) {
            throw new PersistitIOException("Writing to file " + addressToFile(address), e);
        }
    }

    /**
     * Map a ByteBuffer to a file address, as needed to ensure client methods
     * can write their records. This method modifies the values of _writeBuffer,
     * _writeBufferAddress, and in case a new journal file is prepared (a
     * "roll-over" event), it also modifies _currentAddress to reflect the
     * current address in the new file.
     *
     * @param size
     *            Size of record to be written
     * @return <code>true</code> if and only if a new journal file was started
     * @throws PersistitException
     */
    private boolean prepareWriteBuffer(final int size) throws PersistitException {
        _persistit.checkFatal();
        boolean newJournalFile = false;
        if (getCurrentJournalSize() == 0) {
            flush();
            _writeBufferAddress = _currentAddress;
            startJournalFile();
            newJournalFile = true;
        }

        assert _writeBufferAddress + _writeBuffer.position() == _currentAddress : String.format(
                "writeBufferAddress=%,d position=%,d currentAddress=%,d", _writeBufferAddress, _writeBuffer.position(),
                _currentAddress);
        //
        // If the current journal file has room for the record, then return.
        //
        if (_writeBuffer.remaining() > size + JE.OVERHEAD) {
            return newJournalFile;
        }
        //
        // Otherwise, flush the write buffer and try again
        flush();

        if (_writeBuffer.remaining() > size + JE.OVERHEAD) {
            return newJournalFile;
        }
        //
        // In the special case of a record which may be longer than
        // the capacity of the buffer (e.g., the PageMap), then check whether
        // there is enough room in the file to hold the entire map. In that case
        // then the buffer is prepared because the PM and TM writers know how to
        // fill the buffer multiple times.
        //
        if (_writeBuffer.remaining() == _writeBuffer.capacity()) {
            final long remaining = _blockSize - getCurrentJournalSize();
            if (remaining > size + JE.OVERHEAD) {
                return newJournalFile;
            }
        }
        //
        // Finally if there's still not enough room we're committed to
        // rolling the journal.
        //
        rolloverWithNewFile();
        return true;
    }

    void rollover() throws PersistitException {
        rollover(false, false);
    }

    void rolloverWithNewFile() throws PersistitException {
        rollover(false, true);
    }

    void rolloverWithNewBaseAndFile() throws PersistitException {
        rollover(true, true);
    }

    private synchronized void rollover(final boolean setBaseAddress, final boolean startNewFile)
            throws PersistitException {
        if (_writeBufferAddress != Long.MAX_VALUE) {
            writeJournalEnd();
            flush();

            try {
                final long length = getCurrentJournalSize();
                final boolean matches = length == (_writeBuffer.position() + _writeBufferAddress) % _blockSize;
                final FileChannel channel = getFileChannel(_currentAddress);
                Debug.$assert1.t(matches);
                if (matches) {
                    channel.truncate(length);
                }
                channel.force(true);
            } catch (final IOException ioe) {
                throw new PersistitIOException(ioe);
            }
            _currentAddress = ((_currentAddress / _blockSize) + 1) * _blockSize;
            _writeBuffer.clear();
            _writeBufferAddress = _currentAddress;
            _isNewEpoch = false;

            if (setBaseAddress) {
                _baseAddress = _currentAddress;
            }
            if (startNewFile) {
                prepareWriteBuffer(JH.OVERHEAD);
            }
        }
    }

    /**
     * Timestamp marking the Page Map, Transaction Map and other records in the
     * journal header. This timestamp is used to discriminate between pages in a
     * "branch" history and the live history. See comments in
     * {@link RecoveryManager#scanLoadPageMap(long, long, int)} for details.
     *
     * @return either the current timestamp or the timestamp of the last valid
     *         checkpoint, depending on whether this journal file starts a new
     *         epoch.
     */
    private long epochalTimestamp() {
        return _isNewEpoch ? getLastValidCheckpointTimestamp() : _persistit.getCurrentTimestamp();
    }

    private void startJournalFile() throws PersistitException {
        //
        // Write the beginning of a new journal file.
        //
        // The information written here is designed to accelerate recovery.
        // The recovery process can simply read the JournalHeader and
        // subsequent records from the last journal file to load the page
        // map and live transaction map. The journal file is valid for
        // recovery only if the CP (checkpoint) record is present in the
        // recovered file.
        //
        writeJournalHeader();
        //
        // Write IV (identify volume) records for each volume in the handle
        // map
        //
        for (final Map.Entry<Integer, Volume> entry : _handleToVolumeMap.entrySet()) {
            writeVolumeHandleToJournal(entry.getValue(), entry.getKey().intValue());
        }
        //
        // Write IT (identify tree) records for each tree in the handle
        // map
        //
        for (final Map.Entry<Integer, TreeDescriptor> entry : _handleToTreeMap.entrySet()) {
            if (entry.getValue().getVolumeHandle() != Volume.LOCK_VOLUME_HANDLE) {
                writeTreeHandleToJournal(entry.getValue(), entry.getKey().intValue());
            }
        }
        //
        // Write the PM (Page Map) record
        //
        writePageMap();
        //
        // Write the TM (Transaction Map) record
        //
        writeTransactionMap();
        //
        // Finally, write the current CP (checkpoint) record.
        //
        writeCheckpointToJournal(_lastValidCheckpoint);
    }

    /**
     * Return the <code>FileChannel</code> for the journal file containing the
     * supplied <code>address</code>. If necessary, create a new
     * {@link MediatedFileChannel}.
     *
     * @param address
     *            the journal address of a record in the journal for which the
     *            corresponding channel will be returned
     * @throws PersistitException
     *             if the <code>MediatedFileChannel</code> cannot be created
     */
    synchronized FileChannel getFileChannel(final long address) throws PersistitIOException {
        if (address < _deleteBoundaryAddress || address > _currentAddress + _blockSize) {
            throw new IllegalArgumentException("Invalid journal address " + address + " outside of range ("
                    + _deleteBoundaryAddress + ":" + (_currentAddress + _blockSize) + ")");
        }
        final long generation = address / _blockSize;
        FileChannel channel = _journalFileChannels.get(generation);
        if (channel == null) {
            try {
                channel = new MediatedFileChannel(addressToFile(address), "rw");
                _journalFileChannels.put(generation, channel);
            } catch (final IOException ioe) {
                throw new PersistitIOException(ioe);
            }
        }
        return channel;
    }

    /**
     * Set the copyFast flag and then wait until all checkpointed pages have
     * been copied to their respective volumes, allowing the journal files to be
     * deleted. Pages modified after the last valid checkpoint cannot be copied.
     * <p>
     * Does nothing of the <code>appendOnly</code> is set.
     *
     * @throws PersistitException
     */
    @Override
    public void copyBack() throws Exception {
        if (!_appendOnly.get()) {
            _copyFast.set(true);
            final int exceptionCount = _copier.getExceptionCount();
            while (_copyFast.get()) {
                _copier.kick();
                Util.sleep(Persistit.SHORT_DELAY);
                if (_copier.getExceptionCount() != exceptionCount) {
                    throw _copier.getLastException();
                }
            }
        }
    }

    /**
     * Remove transactions and PageNode entries when possible due to completion
     * of a new checkpoint.
     *
     * @param checkpoint
     */
    private void checkpointWritten(final Checkpoint checkpoint) {

        //
        // Will become the earliest timestamp of any record needed to
        // be retained for recovery. For transactions containing LONG_RECORD
        // pages, those pages may be written to the journal with timestamps
        // earlier than the commitTimestamp of the transaction but they are
        // guaranteed to be written with timestamp values later than the
        // transaction's startTimestamp. Therefore we can't cull PageMap entries
        // later than this recoveryTimestamp because the pages they refer to may
        // be needed for recovery.
        //
        long recoveryTimestamp = checkpoint.getTimestamp();
        recoveryTimestamp = Math.min(Math.min(recoveryTimestamp, _earliestCommittedTimestamp),
                _earliestAbortedTimestamp);
        //
        // Remove all but the most recent PageNode version before the
        // checkpoint.
        //
        for (final PageNode pageNode : _pageMap.values()) {
            for (PageNode pn = pageNode; pn != null; pn = pn.getPrevious()) {
                if (pn.getTimestamp() < recoveryTimestamp) {
                    pn.removeHistory();
                    break;
                }
            }
        }
        //
        // Remove any PageNode from the branchMap having a timestamp less
        // than the checkpoint. Generally all such entries are removed after
        // the first checkpoint that has been established after recovery.
        //
        for (final Iterator<PageNode> iterator = _branchMap.values().iterator(); iterator.hasNext();) {
            final PageNode pageNode = iterator.next();
            if (pageNode.getTimestamp() < recoveryTimestamp) {
                iterator.remove();
            }
        }

        checkpoint.completed();
    }

    /**
     * Remove obsolete TransactionMapItem instances from the live transaction
     * map. An instance is obsolete if it refers to a transaction that committed
     * earlier than that last valid checkpoint (because all of the effects of
     * that transaction are now check-pointed into the B-Trees themselves) or if
     * it is from an aborted transaction that has no remaining MVV values.
     */
    void pruneObsoleteTransactions() {
        pruneObsoleteTransactions(isRollbackPruningEnabled());
    }

    void pruneObsoleteTransactions(final boolean rollbackPruningEnabled) {
        final long timestamp = _lastValidCheckpoint.getTimestamp();
        long earliestCommitted = Long.MAX_VALUE;
        long earliestAborted = Long.MAX_VALUE;
        final List<TransactionMapItem> toPrune = new ArrayList<TransactionMapItem>();
        /*
         * Remove any committed transactions that committed before the
         * checkpoint. No need to keep a record of such a transaction since its
         * updates are now fully written to the journal in modified page images.
         */
        synchronized (this) {
            for (final Iterator<TransactionMapItem> iterator = _liveTransactionMap.values().iterator(); iterator
                    .hasNext();) {
                final TransactionMapItem item = iterator.next();
                if (item.isCommitted()) {
                    if (item.getCommitTimestamp() < timestamp) {
                        iterator.remove();
                    } else if (item.getStartTimestamp() < earliestCommitted) {
                        earliestCommitted = item.getStartTimestamp();
                    }
                } else {
                    final TransactionStatus status;
                    status = _persistit.getTransactionIndex().getStatus(item.getStartTimestamp());
                    if (status == null || status.getTs() != item.getStartTimestamp()) {
                        iterator.remove();
                    } else if (status.getTc() == ABORTED && status.isNotified()) {
                        if (status.getMvvCount() == 0) {
                            iterator.remove();
                            sequence(RECOVERY_PRUNING_B);
                        } else {
                            if (item.getStartTimestamp() < earliestAborted) {
                                earliestAborted = item.getStartTimestamp();
                            }
                            if (rollbackPruningEnabled) {
                                toPrune.add(item);
                            }
                        }
                    }
                }
            }
            _earliestCommittedTimestamp = earliestCommitted;
            _earliestAbortedTimestamp = earliestAborted;
        }
        /*
         * Sort the toPrune list - since all members are aborted, the comparison
         * will be by startTimeStamp which is a good approximation of journal
         * address order.
         */
        Collections.sort(toPrune, TransactionMapItem.TRANSACTION_MAP_ITEM_COMPARATOR);
        for (final TransactionMapItem item : toPrune) {
            try {
                synchronized (_player) {
                    final TransactionStatus status;
                    status = _persistit.getTransactionIndex().getStatus(item.getStartTimestamp());
                    if (status != null && status.getTs() == item.getStartTimestamp() && status.getTc() == ABORTED
                            && status.isNotified() && status.getMvvCount() > 0) {
                        _player.applyTransaction(item, _listener);
                    }
                }
            } catch (final PersistitException e) {
                _persistit.getLogBase().pruneException.log(e, item);
            }
        }
    }

    /**
     * General method used to wait for durability. This method is used by all
     * three commit modes: SOFT, HARD and GROUP. The two parameters represent
     * time intervals in milliseconds.
     *
     * @param flushedTimestamp
     *            a timestamp taken after the transaction buffer belonging to
     *            the current transaction has been flushed.
     * @param leadTime
     *            time interval in milliseconds by which to anticipate I/O
     *            completion; the method will return as soon as the I/O
     *            operation that will flush the current generation of data is
     *            expected to complete within that time interval
     * @param stallTime
     *            time interval in milliseconds that this thread is willing to
     *            wait for I/O completion. If if the JOURNAL_FLUSHER is
     *            currently pausing, the pause time may be shortened to try to
     *            complete the I/O when requested. In particular, a value of
     *            zero indicates the I/O should start immediately.
     * @throws PersistitInterruptedException
     */

    void waitForDurability(final long flushedTimestamp, final long leadTime, final long stallTime)
            throws PersistitException {
        final JournalFlusher flusher = _flusher;
        if (flusher != null) {
            flusher.waitForDurability(flushedTimestamp, leadTime, stallTime);
        } else {
            throw new IllegalStateException("JOURNAL_FLUSHER is not running");
        }
    }

    public static class TreeDescriptor {

        final int _volumeHandle;

        final String _treeName;

        TreeDescriptor(final int volumeHandle, final String treeName) {
            _volumeHandle = volumeHandle;
            _treeName = treeName;
        }

        public int getVolumeHandle() {
            return _volumeHandle;
        }

        public String getTreeName() {
            return _treeName;
        }

        @Override
        public boolean equals(final Object obj) {
            if (obj == null || !(obj instanceof TreeDescriptor)) {
                return false;
            }
            final TreeDescriptor td = (TreeDescriptor) obj;
            return td._treeName.equals(_treeName) && td._volumeHandle == _volumeHandle;
        }

        @Override
        public int hashCode() {
            return _treeName.hashCode() ^ _volumeHandle;
        }

        @Override
        public String toString() {
            return "{" + _volumeHandle + "}" + _treeName;
        }
    }

    /**
     * A PageNode represents the existence of a copy of a page in the journal.
     * It links to previously created PageNode objects which refer to earlier
     * versions of the same page. These earlier instances are truncated whenever
     * a later version of the same page has been checkpointed.
     *
     * PageNode instances are designed to serve as both Key and Value fields of
     * the _pageNodeMap. The general rubric when adding a page to the journal is
     * to construct a PageNode representing the page image, and then use it to
     * perform a lookup in the _pageNodeMap. If there is no matching PageNode
     * already in the map then simply add the new one. If there is a matching
     * PageNode, link it to the new one then replace the entry in the map.
     *
     * This class implement Comparable on the page address. This is used in
     * forming a sorted set of PageNodes so that we can copy pages in roughly
     * sequential order to each Volume file.
     */
    public static class PageNode {

        final int _volumeHandle;

        final long _pageAddress;

        final long _timestamp;

        long _journalAddress;

        int _offset;

        PageNode _previous;

        PageNode(final int volumeHandle, final long pageAddress) {
            this(volumeHandle, pageAddress, Long.MIN_VALUE, -1);
        }

        PageNode(final int volumeHandle, final long pageAddress, final long journalAddress, final long timestamp) {
            this._volumeHandle = volumeHandle;
            this._pageAddress = pageAddress;
            this._journalAddress = journalAddress;
            this._timestamp = timestamp;
        }

        /**
         * Construct a copy, also copying members of the linked list. Used by
         * #queryPageMap.
         */
        PageNode(final PageNode pageNode) {
            _volumeHandle = pageNode._volumeHandle;
            _pageAddress = pageNode._pageAddress;
            _journalAddress = pageNode._journalAddress;
            _timestamp = pageNode._timestamp;
            _offset = pageNode._offset;
            final PageNode previous = pageNode._previous;
            if (previous != null) {
                _previous = new PageNode(previous);
            }
        }

        /**
         * @return the previous
         */
        public PageNode getPrevious() {
            return _previous;
        }

        /**
         * @param previous
         *            the previous to set
         */
        public void setPrevious(final PageNode previous) {
            if (previous != null) {
                assert _timestamp >= previous._timestamp;
            }
            this._previous = previous;
        }

        /**
         * @return the volumeHandle
         */
        public int getVolumeHandle() {
            return _volumeHandle;
        }

        /**
         * @return the pageAddress
         */
        public long getPageAddress() {
            return _pageAddress;
        }

        /**
         * @return the journalAddress
         */
        public long getJournalAddress() {
            return _journalAddress;
        }

        /**
         * @return the timestamp
         */
        public long getTimestamp() {
            return _timestamp;
        }

        public void setOffset(final int offset) {
            _offset = offset;
        }

        public int getOffset() {
            return _offset;
        }

        @Override
        public int hashCode() {
            return _volumeHandle ^ (int) _pageAddress ^ (int) (_pageAddress >>> 32);
        }

        @Override
        public boolean equals(final Object obj) {
            if (obj == null || !(obj instanceof PageNode)) {
                return false;
            }
            final PageNode pn = (PageNode) obj;
            return _pageAddress == pn._pageAddress && _volumeHandle == pn._volumeHandle;
        }

        @Override
        public String toString() {
            return String.format("[%d]%d@%d{%d}%s", _volumeHandle, _pageAddress, _journalAddress, _timestamp,
                    _previous == null ? "" : "+");
        }

        public String toString(final JournalManager jman) {
            final Volume volume = jman._handleToVolumeMap.get(_volumeHandle);
            if (volume == null) {
                return toString();
            }
            return String.format("%s:%d@%d{%d}%s", volume, _pageAddress, _journalAddress, _timestamp,
                    _previous == null ? "" : "+");
        }

        public String toStringPageAddress(final VolumeHandleLookup lvh) {
            final Volume volume = lvh.lookupVolumeHandle(_volumeHandle);
            return String.format("%s:%d", volume == null ? String.valueOf(_volumeHandle) : volume.toString(),
                    _pageAddress);
        }

        public String toStringJournalAddress(final VolumeHandleLookup lvn) {
            return String.format("%d{%d}%s", _journalAddress, _timestamp, _previous == null ? "" : "+");

        }

        final static Comparator<PageNode> READ_COMPARATOR = new Comparator<PageNode>() {

            @Override
            public int compare(final PageNode a, final PageNode b) {
                if (!a.isInvalid() && !b.isInvalid()) {
                    return a.getJournalAddress() > b.getJournalAddress() ? 1 : a.getJournalAddress() < b
                            .getJournalAddress() ? -1 : 0;
                }
                if (a.isInvalid() && !b.isInvalid()) {
                    return -1;
                }
                if (!a.isInvalid() && b.isInvalid()) {
                    return 1;
                }
                if (a._volumeHandle != b._volumeHandle) {
                    return a._volumeHandle - b._volumeHandle;
                }
                return a._pageAddress > b._pageAddress ? 1 : a._pageAddress < b._pageAddress ? -1 : 0;
            }
        };

        final static Comparator<PageNode> WRITE_COMPARATOR = new Comparator<PageNode>() {

            @Override
            public int compare(final PageNode a, final PageNode b) {
                if (a.getVolumeHandle() != b.getVolumeHandle()) {
                    return a.getVolumeHandle() < b._volumeHandle ? -1 : 1;
                }
                return a.getPageAddress() < b.getPageAddress() ? -1 : a.getPageAddress() > b.getPageAddress() ? 1 : 0;
            }
        };

        boolean isInvalid() {
            return _journalAddress == Long.MIN_VALUE;
        }

        void invalidate() {
            _journalAddress = Long.MIN_VALUE;
        }

        void removeHistory() {
            PageNode pn = getPrevious();
            setPrevious(null);
            while (pn != null) {
                final PageNode previous = pn.getPrevious();
                pn.invalidate();
                pn.setPrevious(null);
                pn = previous;
            }
        }
    }

    public static class TransactionMapItem implements Comparable<TransactionMapItem> {

        private final long _startAddress;

        private final long _startTimestamp;

        private long _commitTimestamp;

        private long _lastRecordAddress;

        TransactionMapItem(final long startTimestamp, final long address) {
            _startTimestamp = startTimestamp;
            _commitTimestamp = 0;
            _startAddress = address;
            _lastRecordAddress = address;
        }

        TransactionMapItem(final TransactionMapItem item) {
            _startAddress = item._startAddress;
            _startTimestamp = item._startTimestamp;
            _commitTimestamp = item._commitTimestamp;
            _lastRecordAddress = item._lastRecordAddress;
        }

        public long getStartAddress() {
            return _startAddress;
        }

        public long getStartTimestamp() {
            return _startTimestamp;
        }

        public long getCommitTimestamp() {
            return _commitTimestamp;
        }

        public long getLastRecordAddress() {
            return _lastRecordAddress;
        }

        void setCommitTimestamp(final long commitTimestamp) {
            _commitTimestamp = commitTimestamp;
        }

        void setLastRecordAddress(final long address) {
            _lastRecordAddress = address;
        }

        public boolean isCommitted() {
            return _commitTimestamp > 0;
        }

        public boolean isAborted() {
            return _commitTimestamp == ABORTED;
        }

        @Override
        public String toString() {
            return String.format("TStatus %,d{%,d}%s", _startAddress, _commitTimestamp, isCommitted() ? "c" : "u");
        }

        @Override
        public int compareTo(final TransactionMapItem ts) {
            if (isCommitted()) {
                return ts.getCommitTimestamp() < _commitTimestamp ? 1 : ts.getCommitTimestamp() > _commitTimestamp ? -1
                        : 0;
            } else {
                return ts.isCommitted() ? -1 : ts.getStartTimestamp() < _startTimestamp ? 1
                        : ts.getStartTimestamp() > _startTimestamp ? -1 : 0;
            }
        }

        final static Comparator<TransactionMapItem> TRANSACTION_MAP_ITEM_COMPARATOR = new Comparator<TransactionMapItem>() {

            @Override
            public int compare(final TransactionMapItem a, final TransactionMapItem b) {
                return a.getLastRecordAddress() > b.getLastRecordAddress() ? 1 : a.getLastRecordAddress() < b
                        .getLastRecordAddress() ? -1 : 0;
            }
        };

    }

    private class JournalCopier extends IOTaskRunnable {

        private volatile boolean _shouldStop = false;
        private final ByteBuffer _bb = ByteBuffer.allocate(DEFAULT_COPY_BUFFER_SIZE);
        private final List<PageNode> _copyList = new ArrayList<PageNode>(_copiesPerCycle);
        int _lastCyclePagesWritten;

        JournalCopier() {
            super(JournalManager.this._persistit);
        }

        void start() {
            start("JOURNAL_COPIER", _copierInterval);
        }

        @Override
        public void runTask() throws Exception {

            _copying.set(true);
            try {
                _copyList.clear();
                if (!_appendOnly.get()) {
                    selectForCopy(_copyList);
                    if (!_copyList.isEmpty()) {
                        readForCopy(_copyList, _bb);
                    }
                    if (!_copyList.isEmpty()) {
                        writeForCopy(_copyList, _bb);
                    }
                }
                cleanupForCopy(_copyList);
                _lastCyclePagesWritten = _copyList.size();
                if (_copyList.isEmpty()) {
                    _copyFast.set(false);
                }
            } finally {
                _copying.set(false);
            }

            long throttleInterval = 0;
            if (!_appendOnly.get()) {
                final int urgency = urgency();
                if (urgency == URGENT) {
                    throttleInterval = URGENT_COMMIT_DELAY_MILLIS;
                } else if (urgency > ALMOST_URGENT) {
                    throttleInterval = GENTLE_COMMIT_DELAY_MILLIS;
                }
            }
            if (throttleInterval != _throttleSleepInterval) {
                _throttleSleepInterval = throttleInterval;
            }

        }

        @Override
        protected boolean shouldStop() {
            return _closed.get() || _shouldStop;
        }

        /**
         * Return a nice interval, in milliseconds, to wait between copierCycle
         * invocations. The interval decreases as interval goes up, and becomes
         * zero when the urgency is greater than or equal to 8. The interval is
         * also zero if there has be no recent I/O activity invoked by other
         * activities.
         */
        @Override
        public long pollInterval() {
            final IOMeter iom = _persistit.getIOMeter();
            final long pollInterval = super.getPollInterval();
            final int urgency = urgency();

            if (_lastCyclePagesWritten == 0) {
                return pollInterval;
            }

            if (urgency >= ALMOST_URGENT) {
                return 0;
            }

            int divisor = 1;

            if (iom.recentCharge() < iom.getQuiescentIOthreshold() * KILO) {
                divisor = HALF_URGENT;
            } else if (urgency > HALF_URGENT) {
                divisor = urgency - HALF_URGENT;
            }

            return super.getPollInterval() / divisor;
        }
    }

    private class JournalFlusher extends IOTaskRunnable {

        volatile long _lastExceptionTimestamp = 0;
        volatile Exception _lastException = null;

        long[] _ioTimes = new long[IO_MEASUREMENT_CYCLES];
        int _ioCycle;
        volatile long _expectedIoTime;
        volatile long _startTime;
        volatile long _endTime;
        volatile long _startTimestamp;
        volatile long _endTimestamp;

        JournalFlusher() {
            super(JournalManager.this._persistit);
        }

        void start() {
            start("JOURNAL_FLUSHER", _flushInterval);
        }

        /**
         * General method used to wait for durability. {@See
         * JournalManager#waitForDurability(long, long, long)}.
         *
         * @throws PersistitInterruptedException
         */
        private void waitForDurability(final long flushedTimestamp, final long leadTime, final long stallTime)
                throws PersistitException {
            /*
             * Commit is known durable once the JOURNAL_FLUSHER thread has
             * posted an _endTimestamp larger than flushedTimestamp.
             */
            final long now = System.nanoTime();
            long remainingStallTime = stallTime;

            while (true) {
                /*
                 * Detect whether an I/O cycle is in progress; if so estimate
                 * how much more time (in nanoseconds) it will require to
                 * complete.
                 */
                long estimatedRemainingIoNanos = -1;
                long startTime;
                long endTime;
                long startTimestamp;
                long endTimestamp;

                /*
                 * Spin until values are stable
                 */
                while (true) {
                    startTimestamp = _startTimestamp;
                    endTimestamp = _endTimestamp;
                    startTime = _startTime;
                    endTime = _endTime;
                    if (startTimestamp == _startTimestamp && endTimestamp == _endTimestamp) {
                        if (flushedTimestamp > startTimestamp && startTimestamp > endTimestamp) {
                            estimatedRemainingIoNanos = Math.max(startTime + _expectedIoTime - now, 0);
                        }
                        break;
                    }
                    Util.spinSleep();
                }

                if (endTimestamp > flushedTimestamp && startTimestamp > flushedTimestamp) {
                    /*
                     * Done - commit is durable
                     */
                    break;
                }

                long remainingSleepNanos;
                if (estimatedRemainingIoNanos == -1) {
                    remainingSleepNanos = Math.max(0, _flushInterval - (now - endTime));
                } else {
                    remainingSleepNanos = _flushInterval;
                }

                long estimatedNanosToFinish;
                if (startTimestamp < flushedTimestamp) {
                    estimatedNanosToFinish = remainingSleepNanos + _expectedIoTime;
                } else {
                    estimatedNanosToFinish = estimatedRemainingIoNanos;
                }

                if (leadTime > 0 && leadTime * NS_PER_MS >= estimatedNanosToFinish) {
                    /*
                     * If the caller specified an leadTime interval larger than
                     * the estimated time remaining in the cycle, then return
                     * immediately. This handles the "soft" commit case.
                     */
                    break;
                } else if (estimatedRemainingIoNanos == -1) {
                    /*
                     * If there is no I/O in progress, then wait as long as
                     * possible (determined by stallTime) before kicking the
                     * JOURNAL_FLUSHER to write the caller's transaction.
                     */
                    if (remainingStallTime > 0) {
                        Util.sleep(remainingStallTime);
                        remainingStallTime = 0;
                    } else {
                        kick();
                        Util.spinSleep();
                    }
                } else {
                    /*
                     * Otherwise wait for concurrent I/O operation to finish. Do
                     * this by polling because our experiments with using locks
                     * here showed significant excess CPU consumption.
                     */
                    Util.spinSleep();
                }
            }
            if (_lastExceptionTimestamp > flushedTimestamp) {
                final Exception e = _lastException;
                if (e instanceof PersistitException) {
                    throw (PersistitException) e;
                } else {
                    throw new PersistitException(e);
                }
            }
            _totalCommits.incrementAndGet();
            _totalCommitWaitTime.addAndGet(System.nanoTime() - now);
        }

        @Override
        protected void runTask() {
            _flushing.set(true);
            try {
                try {
                    /*
                     * This lock is intended only to help other threads in
                     * waitForDurability to know when the I/O operation has
                     * finished.
                     */
                    try {
                        _startTimestamp = _persistit.getTimestampAllocator().updateTimestamp();
                        _startTime = System.nanoTime();
                        /*
                         * Flush the write buffer and call FileChannel.force().
                         */
                        force();

                    } finally {
                        _endTime = System.nanoTime();
                        _endTimestamp = _persistit.getTimestampAllocator().updateTimestamp();
                    }

                    final long elapsed = _endTime - _startTime;
                    _totalFlushCycles.incrementAndGet();
                    _totalFlushIoTime.addAndGet(elapsed);
                    _ioTimes[_ioCycle] = elapsed;
                    _ioCycle = (_ioCycle + 1) % IO_MEASUREMENT_CYCLES;

                    long avg = 0;
                    for (int index = 0; index < IO_MEASUREMENT_CYCLES; index++) {
                        avg += _ioTimes[index];
                    }
                    avg /= IO_MEASUREMENT_CYCLES;

                    _expectedIoTime = avg;
                    if (elapsed > _slowIoAlertThreshold * NS_PER_MS) {
                        _persistit.getLogBase().longJournalIO.log(elapsed / NS_PER_MS, IO_MEASUREMENT_CYCLES, avg
                                / NS_PER_MS);
                    }

                } catch (final Exception e) {
                    if (e instanceof InterruptedException || e instanceof FatalErrorException) {
                        _closed.set(true);
                    } else if (e instanceof PersistitException) {
                        _persistit.getAlertMonitor().post(
                                new Event(AlertLevel.ERROR, _persistit.getLogBase().journalWriteError, e,
                                        addressToFile(_writeBufferAddress), addressToOffset(_writeBufferAddress)),
                                AlertMonitor.JOURNAL_CATEGORY);
                    } else {
                        _persistit.getLogBase().journalWriteError.log(e, addressToFile(_writeBufferAddress),
                                addressToOffset(_writeBufferAddress));
                    }
                }
            } finally {
                _flushing.set(false);
            }

        }

        @Override
        protected boolean shouldStop() {
            return _closed.get();
        }
    }

    synchronized void selectForCopy(final List<PageNode> list) {
        list.clear();
        if (!_appendOnly.get()) {
            final long timeStampUpperBound = Math.min(getLastValidCheckpointTimestamp(), _copierTimestampLimit);
            for (final Iterator<PageNode> iterator = _pageList.iterator(); iterator.hasNext();) {
                final PageNode pageNode = iterator.next();
                for (PageNode pn = pageNode; pn != null && !pn.isInvalid(); pn = pn.getPrevious()) {
                    if (pn.getTimestamp() < timeStampUpperBound) {
                        list.add(pn);
                        break;
                    }
                }
                if (list.size() >= _copiesPerCycle) {
                    break;
                }
            }
        }
    }

    void readForCopy(final List<PageNode> list, final ByteBuffer bb) throws PersistitException {
        Collections.sort(list, PageNode.READ_COMPARATOR);
        bb.clear();

        Volume volume = null;
        int handle = -1;

        for (final Iterator<PageNode> iterator = list.iterator(); iterator.hasNext();) {

            final PageNode pageNode = iterator.next();
            if (pageNode.isInvalid()) {
                iterator.remove();
                continue;
            }
            pageNode.setOffset(-1);
            if (pageNode.getVolumeHandle() != handle) {
                handle = -1;
                try {
                    volume = volumeForHandle(pageNode.getVolumeHandle());
                    handle = volume.getHandle();
                } catch (final VolumeNotFoundException vnfe) {
                    // Deal with this in writeForCopy
                    continue;
                }
            }
            if (volume == null) {
                // Deal with this in writeForCopy
                continue;
            }

            final int at = bb.position();
            final long pageAddress;
            try {
                final PageNode stablePageNode = new PageNode(pageNode);
                if (pageNode.isInvalid()) {
                    iterator.remove();
                    continue;
                }
                pageAddress = readPageBufferFromJournal(stablePageNode, bb);
                _persistit.getIOMeter().chargeCopyPageFromJournal(volume, pageAddress, volume.getPageSize(),
                        stablePageNode.getJournalAddress(), urgency());
            } catch (final PersistitException ioe) {
                _persistit
                        .getAlertMonitor()
                        .post(new Event(AlertLevel.ERROR, _persistit.getLogBase().copyException, ioe, volume,
                                pageNode.getPageAddress(), pageNode.getJournalAddress()), AlertMonitor.JOURNAL_CATEGORY);
                throw ioe;
            }

            Debug.$assert0.t(pageAddress == pageNode.getPageAddress());
            pageNode.setOffset(at);

            if (bb.limit() - at != volume.getStructure().getPageSize()) {
                throw new CorruptJournalException(pageNode.toStringPageAddress(this) + " bufferSize " + bb.limit()
                        + " does not match " + volume + " bufferSize " + volume.getPageSize() + " at "
                        + pageNode.toStringJournalAddress(this));
            }

            bb.position(bb.limit());
        }
    }

    void writeForCopy(final List<PageNode> list, final ByteBuffer bb) throws PersistitException {
        Collections.sort(list, PageNode.WRITE_COMPARATOR);
        Volume volume = null;
        int handle = -1;
        final Set<Volume> volumes = new HashSet<Volume>();

        for (final Iterator<PageNode> iterator = list.iterator(); iterator.hasNext();) {
            final PageNode pageNode = iterator.next();

            if (pageNode.getVolumeHandle() != handle) {
                handle = -1;
                volume = null;
                Volume candidate = null;
                try {
                    candidate = lookupVolumeHandle(pageNode.getVolumeHandle());
                    if (candidate != null) {
                        if (!candidate.isOpened()) {
                            candidate.open(_persistit);
                        }
                        handle = pageNode.getVolumeHandle();
                        volume = candidate;
                    }
                } catch (final VolumeNotFoundException vnfe) {
                    _persistit.getAlertMonitor().post(
                            new Event(AlertLevel.WARN, _persistit.getLogBase().missingVolume, candidate,
                                    pageNode.getJournalAddress()), AlertMonitor.MISSING_VOLUME_CATEGORY);
                    if (_ignoreMissingVolume.get()) {
                        _persistit.getLogBase().lostPageFromMissingVolume.log(pageNode.getPageAddress(), candidate,
                                pageNode.getJournalAddress());
                        // Not removing the page from the List here will cause
                        // cleanupForCopy to remove it from
                        // the page map.
                        continue;
                    }
                }
            }
            if (volume == null || volume.isClosed()) {
                // Remove from the List so that below we won't remove it from
                // from the pageMap.
                iterator.remove();
                continue;
            }

            final long pageAddress = pageNode.getPageAddress();
            volume.getStorage().extend(pageAddress);
            final int pageSize = volume.getPageSize();
            final int at = pageNode.getOffset();
            bb.limit(bb.capacity()).position(at).limit(at + pageSize);

            try {
                volume.getStorage().writePage(bb, pageAddress);
                volumes.add(volume);
            } catch (final PersistitException ioe) {
                _persistit.getLogBase().copyException.log(ioe, volume, pageNode.getPageAddress(),
                        pageNode.getJournalAddress());
                throw ioe;
            }

            _copiedPageCount++;
            _persistit.getIOMeter().chargeCopyPageToVolume(volume, pageAddress, volume.getPageSize(),
                    pageNode.getJournalAddress(), urgency());
        }

        for (final Volume vol : volumes) {
            vol.getStorage().force();
        }

    }

    private void cleanupForCopy(final List<PageNode> list) throws PersistitException {
        //
        // Files and FileChannels no longer needed for recovery.
        //
        final List<FileChannel> obsoleteFileChannels = new ArrayList<FileChannel>();
        final List<File> obsoleteFiles = new ArrayList<File>();

        // Address of the first file needed for recovery
        long deleteBoundary = 0;

        synchronized (this) {
            for (final PageNode copiedPageNode : list) {
                PageNode pageNode = _pageMap.get(copiedPageNode);
                if (pageNode.getJournalAddress() == copiedPageNode.getJournalAddress()) {
                    pageNode.removeHistory();
                    pageNode.invalidate();
                    final PageNode pn = _pageMap.remove(pageNode);
                    assert pn == copiedPageNode;
                } else {
                    PageNode previous = pageNode.getPrevious();
                    while (previous != null) {
                        if (previous.getJournalAddress() == copiedPageNode.getJournalAddress()) {
                            // No need to keep the previous entry, or any of
                            // its predecessors
                            pageNode.removeHistory();
                            break;
                        } else {
                            pageNode = previous;
                            previous = pageNode.getPrevious();
                        }
                    }
                }
            }
            _droppedPageCount += cleanupPageList() - list.size();
            //
            // Will hold the address of the first record containing information
            // not yet copied back into a Volume, and therefore required for
            // recovery.
            //
            long recoveryBoundary = _currentAddress;
            //
            // Detect first journal address holding a mapped page
            // required for recovery
            //

            for (final PageNode pageNode : _pageMap.values()) {
                //
                // If there are multiple versions, we need to keep
                // the most recent one that has been checkpointed.
                //
                for (PageNode pn = pageNode; pn != null; pn = pn.getPrevious()) {
                    if (!pn.isInvalid() && pn.getJournalAddress() < recoveryBoundary) {
                        recoveryBoundary = pn.getJournalAddress();
                    }
                }
            }
            //
            // Detect first journal address still holding an uncheckpointed
            // Transaction required for recovery.
            //
            for (final Iterator<TransactionMapItem> iterator = _liveTransactionMap.values().iterator(); iterator
                    .hasNext();) {
                final TransactionMapItem item = iterator.next();
                if (item.getStartAddress() < recoveryBoundary) {
                    recoveryBoundary = item.getStartAddress();
                }
            }

            if (recoveryBoundary < _baseAddress) {
                throw new IllegalStateException(String.format("Retrograde base address %,d is less than current %,d",
                        recoveryBoundary, _baseAddress));
            }

            _baseAddress = recoveryBoundary;
            for (deleteBoundary = _deleteBoundaryAddress; deleteBoundary + _blockSize <= _lastValidCheckpointBaseAddress; deleteBoundary += _blockSize) {
                final long generation = deleteBoundary / _blockSize;
                final FileChannel channel = _journalFileChannels.remove(generation);
                if (channel != null) {
                    obsoleteFileChannels.add(channel);
                }
                obsoleteFiles.add(addressToFile(deleteBoundary));
            }
            //
            // Conditions mean that there is no active content in the
            // journal and the current journal file has more than RT bytes
            // in it where RT is the "rolloverThreshold". When these
            // conditions are met then we force a rollover and cause the
            // current journal file to be deleted. This behavior keeps
            // the journal small when there are no un-checkpointed pages
            // or transactions.
            //
            if (_baseAddress == _currentAddress && _lastValidCheckpointBaseAddress >= _currentAddress - CP.OVERHEAD
                    && (getCurrentJournalSize() > rolloverThreshold())) {
                final FileChannel channel = _journalFileChannels.remove(_currentAddress / _blockSize);
                if (channel != null) {
                    obsoleteFileChannels.add(channel);
                }
                obsoleteFiles.add(addressToFile(_currentAddress));
                rolloverWithNewBaseAndFile();
            }
        }

        for (final FileChannel channel : obsoleteFileChannels) {
            if (channel != null) {
                try {
                    channel.close();
                } catch (final IOException e) {
                    // TODO - log this?
                    // Ignored for now - this simply means we can't close
                    // a file we don't need any more.
                }
            }
        }

        boolean deleted = true;
        for (final File file : obsoleteFiles) {
            if (!file.delete()) {
                deleted = false;
                // TODO - log this.
                // Ignored for now - this simply means we can't delete
                // a file we don't need any more.
            }
        }
        if (deleted) {
            _deleteBoundaryAddress = deleteBoundary;
        }
        reportJournalFileCount();
    }

    /**
     * Remove obsolete PageNodes from the page list.
     *
     * @return Count of removed PageNode instances.
     */
    int cleanupPageList() {
        final int size = _pageList.size();
        int from;
        for (from = 0; from < size && !_pageList.get(from).isInvalid(); from++)
            ;
        int to = from;
        for (from = from + 1; from < size; from++) {
            final PageNode pn = _pageList.get(from);
            if (!pn.isInvalid()) {
                _pageList.set(to++, pn);
            }
        }
        if (size > to) {
            _pageList.removeRange(to, size);
        }
        return size - to;
    }

    synchronized void truncate(final Volume volume, final long timestamp) {
        for (final PageNode lastPageNode : _pageMap.values()) {
            PageNode pageNode = lastPageNode;
            while (pageNode != null) {
                if (volume.getHandle() == pageNode.getVolumeHandle() && pageNode.getTimestamp() < timestamp) {
                    pageNode.invalidate();
                }
                pageNode = pageNode.getPrevious();
            }
        }
    }

    private void reportJournalFileCount() {
        /*
         * Does not need synchronization since only the JOURNAL_COPIER thread
         * calls this
         */
        final int journalFileCount = getJournalFileCount();
        if (journalFileCount != _lastReportedJournalFileCount) {
            if (journalFileCount > TOO_MANY_ERROR_THRESHOLD + _urgentFileCountThreshold) {
                _persistit.getAlertMonitor()
                        .post(new Event(AlertLevel.ERROR, _persistit.getLogBase().tooManyJournalFilesError,
                                journalFileCount), AlertMonitor.MANY_JOURNAL_FILES);
            } else if (journalFileCount > TOO_MANY_WARN_THRESHOLD + _urgentFileCountThreshold) {
                _persistit.getAlertMonitor()
                        .post(new Event(AlertLevel.WARN, _persistit.getLogBase().tooManyJournalFilesWarning,
                                journalFileCount), AlertMonitor.MANY_JOURNAL_FILES);
            } else {
                _persistit.getAlertMonitor().post(
                        new Event(AlertLevel.NORMAL, _persistit.getLogBase().normalJournalFileCount, journalFileCount),
                        AlertMonitor.MANY_JOURNAL_FILES);
            }
            _lastReportedJournalFileCount = journalFileCount;
        }
    }

    private class JournalTransactionPlayerSupport implements TransactionPlayerSupport {

        final ByteBuffer _readBuffer = ByteBuffer.allocate(Transaction.TRANSACTION_BUFFER_SIZE
                + JournalRecord.TX.OVERHEAD);

        @Override
        public void read(final long address, final int size) throws PersistitIOException {
            _readBuffer.clear().limit(size);
            readFully(_readBuffer, address);
        }

        @Override
        public ByteBuffer getReadBuffer() {
            return _readBuffer;
        }

        @Override
        public void convertToLongRecord(final Value value, final int treeHandle, final long address,
                final long commitTimestamp) throws PersistitException {
            // Do nothing - long record value does not need to be recovered for
            // pruning
        }

        @Override
        public Persistit getPersistit() {
            return _persistit;
        }
    }

    class ProactiveRollbackListener implements TransactionPlayerListener {

        TransactionStatus status;

        @Override
        public void store(final long address, final long timestamp, final Exchange exchange) throws PersistitException {
            exchange.prune();
        }

        @Override
        public void removeKeyRange(final long address, final long timestamp, final Exchange exchange, final Key from,
                final Key to) throws PersistitException {
            try {
                exchange.prune(from, to);
            } catch (final RebalanceException e) {
                // ignore
            }
        }

        @Override
        public void removeTree(final long address, final long timestamp, final Exchange exchange)
                throws PersistitException {
            // TODO
        }

        @Override
        public void delta(final long address, final long timestamp, final Tree tree, final int index,
                final int accumulatorType, final long value) throws PersistitException {
            // Nothing to to undo.
        }

        @Override
        public void startRecovery(final long address, final long timestamp) throws PersistitException {
            // Default: do nothing
        }

        @Override
        public void startTransaction(final long address, final long startTimestamp, final long commitTimestamp)
                throws PersistitException {
            // Default: do nothing
            status = _persistit.getTransactionIndex().getStatus(startTimestamp);
        }

        @Override
        public void endTransaction(final long address, final long timestamp) throws PersistitException {
            final TransactionStatus ts = _persistit.getTransactionIndex().getStatus(timestamp);
            /*
             * Can be null because the MVV count became zero and
             * TransactionIndex already removed it.
             */
            if (ts != null) {
                if (ts.getMvvCount() > 0 && _persistit.isInitialized()) {
                    _persistit.getLogBase().pruningIncomplete.log(ts,
                            TransactionPlayer.addressToString(address, timestamp));
                }
            }
        }

        @Override
        public void endRecovery(final long address, final long timestamp) throws PersistitException {
            // Default: do nothing
        }

        @Override
        public boolean requiresLongRecordConversion() {
            return false;
        }

        @Override
        public boolean createTree(final long timestamp) throws PersistitException {
            return false;
        }

    }

    /**
     * Extend ArrayList to export the removeRange method.
     */
    @SuppressWarnings("serial")
    static class RangeRemovingArrayList<T> extends ArrayList<T> {
        @Override
        public void removeRange(final int fromIndex, final int toIndex) {
            super.removeRange(fromIndex, toIndex);
        }
    }

    private long rolloverThreshold() {
        return _closed.get() ? 0 : ROLLOVER_THRESHOLD;
    }

    /**
     * @return number of internal handle values that have been assigned so far
     */
    public int getHandleCount() {
        return _handleCounter;
    }

    long getLastValidCheckpointBaseAddress() {
        return _lastValidCheckpointBaseAddress;
    }

    /**
     * For use only by unit tests that test page maps, etc.
     *
     * @param handleToVolumeMap
     */
    synchronized void unitTestInjectVolumes(final Map<Integer, Volume> handleToVolumeMap) {
        _handleToVolumeMap.putAll(handleToVolumeMap);
    }

    /**
     * For use only by unit tests that test page maps, etc.
     *
     * @param handleToVolumeMap
     */
    void unitTestInjectPageMap(final Map<PageNode, PageNode> pageMap) {
        _pageMap.putAll(pageMap);
    }

    void unitTestInjectTransactionMap(final Map<Long, TransactionMapItem> transactionMap) {
        _liveTransactionMap.putAll(transactionMap);
    }

    void unitTestClearTransactionMap() {
        _liveTransactionMap.clear();
    }

    long getCurrentJournalSize() {
        return _currentAddress % _blockSize;
    }

    long getWriteBufferAddress() {
        return _writeBufferAddress;
    }

    int getJournalFileCount() {
        return (int) (_currentAddress / _blockSize - _baseAddress / _blockSize) + 1;
    }

    synchronized boolean unitTestTxnExistsInLiveMap(final Long startTimestamp) {
        return _liveTransactionMap.containsKey(startTimestamp);
    }

    void unitTestInjectPageList(final List<PageNode> list) {
        _pageList.addAll(list);
    }

    boolean unitTestPageListEquals(final List<PageNode> list) {
        return list.equals(_pageList);
    }

    synchronized List<File> unitTestGetAllJournalFiles() {
        final List<File> files = new ArrayList<File>();
        for (final Long address : _journalFileChannels.keySet()) {
            files.add(addressToFile(address));
        }
        return files;
    }

    void unitTestAllowHandlesForTemporaryVolumesAndTrees() {
        _allowHandlesForTempVolumesAndTrees = true;
    }

    public PageNode queryPageNode(final int volumeHandle, final long pageAddress) {
        final PageNode pn = _pageMap.get(new PageNode(volumeHandle, pageAddress, -1, -1));
        if (pn != null) {
            return new PageNode(pn);
        } else {
            return null;
        }
    }

    public PageNode queryBranchNode(final int volumeHandle, final long pageAddress) {
        final PageNode pn = _branchMap.get(new PageNode(volumeHandle, pageAddress, -1, -1));
        if (pn != null) {
            return new PageNode(pn);
        } else {
            return null;
        }
    }

    public TransactionMapItem queryTransactionMap(final long timestamp) {
        final TransactionMapItem item = _liveTransactionMap.get(timestamp);
        if (item != null) {
            return new TransactionMapItem(item);
        } else {
            return null;
        }
    }

    public SortedMap<Integer, Volume> queryVolumeMap() {
        return new TreeMap<Integer, Volume>(_handleToVolumeMap);
    }

    public SortedMap<Integer, TreeDescriptor> queryTreeMap() {
        return new TreeMap<Integer, TreeDescriptor>(_handleToTreeMap);
    }
}
TOP

Related Classes of com.persistit.JournalManager

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.