Source Code of com.sleepycat.je.recovery.Checkpointer$FlushStats

/*-
 * See the file LICENSE for redistribution information.
 *
 * Copyright (c) 2002, 2011 Oracle and/or its affiliates.  All rights reserved.
 *
 */


package com.sleepycat.je.recovery;


import static com.sleepycat.je.recovery.CheckpointStatDefinition.CKPT_CHECKPOINTS;
import static com.sleepycat.je.recovery.CheckpointStatDefinition.CKPT_DELTA_IN_FLUSH;
import static com.sleepycat.je.recovery.CheckpointStatDefinition.CKPT_FULL_BIN_FLUSH;
import static com.sleepycat.je.recovery.CheckpointStatDefinition.CKPT_FULL_IN_FLUSH;
import static com.sleepycat.je.recovery.CheckpointStatDefinition.CKPT_LAST_CKPTID;
import static com.sleepycat.je.recovery.CheckpointStatDefinition.CKPT_LAST_CKPT_END;
import static com.sleepycat.je.recovery.CheckpointStatDefinition.CKPT_LAST_CKPT_START;
import static com.sleepycat.je.recovery.CheckpointStatDefinition.GROUP_DESC;
import static com.sleepycat.je.recovery.CheckpointStatDefinition.GROUP_NAME;


import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.SortedMap;
import java.util.TreeMap;
import java.util.logging.Level;


import com.sleepycat.je.CacheMode;
import com.sleepycat.je.CheckpointConfig;
import com.sleepycat.je.DatabaseException;
import com.sleepycat.je.EnvironmentMutableConfig;
import com.sleepycat.je.StatsConfig;
import com.sleepycat.je.cleaner.Cleaner;
import com.sleepycat.je.cleaner.FileSelector.CheckpointStartCleanerState;
import com.sleepycat.je.config.EnvironmentParams;
import com.sleepycat.je.dbi.DatabaseId;
import com.sleepycat.je.dbi.DatabaseImpl;
import com.sleepycat.je.dbi.DbConfigManager;
import com.sleepycat.je.dbi.DbTree;
import com.sleepycat.je.dbi.EnvConfigObserver;
import com.sleepycat.je.dbi.EnvironmentImpl;
import com.sleepycat.je.log.LogEntryType;
import com.sleepycat.je.log.LogItem;
import com.sleepycat.je.log.LogManager;
import com.sleepycat.je.log.Provisional;
import com.sleepycat.je.log.ReplicationContext;
import com.sleepycat.je.log.entry.SingleItemEntry;
import com.sleepycat.je.tree.ChildReference;
import com.sleepycat.je.tree.IN;
import com.sleepycat.je.tree.INLogContext;
import com.sleepycat.je.tree.INLogItem;
import com.sleepycat.je.tree.SearchResult;
import com.sleepycat.je.tree.Tree;
import com.sleepycat.je.tree.WithRootLatched;
import com.sleepycat.je.utilint.DaemonThread;
import com.sleepycat.je.utilint.DbLsn;
import com.sleepycat.je.utilint.LSNStat;
import com.sleepycat.je.utilint.LoggerUtils;
import com.sleepycat.je.utilint.LongStat;
import com.sleepycat.je.utilint.StatGroup;
import com.sleepycat.je.utilint.TestHook;
import com.sleepycat.je.utilint.TestHookExecute;


/**
 * The Checkpointer looks through the tree for internal nodes that must be
 * flushed to the log. Checkpoint flushes must be done in ascending order from
 * the bottom of the tree up.
 *
 * Checkpoint and IN Logging Rules
 * -------------------------------
 * The checkpoint must log, and make accessible via non-provisional ancestors,
 * all INs that are dirty at CkptStart.  If we crash and recover from that
 * CkptStart onward, any IN that became dirty (before the crash) after the
 * CkptStart must become dirty again as the result of replaying the action that
 * caused it to originally become dirty.
 *
 * Therefore, when an IN is dirtied at some point in the checkpoint interval,
 * but is not logged by the checkpoint, the log entry representing the action
 * that dirtied the IN must follow either the CkptStart or the FirstActiveLSN
 * that is recorded in the CkptEnd entry.  The FirstActiveLSN is less than or
 * equal to the CkptStart LSN.  Recovery will process LNs between the
 * FirstActiveLSN and the end of the log.  Other entries are only processed
 * from the CkptStart forward.  And provisional entries are not processed.
 * 
 * Example: Non-transactional LN logging.  We take two actions: 1) log the LN
 * and then 2) dirty the parent BIN.  What if the LN is logged before CkptStart
 * and the BIN is dirtied after CkptStart?  How do we avoid breaking the rules?
 * The answer is that we log the LN while holding the latch on the parent BIN,
 * and we don't release the latch until after we dirty the BIN.   The
 * construction of the checkpoint dirty map requires latching the BIN.  Since
 * the LN was logged before CkptStart, the BIN will be dirtied before the
 * checkpointer latches it during dirty map construction.  So the BIN will
 * always be included in the dirty map and logged by the checkpoint.
 *
 * Example: Abort.  We take two actions: 1) log the abort and then 2) undo the
 * changes, which modifies (dirties) the BIN parents of the undone LNs.  There
 * is nothing to prevent logging CkptStart in between these two actions, so how
 * do we avoid breaking the rules?  The answer is that we do not unregister the
 * transaction until after the undo phase.  So although the BINs may be dirtied
 * by the undo after CkptStart is logged, the FirstActiveLSN will be prior to
 * CkptStart.  Therefore, we will process the Abort and replay the action that
 * modifies the BINs.
 *
 * Exception: Lazy migration.  The log cleaner will make an IN dirty without
 * logging an action that makes it dirty.  This is an exception to the general
 * rule that actions should be logged when they cause dirtiness.   The reasons
 * this is safe are:
 * 1. The IN contents are not modified, so there is no information lost if the
 *    IN is never logged, or is logged provisionally and no ancestor is logged
 *    non-provisionally.
 * 2. If the IN is logged non-provisionally, this will have the side effect of
 *    recording the old LSN as being obsolete. However, the general rules for
 *    checkpointing and recovery will ensure that the new version is used in
 *    the Btree.  The new version will either be replayed by recovery or
 *    referenced in the active Btree via a non-provisional ancestor.
 *
 * Checkpoint Algorithm
 * --------------------
 * The final checkpointDirtyMap field is used to hold (in addition to the dirty
 * INs) the state of the checkpoint and highest flush levels.  Access to this
 * object is synchronized so that eviction and checkpointing can access it
 * concurrently.  When a checkpoint is not active, the state is CkptState.NONE
 * and the dirty map is empty.  When a checkpoint runs, we do this:
 *
 * 1. Get set of files from cleaner that can be deleted after this checkpoint.
 * 2. Set checkpointDirtyMap state to DIRTY_MAP_INCOMPLETE, meaning that dirty
 *    map construction is in progress.
 * 3. Log CkptStart
 * 4. Construct dirty map, organized by Btree level, from dirty INs in INList.
 *    The highest flush levels are calculated during dirty map construction.
 *    Set checkpointDirtyMap state to DIRTY_MAP_COMPLETE.
 * 5. Flush INs in dirty map.
 *        + First, flush the bottom two levels a sub-tree at a time, where a
 *          sub-tree is one IN at level two and all its BIN children.  Higher
 *          levels (above level two) are logged strictly by level, not using
 *          subtrees.
 *              o If je.checkpointer.highPriority=false, we log one IN at a
 *                time, whether or not the IN is logged as part of a subtree,
 *                and do a Btree search for the parent of each IN.
 *              o If je.checkpointer.highPriority=true, for the bottom two
 *                levels we log each sub-tree in a single call to the
 *                LogManager with the parent IN latched, and we only do one
 *                Btree search for each level two IN.  Higher levels are logged
 *                one IN at a time as with highPriority=false.
 *        + The Provisional property is set as follows, depending on the level
 *          of the IN:
 *              o level is max flush level:  Provisional.NO
 *              o level is bottom level: Provisional.YES
 *              o Otherwise (middle levels): Provisional.BEFORE_CKPT_END
 *  6. Flush VLSNIndex cache to make VLSNIndex recoverable.
 *  7. Flush UtilizationTracker (write FileSummaryLNs) to persist all
 *     tracked obsolete offsets and utilization summary info, to make this info
 *     recoverable.
 *  8. Log CkptEnd
 *  9. Delete cleaned files from step 1.
 * 10. Set checkpointDirtyMap state to NONE.
 *
 * Provisional.BEFORE_CKPT_END
 * ---------------------------
 * See Provisional.java for a description of the relationship between the
 * checkpoint algorithm above and the BEFORE_CKPT_END property.
 *
 * Coordination of Eviction and Checkpointing
 * ------------------------------------------
 * Eviction can proceed concurrently with all phases of a checkpoint, and
 * eviction may take place concurrently in multiple threads.  This concurrency
 * is crucial to avoid blocking application threads that perform eviction and
 * to reduce the amount of eviction required in application threads.
 *
 * Eviction calls Checkpointer.coordinateEvictionWithCheckpoint, which calls
 * DirtyINMap.coordinateEvictionWithCheckpoint, just before logging an IN.
 * coordinateEvictionWithCheckpoint returns whether the IN should be logged
 * provisionally (Provisional.YES) or non-provisionally (Provisional.NO).
 *
 * Other coordination necessary depends on the state of the checkpoint:
 *   + NONE: No additional action.
 *      o return Provisional.NO
 *   + DIRTY_MAP_INCOMPLETE: The parent IN is added to the dirty map, exactly
 *     as if it were encountered as dirty in the INList during dirty map
 *     construction.
 *      o IN level GTE highest flush level: return Provisional.NO
 *      o IN level LT highest flush level: return Provisional.YES
 *   + DIRTY_MAP_COMPLETE:
 *      o IN is root: return Provisional.NO
 *      o IN is not root: return Provisional.YES
 *
 * In general this is designed so that eviction will use the same provisional
 * value that would be used by the checkpoint, as if the checkpoint itself were
 * logging the IN.  However, there are several conditions where this is not
 * exactly the case.
 *
 * 1. Eviction may log an IN with Provisional.YES when the IN was not dirty at
 *    the time of dirty map creation, if it became dirty afterwards.  In this
 *    case, the checkpointer would not have logged the IN at all.  This is safe
 *    because the actions that made that IN dirty are logged in the recovery
 *    period.
 * 2. Eviction may log an IN with Provisional.YES after the checkpoint has
 *    logged it, if it becomes dirty again.  In this case the IN is logged
 *    twice, which would not have been done by the checkpoint alone.  This is
 *    safe because the actions that made that IN dirty are logged in the
 *    recovery period.
 * 3. An intermediate level IN (not bottom most and not the highest flush
 *    level) will be logged by the checkpoint with Provisional.BEFORE_CKPT_END
 *    but will be logged by eviction with Provisional.YES.  See below for why
 *    this is safe.
 * 4. Between checkpoint step 8 (log CkptEnd) and 10 (set checkpointDirtyMap
 *    state to NONE), eviction may log an IN with Provisional.YES, although a
 *    checkpoint is not strictly active during this interval.  See below for
 *    why this is safe.
 *
 * It is safe for eviction to log an IN as Provisional.YES for the last two
 * special cases, because this does not cause incorrect recovery behavior.  For
 * recovery to work properly, it is only necessary that:
 *
 *  + Provisional.NO is used for INs at the max flush level during an active
 *    checkpoint.
 *  + Provisional.YES or BEFORE_CKPT_END is used for INs below the max flush
 *    level, to avoid replaying an IN during recovery that may depend on a file
 *    deleted as the result of the checkpoint.
 *
 * You may ask why we don't use Provisional.YES for eviction when a checkpoint
 * is not active.  There are two reason, both related to performance:
 *
 * 1. This would be wasteful when an IN is evicted in between checkpoints, and
 *    that portion of the log is processed by recovery later, in the event of a
 *    crash.  The evicted INs would be ignored by recovery, but the actions
 *    that caused them to be dirty would be replayed and the INs would be
 *    logged again redundantly.
 * 2. Logging a IN provisionally will not count the old LSN as obsolete
 *    immediately, so cleaner utilization will be inaccurate until the a
 *    non-provisional parent is logged, typically by the next checkpoint.  It
 *    is always important to keep the cleaner from stalling and spiking, to
 *    keep latency and throughput as level as possible.
 *
 * Therefore, it is safe to log with Provisional.YES in between checkpoints,
 * but not desirable.
 *
 * Although we don't do this, it would be safe and optimal to evict with
 * BEFORE_CKPT_END in between checkpoints, because it would be treated by
 * recovery as if it were Provisional.NO.  This is because the interval between
 * checkpoints is only processed by recovery if it follows the last CkptEnd,
 * and BEFORE_CKPT_END is treated as Provisional.NO if the IN follows the last
 * CkptEnd.
 *
 * However, it would not be safe to evict an IN with BEFORE_CKPT_END during a
 * checkpoint, when logging of the IN's ancestors does not occur according to
 * the rules of the checkpoint.  If this were done, then if the checkpoint
 * completes and is used during a subsequent recovery, an obsolete offset for
 * the old version of the IN will mistakenly be recorded.  Below are two cases
 * where BEFORE_CKPT_END is used correctly and one showing how it could be used
 * incorrectly.
 *
 * 1. Correct use of BEFORE_CKPT_END when the checkpoint does not complete.
 *
 *        050 BIN-A
 *        060 IN-B parent of BIN-A
 *        100 CkptStart
 *        200 BIN-A logged with BEFORE_CKPT_END
 *        300 FileSummaryLN with obsolete offset for BIN-A at 050
 *        Crash and recover
 *
 *    Recovery will process BIN-A at 200 (it will be considered
 *    non-provisional) because there is no following CkptEnd.  It is
 *    therefore correct that BIN-A at 050 is obsolete.
 *
 * 2. Correct use of BEFORE_CKPT_END when the checkpoint does complete.
 *
 *        050 BIN-A
 *        060 IN-B parent of BIN-A
 *        100 CkptStart
 *        200 BIN-A logged with BEFORE_CKPT_END
 *        300 FileSummaryLN with obsolete offset for BIN-A at 050
 *        400 IN-B parent of BIN-A, non-provisional
 *        500 CkptEnd
 *        Crash and recover
 *
 *    Recovery will not process BIN-A at 200 (it will be considered
 *    provisional) because there is a following CkptEnd, but it will
 *    process its parent IN-B at 400, and therefore the BIN-A at 200 will be
 *    active in the tree.  It is therefore correct that BIN-A at 050 is
 *    obsolete.
 *
 * 3. Incorrect use of BEFORE_CKPT_END when the checkpoint does complete.
 *
 *        050 BIN-A
 *        060 IN-B parent of BIN-A
 *        100 CkptStart
 *        200 BIN-A logged with BEFORE_CKPT_END
 *        300 FileSummaryLN with obsolete offset for BIN-A at 050
 *        400 CkptEnd
 *        Crash and recover
 *
 *    Recovery will not process BIN-A at 200 (it will be considered
 *    provisional) because there is a following CkptEnd, but no parent
 *    IN-B is logged, and therefore the IN-B at 060 and BIN-A at 050 will be
 *    active in the tree.  It is therefore incorrect that BIN-A at 050 is
 *    obsolete.
 *
 * This last case is what caused the LFNF in SR [#19422], when BEFORE_CKPT_END
 * was mistakenly used for logging evicted BINs via CacheMode.EVICT_BIN.
 * During the checkpoint, we evict BIN-A and log it with BEFORE_CKPT_END, yet
 * neither it nor its parent are part of the checkpoint.  After being counted
 * obsolete, we crash and recover.  Then the file containing the BIN (BIN-A at
 * 050 above) is cleaned and deleted.  During cleaning, it is not migrated
 * because an obsolete offset was previously recorded.  The LFNF occurs when
 * trying to access this BIN during a user operation.
 *
 * CacheMode.EVICT_BIN
 * -------------------
 * Unlike in JE 4.0 where EVICT_BIN was first introduced, in JE 4.1 and later
 * we do not use special rules when an IN is evicted.  Since concurrent
 * eviction and checkpointing are supported in JE 4.1, the above rules apply to
 * EVICT_BIN as well as all other types of eviction.
 */
public class Checkpointer extends DaemonThread implements EnvConfigObserver {


    /*
     * We currently use multi-logging whenever practical, but we're keeping an
     * option open to disable it, perhaps via a config param.
     */
    private static final boolean MULTI_LOG = true;


    /**
     * For unit testing only.  Called before we flush the max level.  This
     * field is static because it is called from the static flushIN method.
     */
    public static TestHook maxFlushLevelHook = null;


    public static TestHook beforeFlushHook = null;


    public static TestHook<IN> examineINForCheckpointHook = null;


    private EnvironmentImpl envImpl;


    /* Checkpoint sequence, initialized at recovery. */
    private long checkpointId;


    /*
     * How much the log should grow between checkpoints. If 0, we're using time
     * based checkpointing.
     */
    private final long logSizeBytesInterval;
    private final long logFileMax;
    private final long timeInterval;
    private long lastCheckpointMillis;


    /* Configured to true to minimize checkpoint duration. */
    private boolean highPriority;


    private long nCheckpoints;
    private long lastCheckpointStart;
    private long lastCheckpointEnd;
    private final FlushStats flushStats;


    /**
     * The DirtyINMap for checkpointing is created once and is reset after each
     * checkpoint is complete.  Access to this object is synchronized so that
     * eviction and checkpointing can access it concurrently.
     */
    private final DirtyINMap checkpointDirtyMap;


    public Checkpointer(EnvironmentImpl envImpl,
                        long waitTime,
                        String name) {
        super(waitTime, name, envImpl);
        this.envImpl = envImpl;
        logSizeBytesInterval =
            envImpl.getConfigManager().getLong
                (EnvironmentParams.CHECKPOINTER_BYTES_INTERVAL);
        logFileMax =
            envImpl.getConfigManager().getLong(EnvironmentParams.LOG_FILE_MAX);
        timeInterval = waitTime;
        lastCheckpointMillis = 0;


        nCheckpoints = 0;
        flushStats = new FlushStats();


        checkpointDirtyMap = new DirtyINMap(envImpl);


        /* Initialize mutable properties and register for notifications. */
        envConfigUpdate(envImpl.getConfigManager(), null);
        envImpl.addConfigObserver(this);
    }


    /**
     * Process notifications of mutable property changes.
     */
    public void envConfigUpdate(DbConfigManager cm,
                                EnvironmentMutableConfig ignore) {
        highPriority = cm.getBoolean
            (EnvironmentParams.CHECKPOINTER_HIGH_PRIORITY);
    }


    /**
     * Initializes the checkpoint intervals when no checkpoint is performed
     * while opening the environment.
     */
    public void initIntervals(long lastCheckpointStart,
                              long lastCheckpointEnd,
                              long lastCheckpointMillis) {
        this.lastCheckpointStart = lastCheckpointStart;
        this.lastCheckpointEnd = lastCheckpointEnd;
        this.lastCheckpointMillis = lastCheckpointMillis;
    }


    /**
     * Coordinates an eviction with an in-progress checkpoint and returns
     * whether provisional logging is needed.
     *
     * @return true if the target must be logged provisionally.
     */
    public boolean coordinateEvictionWithCheckpoint(IN target, IN parent) {
        return checkpointDirtyMap.
            coordinateEvictionWithCheckpoint(target, parent);
    }


    /**
     * Figure out the wakeup period. Supplied through this static method
     * because we need to pass wakeup period to the superclass and need to do
     * the calcuation outside this constructor.
     *
     * @throws IllegalArgumentException via Environment ctor and
     * setMutableConfig.
     */
    public static long getWakeupPeriod(DbConfigManager configManager)
        throws IllegalArgumentException {


        long wakeupPeriod = configManager.getDuration
            (EnvironmentParams.CHECKPOINTER_WAKEUP_INTERVAL);
        long bytePeriod = configManager.getLong
            (EnvironmentParams.CHECKPOINTER_BYTES_INTERVAL);


        /* Checkpointing period must be set either by time or by log size. */
        if ((wakeupPeriod == 0) && (bytePeriod == 0)) {
            throw new IllegalArgumentException
                (EnvironmentParams.CHECKPOINTER_BYTES_INTERVAL.getName() +
                 " and " +
                 EnvironmentParams.CHECKPOINTER_WAKEUP_INTERVAL.getName() +
                 " cannot both be 0. ");
        }


        /*
         * Checkpointing by log size takes precendence over time based period.
         */
        if (bytePeriod == 0) {
            return wakeupPeriod;
        } else {
            return 0;
        }
    }


    /**
     * Set checkpoint id -- can only be done after recovery.
     */
    public synchronized void setCheckpointId(long lastCheckpointId) {
        checkpointId = lastCheckpointId;
    }


    /**
     * Load stats.
     */
    public StatGroup loadStats(StatsConfig config) {
        StatGroup stats = new StatGroup(GROUP_NAME, GROUP_DESC);
        new LongStat(stats, CKPT_LAST_CKPTID, checkpointId);
        new LongStat(stats, CKPT_CHECKPOINTS, nCheckpoints);
        new LSNStat(stats, CKPT_LAST_CKPT_START, lastCheckpointStart);
        new LSNStat(stats, CKPT_LAST_CKPT_END, lastCheckpointEnd);
        new LongStat(stats, CKPT_FULL_IN_FLUSH, flushStats.nFullINFlush);
        new LongStat(stats, CKPT_FULL_BIN_FLUSH, flushStats.nFullBINFlush);
        new LongStat(stats, CKPT_DELTA_IN_FLUSH, flushStats.nDeltaINFlush);


        if (config.getClear()) {
            nCheckpoints = 0;
            flushStats.nFullINFlush = 0;
            flushStats.nFullBINFlush = 0;
            flushStats.nDeltaINFlush = 0;
        }


        return stats;
    }


    public synchronized void clearEnv() {
        envImpl = null;
    }


    /**
     * Return the number of retries when a deadlock exception occurs.
     */
    @Override
    protected long nDeadlockRetries() {
        return envImpl.getConfigManager().getInt
            (EnvironmentParams.CHECKPOINTER_RETRY);
    }


    /**
     * Called whenever the DaemonThread wakes up from a sleep.
     */
    @Override
    protected void onWakeup()
        throws DatabaseException {


        if (envImpl.isClosed()) {
            return;
        }


        doCheckpoint(CheckpointConfig.DEFAULT, "daemon");
    }


    /**
     * Wakes up the checkpointer if a checkpoint log interval is configured and
     * the number of bytes written since the last checkpoint exeeds the size
     * of the interval.
     */
    public void wakeupAfterWrite() {
        if (logSizeBytesInterval != 0) {
            long nextLsn = envImpl.getFileManager().getNextLsn();
            if (DbLsn.getNoCleaningDistance
                    (nextLsn, lastCheckpointStart, logFileMax) >=
                    logSizeBytesInterval) {
                wakeup();
            }
        }
    }


    /**
     * Determine whether a checkpoint should be run.
     *
     * 1. If the force parameter is specified, always checkpoint.
     *
     * 2. If the config object specifies time or log size, use that.
     *
     * 3. If the environment is configured to use log size based checkpointing,
     * check the log.
     *
     * 4. Lastly, use time based checking.
     */
    private boolean isRunnable(CheckpointConfig config) {
        /* Figure out if we're using log size or time to determine interval.*/
        long useBytesInterval = 0;
        long useTimeInterval = 0;
        long nextLsn = DbLsn.NULL_LSN;
        boolean runnable = false;
        try {
            if (config.getForce()) {
                runnable = true;
                return runnable;
            } else if (config.getKBytes() != 0) {
                useBytesInterval = config.getKBytes() << 10;
            } else if (config.getMinutes() != 0) {
                // convert to millis
                useTimeInterval = config.getMinutes() * 60 * 1000;
            } else if (logSizeBytesInterval != 0) {
                useBytesInterval = logSizeBytesInterval;
            } else {
                useTimeInterval = timeInterval;
            }


            /*
             * If our checkpoint interval is defined by log size, check on how
             * much log has grown since the last checkpoint.
             */
            if (useBytesInterval != 0) {
                nextLsn = envImpl.getFileManager().getNextLsn();
                if (DbLsn.getNoCleaningDistance(nextLsn, lastCheckpointStart,
                                                logFileMax) >=
                    useBytesInterval) {
                    runnable = true;
                }
            } else if (useTimeInterval != 0) {


                /*
                 * Our checkpoint is determined by time.  If enough time has
                 * passed and some log data has been written, do a checkpoint.
                 */
                long lastUsedLsn = envImpl.getFileManager().getLastUsedLsn();
                if (((System.currentTimeMillis() - lastCheckpointMillis) >=
                     useTimeInterval) &&
                    (DbLsn.compareTo(lastUsedLsn, lastCheckpointEnd) != 0)) {
                    runnable = true;
                }
            }
            return runnable;
        } finally {
            StringBuilder sb = new StringBuilder();
            sb.append("size interval=").append(useBytesInterval);
            if (nextLsn != DbLsn.NULL_LSN) {
                sb.append(" nextLsn=").
                    append(DbLsn.getNoFormatString(nextLsn));
            }
            if (lastCheckpointEnd != DbLsn.NULL_LSN) {
                sb.append(" lastCkpt=");
                sb.append(DbLsn.getNoFormatString(lastCheckpointEnd));
            }
            sb.append(" time interval=").append(useTimeInterval);
            sb.append(" force=").append(config.getForce());
            sb.append(" runnable=").append(runnable);


            LoggerUtils.finest(logger, envImpl, sb.toString());
        }
    }


    /**
     * The real work to do a checkpoint. This may be called by the checkpoint
     * thread when waking up, or it may be invoked programatically through the
     * api.
     *
     * @param invokingSource a debug aid, to indicate who invoked this
     *       checkpoint. (i.e. recovery, the checkpointer daemon, the cleaner,
     *       programatically)
     */
    public synchronized void doCheckpoint(CheckpointConfig config,
                                          String invokingSource)
        throws DatabaseException {


        if (envImpl.isReadOnly()) {
            return;
        }


        if (!isRunnable(config)) {
            return;
        }


        /*
         * If minimizing recovery time is desired, then flush all the way to
         * the top of the dbtree instead of stopping at the highest level last
         * modified, so that only the root INs are processed by recovery.
         */
        final boolean flushAll = config.getMinimizeRecoveryTime();
        
        /*
         * Since writing deltas does not impact recovery time (they are
         * provisional and not processed), and LN replay is not impacted either
         * (it is impacted only by deltas written earlier), always allow
         * deltas.  This parameter is kept only in case we wish to add an
         * option later for disallowing deltas.
         */
        final boolean allowDeltas = true;


        /*
         * If there are cleaned files to be deleted, flush an extra level to
         * write out the parents of cleaned nodes.  This ensures that the node
         * will contain the LSN of a cleaned files.
         */
        boolean flushExtraLevel = false;
        Cleaner cleaner = envImpl.getCleaner();
        CheckpointStartCleanerState cleanerState =
            cleaner.getFilesAtCheckpointStart();
        if (!cleanerState.isEmpty()) {
            flushExtraLevel = true;
        }


        lastCheckpointMillis = System.currentTimeMillis();
        flushStats.resetPerRunCounters();


        /* Get the next checkpoint id. */
        checkpointId++;
        nCheckpoints++;


        boolean success = false;
        boolean traced = false;


        LogManager logManager = envImpl.getLogManager();


        /*
         * Set the checkpoint state so that concurrent eviction can be
         * coordinated.
         */
        checkpointDirtyMap.beginCheckpoint(flushAll, flushExtraLevel);
        try {


            /*
             * Eviction can run during checkpoint as long as it follows the
             * same rules for using provisional logging and for propagating
             * logging of the checkpoint dirty set up the tree. We have to lock
             * out the evictor after the logging of checkpoint start until
             * we've selected the dirty set and decided on the highest level to
             * be flushed. See SR 11163, 11349.
             */
            long checkpointStart = DbLsn.NULL_LSN;
            long firstActiveLsn = DbLsn.NULL_LSN;


            /* Log the checkpoint start. */
            SingleItemEntry startEntry =
                new SingleItemEntry(LogEntryType.LOG_CKPT_START,
                                    new CheckpointStart(checkpointId,
                                                        invokingSource));
            checkpointStart =
                logManager.log(startEntry, ReplicationContext.NO_REPLICATE);


            /*
             * Note the first active LSN point. The definition of
             * firstActiveLsn is that all log entries for active transactions
             * are equal to or after that LSN.  This is the starting point for
             * replaying LNs during recovery and will be stored in the CkptEnd
             * entry.
             *
             * Use the checkpointStart as the firstActiveLsn if firstActiveLsn
             * is null, meaning that no txns are active.
             *
             * The current value must be retrieved from TxnManager after
             * logging CkptStart. If it were instead retrieved before logging
             * CkptStart, the following failure could occur.  [#20270]
             *
             *  ... getFirstActiveLsn returns NULL_LSN, will use 200 CkptStart
             *  100 LN-A in Txn-1
             *  200 CkptStart
             *  300 BIN-B refers to 100 LN-A
             *  400 CkptEnd
             *  ... Crash and recover.  Recovery does not undo 100 LN-A.
             *  ... Txn-1 is uncommitted, yet 100 LN-A takes effect.
             */
            firstActiveLsn = envImpl.getTxnManager().getFirstActiveLsn();
            if (firstActiveLsn == DbLsn.NULL_LSN) {
                firstActiveLsn = checkpointStart;
            }
                
            /*
             * In a replicated system, the checkpointer will be flushing out
             * the VLSNIndex, which is HA metadata. Check that the in-memory
             * version encompasses all metadata up to the point of the
             * CheckpointStart record. This is no-op for non-replicated
             * systems. [#19754]
             */
            envImpl.awaitVLSNConsistency();


            /* Find the set of dirty INs that must be logged. */
            checkpointDirtyMap.selectDirtyINsForCheckpoint();


            /* Call hook after dirty map creation and before flushing. */
            TestHookExecute.doHookIfSet(beforeFlushHook);


            /* Flush IN nodes. */
            flushDirtyNodes(envImpl, checkpointDirtyMap, allowDeltas,
                            checkpointStart, highPriority, flushStats);


            /*
             * Flush MapLNs if not already done by flushDirtyNodes.  Only flush
             * a database if it has not already been flushed since checkpoint
             * start.  Lastly, flush the DB mapping tree root.
             */
            checkpointDirtyMap.flushMapLNs(checkpointStart);
            checkpointDirtyMap.flushRoot(checkpointStart);


            /*
             * Flush replication information if necessary so that the VLSNIndex
             * cache is flushed and is recoverable.
             */
            envImpl.preCheckpointEndFlush();


            /*
             * Flush utilization info AFTER flushing IN nodes to reduce the
             * inaccuracies caused by the sequence FileSummaryLN-LN-BIN.
             */
            envImpl.getUtilizationProfile().flushFileUtilization
                (envImpl.getUtilizationTracker().getTrackedFiles());


            DbTree dbTree = envImpl.getDbTree();
            boolean willDeleteFiles = !cleanerState.isEmpty();
            CheckpointEnd ckptEnd = new CheckpointEnd
                (invokingSource, checkpointStart, envImpl.getRootLsn(),
                 firstActiveLsn,
                 envImpl.getNodeSequence().getLastLocalNodeId(),
                 envImpl.getNodeSequence().getLastReplicatedNodeId(),
                 dbTree.getLastLocalDbId(), dbTree.getLastReplicatedDbId(),
                 envImpl.getTxnManager().getLastLocalTxnId(),
                 envImpl.getTxnManager().getLastReplicatedTxnId(),
                 checkpointId, willDeleteFiles, cleaner.getLogSummary());


            SingleItemEntry endEntry =
                new SingleItemEntry(LogEntryType.LOG_CKPT_END, ckptEnd);


            /*
             * Log checkpoint end and update state kept about the last
             * checkpoint location. Send a trace message *before* the
             * checkpoint end log entry. This is done so that the normal trace
             * message doesn't affect the time-based isRunnable() calculation,
             * which only issues a checkpoint if a log record has been written
             * since the last checkpoint.
             */
            trace(envImpl, invokingSource, true);
            traced = true;


            /*
             * Always flush to ensure that cleaned files are not referenced,
             * and to ensure that this checkpoint is not wasted if we crash.
             */
            lastCheckpointEnd =
                logManager.logForceFlush(endEntry,
                                         true /*fsyncRequired*/,
                                         ReplicationContext.NO_REPLICATE);


            lastCheckpointStart = checkpointStart;


            success = true;
            cleaner.updateFilesAtCheckpointEnd(cleanerState);


        } catch (DatabaseException e) {
            LoggerUtils.traceAndLogException(envImpl, "Checkpointer",
                                             "doCheckpoint", "checkpointId=" +
                                             checkpointId, e);
            throw e;
        } finally {


            /*
             * Reset the checkpoint state so evictor activity knows there's no
             * further requirement for provisional logging. SR 11163.
             */
            checkpointDirtyMap.reset();


            if (!traced) {
                trace(envImpl, invokingSource, success);
            }
        }
    }


    private void trace(EnvironmentImpl envImpl,
                       String invokingSource,
                       boolean success ) {
        StringBuilder sb = new StringBuilder();
        sb.append("Checkpoint ").append(checkpointId);
        sb.append(": source=" ).append(invokingSource);
        sb.append(" success=").append(success);
        sb.append(" nFullINFlushThisRun=");
        sb.append(flushStats.nFullINFlushThisRun);
        sb.append(" nDeltaINFlushThisRun=");
        sb.append(flushStats.nDeltaINFlushThisRun);
        LoggerUtils.logMsg(logger, envImpl, Level.CONFIG, sb.toString());
    }


    /**
     * Flush a given database to disk. Like checkpoint, log from the bottom
     * up so that parents properly represent their children.
     */
    public void syncDatabase(EnvironmentImpl envImpl,
                             DatabaseImpl dbImpl,
                             boolean flushLog)
        throws DatabaseException {


        if (envImpl.isReadOnly()) {
            return;
        }


        DirtyINMap dirtyMap = new DirtyINMap(envImpl);
        FlushStats fstats = new FlushStats();
        try {
            /* Find the dirty set. */
            dirtyMap.selectDirtyINsForDbSync(dbImpl);


            if (dirtyMap.getNumEntries() > 0) {
                /* Write all dirtyINs out.*/
                flushDirtyNodes
                    (envImpl,
                     dirtyMap,
                     false /*allowDeltas*/,
                     DbLsn.NULL_LSN /*ckptStart*/,
                     false /*highPriority*/,
                     fstats);


                /* Make changes durable. [#15254] */
                if (flushLog) {
                    envImpl.getLogManager().flush();
                }
            }
        } catch (DatabaseException e) {
            LoggerUtils.traceAndLogException
                (envImpl, "Checkpointer", "syncDatabase",
                 "of " + dbImpl.getDebugName(), e);
            throw e;
        } finally {
            dirtyMap.reset();
        }
    }


    /* For unit testing only. */
    public static void setMaxFlushLevelHook(TestHook hook) {
        maxFlushLevelHook = hook;
    }


    /* For unit testing only. */
    public static void setBeforeFlushHook(TestHook hook) {
        beforeFlushHook = hook;
    }


    /**
     * Flush the nodes in order, from the lowest level to highest level.  As a
     * flush dirties its parent, add it to the dirty map, thereby cascading the
     * writes up the tree. If flushAll wasn't specified, we need only cascade
     * up to the highest level set at the start of checkpointing.
     *
     * Note that all but the top level INs are logged provisionally. That's
     * because we don't need to process lower INs during recovery because the
     * higher INs will end up pointing at them.
     */
    private static void flushDirtyNodes(EnvironmentImpl envImpl,
                                        DirtyINMap dirtyMap,
                                        boolean allowDeltas,
                                        long checkpointStart,
                                        boolean highPriority,
                                        FlushStats fstats)
        throws DatabaseException {


        LogManager logManager = envImpl.getLogManager();
        DbTree dbTree = envImpl.getDbTree();


        Map<DatabaseId, DatabaseImpl> dbCache =
            new HashMap<DatabaseId, DatabaseImpl>();
        try {
            while (dirtyMap.getNumLevels() > 0) {


                /*
                 * Work on one level's worth of nodes in ascending level order.
                 */
                Integer currentLevel = dirtyMap.getLowestLevelSet();
                int currentLevelVal = currentLevel.intValue();


                /*
                 * Flush MapLNs just prior to flushing the first level of the
                 * mapping tree.  Only flush a database if it has not already
                 * been flushed since checkpoint start.
                 */
                if (currentLevelVal == IN.DBMAP_LEVEL) {
                    dirtyMap.flushMapLNs(checkpointStart);
                }


                /* Flush the nodes at the current level. */
                while (true) {
                    CheckpointReference targetRef =
                        dirtyMap.removeNextNode(currentLevel);
                    if (targetRef == null) {
                        break;
                    }


                    /*
                     * Check to make sure the DB was not deleted after putting
                     * it in the dirty map, and prevent the DB from being
                     * deleted while we're working with it.
                     */
                    DatabaseImpl db = dbTree.getDb
                        (targetRef.dbId, -1 /*lockTimeout*/, dbCache);
                    if (db != null && !db.isDeleted()) {


                        /* Flush if we're below maxFlushLevel. */
                        int maxFlushLevel = dirtyMap.getHighestFlushLevel(db);
                        if (currentLevelVal <= maxFlushLevel) {


                            /* Evict before each operation. */
                            envImpl.daemonEviction(true /*backgroundIO*/);


                            flushIN
                                (envImpl, db, logManager, targetRef, dirtyMap,
                                 currentLevelVal, maxFlushLevel, allowDeltas,
                                 highPriority, fstats,
                                 true /*allowLogSubtree*/);


                            /*
                             * Sleep if background read/write limit was
                             * exceeded.
                             */
                            envImpl.sleepAfterBackgroundIO();
                        }
                    }
                }


                /* We're done with this level. */
                dirtyMap.removeLevel(currentLevel);
            }
        } finally {
            dbTree.releaseDbs(dbCache);
        }


        /*
         * Do not flush FileSummaryLNs/MapLNs (do not call
         * UtilizationProfile.flushLocalTracker) here because that flushing is
         * already done by the checkpoint.
         */
    }


    /**
     * Flush the target IN.
     *
     * Where applicable, also attempt to flush the subtree that houses this
     * target, which means we flush the siblings of this target to promote
     * better cleaning throughput. The problem lies in the fact that
     * provisionally logged nodes are not available for log cleaning until
     * their parent is logged non-provisionally.  On the other hand, we want to
     * log nodes in provisional mode as much as possible, both for recovery
     * performance, and for correctness to avoid fetches against cleaned log
     * files. (See [#16037].) These conflicting goals are reconciled by
     * flushing nodes in subtree grouping, because writing the non-provisional
     * parent of a set of provisionally written nodes frees the cleaner to work
     * on that set of provisional nodes as soon as possible. For example, if a
     * tree consists of:
     *
     *             INa
     *       +------+-------+
     *      INb            INc
     * +-----+----+         +-----+
     * BINd BINe BINf      BINg BINh
     *
     * It is more efficient for cleaning throughput to log in this order:
     *       BINd, BINe, BINf, INb, BINg, BINh, INc, INa
     * rather than:
     *       BINd, BINe, BINf, BINg, BINh, INb, INc, INa
     *
     * Suppose the subtree in question is INb->{BINd, BINe, BINf}
     *
     * Suppose we see BINd in the dirty map first, before BINe and BINf.
     *  - flushIN(BINd) is called
     *  - we fetch and latch its parent, INb
     *
     * If this is a high priority checkpoint, we'll hold the INb latch across
     * the time it takes to flush all three children.  In flushIN(BINd), we
     * walk through INb, create a local map of all the siblings that can be
     * found in the dirty map, and then call logSiblings with that local map.
     * Then we'll write out INb.
     *
     * If high priority is false, we will not hold the INb latch across
     * multiple IOs. Instead, we
     *  - write BINd out, using logSiblings
     *  - while still holding the INb latch, we create a local map of dirty
     *    siblings
     *  - release the INb latch
     *  - call flushIN() recursively on each entry in the local sibling map,
     *    which will result in a search and write of each sibling.  These
     *    recursive calls to flushIN are called with the allowLogSubtree
     *    parameter of false to halt the recursion and prevent a repeat of the
     *    sibling examination.
     *  - write INb
     */
    private static void flushIN(EnvironmentImpl envImpl,
                                DatabaseImpl db,
                                LogManager logManager,
                                CheckpointReference targetRef,
                                DirtyINMap dirtyMap,
                                int currentLevel,
                                int maxFlushLevel,
                                boolean allowDeltas,
                                boolean highPriority,
                                FlushStats fstats,
                                boolean allowLogSubtree)
        throws DatabaseException {


        /* Call test hook when we reach the max level. */
        assert (currentLevel < maxFlushLevel) ||
            TestHookExecute.doHookIfSet(maxFlushLevelHook);


        Tree tree = db.getTree();
        boolean targetWasRoot = false;
        if (targetRef.isDbRoot) {


            /* We're trying to flush the root. */
            RootFlusher flusher =
                new RootFlusher(db, logManager, targetRef.nodeId);
            tree.withRootLatchedExclusive(flusher);
            boolean flushed = flusher.getFlushed();


            /*
             * If this target isn't the root anymore, we'll have to handle it
             * like a regular node.
             */
            targetWasRoot = flusher.stillRoot();


            /*
             * Update the tree's owner, whether it's the env root or the
             * dbmapping tree.
             */
            if (flushed) {
                DbTree dbTree = envImpl.getDbTree();
                dbTree.modifyDbRoot(db);
                fstats.nFullINFlushThisRun++;
                fstats.nFullINFlush++;
            }
        }


        /*
         * The following attempt to flush applies to two cases:
         *
         * (1) the target was not ever the root
         *
         * (2) the target was the root, when the checkpoint dirty set was
         * assembled but is not the root now.
         */
        if (!targetWasRoot) {


            /*
             * The "isRoot" param is used to stop a search in
             * BIN.descendOnParentSearch and is passed as false (never stop).
             */
            SearchResult result =
                tree.getParentINForChildIN(targetRef.nodeId,
                                           false,  // isRoot
                                           targetRef.treeKey,
                                           false,  // requireExactMatch
                                           CacheMode.UNCHANGED,
                                           -1,     // targetLevel
                                           null,   // trackingList
                                           false); // doFetch


            /*
             * We must make sure that every IN that was selected for the
             * checkpointer's dirty IN set at the beginning of checkpoint is
             * written into the log and can be properly accessed from
             * ancestors. However, we have to take care for cases where the
             * evictor has written out a member of this dirty set before the
             * checkpointer got to it. See SR 10249.
             *
             * If no possible parent is found, the compressor may have deleted
             * this item before we got to processing it.
             */
            if (result.parent != null) {
                IN parent = result.parent;
                int parentLevel = parent.getLevel();
                boolean mustLogParent = false;


                /*
                 * If bottomLevelTarget is true, the parent IN contains bottom
                 * level BINs.  The masking is used to normalize the level for
                 * ordinary DBs and the mapping tree DB.
                 */
                boolean bottomLevelTarget = 
                    ((parentLevel & IN.LEVEL_MASK) == 2);


                /*
                 * INs at the max flush level are always non-provisional and
                 * INs at the bottom level (when this is not also the max flush
                 * level) are always provisional.  In between INs are
                 * provisional BEFORE_CKPT_END (see Provisional).
                 */
                Provisional provisional;
                if (currentLevel >= maxFlushLevel) {
                    provisional = Provisional.NO;
                } else if (bottomLevelTarget) {
                    provisional = Provisional.YES;
                } else {
                    provisional = Provisional.BEFORE_CKPT_END;
                }


                /*
                 * Log a sub-tree when the target is at the bottom level and
                 * this is not a recursive call to flushIN during sub-tree
                 * logging.
                 */
                boolean logSubtree = bottomLevelTarget && allowLogSubtree;


                /*
                 * Log sub-tree siblings with the latch held when highPriority
                 * is configured and this is not a DW DB.  For a DW DB, dirty
                 * LNs are logged for each BIN.  If we were to log a DW
                 * sub-tree with the parent latch held, the amount of logging
                 * may cause the latch to be held for too long a period.
                 */
                boolean logSiblingsWithParentLatchHeld =
                    logSubtree &&
                    highPriority &&
                    !db.isDurableDeferredWrite();


                /*
                 * If we log siblings with the parent latch held, we log the
                 * target along with other siblings so we can perform a single
                 * multi-log call for all siblings.
                 */
                boolean logTargetWithOtherSiblings = false;


                /*
                 * Map of node ID to parent index for each sibling to log.  We
                 * must process the siblings in node ID order during multi-log,
                 * so that latching order is deterministic and only in one
                 * direction.
                 */
                SortedMap<Long, Integer> siblingsToLog = null;


                try {
                    if (result.exactParentFound) {


                        /*
                         * If the child has already been evicted, don't
                         * refetch it.
                         */
                        IN renewedTarget = (IN) parent.getTarget(result.index);


                        if (renewedTarget == null) {
                            /* nAlreadyEvictedThisRun++;  -- for future */
                            mustLogParent |= true;
                        } else {
                            if (logSiblingsWithParentLatchHeld) {
                                logTargetWithOtherSiblings = true;
                            } else {
                                mustLogParent |= logSiblings
                                    (envImpl, dirtyMap, parent,
                                     Collections.singleton(result.index),
                                     allowDeltas, highPriority, provisional,
                                     fstats);
                            }
                        }
                    } else {
                        /* result.exactParentFound was false. */


                        /* Do not flush children of the inexact parent. */
                        logSubtree = false;


                        if (result.childNotResident) {


                            /*
                             * But it was because the child wasn't resident.
                             * To be on the safe side, we'll put the parent
                             * into the dirty set to be logged when that level
                             * is processed.
                             *
                             * Only do this if the parent we found is at a
                             * higher level than the child.  This ensures that
                             * the non-exact search does not find a sibling
                             * rather than a parent. [#11555]
                             */
                            if (parentLevel > currentLevel) {
                                mustLogParent |= true;
                            }
                            /* nAlreadyEvictedThisRun++; -- for future. */
                        }
                    }


                    if (logSubtree) {


                        /*
                         * Create a map of node ID to parent index for each
                         * sibling we intend to log.  Note that the dirty map
                         * does not contain targetRef (the sibling we're
                         * processing) because it was removed before calling
                         * this method, but it is added to the map below.
                         *
                         * A TreeMap (sorted map) is used so that siblings are
                         * latched in node ID order.  A deterministic order is
                         * needed to avoid deadlocks, if siblings are latched
                         * in multiple threads in the future.
                         */
                        siblingsToLog = new TreeMap<Long, Integer>();
                        for (int index = 0;
                             index < parent.getNEntries();
                             index += 1) {
                            IN child = (IN) parent.getTarget(index);
                            if (child != null) {
                                Long childId = child.getNodeId();
                                if ((logTargetWithOtherSiblings &&
                                     targetRef.nodeId ==
                                     childId.longValue()) ||
                                    dirtyMap.containsNode
                                        (child.getLevel(), childId)) {
                                    siblingsToLog.put(childId, index);
                                }
                            }
                        }


                        if (logSiblingsWithParentLatchHeld) {
                            if (MULTI_LOG) {
                                mustLogParent |= logSiblings
                                    (envImpl, dirtyMap, parent,
                                     siblingsToLog.values(), allowDeltas,
                                     highPriority, provisional, fstats);
                            } else {
                                for (int index : siblingsToLog.values()) {
                                    IN child = (IN) parent.getTarget(index);
                                    CheckpointReference childRef =
                                        (targetRef.nodeId ==
                                         child.getNodeId()) ? targetRef :
                                        dirtyMap.removeNode(child.getLevel(),
                                                            child.getNodeId());
                                    assert childRef != null;
                                    mustLogParent |= logSiblings
                                        (envImpl, dirtyMap, parent,
                                         Collections.singleton(index),
                                         allowDeltas, highPriority,
                                         provisional, fstats);
                                }
                            }
                            /* Siblings have been logged, do not log below. */
                            siblingsToLog = null;
                        }
                    }


                    if (mustLogParent) {
                        assert checkParentChildRelationship(result,
                                                            currentLevel) :
                               dumpParentChildInfo(result, parent,
                                                   targetRef.nodeId,
                                                   currentLevel, tree);
                        /*
                         * Add the parent IN to the dirty map unconditionally,
                         * even if not dirty, to cause changes to propogate
                         * upward even when a node has been evicted and
                         * refetched and is no longer dirty. [#16523]
                         */
                        dirtyMap.addIN(parent, true /*updateMemoryBudget*/);
                    }
                } finally {
                    parent.releaseLatch();
                }


                /*
                 * If highPriority is false, we don't hold the latch while
                 * logging the bottom level siblings.  We log them here with
                 * flushIN, performing a separate search for each one, after
                 * releasing the parent latch above.
                 */
                if (siblingsToLog != null) {
                    assert logSubtree;
                    assert !logSiblingsWithParentLatchHeld;
                    for (long childId : siblingsToLog.keySet()) {
                        assert targetRef.nodeId != childId;
                        CheckpointReference childRef =
                            dirtyMap.removeNode(currentLevel, childId);
                        if (childRef != null) {
                            flushIN
                                (envImpl, db, logManager, childRef,
                                 dirtyMap, currentLevel, maxFlushLevel,
                                 allowDeltas, highPriority, fstats,
                                 false /*allowLogSubtree*/);
                        }
                    }
                }


                /*
                 * Log the sub-tree parent, which will be logged
                 * non-provisionally, in order to update cleaner utilization.
                 * This must be done with flushIN after releasing the parent
                 * latch above, since we must search and acquire the
                 * grandparent latch.
                 */
                if (logSubtree && parentLevel <= maxFlushLevel) {
                    CheckpointReference parentRef = dirtyMap.removeNode
                        (parentLevel, parent.getNodeId());
                    if (parentRef != null) {
                        flushIN
                            (envImpl, db, logManager, parentRef, dirtyMap,
                             parentLevel, maxFlushLevel, allowDeltas,
                             highPriority, fstats, false /*allowLogSubtree*/);
                    }
                }
            }
        }
    }


    /**
     * @return true if this parent is appropriately 1 level above the child.
     */
    private static boolean checkParentChildRelationship(SearchResult result,
                                                        int childLevel) {


        if (result.childNotResident && !result.exactParentFound) {


            /*
             * This might be coming from the #11555 clause, in which case we
             * are logging over-cautiously, but intentionally, and the levels
             * might not pass the test below.
             */
            return true;
        }


        /* The parent must be child level + 1 */
        return result.parent.getLevel() == (childLevel + 1);
    }


    private static String dumpParentChildInfo(SearchResult result,
                                       IN parent,
                                       long childNodeId,
                                       int currentLevel,
                                       Tree tree) {
        StringBuilder sb = new StringBuilder();
        /*        sb.append("ckptId=").append(checkpointId); */
        sb.append(" result=").append(result);
        sb.append(" parent node=").append(parent.getNodeId());
        sb.append(" level=").append(parent.getLevel());
        sb.append(" child node=").append(childNodeId);
        sb.append(" level=").append(currentLevel);
        return sb.toString();
    }


    private static boolean logSiblings(EnvironmentImpl envImpl,
                                       DirtyINMap dirtyMap,
                                       IN parent,
                                       Collection<Integer> indicesToLog,
                                       boolean allowDeltas,
                                       boolean highPriority,
                                       Provisional provisional,
                                       FlushStats fstats)
        throws DatabaseException {


        LogManager logManager = envImpl.getLogManager();


        INLogContext context = new INLogContext();
        context.nodeDb = parent.getDatabase();
        context.backgroundIO = true;
        context.allowDeltas = allowDeltas;
        /* Allow compression of deleted slots in full version BINs.  */
        context.allowCompress = true;


        boolean mustLogParent = false;
        List<INLogItem> itemList = new ArrayList<INLogItem>();


        try {
            for (int index : indicesToLog) {
                IN child = (IN) parent.getTarget(index);


                /* Remove it from dirty map if it is present. */
                dirtyMap.removeNode(child.getLevel(), child.getNodeId());


                /*
                 * Latch and add item with valid parentIndex, so we will
                 * release the latch in the finally statement.
                 */
                child.latch(CacheMode.UNCHANGED);
                INLogItem item = new INLogItem();
                item.parentIndex = index;
                itemList.add(item);


                if (child.getDirty()) {


                    if (child.getDatabase().isDurableDeferredWrite()) {


                        /*
                         * Find dirty descendants to avoid logging nodes with
                         * never-logged children. See [#13936] and
                         * IN.logDirtyChildren for description of the case.
                         *
                         * Note that we must log both dirty and never-logged
                         * descendants to be sure to have a consistent view of
                         * the split. If we didn't, we could end up with the
                         * post-split version of a new sibling and the
                         * pre-split version of an split sibling in the log,
                         * which could result in a recovery where descendants
                         * are incorrectly duplicated, because they are in both
                         * the pre-split split sibling, and the post-split
                         * version of the new sibling.
                         */
                        child.logDirtyChildren();
                    }


                    /* Set default params. */
                    item.provisional = provisional;
                    item.repContext = ReplicationContext.NO_REPLICATE;
                    item.parent = parent;


                    /*
                     * Allow child to perform "before log" processing.  Note
                     * that child decides whether to log a delta. Only BINs
                     * that fall into the required percentages and have not
                     * been cleaned will be logged with a delta.
                     */
                    child.beforeLog(logManager, item, context);
                } else {
                    /* Do not process if not dirty.  Unlatch now. */
                    itemList.remove(itemList.size() - 1);
                    child.releaseLatch();


                    /* Log parent if child has already been flushed. */
                    mustLogParent = true;
                }
            }


            /*
             * Log all siblings at once.  Limitations of Java generics prevent
             * conversion from List<INLogItem> to List<LogItem> even by
             * casting, so we convert to an array instead.
             */
            LogItem[] itemArray = new LogItem[itemList.size()];
            logManager.multiLog(itemList.toArray(itemArray), context);


            for (INLogItem item : itemList) {
                IN child = (IN) parent.getTarget(item.parentIndex);


                /* Allow child to perform "after log" processing. */
                child.afterLog(logManager, item, context);


                /* Update the parent slot's LSN. */
                assert (item.newLsn != DbLsn.NULL_LSN);
                parent.updateEntry(item.parentIndex, item.newLsn);


                /* Increment stats. */
                if (item.isDelta) {
                    fstats.nDeltaINFlushThisRun++;
                    fstats.nDeltaINFlush++;
                } else {
                    fstats.nFullINFlushThisRun++;
                    fstats.nFullINFlush++;
                    if (child.isBIN()) {
                        fstats.nFullBINFlush++;
                        fstats.nFullBINFlushThisRun++;
                    }
                }


                /* Parent slot has changed, must log parent. */
                mustLogParent = true;
            }
            return mustLogParent;
        } finally {
            for (INLogItem item : itemList) {
                IN child = (IN) parent.getTarget(item.parentIndex);
                child.releaseLatch();
            }
        }
    }


    /*
     * RootFlusher lets us write out the root IN within the root latch.
     */
    private static class RootFlusher implements WithRootLatched {
        private final DatabaseImpl db;
        private boolean flushed;
        private boolean stillRoot;
        private final LogManager logManager;
        private final long targetNodeId;


        RootFlusher(DatabaseImpl db,
                    LogManager logManager,
                    long targetNodeId) {
            this.db = db;
            flushed = false;
            this.logManager = logManager;
            this.targetNodeId = targetNodeId;
            stillRoot = false;
        }


        /**
         * Flush the rootIN if dirty.
         */
        public IN doWork(ChildReference root)
            throws DatabaseException {


            if (root == null) {
                return null;
            }
            IN rootIN = (IN) root.fetchTarget(db, null);
            rootIN.latch(CacheMode.UNCHANGED);
            try {
                if (rootIN.getNodeId() == targetNodeId) {


                    /*
                     * Find dirty descendants to avoid logging nodes with
                     * never-logged children. See [#13936]
                     */
                    if (rootIN.getDatabase().isDurableDeferredWrite()) {
                        rootIN.logDirtyChildren();
                    }


                    /*
                     * stillRoot handles the situation where the root was split
                     * after it was placed in the checkpointer's dirty set.
                     */
                    stillRoot = true;
                    if (rootIN.getDirty()) {
                        long newLsn = rootIN.log(logManager);
                        root.setLsn(newLsn);
                        flushed = true;
                    }
                }
            } finally {
                rootIN.releaseLatch();
            }
            return null;
        }


        boolean getFlushed() {
            return flushed;
        }


        boolean stillRoot() {
            return stillRoot;
        }
    }


    /*
     * CheckpointReferences are used to identify nodes that must be flushed as
     * part of the checkpoint. We don't keep an actual reference to the node
     * because that prevents nodes from being GC'ed during checkpoint.
     *
     * Using a checkpointReference introduces a window between the point when
     * the checkpoint dirty set is created and when the node is flushed. Some
     * of the fields saved in the reference are immutable: db, nodeId.  The
     * others are not and we have to handle potential change:
     *
     * isDbRoot: it's possible for isDbRoot to go from true->false, but not
     *         false->true. True->false is handled by the flushIN method
     *         by finding the root and checking if it is the target.
     * treeKey: This can change only in the event of a split. If it does, there
     *         is the chance that the checkpointer will find the wrong node to
     *         flush, but that's okay because the split guarantees flushing to
     *         the root, so the target will be properly logged within the
     *         checkpoint period.
     *
     * The class and ctor are public for the Sizeof program.
     */
    public static class CheckpointReference {
        DatabaseId dbId;
        long nodeId;
        boolean isDbRoot;
        byte[] treeKey;


        public CheckpointReference(DatabaseId dbId,
                                   long nodeId,
                                   boolean isDbRoot,
                                   byte[] treeKey) {
            this.dbId = dbId;
            this.nodeId = nodeId;
            this.isDbRoot = isDbRoot;
            this.treeKey = treeKey;
        }


        @Override
        public boolean equals(Object o) {
            if (!(o instanceof CheckpointReference)) {
                return false;
            }


            CheckpointReference other = (CheckpointReference) o;
            return nodeId == other.nodeId;
        }


        @Override
        public int hashCode() {
            return (int) nodeId;
        }


        @Override
        public String toString() {
            StringBuilder sb = new StringBuilder();
            sb.append("db=").append(dbId);
            sb.append(" nodeId=").append(nodeId);
            return sb.toString();
        }
    }


    /**
     * A struct to hold log flushing stats for checkpoint and database sync.
     */
    public static class FlushStats {


        public long nFullINFlush;
        public long nFullBINFlush;
        public long nDeltaINFlush;
        public long nFullINFlushThisRun;
        public long nFullBINFlushThisRun;
        public long nDeltaINFlushThisRun;


        /* For future addition to stats:
           private int nAlreadyEvictedThisRun;
        */


        /* Reset per-run counters. */
        void resetPerRunCounters() {
            nFullINFlushThisRun = 0;
            nFullBINFlushThisRun = 0;
            nDeltaINFlushThisRun = 0;
            /* nAlreadyEvictedThisRun = 0; -- for future */
        }
    }
}
Source Code of com.sleepycat.je.recovery.Checkpointer$FlushStats

Related Classes of com.sleepycat.je.recovery.Checkpointer$FlushStats