Package com.sleepycat.je.recovery

Source Code of com.sleepycat.je.recovery.Checkpointer$FlushStats

/*-
* See the file LICENSE for redistribution information.
*
* Copyright (c) 2002, 2011 Oracle and/or its affiliates.  All rights reserved.
*
*/

package com.sleepycat.je.recovery;

import static com.sleepycat.je.recovery.CheckpointStatDefinition.CKPT_CHECKPOINTS;
import static com.sleepycat.je.recovery.CheckpointStatDefinition.CKPT_DELTA_IN_FLUSH;
import static com.sleepycat.je.recovery.CheckpointStatDefinition.CKPT_FULL_BIN_FLUSH;
import static com.sleepycat.je.recovery.CheckpointStatDefinition.CKPT_FULL_IN_FLUSH;
import static com.sleepycat.je.recovery.CheckpointStatDefinition.CKPT_LAST_CKPTID;
import static com.sleepycat.je.recovery.CheckpointStatDefinition.CKPT_LAST_CKPT_END;
import static com.sleepycat.je.recovery.CheckpointStatDefinition.CKPT_LAST_CKPT_START;
import static com.sleepycat.je.recovery.CheckpointStatDefinition.GROUP_DESC;
import static com.sleepycat.je.recovery.CheckpointStatDefinition.GROUP_NAME;

import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.SortedMap;
import java.util.TreeMap;
import java.util.logging.Level;

import com.sleepycat.je.CacheMode;
import com.sleepycat.je.CheckpointConfig;
import com.sleepycat.je.DatabaseException;
import com.sleepycat.je.EnvironmentMutableConfig;
import com.sleepycat.je.StatsConfig;
import com.sleepycat.je.cleaner.Cleaner;
import com.sleepycat.je.cleaner.FileSelector.CheckpointStartCleanerState;
import com.sleepycat.je.config.EnvironmentParams;
import com.sleepycat.je.dbi.DatabaseId;
import com.sleepycat.je.dbi.DatabaseImpl;
import com.sleepycat.je.dbi.DbConfigManager;
import com.sleepycat.je.dbi.DbTree;
import com.sleepycat.je.dbi.EnvConfigObserver;
import com.sleepycat.je.dbi.EnvironmentImpl;
import com.sleepycat.je.log.LogEntryType;
import com.sleepycat.je.log.LogItem;
import com.sleepycat.je.log.LogManager;
import com.sleepycat.je.log.Provisional;
import com.sleepycat.je.log.ReplicationContext;
import com.sleepycat.je.log.entry.SingleItemEntry;
import com.sleepycat.je.tree.ChildReference;
import com.sleepycat.je.tree.IN;
import com.sleepycat.je.tree.INLogContext;
import com.sleepycat.je.tree.INLogItem;
import com.sleepycat.je.tree.SearchResult;
import com.sleepycat.je.tree.Tree;
import com.sleepycat.je.tree.WithRootLatched;
import com.sleepycat.je.utilint.DaemonThread;
import com.sleepycat.je.utilint.DbLsn;
import com.sleepycat.je.utilint.LSNStat;
import com.sleepycat.je.utilint.LoggerUtils;
import com.sleepycat.je.utilint.LongStat;
import com.sleepycat.je.utilint.StatGroup;
import com.sleepycat.je.utilint.TestHook;
import com.sleepycat.je.utilint.TestHookExecute;

/**
* The Checkpointer looks through the tree for internal nodes that must be
* flushed to the log. Checkpoint flushes must be done in ascending order from
* the bottom of the tree up.
*
* Checkpoint and IN Logging Rules
* -------------------------------
* The checkpoint must log, and make accessible via non-provisional ancestors,
* all INs that are dirty at CkptStart.  If we crash and recover from that
* CkptStart onward, any IN that became dirty (before the crash) after the
* CkptStart must become dirty again as the result of replaying the action that
* caused it to originally become dirty.
*
* Therefore, when an IN is dirtied at some point in the checkpoint interval,
* but is not logged by the checkpoint, the log entry representing the action
* that dirtied the IN must follow either the CkptStart or the FirstActiveLSN
* that is recorded in the CkptEnd entry.  The FirstActiveLSN is less than or
* equal to the CkptStart LSN.  Recovery will process LNs between the
* FirstActiveLSN and the end of the log.  Other entries are only processed
* from the CkptStart forward.  And provisional entries are not processed.
*
* Example: Non-transactional LN logging.  We take two actions: 1) log the LN
* and then 2) dirty the parent BIN.  What if the LN is logged before CkptStart
* and the BIN is dirtied after CkptStart?  How do we avoid breaking the rules?
* The answer is that we log the LN while holding the latch on the parent BIN,
* and we don't release the latch until after we dirty the BIN.   The
* construction of the checkpoint dirty map requires latching the BIN.  Since
* the LN was logged before CkptStart, the BIN will be dirtied before the
* checkpointer latches it during dirty map construction.  So the BIN will
* always be included in the dirty map and logged by the checkpoint.
*
* Example: Abort.  We take two actions: 1) log the abort and then 2) undo the
* changes, which modifies (dirties) the BIN parents of the undone LNs.  There
* is nothing to prevent logging CkptStart in between these two actions, so how
* do we avoid breaking the rules?  The answer is that we do not unregister the
* transaction until after the undo phase.  So although the BINs may be dirtied
* by the undo after CkptStart is logged, the FirstActiveLSN will be prior to
* CkptStart.  Therefore, we will process the Abort and replay the action that
* modifies the BINs.
*
* Exception: Lazy migration.  The log cleaner will make an IN dirty without
* logging an action that makes it dirty.  This is an exception to the general
* rule that actions should be logged when they cause dirtiness.   The reasons
* this is safe are:
* 1. The IN contents are not modified, so there is no information lost if the
*    IN is never logged, or is logged provisionally and no ancestor is logged
*    non-provisionally.
* 2. If the IN is logged non-provisionally, this will have the side effect of
*    recording the old LSN as being obsolete. However, the general rules for
*    checkpointing and recovery will ensure that the new version is used in
*    the Btree.  The new version will either be replayed by recovery or
*    referenced in the active Btree via a non-provisional ancestor.
*
* Checkpoint Algorithm
* --------------------
* The final checkpointDirtyMap field is used to hold (in addition to the dirty
* INs) the state of the checkpoint and highest flush levels.  Access to this
* object is synchronized so that eviction and checkpointing can access it
* concurrently.  When a checkpoint is not active, the state is CkptState.NONE
* and the dirty map is empty.  When a checkpoint runs, we do this:
*
* 1. Get set of files from cleaner that can be deleted after this checkpoint.
* 2. Set checkpointDirtyMap state to DIRTY_MAP_INCOMPLETE, meaning that dirty
*    map construction is in progress.
* 3. Log CkptStart
* 4. Construct dirty map, organized by Btree level, from dirty INs in INList.
*    The highest flush levels are calculated during dirty map construction.
*    Set checkpointDirtyMap state to DIRTY_MAP_COMPLETE.
* 5. Flush INs in dirty map.
*        + First, flush the bottom two levels a sub-tree at a time, where a
*          sub-tree is one IN at level two and all its BIN children.  Higher
*          levels (above level two) are logged strictly by level, not using
*          subtrees.
*              o If je.checkpointer.highPriority=false, we log one IN at a
*                time, whether or not the IN is logged as part of a subtree,
*                and do a Btree search for the parent of each IN.
*              o If je.checkpointer.highPriority=true, for the bottom two
*                levels we log each sub-tree in a single call to the
*                LogManager with the parent IN latched, and we only do one
*                Btree search for each level two IN.  Higher levels are logged
*                one IN at a time as with highPriority=false.
*        + The Provisional property is set as follows, depending on the level
*          of the IN:
*              o level is max flush level:  Provisional.NO
*              o level is bottom level: Provisional.YES
*              o Otherwise (middle levels): Provisional.BEFORE_CKPT_END
*  6. Flush VLSNIndex cache to make VLSNIndex recoverable.
*  7. Flush UtilizationTracker (write FileSummaryLNs) to persist all
*     tracked obsolete offsets and utilization summary info, to make this info
*     recoverable.
*  8. Log CkptEnd
*  9. Delete cleaned files from step 1.
* 10. Set checkpointDirtyMap state to NONE.
*
* Provisional.BEFORE_CKPT_END
* ---------------------------
* See Provisional.java for a description of the relationship between the
* checkpoint algorithm above and the BEFORE_CKPT_END property.
*
* Coordination of Eviction and Checkpointing
* ------------------------------------------
* Eviction can proceed concurrently with all phases of a checkpoint, and
* eviction may take place concurrently in multiple threads.  This concurrency
* is crucial to avoid blocking application threads that perform eviction and
* to reduce the amount of eviction required in application threads.
*
* Eviction calls Checkpointer.coordinateEvictionWithCheckpoint, which calls
* DirtyINMap.coordinateEvictionWithCheckpoint, just before logging an IN.
* coordinateEvictionWithCheckpoint returns whether the IN should be logged
* provisionally (Provisional.YES) or non-provisionally (Provisional.NO).
*
* Other coordination necessary depends on the state of the checkpoint:
*   + NONE: No additional action.
*      o return Provisional.NO
*   + DIRTY_MAP_INCOMPLETE: The parent IN is added to the dirty map, exactly
*     as if it were encountered as dirty in the INList during dirty map
*     construction.
*      o IN level GTE highest flush level: return Provisional.NO
*      o IN level LT highest flush level: return Provisional.YES
*   + DIRTY_MAP_COMPLETE:
*      o IN is root: return Provisional.NO
*      o IN is not root: return Provisional.YES
*
* In general this is designed so that eviction will use the same provisional
* value that would be used by the checkpoint, as if the checkpoint itself were
* logging the IN.  However, there are several conditions where this is not
* exactly the case.
*
* 1. Eviction may log an IN with Provisional.YES when the IN was not dirty at
*    the time of dirty map creation, if it became dirty afterwards.  In this
*    case, the checkpointer would not have logged the IN at all.  This is safe
*    because the actions that made that IN dirty are logged in the recovery
*    period.
* 2. Eviction may log an IN with Provisional.YES after the checkpoint has
*    logged it, if it becomes dirty again.  In this case the IN is logged
*    twice, which would not have been done by the checkpoint alone.  This is
*    safe because the actions that made that IN dirty are logged in the
*    recovery period.
* 3. An intermediate level IN (not bottom most and not the highest flush
*    level) will be logged by the checkpoint with Provisional.BEFORE_CKPT_END
*    but will be logged by eviction with Provisional.YES.  See below for why
*    this is safe.
* 4. Between checkpoint step 8 (log CkptEnd) and 10 (set checkpointDirtyMap
*    state to NONE), eviction may log an IN with Provisional.YES, although a
*    checkpoint is not strictly active during this interval.  See below for
*    why this is safe.
*
* It is safe for eviction to log an IN as Provisional.YES for the last two
* special cases, because this does not cause incorrect recovery behavior.  For
* recovery to work properly, it is only necessary that:
*
*  + Provisional.NO is used for INs at the max flush level during an active
*    checkpoint.
*  + Provisional.YES or BEFORE_CKPT_END is used for INs below the max flush
*    level, to avoid replaying an IN during recovery that may depend on a file
*    deleted as the result of the checkpoint.
*
* You may ask why we don't use Provisional.YES for eviction when a checkpoint
* is not active.  There are two reason, both related to performance:
*
* 1. This would be wasteful when an IN is evicted in between checkpoints, and
*    that portion of the log is processed by recovery later, in the event of a
*    crash.  The evicted INs would be ignored by recovery, but the actions
*    that caused them to be dirty would be replayed and the INs would be
*    logged again redundantly.
* 2. Logging a IN provisionally will not count the old LSN as obsolete
*    immediately, so cleaner utilization will be inaccurate until the a
*    non-provisional parent is logged, typically by the next checkpoint.  It
*    is always important to keep the cleaner from stalling and spiking, to
*    keep latency and throughput as level as possible.
*
* Therefore, it is safe to log with Provisional.YES in between checkpoints,
* but not desirable.
*
* Although we don't do this, it would be safe and optimal to evict with
* BEFORE_CKPT_END in between checkpoints, because it would be treated by
* recovery as if it were Provisional.NO.  This is because the interval between
* checkpoints is only processed by recovery if it follows the last CkptEnd,
* and BEFORE_CKPT_END is treated as Provisional.NO if the IN follows the last
* CkptEnd.
*
* However, it would not be safe to evict an IN with BEFORE_CKPT_END during a
* checkpoint, when logging of the IN's ancestors does not occur according to
* the rules of the checkpoint.  If this were done, then if the checkpoint
* completes and is used during a subsequent recovery, an obsolete offset for
* the old version of the IN will mistakenly be recorded.  Below are two cases
* where BEFORE_CKPT_END is used correctly and one showing how it could be used
* incorrectly.
*
* 1. Correct use of BEFORE_CKPT_END when the checkpoint does not complete.
*
*        050 BIN-A
*        060 IN-B parent of BIN-A
*        100 CkptStart
*        200 BIN-A logged with BEFORE_CKPT_END
*        300 FileSummaryLN with obsolete offset for BIN-A at 050
*        Crash and recover
*
*    Recovery will process BIN-A at 200 (it will be considered
*    non-provisional) because there is no following CkptEnd.  It is
*    therefore correct that BIN-A at 050 is obsolete.
*
* 2. Correct use of BEFORE_CKPT_END when the checkpoint does complete.
*
*        050 BIN-A
*        060 IN-B parent of BIN-A
*        100 CkptStart
*        200 BIN-A logged with BEFORE_CKPT_END
*        300 FileSummaryLN with obsolete offset for BIN-A at 050
*        400 IN-B parent of BIN-A, non-provisional
*        500 CkptEnd
*        Crash and recover
*
*    Recovery will not process BIN-A at 200 (it will be considered
*    provisional) because there is a following CkptEnd, but it will
*    process its parent IN-B at 400, and therefore the BIN-A at 200 will be
*    active in the tree.  It is therefore correct that BIN-A at 050 is
*    obsolete.
*
* 3. Incorrect use of BEFORE_CKPT_END when the checkpoint does complete.
*
*        050 BIN-A
*        060 IN-B parent of BIN-A
*        100 CkptStart
*        200 BIN-A logged with BEFORE_CKPT_END
*        300 FileSummaryLN with obsolete offset for BIN-A at 050
*        400 CkptEnd
*        Crash and recover
*
*    Recovery will not process BIN-A at 200 (it will be considered
*    provisional) because there is a following CkptEnd, but no parent
*    IN-B is logged, and therefore the IN-B at 060 and BIN-A at 050 will be
*    active in the tree.  It is therefore incorrect that BIN-A at 050 is
*    obsolete.
*
* This last case is what caused the LFNF in SR [#19422], when BEFORE_CKPT_END
* was mistakenly used for logging evicted BINs via CacheMode.EVICT_BIN.
* During the checkpoint, we evict BIN-A and log it with BEFORE_CKPT_END, yet
* neither it nor its parent are part of the checkpoint.  After being counted
* obsolete, we crash and recover.  Then the file containing the BIN (BIN-A at
* 050 above) is cleaned and deleted.  During cleaning, it is not migrated
* because an obsolete offset was previously recorded.  The LFNF occurs when
* trying to access this BIN during a user operation.
*
* CacheMode.EVICT_BIN
* -------------------
* Unlike in JE 4.0 where EVICT_BIN was first introduced, in JE 4.1 and later
* we do not use special rules when an IN is evicted.  Since concurrent
* eviction and checkpointing are supported in JE 4.1, the above rules apply to
* EVICT_BIN as well as all other types of eviction.
*/
public class Checkpointer extends DaemonThread implements EnvConfigObserver {

    /*
     * We currently use multi-logging whenever practical, but we're keeping an
     * option open to disable it, perhaps via a config param.
     */
    private static final boolean MULTI_LOG = true;

    /**
     * For unit testing only.  Called before we flush the max level.  This
     * field is static because it is called from the static flushIN method.
     */
    public static TestHook maxFlushLevelHook = null;

    public static TestHook beforeFlushHook = null;

    public static TestHook<IN> examineINForCheckpointHook = null;

    private EnvironmentImpl envImpl;

    /* Checkpoint sequence, initialized at recovery. */
    private long checkpointId;

    /*
     * How much the log should grow between checkpoints. If 0, we're using time
     * based checkpointing.
     */
    private final long logSizeBytesInterval;
    private final long logFileMax;
    private final long timeInterval;
    private long lastCheckpointMillis;

    /* Configured to true to minimize checkpoint duration. */
    private boolean highPriority;

    private long nCheckpoints;
    private long lastCheckpointStart;
    private long lastCheckpointEnd;
    private final FlushStats flushStats;

    /**
     * The DirtyINMap for checkpointing is created once and is reset after each
     * checkpoint is complete.  Access to this object is synchronized so that
     * eviction and checkpointing can access it concurrently.
     */
    private final DirtyINMap checkpointDirtyMap;

    public Checkpointer(EnvironmentImpl envImpl,
                        long waitTime,
                        String name) {
        super(waitTime, name, envImpl);
        this.envImpl = envImpl;
        logSizeBytesInterval =
            envImpl.getConfigManager().getLong
                (EnvironmentParams.CHECKPOINTER_BYTES_INTERVAL);
        logFileMax =
            envImpl.getConfigManager().getLong(EnvironmentParams.LOG_FILE_MAX);
        timeInterval = waitTime;
        lastCheckpointMillis = 0;

        nCheckpoints = 0;
        flushStats = new FlushStats();

        checkpointDirtyMap = new DirtyINMap(envImpl);

        /* Initialize mutable properties and register for notifications. */
        envConfigUpdate(envImpl.getConfigManager(), null);
        envImpl.addConfigObserver(this);
    }

    /**
     * Process notifications of mutable property changes.
     */
    public void envConfigUpdate(DbConfigManager cm,
                                EnvironmentMutableConfig ignore) {
        highPriority = cm.getBoolean
            (EnvironmentParams.CHECKPOINTER_HIGH_PRIORITY);
    }

    /**
     * Initializes the checkpoint intervals when no checkpoint is performed
     * while opening the environment.
     */
    public void initIntervals(long lastCheckpointStart,
                              long lastCheckpointEnd,
                              long lastCheckpointMillis) {
        this.lastCheckpointStart = lastCheckpointStart;
        this.lastCheckpointEnd = lastCheckpointEnd;
        this.lastCheckpointMillis = lastCheckpointMillis;
    }

    /**
     * Coordinates an eviction with an in-progress checkpoint and returns
     * whether provisional logging is needed.
     *
     * @return true if the target must be logged provisionally.
     */
    public boolean coordinateEvictionWithCheckpoint(IN target, IN parent) {
        return checkpointDirtyMap.
            coordinateEvictionWithCheckpoint(target, parent);
    }

    /**
     * Figure out the wakeup period. Supplied through this static method
     * because we need to pass wakeup period to the superclass and need to do
     * the calcuation outside this constructor.
     *
     * @throws IllegalArgumentException via Environment ctor and
     * setMutableConfig.
     */
    public static long getWakeupPeriod(DbConfigManager configManager)
        throws IllegalArgumentException {

        long wakeupPeriod = configManager.getDuration
            (EnvironmentParams.CHECKPOINTER_WAKEUP_INTERVAL);
        long bytePeriod = configManager.getLong
            (EnvironmentParams.CHECKPOINTER_BYTES_INTERVAL);

        /* Checkpointing period must be set either by time or by log size. */
        if ((wakeupPeriod == 0) && (bytePeriod == 0)) {
            throw new IllegalArgumentException
                (EnvironmentParams.CHECKPOINTER_BYTES_INTERVAL.getName() +
                 " and " +
                 EnvironmentParams.CHECKPOINTER_WAKEUP_INTERVAL.getName() +
                 " cannot both be 0. ");
        }

        /*
         * Checkpointing by log size takes precendence over time based period.
         */
        if (bytePeriod == 0) {
            return wakeupPeriod;
        } else {
            return 0;
        }
    }

    /**
     * Set checkpoint id -- can only be done after recovery.
     */
    public synchronized void setCheckpointId(long lastCheckpointId) {
        checkpointId = lastCheckpointId;
    }

    /**
     * Load stats.
     */
    public StatGroup loadStats(StatsConfig config) {
        StatGroup stats = new StatGroup(GROUP_NAME, GROUP_DESC);
        new LongStat(stats, CKPT_LAST_CKPTID, checkpointId);
        new LongStat(stats, CKPT_CHECKPOINTS, nCheckpoints);
        new LSNStat(stats, CKPT_LAST_CKPT_START, lastCheckpointStart);
        new LSNStat(stats, CKPT_LAST_CKPT_END, lastCheckpointEnd);
        new LongStat(stats, CKPT_FULL_IN_FLUSH, flushStats.nFullINFlush);
        new LongStat(stats, CKPT_FULL_BIN_FLUSH, flushStats.nFullBINFlush);
        new LongStat(stats, CKPT_DELTA_IN_FLUSH, flushStats.nDeltaINFlush);

        if (config.getClear()) {
            nCheckpoints = 0;
            flushStats.nFullINFlush = 0;
            flushStats.nFullBINFlush = 0;
            flushStats.nDeltaINFlush = 0;
        }

        return stats;
    }

    public synchronized void clearEnv() {
        envImpl = null;
    }

    /**
     * Return the number of retries when a deadlock exception occurs.
     */
    @Override
    protected long nDeadlockRetries() {
        return envImpl.getConfigManager().getInt
            (EnvironmentParams.CHECKPOINTER_RETRY);
    }

    /**
     * Called whenever the DaemonThread wakes up from a sleep.
     */
    @Override
    protected void onWakeup()
        throws DatabaseException {

        if (envImpl.isClosed()) {
            return;
        }

        doCheckpoint(CheckpointConfig.DEFAULT, "daemon");
    }

    /**
     * Wakes up the checkpointer if a checkpoint log interval is configured and
     * the number of bytes written since the last checkpoint exeeds the size
     * of the interval.
     */
    public void wakeupAfterWrite() {
        if (logSizeBytesInterval != 0) {
            long nextLsn = envImpl.getFileManager().getNextLsn();
            if (DbLsn.getNoCleaningDistance
                    (nextLsn, lastCheckpointStart, logFileMax) >=
                    logSizeBytesInterval) {
                wakeup();
            }
        }
    }

    /**
     * Determine whether a checkpoint should be run.
     *
     * 1. If the force parameter is specified, always checkpoint.
     *
     * 2. If the config object specifies time or log size, use that.
     *
     * 3. If the environment is configured to use log size based checkpointing,
     * check the log.
     *
     * 4. Lastly, use time based checking.
     */
    private boolean isRunnable(CheckpointConfig config) {
        /* Figure out if we're using log size or time to determine interval.*/
        long useBytesInterval = 0;
        long useTimeInterval = 0;
        long nextLsn = DbLsn.NULL_LSN;
        boolean runnable = false;
        try {
            if (config.getForce()) {
                runnable = true;
                return runnable;
            } else if (config.getKBytes() != 0) {
                useBytesInterval = config.getKBytes() << 10;
            } else if (config.getMinutes() != 0) {
                // convert to millis
                useTimeInterval = config.getMinutes() * 60 * 1000;
            } else if (logSizeBytesInterval != 0) {
                useBytesInterval = logSizeBytesInterval;
            } else {
                useTimeInterval = timeInterval;
            }

            /*
             * If our checkpoint interval is defined by log size, check on how
             * much log has grown since the last checkpoint.
             */
            if (useBytesInterval != 0) {
                nextLsn = envImpl.getFileManager().getNextLsn();
                if (DbLsn.getNoCleaningDistance(nextLsn, lastCheckpointStart,
                                                logFileMax) >=
                    useBytesInterval) {
                    runnable = true;
                }
            } else if (useTimeInterval != 0) {

                /*
                 * Our checkpoint is determined by time.  If enough time has
                 * passed and some log data has been written, do a checkpoint.
                 */
                long lastUsedLsn = envImpl.getFileManager().getLastUsedLsn();
                if (((System.currentTimeMillis() - lastCheckpointMillis) >=
                     useTimeInterval) &&
                    (DbLsn.compareTo(lastUsedLsn, lastCheckpointEnd) != 0)) {
                    runnable = true;
                }
            }
            return runnable;
        } finally {
            StringBuilder sb = new StringBuilder();
            sb.append("size interval=").append(useBytesInterval);
            if (nextLsn != DbLsn.NULL_LSN) {
                sb.append(" nextLsn=").
                    append(DbLsn.getNoFormatString(nextLsn));
            }
            if (lastCheckpointEnd != DbLsn.NULL_LSN) {
                sb.append(" lastCkpt=");
                sb.append(DbLsn.getNoFormatString(lastCheckpointEnd));
            }
            sb.append(" time interval=").append(useTimeInterval);
            sb.append(" force=").append(config.getForce());
            sb.append(" runnable=").append(runnable);

            LoggerUtils.finest(logger, envImpl, sb.toString());
        }
    }

    /**
     * The real work to do a checkpoint. This may be called by the checkpoint
     * thread when waking up, or it may be invoked programatically through the
     * api.
     *
     * @param invokingSource a debug aid, to indicate who invoked this
     *       checkpoint. (i.e. recovery, the checkpointer daemon, the cleaner,
     *       programatically)
     */
    public synchronized void doCheckpoint(CheckpointConfig config,
                                          String invokingSource)
        throws DatabaseException {

        if (envImpl.isReadOnly()) {
            return;
        }

        if (!isRunnable(config)) {
            return;
        }

        /*
         * If minimizing recovery time is desired, then flush all the way to
         * the top of the dbtree instead of stopping at the highest level last
         * modified, so that only the root INs are processed by recovery.
         */
        final boolean flushAll = config.getMinimizeRecoveryTime();
       
        /*
         * Since writing deltas does not impact recovery time (they are
         * provisional and not processed), and LN replay is not impacted either
         * (it is impacted only by deltas written earlier), always allow
         * deltas.  This parameter is kept only in case we wish to add an
         * option later for disallowing deltas.
         */
        final boolean allowDeltas = true;

        /*
         * If there are cleaned files to be deleted, flush an extra level to
         * write out the parents of cleaned nodes.  This ensures that the node
         * will contain the LSN of a cleaned files.
         */
        boolean flushExtraLevel = false;
        Cleaner cleaner = envImpl.getCleaner();
        CheckpointStartCleanerState cleanerState =
            cleaner.getFilesAtCheckpointStart();
        if (!cleanerState.isEmpty()) {
            flushExtraLevel = true;
        }

        lastCheckpointMillis = System.currentTimeMillis();
        flushStats.resetPerRunCounters();

        /* Get the next checkpoint id. */
        checkpointId++;
        nCheckpoints++;

        boolean success = false;
        boolean traced = false;

        LogManager logManager = envImpl.getLogManager();

        /*
         * Set the checkpoint state so that concurrent eviction can be
         * coordinated.
         */
        checkpointDirtyMap.beginCheckpoint(flushAll, flushExtraLevel);
        try {

            /*
             * Eviction can run during checkpoint as long as it follows the
             * same rules for using provisional logging and for propagating
             * logging of the checkpoint dirty set up the tree. We have to lock
             * out the evictor after the logging of checkpoint start until
             * we've selected the dirty set and decided on the highest level to
             * be flushed. See SR 11163, 11349.
             */
            long checkpointStart = DbLsn.NULL_LSN;
            long firstActiveLsn = DbLsn.NULL_LSN;

            /* Log the checkpoint start. */
            SingleItemEntry startEntry =
                new SingleItemEntry(LogEntryType.LOG_CKPT_START,
                                    new CheckpointStart(checkpointId,
                                                        invokingSource));
            checkpointStart =
                logManager.log(startEntry, ReplicationContext.NO_REPLICATE);

            /*
             * Note the first active LSN point. The definition of
             * firstActiveLsn is that all log entries for active transactions
             * are equal to or after that LSN.  This is the starting point for
             * replaying LNs during recovery and will be stored in the CkptEnd
             * entry.
             *
             * Use the checkpointStart as the firstActiveLsn if firstActiveLsn
             * is null, meaning that no txns are active.
             *
             * The current value must be retrieved from TxnManager after
             * logging CkptStart. If it were instead retrieved before logging
             * CkptStart, the following failure could occur.  [#20270]
             *
             *  ... getFirstActiveLsn returns NULL_LSN, will use 200 CkptStart
             *  100 LN-A in Txn-1
             *  200 CkptStart
             *  300 BIN-B refers to 100 LN-A
             *  400 CkptEnd
             *  ... Crash and recover.  Recovery does not undo 100 LN-A.
             *  ... Txn-1 is uncommitted, yet 100 LN-A takes effect.
             */
            firstActiveLsn = envImpl.getTxnManager().getFirstActiveLsn();
            if (firstActiveLsn == DbLsn.NULL_LSN) {
                firstActiveLsn = checkpointStart;
            }
               
            /*
             * In a replicated system, the checkpointer will be flushing out
             * the VLSNIndex, which is HA metadata. Check that the in-memory
             * version encompasses all metadata up to the point of the
             * CheckpointStart record. This is no-op for non-replicated
             * systems. [#19754]
             */
            envImpl.awaitVLSNConsistency();

            /* Find the set of dirty INs that must be logged. */
            checkpointDirtyMap.selectDirtyINsForCheckpoint();

            /* Call hook after dirty map creation and before flushing. */
            TestHookExecute.doHookIfSet(beforeFlushHook);

            /* Flush IN nodes. */
            flushDirtyNodes(envImpl, checkpointDirtyMap, allowDeltas,
                            checkpointStart, highPriority, flushStats);

            /*
             * Flush MapLNs if not already done by flushDirtyNodes.  Only flush
             * a database if it has not already been flushed since checkpoint
             * start.  Lastly, flush the DB mapping tree root.
             */
            checkpointDirtyMap.flushMapLNs(checkpointStart);
            checkpointDirtyMap.flushRoot(checkpointStart);

            /*
             * Flush replication information if necessary so that the VLSNIndex
             * cache is flushed and is recoverable.
             */
            envImpl.preCheckpointEndFlush();

            /*
             * Flush utilization info AFTER flushing IN nodes to reduce the
             * inaccuracies caused by the sequence FileSummaryLN-LN-BIN.
             */
            envImpl.getUtilizationProfile().flushFileUtilization
                (envImpl.getUtilizationTracker().getTrackedFiles());

            DbTree dbTree = envImpl.getDbTree();
            boolean willDeleteFiles = !cleanerState.isEmpty();
            CheckpointEnd ckptEnd = new CheckpointEnd
                (invokingSource, checkpointStart, envImpl.getRootLsn(),
                 firstActiveLsn,
                 envImpl.getNodeSequence().getLastLocalNodeId(),
                 envImpl.getNodeSequence().getLastReplicatedNodeId(),
                 dbTree.getLastLocalDbId(), dbTree.getLastReplicatedDbId(),
                 envImpl.getTxnManager().getLastLocalTxnId(),
                 envImpl.getTxnManager().getLastReplicatedTxnId(),
                 checkpointId, willDeleteFiles, cleaner.getLogSummary());

            SingleItemEntry endEntry =
                new SingleItemEntry(LogEntryType.LOG_CKPT_END, ckptEnd);

            /*
             * Log checkpoint end and update state kept about the last
             * checkpoint location. Send a trace message *before* the
             * checkpoint end log entry. This is done so that the normal trace
             * message doesn't affect the time-based isRunnable() calculation,
             * which only issues a checkpoint if a log record has been written
             * since the last checkpoint.
             */
            trace(envImpl, invokingSource, true);
            traced = true;

            /*
             * Always flush to ensure that cleaned files are not referenced,
             * and to ensure that this checkpoint is not wasted if we crash.
             */
            lastCheckpointEnd =
                logManager.logForceFlush(endEntry,
                                         true /*fsyncRequired*/,
                                         ReplicationContext.NO_REPLICATE);

            lastCheckpointStart = checkpointStart;

            success = true;
            cleaner.updateFilesAtCheckpointEnd(cleanerState);

        } catch (DatabaseException e) {
            LoggerUtils.traceAndLogException(envImpl, "Checkpointer",
                                             "doCheckpoint", "checkpointId=" +
                                             checkpointId, e);
            throw e;
        } finally {

            /*
             * Reset the checkpoint state so evictor activity knows there's no
             * further requirement for provisional logging. SR 11163.
             */
            checkpointDirtyMap.reset();

            if (!traced) {
                trace(envImpl, invokingSource, success);
            }
        }
    }

    private void trace(EnvironmentImpl envImpl,
                       String invokingSource,
                       boolean success ) {
        StringBuilder sb = new StringBuilder();
        sb.append("Checkpoint ").append(checkpointId);
        sb.append(": source=" ).append(invokingSource);
        sb.append(" success=").append(success);
        sb.append(" nFullINFlushThisRun=");
        sb.append(flushStats.nFullINFlushThisRun);
        sb.append(" nDeltaINFlushThisRun=");
        sb.append(flushStats.nDeltaINFlushThisRun);
        LoggerUtils.logMsg(logger, envImpl, Level.CONFIG, sb.toString());
    }

    /**
     * Flush a given database to disk. Like checkpoint, log from the bottom
     * up so that parents properly represent their children.
     */
    public void syncDatabase(EnvironmentImpl envImpl,
                             DatabaseImpl dbImpl,
                             boolean flushLog)
        throws DatabaseException {

        if (envImpl.isReadOnly()) {
            return;
        }

        DirtyINMap dirtyMap = new DirtyINMap(envImpl);
        FlushStats fstats = new FlushStats();
        try {
            /* Find the dirty set. */
            dirtyMap.selectDirtyINsForDbSync(dbImpl);

            if (dirtyMap.getNumEntries() > 0) {
                /* Write all dirtyINs out.*/
                flushDirtyNodes
                    (envImpl,
                     dirtyMap,
                     false /*allowDeltas*/,
                     DbLsn.NULL_LSN /*ckptStart*/,
                     false /*highPriority*/,
                     fstats);

                /* Make changes durable. [#15254] */
                if (flushLog) {
                    envImpl.getLogManager().flush();
                }
            }
        } catch (DatabaseException e) {
            LoggerUtils.traceAndLogException
                (envImpl, "Checkpointer", "syncDatabase",
                 "of " + dbImpl.getDebugName(), e);
            throw e;
        } finally {
            dirtyMap.reset();
        }
    }

    /* For unit testing only. */
    public static void setMaxFlushLevelHook(TestHook hook) {
        maxFlushLevelHook = hook;
    }

    /* For unit testing only. */
    public static void setBeforeFlushHook(TestHook hook) {
        beforeFlushHook = hook;
    }

    /**
     * Flush the nodes in order, from the lowest level to highest level.  As a
     * flush dirties its parent, add it to the dirty map, thereby cascading the
     * writes up the tree. If flushAll wasn't specified, we need only cascade
     * up to the highest level set at the start of checkpointing.
     *
     * Note that all but the top level INs are logged provisionally. That's
     * because we don't need to process lower INs during recovery because the
     * higher INs will end up pointing at them.
     */
    private static void flushDirtyNodes(EnvironmentImpl envImpl,
                                        DirtyINMap dirtyMap,
                                        boolean allowDeltas,
                                        long checkpointStart,
                                        boolean highPriority,
                                        FlushStats fstats)
        throws DatabaseException {

        LogManager logManager = envImpl.getLogManager();
        DbTree dbTree = envImpl.getDbTree();

        Map<DatabaseId, DatabaseImpl> dbCache =
            new HashMap<DatabaseId, DatabaseImpl>();
        try {
            while (dirtyMap.getNumLevels() > 0) {

                /*
                 * Work on one level's worth of nodes in ascending level order.
                 */
                Integer currentLevel = dirtyMap.getLowestLevelSet();
                int currentLevelVal = currentLevel.intValue();

                /*
                 * Flush MapLNs just prior to flushing the first level of the
                 * mapping tree.  Only flush a database if it has not already
                 * been flushed since checkpoint start.
                 */
                if (currentLevelVal == IN.DBMAP_LEVEL) {
                    dirtyMap.flushMapLNs(checkpointStart);
                }

                /* Flush the nodes at the current level. */
                while (true) {
                    CheckpointReference targetRef =
                        dirtyMap.removeNextNode(currentLevel);
                    if (targetRef == null) {
                        break;
                    }

                    /*
                     * Check to make sure the DB was not deleted after putting
                     * it in the dirty map, and prevent the DB from being
                     * deleted while we're working with it.
                     */
                    DatabaseImpl db = dbTree.getDb
                        (targetRef.dbId, -1 /*lockTimeout*/, dbCache);
                    if (db != null && !db.isDeleted()) {

                        /* Flush if we're below maxFlushLevel. */
                        int maxFlushLevel = dirtyMap.getHighestFlushLevel(db);
                        if (currentLevelVal <= maxFlushLevel) {

                            /* Evict before each operation. */
                            envImpl.daemonEviction(true /*backgroundIO*/);

                            flushIN
                                (envImpl, db, logManager, targetRef, dirtyMap,
                                 currentLevelVal, maxFlushLevel, allowDeltas,
                                 highPriority, fstats,
                                 true /*allowLogSubtree*/);

                            /*
                             * Sleep if background read/write limit was
                             * exceeded.
                             */
                            envImpl.sleepAfterBackgroundIO();
                        }
                    }
                }

                /* We're done with this level. */
                dirtyMap.removeLevel(currentLevel);
            }
        } finally {
            dbTree.releaseDbs(dbCache);
        }

        /*
         * Do not flush FileSummaryLNs/MapLNs (do not call
         * UtilizationProfile.flushLocalTracker) here because that flushing is
         * already done by the checkpoint.
         */
    }

    /**
     * Flush the target IN.
     *
     * Where applicable, also attempt to flush the subtree that houses this
     * target, which means we flush the siblings of this target to promote
     * better cleaning throughput. The problem lies in the fact that
     * provisionally logged nodes are not available for log cleaning until
     * their parent is logged non-provisionally.  On the other hand, we want to
     * log nodes in provisional mode as much as possible, both for recovery
     * performance, and for correctness to avoid fetches against cleaned log
     * files. (See [#16037].) These conflicting goals are reconciled by
     * flushing nodes in subtree grouping, because writing the non-provisional
     * parent of a set of provisionally written nodes frees the cleaner to work
     * on that set of provisional nodes as soon as possible. For example, if a
     * tree consists of:
     *
     *             INa
     *       +------+-------+
     *      INb            INc
     * +-----+----+         +-----+
     * BINd BINe BINf      BINg BINh
     *
     * It is more efficient for cleaning throughput to log in this order:
     *       BINd, BINe, BINf, INb, BINg, BINh, INc, INa
     * rather than:
     *       BINd, BINe, BINf, BINg, BINh, INb, INc, INa
     *
     * Suppose the subtree in question is INb->{BINd, BINe, BINf}
     *
     * Suppose we see BINd in the dirty map first, before BINe and BINf.
     *  - flushIN(BINd) is called
     *  - we fetch and latch its parent, INb
     *
     * If this is a high priority checkpoint, we'll hold the INb latch across
     * the time it takes to flush all three children.  In flushIN(BINd), we
     * walk through INb, create a local map of all the siblings that can be
     * found in the dirty map, and then call logSiblings with that local map.
     * Then we'll write out INb.
     *
     * If high priority is false, we will not hold the INb latch across
     * multiple IOs. Instead, we
     *  - write BINd out, using logSiblings
     *  - while still holding the INb latch, we create a local map of dirty
     *    siblings
     *  - release the INb latch
     *  - call flushIN() recursively on each entry in the local sibling map,
     *    which will result in a search and write of each sibling.  These
     *    recursive calls to flushIN are called with the allowLogSubtree
     *    parameter of false to halt the recursion and prevent a repeat of the
     *    sibling examination.
     *  - write INb
     */
    private static void flushIN(EnvironmentImpl envImpl,
                                DatabaseImpl db,
                                LogManager logManager,
                                CheckpointReference targetRef,
                                DirtyINMap dirtyMap,
                                int currentLevel,
                                int maxFlushLevel,
                                boolean allowDeltas,
                                boolean highPriority,
                                FlushStats fstats,
                                boolean allowLogSubtree)
        throws DatabaseException {

        /* Call test hook when we reach the max level. */
        assert (currentLevel < maxFlushLevel) ||
            TestHookExecute.doHookIfSet(maxFlushLevelHook);

        Tree tree = db.getTree();
        boolean targetWasRoot = false;
        if (targetRef.isDbRoot) {

            /* We're trying to flush the root. */
            RootFlusher flusher =
                new RootFlusher(db, logManager, targetRef.nodeId);
            tree.withRootLatchedExclusive(flusher);
            boolean flushed = flusher.getFlushed();

            /*
             * If this target isn't the root anymore, we'll have to handle it
             * like a regular node.
             */
            targetWasRoot = flusher.stillRoot();

            /*
             * Update the tree's owner, whether it's the env root or the
             * dbmapping tree.
             */
            if (flushed) {
                DbTree dbTree = envImpl.getDbTree();
                dbTree.modifyDbRoot(db);
                fstats.nFullINFlushThisRun++;
                fstats.nFullINFlush++;
            }
        }

        /*
         * The following attempt to flush applies to two cases:
         *
         * (1) the target was not ever the root
         *
         * (2) the target was the root, when the checkpoint dirty set was
         * assembled but is not the root now.
         */
        if (!targetWasRoot) {

            /*
             * The "isRoot" param is used to stop a search in
             * BIN.descendOnParentSearch and is passed as false (never stop).
             */
            SearchResult result =
                tree.getParentINForChildIN(targetRef.nodeId,
                                           false,  // isRoot
                                           targetRef.treeKey,
                                           false,  // requireExactMatch
                                           CacheMode.UNCHANGED,
                                           -1,     // targetLevel
                                           null,   // trackingList
                                           false); // doFetch

            /*
             * We must make sure that every IN that was selected for the
             * checkpointer's dirty IN set at the beginning of checkpoint is
             * written into the log and can be properly accessed from
             * ancestors. However, we have to take care for cases where the
             * evictor has written out a member of this dirty set before the
             * checkpointer got to it. See SR 10249.
             *
             * If no possible parent is found, the compressor may have deleted
             * this item before we got to processing it.
             */
            if (result.parent != null) {
                IN parent = result.parent;
                int parentLevel = parent.getLevel();
                boolean mustLogParent = false;

                /*
                 * If bottomLevelTarget is true, the parent IN contains bottom
                 * level BINs.  The masking is used to normalize the level for
                 * ordinary DBs and the mapping tree DB.
                 */
                boolean bottomLevelTarget =
                    ((parentLevel & IN.LEVEL_MASK) == 2);

                /*
                 * INs at the max flush level are always non-provisional and
                 * INs at the bottom level (when this is not also the max flush
                 * level) are always provisional.  In between INs are
                 * provisional BEFORE_CKPT_END (see Provisional).
                 */
                Provisional provisional;
                if (currentLevel >= maxFlushLevel) {
                    provisional = Provisional.NO;
                } else if (bottomLevelTarget) {
                    provisional = Provisional.YES;
                } else {
                    provisional = Provisional.BEFORE_CKPT_END;
                }

                /*
                 * Log a sub-tree when the target is at the bottom level and
                 * this is not a recursive call to flushIN during sub-tree
                 * logging.
                 */
                boolean logSubtree = bottomLevelTarget && allowLogSubtree;

                /*
                 * Log sub-tree siblings with the latch held when highPriority
                 * is configured and this is not a DW DB.  For a DW DB, dirty
                 * LNs are logged for each BIN.  If we were to log a DW
                 * sub-tree with the parent latch held, the amount of logging
                 * may cause the latch to be held for too long a period.
                 */
                boolean logSiblingsWithParentLatchHeld =
                    logSubtree &&
                    highPriority &&
                    !db.isDurableDeferredWrite();

                /*
                 * If we log siblings with the parent latch held, we log the
                 * target along with other siblings so we can perform a single
                 * multi-log call for all siblings.
                 */
                boolean logTargetWithOtherSiblings = false;

                /*
                 * Map of node ID to parent index for each sibling to log.  We
                 * must process the siblings in node ID order during multi-log,
                 * so that latching order is deterministic and only in one
                 * direction.
                 */
                SortedMap<Long, Integer> siblingsToLog = null;

                try {
                    if (result.exactParentFound) {

                        /*
                         * If the child has already been evicted, don't
                         * refetch it.
                         */
                        IN renewedTarget = (IN) parent.getTarget(result.index);

                        if (renewedTarget == null) {
                            /* nAlreadyEvictedThisRun++;  -- for future */
                            mustLogParent |= true;
                        } else {
                            if (logSiblingsWithParentLatchHeld) {
                                logTargetWithOtherSiblings = true;
                            } else {
                                mustLogParent |= logSiblings
                                    (envImpl, dirtyMap, parent,
                                     Collections.singleton(result.index),
                                     allowDeltas, highPriority, provisional,
                                     fstats);
                            }
                        }
                    } else {
                        /* result.exactParentFound was false. */

                        /* Do not flush children of the inexact parent. */
                        logSubtree = false;

                        if (result.childNotResident) {

                            /*
                             * But it was because the child wasn't resident.
                             * To be on the safe side, we'll put the parent
                             * into the dirty set to be logged when that level
                             * is processed.
                             *
                             * Only do this if the parent we found is at a
                             * higher level than the child.  This ensures that
                             * the non-exact search does not find a sibling
                             * rather than a parent. [#11555]
                             */
                            if (parentLevel > currentLevel) {
                                mustLogParent |= true;
                            }
                            /* nAlreadyEvictedThisRun++; -- for future. */
                        }
                    }

                    if (logSubtree) {

                        /*
                         * Create a map of node ID to parent index for each
                         * sibling we intend to log.  Note that the dirty map
                         * does not contain targetRef (the sibling we're
                         * processing) because it was removed before calling
                         * this method, but it is added to the map below.
                         *
                         * A TreeMap (sorted map) is used so that siblings are
                         * latched in node ID order.  A deterministic order is
                         * needed to avoid deadlocks, if siblings are latched
                         * in multiple threads in the future.
                         */
                        siblingsToLog = new TreeMap<Long, Integer>();
                        for (int index = 0;
                             index < parent.getNEntries();
                             index += 1) {
                            IN child = (IN) parent.getTarget(index);
                            if (child != null) {
                                Long childId = child.getNodeId();
                                if ((logTargetWithOtherSiblings &&
                                     targetRef.nodeId ==
                                     childId.longValue()) ||
                                    dirtyMap.containsNode
                                        (child.getLevel(), childId)) {
                                    siblingsToLog.put(childId, index);
                                }
                            }
                        }

                        if (logSiblingsWithParentLatchHeld) {
                            if (MULTI_LOG) {
                                mustLogParent |= logSiblings
                                    (envImpl, dirtyMap, parent,
                                     siblingsToLog.values(), allowDeltas,
                                     highPriority, provisional, fstats);
                            } else {
                                for (int index : siblingsToLog.values()) {
                                    IN child = (IN) parent.getTarget(index);
                                    CheckpointReference childRef =
                                        (targetRef.nodeId ==
                                         child.getNodeId()) ? targetRef :
                                        dirtyMap.removeNode(child.getLevel(),
                                                            child.getNodeId());
                                    assert childRef != null;
                                    mustLogParent |= logSiblings
                                        (envImpl, dirtyMap, parent,
                                         Collections.singleton(index),
                                         allowDeltas, highPriority,
                                         provisional, fstats);
                                }
                            }
                            /* Siblings have been logged, do not log below. */
                            siblingsToLog = null;
                        }
                    }

                    if (mustLogParent) {
                        assert checkParentChildRelationship(result,
                                                            currentLevel) :
                               dumpParentChildInfo(result, parent,
                                                   targetRef.nodeId,
                                                   currentLevel, tree);
                        /*
                         * Add the parent IN to the dirty map unconditionally,
                         * even if not dirty, to cause changes to propogate
                         * upward even when a node has been evicted and
                         * refetched and is no longer dirty. [#16523]
                         */
                        dirtyMap.addIN(parent, true /*updateMemoryBudget*/);
                    }
                } finally {
                    parent.releaseLatch();
                }

                /*
                 * If highPriority is false, we don't hold the latch while
                 * logging the bottom level siblings.  We log them here with
                 * flushIN, performing a separate search for each one, after
                 * releasing the parent latch above.
                 */
                if (siblingsToLog != null) {
                    assert logSubtree;
                    assert !logSiblingsWithParentLatchHeld;
                    for (long childId : siblingsToLog.keySet()) {
                        assert targetRef.nodeId != childId;
                        CheckpointReference childRef =
                            dirtyMap.removeNode(currentLevel, childId);
                        if (childRef != null) {
                            flushIN
                                (envImpl, db, logManager, childRef,
                                 dirtyMap, currentLevel, maxFlushLevel,
                                 allowDeltas, highPriority, fstats,
                                 false /*allowLogSubtree*/);
                        }
                    }
                }

                /*
                 * Log the sub-tree parent, which will be logged
                 * non-provisionally, in order to update cleaner utilization.
                 * This must be done with flushIN after releasing the parent
                 * latch above, since we must search and acquire the
                 * grandparent latch.
                 */
                if (logSubtree && parentLevel <= maxFlushLevel) {
                    CheckpointReference parentRef = dirtyMap.removeNode
                        (parentLevel, parent.getNodeId());
                    if (parentRef != null) {
                        flushIN
                            (envImpl, db, logManager, parentRef, dirtyMap,
                             parentLevel, maxFlushLevel, allowDeltas,
                             highPriority, fstats, false /*allowLogSubtree*/);
                    }
                }
            }
        }
    }

    /**
     * @return true if this parent is appropriately 1 level above the child.
     */
    private static boolean checkParentChildRelationship(SearchResult result,
                                                        int childLevel) {

        if (result.childNotResident && !result.exactParentFound) {

            /*
             * This might be coming from the #11555 clause, in which case we
             * are logging over-cautiously, but intentionally, and the levels
             * might not pass the test below.
             */
            return true;
        }

        /* The parent must be child level + 1 */
        return result.parent.getLevel() == (childLevel + 1);
    }

    private static String dumpParentChildInfo(SearchResult result,
                                       IN parent,
                                       long childNodeId,
                                       int currentLevel,
                                       Tree tree) {
        StringBuilder sb = new StringBuilder();
        /*        sb.append("ckptId=").append(checkpointId); */
        sb.append(" result=").append(result);
        sb.append(" parent node=").append(parent.getNodeId());
        sb.append(" level=").append(parent.getLevel());
        sb.append(" child node=").append(childNodeId);
        sb.append(" level=").append(currentLevel);
        return sb.toString();
    }

    private static boolean logSiblings(EnvironmentImpl envImpl,
                                       DirtyINMap dirtyMap,
                                       IN parent,
                                       Collection<Integer> indicesToLog,
                                       boolean allowDeltas,
                                       boolean highPriority,
                                       Provisional provisional,
                                       FlushStats fstats)
        throws DatabaseException {

        LogManager logManager = envImpl.getLogManager();

        INLogContext context = new INLogContext();
        context.nodeDb = parent.getDatabase();
        context.backgroundIO = true;
        context.allowDeltas = allowDeltas;
        /* Allow compression of deleted slots in full version BINs.  */
        context.allowCompress = true;

        boolean mustLogParent = false;
        List<INLogItem> itemList = new ArrayList<INLogItem>();

        try {
            for (int index : indicesToLog) {
                IN child = (IN) parent.getTarget(index);

                /* Remove it from dirty map if it is present. */
                dirtyMap.removeNode(child.getLevel(), child.getNodeId());

                /*
                 * Latch and add item with valid parentIndex, so we will
                 * release the latch in the finally statement.
                 */
                child.latch(CacheMode.UNCHANGED);
                INLogItem item = new INLogItem();
                item.parentIndex = index;
                itemList.add(item);

                if (child.getDirty()) {

                    if (child.getDatabase().isDurableDeferredWrite()) {

                        /*
                         * Find dirty descendants to avoid logging nodes with
                         * never-logged children. See [#13936] and
                         * IN.logDirtyChildren for description of the case.
                         *
                         * Note that we must log both dirty and never-logged
                         * descendants to be sure to have a consistent view of
                         * the split. If we didn't, we could end up with the
                         * post-split version of a new sibling and the
                         * pre-split version of an split sibling in the log,
                         * which could result in a recovery where descendants
                         * are incorrectly duplicated, because they are in both
                         * the pre-split split sibling, and the post-split
                         * version of the new sibling.
                         */
                        child.logDirtyChildren();
                    }

                    /* Set default params. */
                    item.provisional = provisional;
                    item.repContext = ReplicationContext.NO_REPLICATE;
                    item.parent = parent;

                    /*
                     * Allow child to perform "before log" processing.  Note
                     * that child decides whether to log a delta. Only BINs
                     * that fall into the required percentages and have not
                     * been cleaned will be logged with a delta.
                     */
                    child.beforeLog(logManager, item, context);
                } else {
                    /* Do not process if not dirty.  Unlatch now. */
                    itemList.remove(itemList.size() - 1);
                    child.releaseLatch();

                    /* Log parent if child has already been flushed. */
                    mustLogParent = true;
                }
            }

            /*
             * Log all siblings at once.  Limitations of Java generics prevent
             * conversion from List<INLogItem> to List<LogItem> even by
             * casting, so we convert to an array instead.
             */
            LogItem[] itemArray = new LogItem[itemList.size()];
            logManager.multiLog(itemList.toArray(itemArray), context);

            for (INLogItem item : itemList) {
                IN child = (IN) parent.getTarget(item.parentIndex);

                /* Allow child to perform "after log" processing. */
                child.afterLog(logManager, item, context);

                /* Update the parent slot's LSN. */
                assert (item.newLsn != DbLsn.NULL_LSN);
                parent.updateEntry(item.parentIndex, item.newLsn);

                /* Increment stats. */
                if (item.isDelta) {
                    fstats.nDeltaINFlushThisRun++;
                    fstats.nDeltaINFlush++;
                } else {
                    fstats.nFullINFlushThisRun++;
                    fstats.nFullINFlush++;
                    if (child.isBIN()) {
                        fstats.nFullBINFlush++;
                        fstats.nFullBINFlushThisRun++;
                    }
                }

                /* Parent slot has changed, must log parent. */
                mustLogParent = true;
            }
            return mustLogParent;
        } finally {
            for (INLogItem item : itemList) {
                IN child = (IN) parent.getTarget(item.parentIndex);
                child.releaseLatch();
            }
        }
    }

    /*
     * RootFlusher lets us write out the root IN within the root latch.
     */
    private static class RootFlusher implements WithRootLatched {
        private final DatabaseImpl db;
        private boolean flushed;
        private boolean stillRoot;
        private final LogManager logManager;
        private final long targetNodeId;

        RootFlusher(DatabaseImpl db,
                    LogManager logManager,
                    long targetNodeId) {
            this.db = db;
            flushed = false;
            this.logManager = logManager;
            this.targetNodeId = targetNodeId;
            stillRoot = false;
        }

        /**
         * Flush the rootIN if dirty.
         */
        public IN doWork(ChildReference root)
            throws DatabaseException {

            if (root == null) {
                return null;
            }
            IN rootIN = (IN) root.fetchTarget(db, null);
            rootIN.latch(CacheMode.UNCHANGED);
            try {
                if (rootIN.getNodeId() == targetNodeId) {

                    /*
                     * Find dirty descendants to avoid logging nodes with
                     * never-logged children. See [#13936]
                     */
                    if (rootIN.getDatabase().isDurableDeferredWrite()) {
                        rootIN.logDirtyChildren();
                    }

                    /*
                     * stillRoot handles the situation where the root was split
                     * after it was placed in the checkpointer's dirty set.
                     */
                    stillRoot = true;
                    if (rootIN.getDirty()) {
                        long newLsn = rootIN.log(logManager);
                        root.setLsn(newLsn);
                        flushed = true;
                    }
                }
            } finally {
                rootIN.releaseLatch();
            }
            return null;
        }

        boolean getFlushed() {
            return flushed;
        }

        boolean stillRoot() {
            return stillRoot;
        }
    }

    /*
     * CheckpointReferences are used to identify nodes that must be flushed as
     * part of the checkpoint. We don't keep an actual reference to the node
     * because that prevents nodes from being GC'ed during checkpoint.
     *
     * Using a checkpointReference introduces a window between the point when
     * the checkpoint dirty set is created and when the node is flushed. Some
     * of the fields saved in the reference are immutable: db, nodeId.  The
     * others are not and we have to handle potential change:
     *
     * isDbRoot: it's possible for isDbRoot to go from true->false, but not
     *         false->true. True->false is handled by the flushIN method
     *         by finding the root and checking if it is the target.
     * treeKey: This can change only in the event of a split. If it does, there
     *         is the chance that the checkpointer will find the wrong node to
     *         flush, but that's okay because the split guarantees flushing to
     *         the root, so the target will be properly logged within the
     *         checkpoint period.
     *
     * The class and ctor are public for the Sizeof program.
     */
    public static class CheckpointReference {
        DatabaseId dbId;
        long nodeId;
        boolean isDbRoot;
        byte[] treeKey;

        public CheckpointReference(DatabaseId dbId,
                                   long nodeId,
                                   boolean isDbRoot,
                                   byte[] treeKey) {
            this.dbId = dbId;
            this.nodeId = nodeId;
            this.isDbRoot = isDbRoot;
            this.treeKey = treeKey;
        }

        @Override
        public boolean equals(Object o) {
            if (!(o instanceof CheckpointReference)) {
                return false;
            }

            CheckpointReference other = (CheckpointReference) o;
            return nodeId == other.nodeId;
        }

        @Override
        public int hashCode() {
            return (int) nodeId;
        }

        @Override
        public String toString() {
            StringBuilder sb = new StringBuilder();
            sb.append("db=").append(dbId);
            sb.append(" nodeId=").append(nodeId);
            return sb.toString();
        }
    }

    /**
     * A struct to hold log flushing stats for checkpoint and database sync.
     */
    public static class FlushStats {

        public long nFullINFlush;
        public long nFullBINFlush;
        public long nDeltaINFlush;
        public long nFullINFlushThisRun;
        public long nFullBINFlushThisRun;
        public long nDeltaINFlushThisRun;

        /* For future addition to stats:
           private int nAlreadyEvictedThisRun;
        */

        /* Reset per-run counters. */
        void resetPerRunCounters() {
            nFullINFlushThisRun = 0;
            nFullBINFlushThisRun = 0;
            nDeltaINFlushThisRun = 0;
            /* nAlreadyEvictedThisRun = 0; -- for future */
        }
    }
}
TOP

Related Classes of com.sleepycat.je.recovery.Checkpointer$FlushStats

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.