Package com.sleepycat.je.rep.impl.node

Source Code of com.sleepycat.je.rep.impl.node.Replay

/*-
* See the file LICENSE for redistribution information.
*
* Copyright (c) 2002, 2011 Oracle and/or its affiliates.  All rights reserved.
*
*/

package com.sleepycat.je.rep.impl.node;

import static com.sleepycat.je.log.LogEntryType.LOG_NAMELN_TRANSACTIONAL;
import static com.sleepycat.je.log.LogEntryType.LOG_TXN_ABORT;
import static com.sleepycat.je.log.LogEntryType.LOG_TXN_COMMIT;
import static com.sleepycat.je.rep.impl.node.ReplayStatDefinition.MAX_COMMIT_PROCESSING_NANOS;
import static com.sleepycat.je.rep.impl.node.ReplayStatDefinition.MIN_COMMIT_PROCESSING_NANOS;
import static com.sleepycat.je.rep.impl.node.ReplayStatDefinition.N_ABORTS;
import static com.sleepycat.je.rep.impl.node.ReplayStatDefinition.N_COMMITS;
import static com.sleepycat.je.rep.impl.node.ReplayStatDefinition.N_COMMIT_ACKS;
import static com.sleepycat.je.rep.impl.node.ReplayStatDefinition.N_COMMIT_NO_SYNCS;
import static com.sleepycat.je.rep.impl.node.ReplayStatDefinition.N_COMMIT_SYNCS;
import static com.sleepycat.je.rep.impl.node.ReplayStatDefinition.N_COMMIT_WRITE_NO_SYNCS;
import static com.sleepycat.je.rep.impl.node.ReplayStatDefinition.N_ELAPSED_TXN_TIME;
import static com.sleepycat.je.rep.impl.node.ReplayStatDefinition.N_LNS;
import static com.sleepycat.je.rep.impl.node.ReplayStatDefinition.N_NAME_LNS;
import static com.sleepycat.je.rep.impl.node.ReplayStatDefinition.TOTAL_COMMIT_PROCESSING_NANOS;
import static java.util.concurrent.TimeUnit.MILLISECONDS;
import static java.util.concurrent.TimeUnit.NANOSECONDS;

import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Date;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentMap;
import java.util.logging.Level;
import java.util.logging.Logger;

import com.sleepycat.je.Cursor;
import com.sleepycat.je.DatabaseConfig;
import com.sleepycat.je.DatabaseEntry;
import com.sleepycat.je.DatabaseException;
import com.sleepycat.je.DatabaseNotFoundException;
import com.sleepycat.je.DbInternal;
import com.sleepycat.je.Durability.SyncPolicy;
import com.sleepycat.je.EnvironmentFailureException;
import com.sleepycat.je.LockMode;
import com.sleepycat.je.OperationStatus;
import com.sleepycat.je.StatsConfig;
import com.sleepycat.je.TransactionConfig;
import com.sleepycat.je.dbi.CursorImpl.SearchMode;
import com.sleepycat.je.dbi.DatabaseId;
import com.sleepycat.je.dbi.DatabaseImpl;
import com.sleepycat.je.dbi.DbTree.TruncateDbResult;
import com.sleepycat.je.dbi.DbType;
import com.sleepycat.je.dbi.EnvironmentFailureReason;
import com.sleepycat.je.dbi.PutMode;
import com.sleepycat.je.dbi.TriggerManager;
import com.sleepycat.je.log.DbOpReplicationContext;
import com.sleepycat.je.log.LogEntryType;
import com.sleepycat.je.log.LogManager;
import com.sleepycat.je.log.ReplicationContext;
import com.sleepycat.je.log.entry.DbOperationType;
import com.sleepycat.je.log.entry.LNLogEntry;
import com.sleepycat.je.log.entry.LogEntry;
import com.sleepycat.je.log.entry.NameLNLogEntry;
import com.sleepycat.je.log.entry.SingleItemEntry;
import com.sleepycat.je.recovery.RecoveryInfo;
import com.sleepycat.je.recovery.RollbackTracker;
import com.sleepycat.je.rep.SyncupProgress;
import com.sleepycat.je.rep.impl.RepGroupDB;
import com.sleepycat.je.rep.impl.RepImpl;
import com.sleepycat.je.rep.impl.RepParams;
import com.sleepycat.je.rep.stream.InputWireRecord;
import com.sleepycat.je.rep.stream.MasterStatus.MasterSyncException;
import com.sleepycat.je.rep.stream.Protocol;
import com.sleepycat.je.rep.txn.ReplayTxn;
import com.sleepycat.je.rep.utilint.LongMaxZeroStat;
import com.sleepycat.je.rep.utilint.LongMinZeroStat;
import com.sleepycat.je.rep.utilint.NamedChannel;
import com.sleepycat.je.rep.vlsn.VLSNRange;
import com.sleepycat.je.tree.LN;
import com.sleepycat.je.tree.NameLN;
import com.sleepycat.je.txn.RollbackEnd;
import com.sleepycat.je.txn.RollbackStart;
import com.sleepycat.je.txn.Txn;
import com.sleepycat.je.txn.TxnAbort;
import com.sleepycat.je.txn.TxnCommit;
import com.sleepycat.je.utilint.DbLsn;
import com.sleepycat.je.utilint.LoggerUtils;
import com.sleepycat.je.utilint.LongMaxStat;
import com.sleepycat.je.utilint.LongMinStat;
import com.sleepycat.je.utilint.LongStat;
import com.sleepycat.je.utilint.StatGroup;
import com.sleepycat.je.utilint.VLSN;

/**
* Replays log records from the replication stream, and manages the
* transactions for those records.
*
* The Replay module has a lifetime equivalent to the environment owned by
* this replicator. Its lifetime is longer than the feeder/replica stream.
* For example, suppose this is nodeX:
*
* t1 - Node X is a replica, node A is master. Replay X is alive
* t2 - Node X is a replica, node B takes over as master. X's Replay module
*      is still alive and has the same set of active txns. It doesn't matter
*      to X that the master has changed.
* t3 - Node X becomes the master. Now its Replay unit is cleared, because
*      anything managed by the Replay is defunct.
*/
public class Replay {

    /* These are strings for the rollback logging. */
    private static final String RBSTATUS_START =
        "Started Rollback";
    private static final String RBSTATUS_NO_ACTIVE =
        "No active txns, nothing to rollback";
    private static final String RBSTATUS_RANGE_EQUALS =
        "End of range equals matchpoint, nothing to rollback";
    private static final String RBSTATUS_LOG_RBSTART =
        "Logged RollbackStart entry";
    private static final String RBSTATUS_MEM_ROLLBACK =
        "Finished in-memory rollback";
    private static final String RBSTATUS_INVISIBLE =
        "Finished invisible setting";
    private static final String RBSTATUS_FINISH =
        "Finished rollback";

    private final RepImpl repImpl;

    /*
     *  If a commit replay operation takes more than this threshold, it's
     *  logged. This information helps determine whether ack timeouts on the
     *  master are due to a slow replica, or the network.
     */
    private final long ackTimeoutLogThresholdInNanos;

    /**
     * ActiveTxns is a collection of txn objects used for applying replicated
     * transactions. This collection should be empty if the node is a master.
     *
     * Note that there is an interesting relationship between ActiveTxns and
     * the txn collection managed by the environment TxnManager. ActiveTxns is
     * effectively a subset of the set of txns held by the
     * TxnManager. ReplayTxns must be sure to register and unregister
     * themselves from ActiveTxns, just as all Txns must register and
     * unregister with the TxnManager's set. One implementation alternative to
     * having an ActiveTxns map here is to search the TxnManager set (which is
     * a set, not a map) for a given ReplayTxn. Another is to subclass
     * TxnManager so that replicated nodes have their own replayTxn map, just
     * as XATxns have a XID->Txn map.
     *
     * Both alternatives seemed too costly in terms of performance or elaborate
     * in terms of code to do for the current function. It seems clearer to
     * make the ActiveTxns a map in the one place that it is currently
     * used. This choice may change over time, and should be reevaluated if the
     * implementation changes.
     *
     * The ActiveTxns key is the transaction id. These transactions are closed
     * when:
     * - the replay unit executes a commit received over the replication stream
     * - the replay unit executes an abort received over the replication stream
     * - the replication node realizes that it has just become the new master,
     *   and was not previously the master.
     *
     * Note that the Replay class has a lifetime that is longer than that of a
     * RepNode. This means in particular, that transactions may be left open,
     * and will be resumed when a replica switches from one master to another,
     * creating a new RepNode in the process. Because of that, part of the
     * transaction may be implemented by the rep stream from one master and
     * another part by another.
     */
    private final ConcurrentMap<Long, ReplayTxn> activeTxns;

    /*
     * The entry representing the last replayed txn commit. Supports the
     * replica's notion of consistency.
     */
    private volatile TxnInfo lastReplayedTxn = null;

    /*
     * The last replayed entry of any kind. Supports PointConsistencyPolicy.
     */
    private volatile VLSN lastReplayedVLSN = null;

    /*
     * The sync policy to be used in the absence of an ACK request. The replica
     * in this case has some latitude about how it syncs the commit.
     */
    private final SyncPolicy noAckSyncPolicy = SyncPolicy.NO_SYNC;

    /**
     *  The RepParams.REPLAY_LOGGING_THRESHOLD configured logging threshold.
     */
    private final long replayLoggingThresholdNs;

    /* Maintains the statistics associated with stream replay. */
    private final StatGroup statistics;
    private final LongStat nCommits;
    private final LongStat nCommitAcks;
    private final LongStat nCommitSyncs;
    private final LongStat nCommitNoSyncs;
    private final LongStat nCommitWriteNoSyncs;
    private final LongStat nAborts;
    private final LongStat nNameLNs;
    private final LongStat nLNs;
    private final LongStat nElapsedTxnTime;
    private final LongMinStat minCommitProcessingNanos;
    private final LongMaxStat maxCommitProcessingNanos;
    private final LongStat totalCommitProcessingNanos;

    private final Logger logger;

    public Replay(RepImpl repImpl,
                  @SuppressWarnings("unused") NameIdPair nameIdPair) {

        /*
         * This should have already been caught in
         * ReplicatedEnvironment.setupRepConfig, but it is checked here anyway
         * as an added sanity check. [#17643]
         */
        if (repImpl.isReadOnly()) {
            throw EnvironmentFailureException.unexpectedState
                ("Replay created with readonly ReplicatedEnvironment");
        }

        this.repImpl = repImpl;
        ackTimeoutLogThresholdInNanos =
            repImpl.getConfigManager().
            getDuration(RepParams.REPLICA_ACK_TIMEOUT) * 1000000L;

        /*
         * The Replay module manages all write transactions and mimics a
         * writing application thread. When the node comes up, it populates
         * the activeTxn collection with ReplayTxns that were resurrected
         * at recovery time.
         */
        activeTxns = new ConcurrentHashMap<Long,ReplayTxn>();

        logger = LoggerUtils.getLogger(getClass());
        statistics = new StatGroup(ReplayStatDefinition.GROUP_NAME,
                                   ReplayStatDefinition.GROUP_DESC);

        nCommits = new LongStat(statistics, N_COMMITS);
        nCommitAcks = new LongStat(statistics, N_COMMIT_ACKS);
        nCommitSyncs = new LongStat(statistics, N_COMMIT_SYNCS);
        nCommitNoSyncs = new LongStat(statistics, N_COMMIT_NO_SYNCS);
        nCommitWriteNoSyncs =
            new LongStat(statistics, N_COMMIT_WRITE_NO_SYNCS);
        nAborts = new LongStat(statistics, N_ABORTS);
        nNameLNs = new LongStat(statistics, N_NAME_LNS);
        nLNs = new LongStat(statistics, N_LNS);
        nElapsedTxnTime = new LongStat(statistics, N_ELAPSED_TXN_TIME);
        minCommitProcessingNanos =
            new LongMinZeroStat(statistics, MIN_COMMIT_PROCESSING_NANOS);
        maxCommitProcessingNanos =
            new LongMaxZeroStat(statistics, MAX_COMMIT_PROCESSING_NANOS);
        totalCommitProcessingNanos =
            new LongStat(statistics, TOTAL_COMMIT_PROCESSING_NANOS);

        replayLoggingThresholdNs =
                MILLISECONDS.toNanos(repImpl.getConfigManager().
                getDuration(RepParams.REPLAY_LOGGING_THRESHOLD));
    }

    /**
     * Actions that must be taken before the recovery checkpoint, whether
     * the environment is read/write or read/only.
     */
    public void preRecoveryCheckpointInit(RecoveryInfo recoveryInfo) {
        for (Txn txn : recoveryInfo.replayTxns.values()) {

            /*
             * ReplayTxns need to know about their owning activeTxn map,
             * so they can remove themselves at close. We are casting upwards,
             * because we the non-HA code is prohibited from referencing
             * Replication classes, and the RecoveryInfo.replayTxns collection
             * doesn't know that it's got ReplayTxns.
             */
            ((ReplayTxn) txn).registerWithActiveTxns(activeTxns);
        }
        lastReplayedVLSN = repImpl.getVLSNIndex().getRange().getLast();
    }

    public TxnInfo getLastReplayedTxn() {
        return lastReplayedTxn;
    }

    public VLSN getLastReplayedVLSN() {
        return lastReplayedVLSN;
    }

    /**
     * When mastership changes, all inflight replay transactions are aborted.
     * Replay transactions are only aborted by the new master (who was
     * previously a Replica); the replay transactions on the other replicas are
     * resolved by the abort record issued by said new master.
     */
    public void abortOldTxns()
        throws DatabaseException {

        final int masterNodeId = repImpl.getNodeId();
        for (ReplayTxn replayTxn : activeTxns.values()) {
            replayTxn.abort(ReplicationContext.MASTER, masterNodeId);
        }
        assert activeTxns.size() == 0 : "Unexpected txns in activeTxns = " +
            activeTxns;
    }

    private void updateCommitStats(boolean needsAck,
                                   SyncPolicy syncPolicy,
                                   long startTimeNanos) {

        final long now = System.nanoTime();
        final long commitNanos = now - startTimeNanos;

        if (commitNanos > ackTimeoutLogThresholdInNanos &&
            logger.isLoggable(Level.INFO)) {
            LoggerUtils.info
                (logger, repImpl,
                 "Replay commit time: " + (commitNanos / 1000000) +
                 " ms exceeded log threshold: " +
                 (ackTimeoutLogThresholdInNanos / 1000000));
        }

        nCommits.increment();

        if (needsAck) {
            nCommitAcks.increment();
        }

        if (syncPolicy == SyncPolicy.SYNC) {
            nCommitSyncs.increment();
        } else if (syncPolicy == SyncPolicy.NO_SYNC) {
            nCommitNoSyncs.increment();
        } else if (syncPolicy == SyncPolicy.WRITE_NO_SYNC) {
            nCommitWriteNoSyncs.increment();
        } else {
            throw EnvironmentFailureException.unexpectedState
                ("Unknown sync policy: " + syncPolicy);
        }

        totalCommitProcessingNanos.add(commitNanos);
        minCommitProcessingNanos.setMin(commitNanos);
        maxCommitProcessingNanos.setMax(commitNanos);
    }

    /**
     * Apply the operation represented by this log entry on this replica node.
     * @throws InterruptedException
     * @throws MasterSyncException
     */
    public void replayEntry(NamedChannel namedChannel,
                            Protocol protocol,
                            Protocol.Entry entry)
        throws DatabaseException,
               IOException,
               InterruptedException,
               MasterSyncException {

        final long startNs = System.nanoTime();
        final InputWireRecord wireRecord = entry.getWireRecord();
        final LogEntry logEntry = wireRecord.getLogEntry();

        /*
         * Sanity check that the replication stream is in sequence. We want to
         * forestall any possible corruption from replaying invalid entries.
         */
        if (!wireRecord.getVLSN().follows(lastReplayedVLSN)) {
            throw new EnvironmentFailureException
                (repImpl,
                 EnvironmentFailureReason.UNEXPECTED_STATE,
                 "Rep stream not sequential. Current VLSN: " +
                 lastReplayedVLSN +
                 " next log entry VLSN: " + wireRecord.getVLSN());
        }

        if (logger.isLoggable(Level.FINEST)) {
            LoggerUtils.finest(logger, repImpl, "Replaying " + wireRecord);
        }

        final ReplayTxn repTxn = getReplayTxn(logEntry.getTransactionId());
        updateReplicaSequences(logEntry);
        final byte entryType = wireRecord.getEntryType();

        lastReplayedVLSN = wireRecord.getVLSN();

        final RepNode repNode = repImpl.getRepNode();
        try {
            if (LOG_TXN_COMMIT.equalsType(entryType)) {
                Protocol.Commit commitEntry = (Protocol.Commit) entry;

                final boolean needsAck = commitEntry.getNeedsAck();
                final SyncPolicy syncPolicy =
                    needsAck ?
                    commitEntry.getReplicaSyncPolicy() :
                    noAckSyncPolicy;

                logReplay(repTxn, needsAck, syncPolicy);

                final TxnCommit masterCommit =
                        (TxnCommit) logEntry.getMainItem();

                if (needsAck) {

                    /*
                     * Only wait if the replica is not lagging and the
                     * durability requires it.
                     */
                    repNode.getVLSNFreezeLatch().awaitThaw();
                    repNode.getMasterStatus().assertSync();
                }

                repTxn.commit(syncPolicy,
                              new ReplicationContext(lastReplayedVLSN),
                              masterCommit.getMasterNodeId());

                final long masterCommitTimeMs =
                        masterCommit.getTime().getTime();
                lastReplayedTxn = new TxnInfo(lastReplayedVLSN,
                                              masterCommitTimeMs);

                updateCommitStats(needsAck, syncPolicy, startNs);

                /* Respond to the feeder. */
                if (needsAck) {
                    protocol.write(protocol.new Ack(repTxn.getId()),
                                   namedChannel);
                }

                /*
                 * The group refresh and recalculation can be expensive, since
                 * it may require a database read. Do it after the ack.
                 */
                if (repTxn.getRepGroupDbChange() && canRefreshGroup(repTxn)) {
                    repNode.refreshCachedGroup();
                    repNode.recalculateGlobalCBVLSN();
                }

                nElapsedTxnTime.add(repTxn.elapsedTime());
            } else if (LOG_TXN_ABORT.equalsType(entryType)) {

                nAborts.increment();
                final TxnAbort masterAbort = (TxnAbort) logEntry.getMainItem();
                final ReplicationContext abortContext =
                    new ReplicationContext(wireRecord.getVLSN());
                if (logger.isLoggable(Level.FINEST)) {
                    LoggerUtils.finest(logger, repImpl,
                                       "abort called for " + repTxn.getId() +
                                       " masterId=" +
                                       masterAbort.getMasterNodeId() +
                                       " repContext=" + abortContext);
                }
                repTxn.abort(abortContext, masterAbort.getMasterNodeId());
                lastReplayedTxn = new TxnInfo(lastReplayedVLSN,
                                              masterAbort.getTime().getTime());
                if (repTxn.getRepGroupDbChange() && canRefreshGroup(repTxn)) {

                    /*
                     * Refresh is the safe thing to do on an abort, since a
                     * refresh may have been held back from an earlier commit
                     * due to this active transaction.
                     */
                    repNode.refreshCachedGroup();
                }
                nElapsedTxnTime.add(repTxn.elapsedTime());
            } else if (LOG_NAMELN_TRANSACTIONAL.equalsType(entryType)) {

                repNode.getReplica().clearDbTreeCache();
                nNameLNs.increment();
                applyNameLN(repTxn, wireRecord);
            } else {

                nLNs.increment();
                /* A data operation. */
                assert wireRecord.getLogEntry() instanceof LNLogEntry;
                applyLN(repTxn, wireRecord);
            }
            /* Remember the last VLSN applied by this txn. */
            repTxn.setLastAppliedVLSN(lastReplayedVLSN);
        } catch (DatabaseException e) {
            e.addErrorMessage("Problem seen replaying entry " + wireRecord);
            throw e;
        } finally {
            final long elapsedNs = System.nanoTime() - startNs;
            if (elapsedNs > replayLoggingThresholdNs) {
                LoggerUtils.info(logger, repImpl,
                                 "Replay time for entry type:" +
                                 LogEntryType.findType(entryType) + " " +
                                 NANOSECONDS.toMillis(elapsedNs) + "ms " +
                                 "exceeded threshold:" +
                                 NANOSECONDS.
                                     toMillis(replayLoggingThresholdNs) +
                                 "ms");
            }
        }
    }

    /**
     * Logs information associated with the replay of the txn commit
     */
    private void logReplay(ReplayTxn repTxn,
                           boolean needsAck,
                           SyncPolicy syncPolicy) {

        if (!logger.isLoggable(Level.FINE)) {
            return;
        }

        if (needsAck) {
            LoggerUtils.fine(logger, repImpl,
                             "Replay: got commit for txn=" + repTxn.getId() +
                             ", ack needed, replica sync policy=" +
                             syncPolicy +
                             " vlsn=" + lastReplayedVLSN);
        } else {
            LoggerUtils.fine(logger, repImpl,
                             "Replay: got commit for txn=" + repTxn.getId() +
                             " ack not needed" +
                             " vlsn=" + lastReplayedVLSN);
        }
    }

    /**
     * Returns true if there are no other activeTxns that have also modified
     * the membership database and are still open, since they could potentially
     * hold write locks that would block the read locks acquired during the
     * refresh operation.
     *
     * @param txn the current txn being committed or aborted
     *
     * @return true if there are no open transactions that hold locks on the
     * membership database.
     */
    private boolean canRefreshGroup(ReplayTxn txn) {
        for (ReplayTxn atxn : activeTxns.values()) {
            if (atxn == txn) {
                continue;
            }
            if (atxn.getRepGroupDbChange()) {
                return false;
            }
        }
        return true;
    }

    /**
     * Update this replica's node, txn and database sequences with any ids in
     * this log entry. We can call update, even if the replay id doesn't
     * represent a new lowest-id point, or if the apply is not successful,
     * because the apply checks that the replay id is < the sequence on the
     * replica. We just want to ensure that if this node becomes the master,
     * its sequences are in sync with what came before in the replication
     * stream, and ids are not incorrectly reused.
     */
    private void updateReplicaSequences(LogEntry logEntry) {

        /* For now, we assume all replay entries have a txn id. */
        repImpl.getTxnManager().updateFromReplay(logEntry.getTransactionId());

        /* If it's a database operation, update the database id. */
        if (logEntry instanceof NameLNLogEntry) {
            NameLNLogEntry nameLogEntry = (NameLNLogEntry) logEntry;
            nameLogEntry.postFetchInit(false /*isDupDb*/);
            NameLN nameLN = (NameLN) nameLogEntry.getLN();
            repImpl.getDbTree().updateFromReplay(nameLN.getId());
        }
    }

    /**
     * Obtain a ReplayTxn to represent the incoming operation.
     */
    private ReplayTxn getReplayTxn(long txnId)
        throws DatabaseException {

        /*
         * We are using a ConcurrentHashMap, but not using the atomic
         * putIfAbsent method. That's because putIfAbsent requires creating a
         * candidate ReplayTxn for insertion, which runs afoul of two
         * issues. (a) ReplayTxn construction may be heavyweight, because of
         * the need to adjust the MemoryBudget and create various collections,
         * and (b) more importantly, Txn construction has the side effect of
         * inserting a Txn into the txnManager's txn table.  Although ReplayTxn
         * construction may be made cheaper with lazy initializion in order to
         * reduce memory cost, factor (b) prohibits using putIfAbsent().
         */
        ReplayTxn useTxn = null;
        synchronized (activeTxns) {
            useTxn = activeTxns.get(txnId);
            if (useTxn == null) {

                /*
                 * Durability will be explicitly specified when
                 * ReplayTxn.commit is called, so TransactionConfig.DEFAULT is
                 * fine.
                 */
                useTxn = new ReplayTxn(repImpl,
                                       TransactionConfig.DEFAULT,
                                       txnId,
                                       activeTxns,
                                       logger);
            }
        }
        return useTxn;
    }

    /**
     * Replays the NameLN.
     *
     * Note that the operations: remove, rename and truncate need to establish
     * write locks on the database. Any open handles are closed by this
     * operation by virtue of the ReplayTxn's importunate property.  The
     * application will receive a LockPreemptedException if it subsequently
     * accesses the database handle.
     */
    private void applyNameLN(ReplayTxn repTxn,
                             InputWireRecord wireRecord)
        throws DatabaseException {

        NameLNLogEntry nameLNEntry = (NameLNLogEntry) wireRecord.getLogEntry();
        final NameLN nameLN = (NameLN) nameLNEntry.getLN();

        String databaseName;
        try {
            databaseName = new String(nameLNEntry.getKey(), "UTF-8");
        } catch (UnsupportedEncodingException e) {
            throw EnvironmentFailureException.unexpectedException(e);
        }

        final DbOpReplicationContext repContext =
            new DbOpReplicationContext(wireRecord.getVLSN(), nameLNEntry);

        DbOperationType opType = repContext.getDbOperationType();
        DatabaseImpl dbImpl = null;
        try {
            switch (opType) {
                case CREATE:
                {
                    DatabaseConfig dbConfig =
                        repContext.getCreateConfig().getReplicaConfig(repImpl);

                    dbImpl = repImpl.getDbTree().createReplicaDb
                      (repTxn, databaseName, dbConfig, nameLN, repContext);

                    /*
                     * We rely on the RepGroupDB.DB_ID value, so make sure
                     * it's what we expect for this internal replicated
                     * database.
                     */
                    if ((dbImpl.getId().getId() == RepGroupDB.DB_ID) &&
                        !DbType.REP_GROUP.getInternalName().equals
                        (databaseName)) {
                        throw EnvironmentFailureException.unexpectedState
                            ("Database: " +
                             DbType.REP_GROUP.getInternalName() +
                             " is associated with id: " +
                             dbImpl.getId().getId() +
                             " and not the reserved database id: " +
                             RepGroupDB.DB_ID);
                    }

                    TriggerManager.runOpenTriggers(repTxn, dbImpl, true);
                    break;
                }

                case REMOVE: {
                    dbImpl = repImpl.getDbTree().getDb(nameLN.getId());
                    try {
                        repImpl.getDbTree().removeReplicaDb
                            (repTxn, databaseName, nameLN.getId(), repContext);
                        TriggerManager.runRemoveTriggers(repTxn, dbImpl);
                    } catch (DatabaseNotFoundException e) {
                        throw EnvironmentFailureException.unexpectedState
                            ("Database: " + dbImpl.getName() +
                             " Id: " + nameLN.getId() +
                             " not found on the Replica.");
                    }
                    break;
                }

                case TRUNCATE: {
                    dbImpl = repImpl.getDbTree().getDb
                        (repContext.getTruncateOldDbId());
                    try {
                        TruncateDbResult result =
                        repImpl.getDbTree().truncateReplicaDb
                            (repTxn, databaseName, false, nameLN, repContext);
                        TriggerManager.runTruncateTriggers(repTxn, result.newDb);
                    } catch (DatabaseNotFoundException e) {
                        throw EnvironmentFailureException.unexpectedState
                            ("Database: " + dbImpl.getName() +
                             " Id: " + nameLN.getId() +
                             " not found on the Replica.");
                    }

                    break;
                }

                case RENAME: {
                    dbImpl = repImpl.getDbTree().getDb(nameLN.getId());
                    try {
                        dbImpl =
                        repImpl.getDbTree().renameReplicaDb
                            (repTxn, dbImpl.getName(), databaseName, nameLN,
                             repContext);
                        TriggerManager.runRenameTriggers(repTxn, dbImpl,
                                                         databaseName);
                    } catch (DatabaseNotFoundException e) {
                        throw EnvironmentFailureException.unexpectedState
                            ("Database rename from: " + dbImpl.getName() +
                             " to " + databaseName +
                             " failed, name not found on the Replica.");
                    }
                    break;
                }

                case UPDATE_CONFIG: {
                    /* Get the replicated database configurations. */
                    DatabaseConfig dbConfig =
                        repContext.getCreateConfig().getReplicaConfig(repImpl);

                    /* Update the NameLN and write it to the log. */
                    dbImpl = repImpl.getDbTree().getDb(nameLN.getId());
                    final String dbName = dbImpl.getName();
                    repImpl.getDbTree().updateNameLN
                        (repTxn, dbName, repContext);

                    /* Set the new configurations to DatabaseImpl. */
                    dbImpl.setConfigProperties
                        (repTxn, dbName, dbConfig, repImpl);

                    repImpl.getDbTree().modifyDbRoot(dbImpl);

                    break;
                }

                default:
                    throw EnvironmentFailureException.unexpectedState
                        ("Illegal database op type of " + opType.toString() +
                         " from " + wireRecord + " database=" + databaseName);
            }
        } finally {
            if (dbImpl != null) {
                repImpl.getDbTree().releaseDb(dbImpl);
            }
        }
    }

    private void applyLN(final ReplayTxn repTxn,
                         final InputWireRecord wireRecord)
        throws DatabaseException {

        final LNLogEntry lnEntry = (LNLogEntry) wireRecord.getLogEntry();
        final DatabaseId dbId = lnEntry.getDbId();

        /*
         * If this is a change to the rep group db, remember at commit time,
         * and refresh this node's group metadata.
         */
        if (dbId.getId() == RepGroupDB.DB_ID) {
            repTxn.noteRepGroupDbChange();
        }

        /*
         * Note that we don't have to worry about serializable isolation when
         * applying a replicated txn; serializable isolation in only an issue
         * for txns that take read locks, and a replicated txn consists only of
         * write operations.
         */
        final DatabaseImpl dbImpl =
            repImpl.getRepNode().getReplica().getDbCache().get(dbId, repTxn);
        lnEntry.postFetchInit(dbImpl);
        final ReplicationContext repContext =
            new ReplicationContext(wireRecord.getVLSN());
        final Cursor cursor = DbInternal.makeCursor(dbImpl, repTxn,
                                                    null /*cursorConfig*/);
        try {
            OperationStatus status;
            final LN ln = lnEntry.getLN();

            if (ln.isDeleted()) {

                /*
                 * Perform an exact search by key.  Use read-uncommitted and
                 * partial data entry to avoid reading old data.
                 */
                final DatabaseEntry key = new DatabaseEntry(lnEntry.getKey());
                final DatabaseEntry data = new DatabaseEntry();
                data.setPartial(0, 0, true);
                status = DbInternal.searchForReplay(cursor, key, data,
                                                    LockMode.READ_UNCOMMITTED,
                                                    SearchMode.SET);
                if (status == OperationStatus.SUCCESS) {
                    status = DbInternal.deleteInternal(cursor, repContext);
                }
            } else {
                status = DbInternal.putForReplay(cursor, lnEntry.getKey(), ln,
                                                 PutMode.OVERWRITE,
                                                 repContext);
            }

            if (status != OperationStatus.SUCCESS) {
                throw new EnvironmentFailureException
                    (repImpl,
                     EnvironmentFailureReason.LOG_INCOMPLETE,
                     "Replicated operation could  not be applied. Status= " +
                     status + ' ' + wireRecord);
            }
        } finally {
            cursor.close();
        }
    }

    /**
     * Go through all active txns and rollback up to but not including the log
     * entry represented by the matchpoint VLSN.
     *
     * Effectively truncate these rolled back log entries by making them
     * invisible. Flush the log first, to make sure these log entries are out
     * of the log buffers and are on disk, so we can reliably find them through
     * the FileManager.
     *
     * Rollback steps are described in
     * https://sleepycat.oracle.com/trac/wiki/Logging#Recoverysteps. In
     * summary,
     *
     * 1. Log and fsync a new RollbackStart record
     * 2. Do the rollback in memory. There is no need to explicitly
     *    log INs made dirty by the rollback operation.
     * 3. Do invisibility masking by overwriting LNs.
     * 4. Fsync all overwritten log files at this point.
     * 5. Write a RollbackEnd record, for ease of debugging
     *
     * Note that application read txns  can continue to run during syncup.
     * Reader txns cannot access records that are being rolled back, because
     * they are in txns that are not committed, i.e, they are write locked.
     * The rollback interval never includes committed txns, and we do a hard
     * recovery if it would include them.
     */
    public void rollback(VLSN matchpointVLSN, long matchpointLsn) {

        String rollbackStatus = RBSTATUS_START;

        try {
            if (activeTxns.size() == 0) {
                /* no live read/write txns, nothing to do. */
                rollbackStatus = RBSTATUS_NO_ACTIVE;
                return;
            }

            VLSNRange range = repImpl.getVLSNIndex().getRange();
            if (range.getLast().equals(matchpointVLSN)) {
                /* nothing to roll back. */
                rollbackStatus = RBSTATUS_RANGE_EQUALS;
                return;
            }

            repImpl.setSyncupProgress(SyncupProgress.DO_ROLLBACK);

            /*
             * Stop the log file backup service, since the files will be in an
             * inconsistent state while the rollback is in progress.
             */
            repImpl.getRepNode().shutdownNetworkBackup();

            /*
             * Set repImpl's isRollingBack to true, and invalidate all the in
             * progress DbBackup.
             */
            repImpl.setBackupProhibited(true);
            repImpl.invalidateBackups(DbLsn.getFileNumber(matchpointLsn));

            /*
             * 1. Log RollbackStart. The fsync guarantees that this marker will
             * be present in the log for recovery. It also ensures that all log
             * entries will be flushed to disk and the TxnChain will not have
             * to worry about entries that are in log buffers when constructing
             * the rollback information.
             */
            LogManager logManager = repImpl.getLogManager();
            LogEntry rollbackStart =
                new SingleItemEntry(LogEntryType.LOG_ROLLBACK_START,
                                    new RollbackStart(matchpointVLSN,
                                                      matchpointLsn,
                                                      activeTxns.keySet()));
            long rollbackStartLsn =
                logManager.logForceFlush(rollbackStart,
                                         true, // fsyncRequired,
                                         ReplicationContext.NO_REPLICATE);
            rollbackStatus = RBSTATUS_LOG_RBSTART;

            /*
             * 2. Do rollback in memory. Undo any operations that were logged
             * after the matchpointLsn, and save the LSNs for those log
             * entries.. There should be something to undo, because we checked
             * earlier that there were log entries after the matchpoint.
             */
            List<Long> rollbackLsns = new ArrayList<Long>();
            for (ReplayTxn replayTxn : activeTxns.values()) {
                Collection<Long> txnRollbackLsns =
                    replayTxn.rollback(matchpointLsn);

                /*
                 * Txns that were entirely rolled back should have been removed
                 * from the activeTxns map.
                 */
                assert checkRemoved(replayTxn) :
                    "Should have removed " + replayTxn;

                rollbackLsns.addAll(txnRollbackLsns);
            }
            rollbackStatus = RBSTATUS_MEM_ROLLBACK;
            assert rollbackLsns.size() != 0;

            /*
             * 3 & 4 - Mark the rolled back log entries as invisible. After all
             * are done, fsync the set of files. By waiting, some may have made
             * it out on their own.
             */
            RollbackTracker.makeInvisible(repImpl, rollbackLsns);
            rollbackStatus = RBSTATUS_INVISIBLE;

            /*
             * 5. Log RollbackEnd. Flush it so that we can use it to optimize
             * recoveries later on. If the RollbackEnd exists, we can skip the
             * step of re-making LNs invisible.
             */
            logManager.logForceFlush
                (new SingleItemEntry(LogEntryType.LOG_ROLLBACK_END,
                                     new RollbackEnd(matchpointLsn,
                                                     rollbackStartLsn)),
                 true, // fsyncRequired
                 ReplicationContext.NO_REPLICATE);

            /*
             * Restart the backup service only if all the steps of the
             * rollback were successful.
             */
            repImpl.getRepNode().restartNetworkBackup();
            repImpl.setBackupProhibited(false);
            rollbackStatus = RBSTATUS_FINISH;
        } finally {

            /* Reset the lastReplayedVLSN so it's correct when we resume. */
            lastReplayedVLSN = matchpointVLSN;
            LoggerUtils.info(logger, repImpl,
                             "Rollback to matchpoint " + matchpointVLSN +
                             " at " + DbLsn.getNoFormatString(matchpointLsn) +
                             " status=" + rollbackStatus);
        }
    }

    private boolean checkRemoved(ReplayTxn txn) {
        if (txn.isClosed()) {
            if (activeTxns.containsKey(txn.getId())) {
                return false;
            }
    }

        return true;
    }

    /**
     * Release all transactions, database handles, etc held by the replay
     * unit. The Replicator is closing down and Replay will not be invoked
     * again.
     */
    public void close() {
        for (ReplayTxn replayTxn : activeTxns.values()) {
            try {
                if (logger.isLoggable(Level.FINE)) {
                    LoggerUtils.fine(logger, repImpl,
                                     "Unregistering open replay txn: " +
                                     replayTxn.getId());
                }
                replayTxn.cleanup();
            } catch (DatabaseException e) {
                LoggerUtils.fine(logger, repImpl,
                                 "Replay txn: " + replayTxn.getId() +
                                 " unregistration failed: " + e.getMessage());
            }
        }
        assert activeTxns.size() == 0;
    }

    /**
     * Returns a copy of the statistics associated with Replay
     */
    public StatGroup getStats(StatsConfig config) {
        StatGroup ret = statistics.cloneGroup(config.getClear());

        return ret;
    }

    public void resetStats() {
        statistics.clear();
    }

    /* For unit tests */
    public Map<Long, ReplayTxn> getActiveTxns() {
        return activeTxns;
    }

    public String dumpState() {
        StringBuilder sb = new StringBuilder();
        sb.append("lastReplayedTxn=").append(lastReplayedTxn);
        sb.append(" lastReplayedVLSN=").append(lastReplayedVLSN);
        sb.append(" numActiveReplayTxns=").append(activeTxns.size());
        sb.append("\n");
        return sb.toString();
    }

    /**
     * Simple helper class to package a Txn vlsn and its associated commit
     * time.
     */
    public static class TxnInfo {
        final VLSN txnVLSN;
        final long masterTxnEndTime;

        private TxnInfo(VLSN txnVLSN, long masterTxnEndTime) {
            this.txnVLSN = txnVLSN;
            this.masterTxnEndTime = masterTxnEndTime;
        }

        public VLSN getTxnVLSN() {
            return txnVLSN;
        }

        public long getMasterTxnEndTime() {
            return masterTxnEndTime;
        }

        @Override
        public String toString() {
            return " VLSN: " + txnVLSN +
                " masterTxnEndTime=" + new Date(masterTxnEndTime);
        }
    }
}
TOP

Related Classes of com.sleepycat.je.rep.impl.node.Replay

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.