/*-
* See the file LICENSE for redistribution information.
*
* Copyright (c) 2002, 2011 Oracle and/or its affiliates. All rights reserved.
*
*/
package com.sleepycat.je.rep.impl.node;
import static com.sleepycat.je.rep.ReplicatedEnvironment.State.DETACHED;
import static com.sleepycat.je.rep.ReplicatedEnvironment.State.MASTER;
import static com.sleepycat.je.rep.ReplicatedEnvironment.State.REPLICA;
import static com.sleepycat.je.rep.ReplicatedEnvironment.State.UNKNOWN;
import static com.sleepycat.je.rep.impl.RepParams.CATCHUP_MASTER_PHASE2_TIMEOUT;
import static com.sleepycat.je.rep.impl.RepParams.CATCHUP_MASTER_TIMEOUT;
import static com.sleepycat.je.rep.impl.RepParams.DBTREE_CACHE_CLEAR_COUNT;
import static com.sleepycat.je.rep.impl.RepParams.ENV_CONSISTENCY_TIMEOUT;
import static com.sleepycat.je.rep.impl.RepParams.HEARTBEAT_INTERVAL;
import static com.sleepycat.je.rep.impl.RepParams.LOG_FLUSH_TASK_INTERVAL;
import static com.sleepycat.je.rep.impl.RepParams.RUN_LOG_FLUSH_TASK;
import java.io.IOException;
import java.net.InetAddress;
import java.net.InetSocketAddress;
import java.util.HashSet;
import java.util.Set;
import java.util.Timer;
import java.util.UUID;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.TimeUnit;
import java.util.logging.Logger;
import com.sleepycat.je.CheckpointConfig;
import com.sleepycat.je.DatabaseException;
import com.sleepycat.je.Durability;
import com.sleepycat.je.Durability.ReplicaAckPolicy;
import com.sleepycat.je.EnvironmentFailureException;
import com.sleepycat.je.RecoveryProgress;
import com.sleepycat.je.ReplicaConsistencyPolicy;
import com.sleepycat.je.StatsConfig;
import com.sleepycat.je.dbi.DbConfigManager;
import com.sleepycat.je.dbi.StartupTracker.Phase;
import com.sleepycat.je.log.LogEntryType;
import com.sleepycat.je.log.LogManager;
import com.sleepycat.je.rep.AppStateMonitor;
import com.sleepycat.je.rep.GroupShutdownException;
import com.sleepycat.je.rep.MasterStateException;
import com.sleepycat.je.rep.MasterTransferFailureException;
import com.sleepycat.je.rep.MemberNotFoundException;
import com.sleepycat.je.rep.NoConsistencyRequiredPolicy;
import com.sleepycat.je.rep.QuorumPolicy;
import com.sleepycat.je.rep.RepInternal;
import com.sleepycat.je.rep.ReplicaConsistencyException;
import com.sleepycat.je.rep.ReplicaStateException;
import com.sleepycat.je.rep.ReplicatedEnvironment;
import com.sleepycat.je.rep.ReplicatedEnvironmentStats;
import com.sleepycat.je.rep.RestartRequiredException;
import com.sleepycat.je.rep.UnknownMasterException;
import com.sleepycat.je.rep.elections.Elections;
import com.sleepycat.je.rep.elections.Learner;
import com.sleepycat.je.rep.elections.MasterValue;
import com.sleepycat.je.rep.elections.Proposer.Proposal;
import com.sleepycat.je.rep.elections.Proposer.WinningProposal;
import com.sleepycat.je.rep.elections.TimebasedProposalGenerator;
import com.sleepycat.je.rep.impl.BinaryNodeStateService;
import com.sleepycat.je.rep.impl.GroupService;
import com.sleepycat.je.rep.impl.NodeStateService;
import com.sleepycat.je.rep.impl.PointConsistencyPolicy;
import com.sleepycat.je.rep.impl.RepGroupDB;
import com.sleepycat.je.rep.impl.RepGroupImpl;
import com.sleepycat.je.rep.impl.RepImpl;
import com.sleepycat.je.rep.impl.RepNodeImpl;
import com.sleepycat.je.rep.impl.RepParams;
import com.sleepycat.je.rep.monitor.LeaveGroupEvent.LeaveReason;
import com.sleepycat.je.rep.stream.FeederTxns;
import com.sleepycat.je.rep.stream.MasterChangeListener;
import com.sleepycat.je.rep.stream.MasterStatus;
import com.sleepycat.je.rep.stream.MasterSuggestionGenerator;
import com.sleepycat.je.rep.util.ldiff.LDiffService;
import com.sleepycat.je.rep.utilint.RepUtils;
import com.sleepycat.je.rep.utilint.RepUtils.ExceptionAwareCountDownLatch;
import com.sleepycat.je.rep.utilint.ServiceDispatcher;
import com.sleepycat.je.rep.vlsn.VLSNIndex;
import com.sleepycat.je.sync.impl.LogChangeSet;
import com.sleepycat.je.utilint.LoggerUtils;
import com.sleepycat.je.utilint.StoppableThread;
import com.sleepycat.je.utilint.TestHook;
import com.sleepycat.je.utilint.TestHookExecute;
import com.sleepycat.je.utilint.VLSN;
/**
* Represents a replication node. This class is the locus of operations that
* manage the state of the node, master, replica, etc. Once the state of a node
* has been established the thread of control passes over to the Replica or
* FeederManager instances.
*
* Note that both Feeders and the Replica instance may be active in future when
* we support r2r replication, in addition to m2r replication. For now however,
* either the FeederManager is active, or the Replica is and the same common
* thread control can be shared between the two.
*/
public class RepNode extends StoppableThread {
/*
* The unique node name and internal id that identifies the node within
* the rep group. There is a canonical instance of this that's updated
* when the node joins the group.
*/
private final NameIdPair nameIdPair;
/*
* The socket address on which Replicas connect to me, were this node
* to become the master.
*/
private final InetSocketAddress mySocket;
/* The service dispatcher used by this replication node. */
private final ServiceDispatcher serviceDispatcher;
/* The election instance for this node */
private Elections elections;
/* The locus of operations when the node is a replica. */
private final Replica replica;
/* Used when the node is a feeder. */
private final FeederManager feederManager;
/*
* The status of the Master. Note that this is the leading state as
* communicated to this node via the Listener. The node itself may not as
* yet have responded to this state change announced by the Listener. That
* is, nodeState, may reflect a different state until the transition to
* this state has been completed.
*/
private final MasterStatus masterStatus;
private final MasterChangeListener changeListener;
private final MasterSuggestionGenerator suggestionGenerator;
/*
* Represents the application visible state of this node. It may lag the
* state as described by masterStatus.
*/
private final NodeState nodeState;
/*
* Determines whether a node designated as a Primary is actually active
* as a Master as a direct result of this designation. If this is true,
* it indicates that this node is currently the master in a two node group
* and that it's not in communication with the Secondary node.
*/
private volatile boolean activePrimary = false;
/*
* If non-zero use this value overrides the normal group size calculations.
*/
private int electableGroupSizeOverride;
private final RepImpl repImpl;
/* The encapsulated internal replication group database. */
final RepGroupDB repGroupDB;
/*
* The latch used to indicate that the node has a well defined state as a
* Master or Replica and has finished the node-specific initialization that
* will permit it to function immediately in that capacity.
*
* For a Master it means that it's ready to start accepting connections
* from Replicas.
*
* For a Replica, it means that it has established a connection with a
* Feeder, completed the handshake process that validates it as being a
* legitimate member of the group, established a sync point, and is ready
* to start replaying the replication stream.
*/
private volatile ExceptionAwareCountDownLatch readyLatch = null;
/*
* Latch used to freeze txn commit VLSN advancement during an election.
*/
private final CommitFreezeLatch vlsnFreezeLatch = new CommitFreezeLatch();
/*
* Describes the nodes that form the group. This information is dynamic
* it's initialized at startup and subsequently as a result of changes
* made either directly to it, when the node is a master, or via the
* replication stream, when it is a Replica.
*/
private RepGroupImpl group;
/*
* Represents the VLSN of TxnAbort/TxnCommit entry of the last finished
* transaction. It's volatile so that reads initiated by the feeders get
* the most uptodate values.
*/
volatile private VLSN currentTxnEndVLSN = null;
/*
* Determines the election policy to use when the node holds its very first
* elections
*/
private QuorumPolicy electionQuorumPolicy = QuorumPolicy.SIMPLE_MAJORITY;
/*
* Amount of times to sleep between retries when a new node tries to locate
* a master.
*/
private static final int MASTER_QUERY_INTERVAL = 10000;
/* Number of times to retry joining on a retryable exception. */
private static final int JOIN_RETRIES = 10;
/*
* Encapsulates access to current time, to arrange for testing of clock
* skews.
*/
private final Clock clock;
private com.sleepycat.je.rep.impl.networkRestore.FeederManager
logFeederManager;
private LDiffService ldiff;
private NodeStateService nodeStateService;
private BinaryNodeStateService binaryNodeStateService;
/* tracks the local CBVLSN for this node. */
final LocalCBVLSNTracker cbvlsnTracker;
/* calculates and manages the global, cached CBVLSN */
final GlobalCBVLSN globalCBVLSN;
/* Determines how long to wait for a replica to catch up on a close. */
private long replicaCloseCatchupMs = -1;
/* Manage and notify MonitorChangeEvents fired by this RepNode. */
private MonitorEventManager monitorEventManager;
/* The user defined AppStateMonitor which gets the application state. */
private AppStateMonitor appStateMonitor;
/* A timer used to track inactive socket channels used by the RepNode. */
private final Timer timer;
private final ChannelTimeoutTask channelTimeoutTask;
private LogFlusher logFlusher;
final Logger logger;
/* Used by tests only. */
private TestHook<Integer> versionHook;
private TestHook<Thread> masterTransferHook;
public RepNode(RepImpl repImpl,
Replay replay,
NodeState nodeState)
throws IOException, DatabaseException {
super(repImpl, "RepNode " + repImpl.getNameIdPair());
this.repImpl = repImpl;
readyLatch = new ExceptionAwareCountDownLatch(repImpl, 1);
nameIdPair = repImpl.getNameIdPair();
logger = LoggerUtils.getLogger(getClass());
this.mySocket = repImpl.getSocket();
this.serviceDispatcher = new ServiceDispatcher(mySocket, repImpl);
serviceDispatcher.start();
clock = new Clock(RepImpl.getClockSkewMs());
this.repGroupDB = new RepGroupDB(repImpl);
masterStatus = new MasterStatus(nameIdPair);
replica = ReplicaFactory.create(this, replay);
feederManager = new FeederManager(this);
changeListener = new MasterChangeListener(this);
suggestionGenerator = new MasterSuggestionGenerator(this);
this.nodeState = nodeState;
electableGroupSizeOverride = repImpl.getConfigManager().
getInt(RepParams.ELECTABLE_GROUP_SIZE_OVERRIDE);
if (electableGroupSizeOverride > 0) {
LoggerUtils.warning(logger, repImpl,
"Electable group size override set to:" +
electableGroupSizeOverride);
}
utilityServicesStart();
this.cbvlsnTracker = new LocalCBVLSNTracker(this);
this.globalCBVLSN = new GlobalCBVLSN(this);
this.monitorEventManager = new MonitorEventManager(this);
timer = new Timer(true);
channelTimeoutTask = new ChannelTimeoutTask(timer);
configLogFlusher(getConfigManager());
}
private void utilityServicesStart() {
ldiff = new LDiffService(serviceDispatcher, repImpl);
logFeederManager =
new com.sleepycat.je.rep.impl.networkRestore.FeederManager
(serviceDispatcher, repImpl, nameIdPair);
/* Register the node state querying service. */
nodeStateService = new NodeStateService(serviceDispatcher, this);
serviceDispatcher.register(nodeStateService);
binaryNodeStateService =
new BinaryNodeStateService(serviceDispatcher, this);
}
/* Create a placeholder node, for test purposes only. */
public RepNode(NameIdPair nameIdPair) {
this(nameIdPair, null);
}
public RepNode() {
this(NameIdPair.NULL);
}
public RepNode(NameIdPair nameIdPair,
ServiceDispatcher serviceDispatcher) {
super("RepNode " + nameIdPair);
repImpl = null;
clock = new Clock(0);
this.nameIdPair = nameIdPair;
mySocket = null;
this.serviceDispatcher = serviceDispatcher;
this.repGroupDB = null;
masterStatus = new MasterStatus(NameIdPair.NULL);
replica = null;
feederManager = null;
changeListener = null;
suggestionGenerator = null;
nodeState = null;
cbvlsnTracker = null;
globalCBVLSN = null;
logger = null;
timer = null;
channelTimeoutTask = null;
}
@Override
public Logger getLogger() {
return logger;
}
/**
* Returns the timer associated with this RepNode
*/
public Timer getTimer() {
return timer;
}
public ServiceDispatcher getServiceDispatcher() {
return serviceDispatcher;
}
/**
* Returns the accumulated statistics for this node. The method
* encapsulates the statistics associated with its two principal components
* the FeederManager and the Replica.
*/
public ReplicatedEnvironmentStats getStats(StatsConfig config) {
ReplicatedEnvironmentStats ret =
RepInternal.makeReplicatedEnvironmentStats(feederManager,
replica,
repImpl.getVLSNIndex(),
config);
return ret;
}
public void resetStats() {
feederManager.resetStats();
replica.resetStats();
}
public ExceptionAwareCountDownLatch getReadyLatch() {
return readyLatch;
}
public CommitFreezeLatch getVLSNFreezeLatch() {
return vlsnFreezeLatch;
}
public void resetReadyLatch(Exception exception) {
if (readyLatch.getCount() != 0) {
/* releasing latch in some error situation. */
readyLatch.releaseAwait(exception);
}
readyLatch = new ExceptionAwareCountDownLatch(repImpl, 1);
}
/* The methods below return the components of the rep node. */
public FeederManager feederManager() {
return feederManager;
}
public Replica replica() {
return replica;
}
public Clock getClock() {
return clock;
}
Replica getReplica() {
return replica;
}
public RepGroupDB getRepGroupDB() {
return repGroupDB;
}
public RepGroupImpl getGroup() {
return group;
}
/**
* Returns the UUID associated with the replicated environment.
*/
public UUID getUUID() {
if (group == null) {
throw EnvironmentFailureException.unexpectedState
("Group info is not available");
}
return group.getUUID();
}
/**
* Returns the nodeName associated with this replication node.
*
* @return the nodeName
*/
public String getNodeName() {
return nameIdPair.getName();
}
/**
* Returns the nodeId associated with this replication node.
*
* @return the nodeId
*/
public int getNodeId() {
return nameIdPair.getId();
}
public NameIdPair getNameIdPair() {
return nameIdPair;
}
public InetSocketAddress getSocket() {
return mySocket;
}
public MasterStatus getMasterStatus() {
return masterStatus;
}
/**
* Returns a definitive answer to whether this node is currently the master
* by checking both its status as a master and that at least a simple
* majority of nodes agrees that it's the master based on the number of
* feeder connections to it. Such an authoritative answer is needed in a
* network partition situation to detect a master that may be isolated on
* the minority side of a network partition.
*
* @return true if the node is definitely the master. False if it's not or
* we cannot be sure.
*/
public boolean isAuthoritativeMaster() {
if (!getMasterStatus().isGroupMaster()) {
return false;
}
return (feederManager.activeReplicaCount() + 1) >=
getElectionQuorumSize(QuorumPolicy.SIMPLE_MAJORITY);
}
public int getHeartbeatInterval() {
return getConfigManager().getInt(HEARTBEAT_INTERVAL);
}
/* For unit testing only. */
public void setVersionHook(TestHook<Integer> versionHook) {
this.versionHook = versionHook;
}
/* For unit testing only. */
public void setMasterTransferHook(TestHook<Thread> masterTransferHook) {
this.masterTransferHook = masterTransferHook;
}
public int getLogVersion() {
/* Allow the test hook to set the log version. */
if (versionHook != null) {
return versionHook.getHookValue();
}
return LogEntryType.LOG_VERSION;
}
public int getElectionPriority() {
final int priority =
getConfigManager().getInt(RepParams.NODE_PRIORITY);
final int defaultPriority =
Integer.parseInt(RepParams.NODE_PRIORITY.getDefault());
return (getConfigManager().getBoolean(RepParams.DESIGNATED_PRIMARY) &&
(priority == defaultPriority)) ?
defaultPriority + 1 : /* Raise its priority. */
priority; /* Explicit priority, leave it intact. */
}
/*
* Amount of time to wait for a thread to finish on a shutdown. It's
* a multiple of a heartbeat, since a thread typically polls for a
* shutdown once per heartbeat.
*/
public int getThreadWaitInterval() {
return getHeartbeatInterval()*4;
}
int getDbTreeCacheClearingOpCount() {
return getConfigManager().getInt(DBTREE_CACHE_CLEAR_COUNT);
}
public RepImpl getRepImpl() {
return repImpl;
}
public LogManager getLogManager() {
return repImpl.getLogManager();
}
DbConfigManager getConfigManager() {
return repImpl.getConfigManager();
}
public VLSNIndex getVLSNIndex() {
return repImpl.getVLSNIndex();
}
public FeederTxns getFeederTxns() {
return repImpl.getFeederTxns();
}
public Elections getElections() {
return elections;
}
public MasterSuggestionGenerator getSuggestionGenerator() {
return suggestionGenerator;
}
/* Used by unit tests only. */
public QuorumPolicy getElectionPolicy() {
return electionQuorumPolicy;
}
/**
* Returns a list of nodes suitable for feeding log files for a network
* restore.
*
* @return a list of hostPort pairs
*/
public RepNodeImpl[] getLogProviders() {
Set<RepNodeImpl> nodes = getGroup().getAllElectableMembers();
RepNodeImpl[] logProviders = new RepNodeImpl[nodes.size()];
int i=0;
for (RepNodeImpl node : nodes) {
logProviders[i++] = node;
}
return logProviders;
}
/* Used by unit tests only. */
public LogFlusher getLogFlusher() {
return logFlusher;
}
/* Configure the log flusher according to the configuration changes. */
public void configLogFlusher(DbConfigManager configMgr) {
boolean enableTask = configMgr.getBoolean(RUN_LOG_FLUSH_TASK);
int flushInterval = configMgr.getDuration(LOG_FLUSH_TASK_INTERVAL);
/* Cancel the log flushing the task if we want to. */
if (!enableTask) {
if (logFlusher != null) {
logFlusher.cancelTask();
}
return;
}
/* Create LogFlusher if it's null and we do want to start the task. */
if (logFlusher == null) {
logFlusher = new LogFlusher(this, timer);
}
/* Configure the flushing task. */
logFlusher.configFlushTask(flushInterval);
}
public ChannelTimeoutTask getChannelTimeoutTask() {
return channelTimeoutTask;
}
public boolean isMaster() {
return masterStatus.isNodeMaster();
}
/**
* Notes the VLSN associated with the latest commit or abort. The updates
* are done in ascending order.
*
* @param txnEndVLSN the VLSN of a TxnAbort/TxnCommit
*/
public void currentTxnEndVLSN(VLSN txnEndVLSN) {
currentTxnEndVLSN = txnEndVLSN;
}
public MonitorEventManager getMonitorEventManager() {
return monitorEventManager;
}
/**
* Register an AppStateMonitor with this RepNode.
*/
public void registerAppStateMonitor(AppStateMonitor stateMonitor) {
this.appStateMonitor = stateMonitor;
}
/**
* Return the application state that defined in user specified
* AppStateMonitor.
*/
public byte[] getAppState() {
/*
* If the AppStateMonitor is not defined, or there is currently no
* returned application state, return null.
*/
if (appStateMonitor == null || appStateMonitor.getAppState() == null) {
return null;
}
/* Application state shouldn't be a zero length byte array. */
if (appStateMonitor.getAppState().length == 0) {
throw new IllegalStateException
("Application state should be a byte array larger than 0.");
}
return appStateMonitor.getAppState();
}
/* Get the current master name if it exists. */
public String getMasterName() {
if (masterStatus.getGroupMasterNameId().getId() ==
NameIdPair.NULL_NODE_ID) {
return null;
}
return masterStatus.getGroupMasterNameId().getName();
}
/**
* Returns the latest VLSN associated with a replicated commit.
*/
public VLSN getCurrentTxnEndVLSN() {
return currentTxnEndVLSN;
}
/*
* Testing API used to force this node as a master. The mastership is
* communicated upon election completion via the Listener. It's the
* responsibility of the caller to ensure that only one node is forced
* at a time via this API.
*
* @param force true to force this node as the master, false reverts back
* to use of normal (non-preemptive) elections.
*/
public void forceMaster(boolean force)
throws InterruptedException, DatabaseException {
suggestionGenerator.forceMaster(force);
/* Initiate elections to make the changed proposal heard. */
refreshCachedGroup();
elections.initiateElection(group, electionQuorumPolicy);
}
/**
* Starts up the thread in which the node does its processing as a master
* or replica. It then waits for the newly started thread to transition it
* out of the DETACHED state, and returns upon completion of this
* transition.
*
* @throws IOException
* @throws DatabaseException
*/
private void startup(QuorumPolicy initialElectionPolicy)
throws IOException, DatabaseException {
if (isAlive()) {
return;
}
assert(nodeState.getRepEnvState().isDetached());
elections = new Elections(this,
changeListener,
suggestionGenerator);
repImpl.getStartupTracker().start(Phase.FIND_MASTER);
try {
if (repImpl.getConfigManager().
getBoolean(RepParams.RESET_REP_GROUP)) {
/* Invoked by DbResetRepGroup utility */
reinitSelfElect();
} else {
findMaster();
}
this.electionQuorumPolicy = initialElectionPolicy;
elections.participate();
} finally {
repImpl.getStartupTracker().stop(Phase.FIND_MASTER);
}
start();
}
/**
* This method must be invoked when a RepNode is first initialized and
* subsequently every time there is a change to the replication group.
* <p>
* The Master should invoke this method each time a member is added or
* removed, and a replica should invoke it each time it detects the commit
* of a transaction that modifies the membership database.
* <p>
* In addition, it must be invoked after a syncup operation, since it may
* revert changes made to the membership table.
*
* @throws DatabaseException
*/
public RepGroupImpl refreshCachedGroup()
throws DatabaseException {
group = repGroupDB.getGroup(new NoConsistencyRequiredPolicy());
elections.updateRepGroup(group);
if (nameIdPair.hasNullId()) {
RepNodeImpl n = group.getMember(nameIdPair.getName());
if (n != null) {
/* May not be sufficiently current in the rep stream. */
nameIdPair.update(n.getNameIdPair());
}
}
return group;
}
/**
* Removes a node so that it's no longer a member of the group.
*
* Note that names referring to deleted nodes cannot be reused.
*
* @param nodeName identifies the node to be deleted.
*
* @throws MemberNotFoundException if the node denoted by
* <code>memberName</code> is not a member of the replication group.
*
* @throws MasterStateException if the member being removed is currently
* the Master
*
* @see <a href="https://sleepycat.oracle.com/trac/wiki/DynamicGroupMembership#DeletingMembers">Member Deletion</a>
*/
public void removeMember(String nodeName) {
checkValidity(nodeName, "Removing member");
/*
* First remove it from the cached group, effectively setting new
* durability requirements, for the ensuing group db updates.
*/
RepNodeImpl node = group.removeMember(nodeName);
/*
* Shutdown any feeder that may be active with the replica. Any
* subsequent attempts by the replica to rejoin the group will result
* in a failure.
*/
feederManager.shutdownFeeder(node);
repGroupDB.removeMember(node);
}
/**
* Update the network address of a node.
*
* Note that an alive node's address can't be updated, we'll throw an
* ReplicaStateException for this case.
*
* @param nodeName identifies the node to be updated
* @param newHostName the new host name of this node
* @param newPort the new port of this node
*/
public void updateAddress(String nodeName,
String newHostName,
int newPort) {
checkValidity(nodeName, "Updating node's address");
/* Check whether the node is still alive. */
if (feederManager.getFeeder(nodeName) != null) {
throw new ReplicaStateException
("Can't update the network address for a live node.");
}
/* Update the node information in the group database. */
RepNodeImpl node = group.getNode(nodeName);
node.setHostName(newHostName);
node.setPort(newPort);
repGroupDB.updateMember(node);
}
/**
* Transfer the mastership to a specified replica.
*
* @param nodeName the name of the node that will be elected as the
* master
*/
public synchronized void transferMaster(String nodeName) {
checkValidity(nodeName, "Transferring mastership");
/* Check if the replica is alive. */
Feeder feeder = feederManager.getFeeder(nodeName);
if (feeder == null) {
throw new MasterTransferFailureException
("Replica is going to be the master is not alive, transfer " +
"fails, master doesn't change.");
}
long catchupTimeout =
getConfigManager().getDuration(CATCHUP_MASTER_TIMEOUT);
long catchupPhase2Timeout =
getConfigManager().getDuration(CATCHUP_MASTER_PHASE2_TIMEOUT);
try {
/*
* First phase: the replica catches up with the master.
*
* Transactions commits are not blocked at this time. Even though
* application writes are blocked, note that internal transactions
* may be in active and may cause the currentTxnEndVLSN to advance.
*
* In this first phase, the replica only needs to catch up with the
* currentTxnEndVLSN value in order for the phase to complete.
*/
replicaCatchupMaster(feeder, catchupTimeout, true);
/*
* Second phase: transferring the state.
*
* During this phase, MasterTxn commits/aborts are blocked.
*/
repImpl.createAndSetBlockLatch();
/*
* Now we stop all the transactions, both internal and those
* started by users. In this case, we expect the feederVLSN to be
* the same as the currentTxnEndVLSN value after the catch up.
*/
replicaCatchupMaster(feeder, catchupPhase2Timeout, false);
} catch (Exception e) {
/*
* Release the transaction blocking latch if the catching up phase
* is not successfully finished.
*/
repImpl.countDownBlockLatch();
throw new MasterTransferFailureException(e.getMessage());
}
TestHookExecute.doHookIfSet(masterTransferHook);
/* Broadcast the result message to elect a new master. */
broadcastMessage(nodeName);
}
/*
* The replica that will be the new master attempts to sync up its
* replication stream with that of the master.
*/
private void replicaCatchupMaster(Feeder feeder,
long catchTime,
boolean useOldTxnEndVLSN) {
try {
long targetVLSN = currentTxnEndVLSN.getSequence();
CountDownLatch syncLatch = new CountDownLatch(1);
feeder.setSyncLatch(syncLatch);
syncLatch.await(catchTime, TimeUnit.MILLISECONDS);
if (!useOldTxnEndVLSN) {
targetVLSN = currentTxnEndVLSN.getSequence();
}
if (feeder.getFeederVLSN().getSequence() < targetVLSN) {
throw new MasterTransferFailureException
("Replica can't catch up the master, replica VLSN: " +
feeder.getFeederVLSN() + " while master's latest " +
"transaction end VLSN: " + targetVLSN +
". Transfer fails, master doesn't change");
}
} catch (InterruptedException e) {
throw new MasterTransferFailureException
("Replica catching up with master is interrupted, transfer " +
"fails, master doesn't change.");
}
}
/* Broadcast a Result message to re-elect a new master. */
private void broadcastMessage(String nodeName) {
RepNodeImpl node = group.getNode(nodeName);
MasterValue newMaster = new MasterValue
(node.getSocketAddress().getAddress().getHostAddress(),
node.getSocketAddress().getPort(),
node.getNameIdPair());
Proposal proposal =
new TimebasedProposalGenerator().nextProposal();
elections.getLearner();
Learner.informLearners
(group.getLearnerSockets(),
new WinningProposal(proposal, newMaster, null),
elections.getProtocol(),
elections.getThreadPool(),
elections.getLogger(),
repImpl,
null);
}
/*
* Check that the original master and target replica are in suitable
* states.
* - the original master should be a master
* - the target node should be a valid member of the group
* - the target node should not be the original node.
*/
private void checkValidity(String nodeName, String actionName)
throws MemberNotFoundException {
if (!nodeState.getRepEnvState().isMaster()) {
throw EnvironmentFailureException.unexpectedState
("Not currently a master. " + actionName + " must be " +
"invoked on the node that's currently the master.");
}
RepNodeImpl node = group.getNode(nodeName);
if (node == null) {
throw new MemberNotFoundException("Node:" + nodeName +
"is not a member of the group:" +
group.getName());
}
if (node.isRemoved() && node.isQuorumAck()) {
throw new MemberNotFoundException("Node:" + nodeName +
"is not currently a member of " +
"the group:" + group.getName() +
" It had been removed.");
}
/* Check if the node is the master itself. */
if (nodeName.equals(getNodeName())) {
throw new MasterStateException(getRepImpl().
getStateChangeEvent());
}
}
/**
* Updates the cached group info for the node, avoiding a database read.
*
* @param updateNameIdPair the node whose localCBVLSN must be updated.
* @param barrierState the new node syncup state
*/
public void updateGroupInfo(NameIdPair updateNameIdPair,
RepGroupImpl.BarrierState barrierState) {
RepNodeImpl node = group.getMember(updateNameIdPair.getName());
if (node == null) {
/* A subsequent refresh will get it, along with the new node. */
return;
}
LoggerUtils.fine(logger, repImpl,
"LocalCBVLSN for " + updateNameIdPair +
" updated to " + barrierState +
" from " + node.getBarrierState().getLastCBVLSN());
node.setBarrierState(barrierState);
globalCBVLSN.recalculate(group);
}
/**
* Recalculate the Global CBVLSN, provoked by Replay, to ensure that the
* replica's global CBVLSN is up to date.
*/
void recalculateGlobalCBVLSN() {
globalCBVLSN.recalculate(group);
}
LocalCBVLSNTracker getCBVLSNTracker() {
return cbvlsnTracker;
}
public void freezeLocalCBVLSN() {
cbvlsnTracker.incrementFreezeCounter();
}
public void unfreezeLocalCBVLSN() {
cbvlsnTracker.decrementFreezeCounter();
}
/**
* Finds a master node.
*
* @throws IOException
* @throws DatabaseException
* @throws InterruptedException
*/
private void findMaster()
throws IOException,
DatabaseException {
refreshCachedGroup();
elections.startLearner();
LoggerUtils.info(logger, repImpl, "Current group size: " +
group.getElectableGroupSize());
RepNodeImpl thisNode = group.getNode(nameIdPair.getName());
if (thisNode == null) {
LoggerUtils.info(logger, repImpl, "New node " + nameIdPair +
" unknown to rep group");
Set<InetSocketAddress> helperSockets = repImpl.getHelperSockets();
/*
* Not present in the replication group. Use the helper, to get
* to a master and enter the group.
*/
if ((group.getElectableGroupSize() == 0) &&
(helperSockets.size() == 1) &&
serviceDispatcher.getSocketAddress().
equals(helperSockets.iterator().next())) {
/* A startup situation, should this node become master. */
selfElect();
elections.updateRepGroup(group);
return;
}
queryGroupForMaster();
} else {
/* The node is in the group database. */
if (thisNode.isRemoved()) {
throw EnvironmentFailureException.unexpectedState
("Node: " + nameIdPair.getName() +
" was previously deleted.");
}
LoggerUtils.info(logger, repImpl,
"Existing node " + nameIdPair.getName() +
" querying for a current master.");
/*
* The group has other members, see if they know of a master,
* along with any helpers that were also supplied.
*/
Set<InetSocketAddress> helperSockets = repImpl.getHelperSockets();
helperSockets.addAll(group.getLearnerSockets());
elections.getLearner().queryForMaster(helperSockets);
}
}
/**
* This method enforces the requirement that all addresses within a
* replication group, must be loopback addresses or they must all be
* non-local ip addresses. Mixing them means that the node with a loopback
* address cannot be contacted by a different node.
*
* @param helperSockets the helper nodes used by this node when contacting
* the master.
*/
private void checkLoopbackAddresses(Set<InetSocketAddress> helperSockets) {
final InetAddress myAddress = mySocket.getAddress();
final boolean isLoopback= myAddress.isLoopbackAddress();
for (InetSocketAddress socketAddress : helperSockets) {
final InetAddress nodeAddress = socketAddress.getAddress();
if (nodeAddress.isLoopbackAddress() == isLoopback) {
continue;
}
String message = mySocket +
" the address associated with this node, " +
(isLoopback? "is " : "is not ") + "a loopback address." +
" It conflicts with an existing use, by a different node " +
" of the address:" +
socketAddress +
(!isLoopback ? " which is a loopback address." :
" which is not a loopback address.") +
" Such mixing of addresses within a group is not allowed, " +
"since the nodes will not be able to communicate with " +
"each other.";
throw new IllegalArgumentException(message);
}
}
/**
* Used by a new node (that is not a self-elected master) to identify a
* master. A new node, one that is not as yet in group database, queries
* the designated helpers and all known learners for the current master.
* The helpers are the ones that were identified via the node's
* configuration, while the learners are the ones currently in the member
* database. It uses both to cast the widest possible net.
*/
private void queryGroupForMaster() {
Set<InetSocketAddress> helperSockets = repImpl.getHelperSockets();
checkLoopbackAddresses(helperSockets);
/*
* Not in the rep group. Use the designated helpers and other members
* of the group to detect a master.
*/
Set<InetSocketAddress> learners =
new HashSet<InetSocketAddress>(helperSockets);
learners.addAll(group.getLearnerSockets());
if (learners.size() == 0) {
throw EnvironmentFailureException.unexpectedState
("Need a helper to add a new node into the group");
}
while (true) {
elections.getLearner().queryForMaster(learners);
if (masterStatus.getGroupMasterNameId().getId() !=
NameIdPair.NULL_NODE_ID) {
break;
}
try {
Thread.sleep(MASTER_QUERY_INTERVAL);
} catch (InterruptedException e) {
throw EnvironmentFailureException.unexpectedException(e);
}
}
LoggerUtils.info(logger, repImpl, "New node " + nameIdPair.getName() +
" located master: " +
masterStatus.getGroupMasterNameId());
}
/**
* Elects this node as the master. The operation is only valid when the
* group consists of just this node.
* @param helperLearner
* @throws DatabaseException
*/
private void selfElect()
throws DatabaseException {
nameIdPair.setId(RepGroupImpl.getFirstNodeId());
/* Master by default of a nascent group. */
Proposal proposal = new TimebasedProposalGenerator().nextProposal();
elections.getLearner().processResult(proposal,
suggestionGenerator.get(proposal));
LoggerUtils.info(logger, repImpl, "Nascent group. " +
nameIdPair.getName() +
" is master by virtue of being the first node.");
nodeState.changeAndNotify(UNKNOWN, NameIdPair.NULL);
masterStatus.sync();
nodeState.changeAndNotify(MASTER, masterStatus.getNodeMasterNameId());
repImpl.getVLSNIndex().initAsMaster();
repGroupDB.addFirstNode();
refreshCachedGroup();
/* Unsync so that the run loop does not call for an election. */
masterStatus.unSync();
}
/**
* Establishes this node as the master, after re-initializing the group
* with this as the sole node in the group. This method is used solely
* as part of the DbResetrepGroup utility.
*/
void reinitSelfElect()
throws IOException {
/* Establish an empty group so transaction commits can proceed. */
group = repGroupDB.emptyGroup;
LoggerUtils.info(logger, repImpl, "Reinitializing group to node " +
nameIdPair);
/*
* Unilaterally transition the nodeState to Master, so that write
* transactions needed to reset the group and establish this node can
* be issued against the environment.
*/
nodeState.changeAndNotify(UNKNOWN, NameIdPair.NULL);
nodeState.changeAndNotify(MASTER, masterStatus.getNodeMasterNameId());
repImpl.getVLSNIndex().initAsMaster();
/*
* Start using new log files. The file ensures that we can safely
* truncate the past VLSNs.
*/
repImpl.forceLogFileFlip();
CheckpointConfig ckptConfig = new CheckpointConfig();
ckptConfig.setForce(true);
/*
* The checkpoint ensures that we do not have to replay VLSNs from the
* prior group and that we have a complete VLSN index on disk.
*/
repImpl.getCheckpointer().doCheckpoint(ckptConfig,
"Reinit of RepGroup");
VLSN lastOldVLSN = repImpl.getVLSNIndex().getRange().getLast();
/* Now create the new rep group on disk. */
repGroupDB.reinitFirstNode(lastOldVLSN);
refreshCachedGroup();
long lastOldFile =
repImpl.getVLSNIndex().getLTEFileNumber(lastOldVLSN);
/*
* Discard the VLSN index covering the pre group reset VLSNS, to ensure
* that the pre reset part of the log is never replayed. We don't want
* to replay this part of the log, since it contains references to
* repnodes via node ids that are no longer part of the reset rep
* group. Note that we do not reuse rep node ids, that is, rep node id
* sequence continues across the reset operation and is not itself
* reset. Nodes joining the new group will need to do a network restore
* when they join the group.
*/
repImpl.getVLSNIndex().truncateFromHead(lastOldVLSN, lastOldFile);
elections.startLearner();
/* Unsync so that the run loop does not call for an election. */
masterStatus.unSync();
}
/**
* The top level Master/Feeder or Replica loop in support of replication.
* It's responsible for driving the node level state changes resulting
* from elections initiated either by this node, or by other members of the
* group.
* <p>
* The thread is terminated via an orderly shutdown initiated as a result
* of an interrupt issued by the shutdown() method. Any exception that is
* not handled by the run method itself is caught by the thread's uncaught
* exception handler, and results in the RepImpl being made invalid. In
* that case, the application is responsible for closing the Replicated
* Environment, which will provoke the shutdown.
* <p>
* Note: This method currently runs either the feeder loop or the replica
* loop. With R to R support, it would be possible for a Replica to run
* both. This will be a future feature.
*/
@Override
public void run() {
if (nodeState.getRepEnvState().isDetached()) {
nodeState.changeAndNotify(UNKNOWN, NameIdPair.NULL);
}
/* Set to indicate an error-initiated shutdown. */
Error repNodeError = null;
try {
LoggerUtils.info(logger, repImpl,
"Node " + nameIdPair.getName() + " started");
while (!isShutdown()) {
if (nodeState.getRepEnvState() != UNKNOWN) {
/* Avoid unnecessary state changes. */
nodeState.changeAndNotify(UNKNOWN, NameIdPair.NULL);
}
/*
* Initiate elections if we don't have a group master, or there
* is a master, but we were unable to use it.
*/
if (masterStatus.getGroupMasterNameId().hasNullId() ||
masterStatus.inSync()) {
elections.initiateElection(group, electionQuorumPolicy);
/*
* Subsequent elections must always use a simple majority.
*/
electionQuorumPolicy = QuorumPolicy.SIMPLE_MAJORITY;
/* In case elections were shutdown. */
if (isShutdown()) {
return;
}
}
/* Start syncing this node to the new group master */
masterStatus.sync();
/* Copy status to hold it stable against concurrent updates. */
MasterStatus status = (MasterStatus) masterStatus.clone();
if (status.isNodeMaster()) {
repImpl.getVLSNIndex().initAsMaster();
replica.masterTransitionCleanup();
try {
serviceDispatcher.register
(new GroupService(serviceDispatcher, this));
/* Master is ready for business. */
nodeState.changeAndNotify
(MASTER, status.getNodeMasterNameId());
feederManager.runFeeders();
} finally {
serviceDispatcher.cancel(GroupService.SERVICE_NAME);
}
} else {
nodeState.changeAndNotify
(REPLICA, status.getNodeMasterNameId());
replica.runReplicaLoop();
}
}
} catch (InterruptedException e) {
LoggerUtils.fine(logger, repImpl,
"RepNode main thread interrupted - " +
" forced shutdown.");
} catch (GroupShutdownException e) {
saveShutdownException(e);
} catch (RuntimeException e) {
saveShutdownException(e);
throw e;
} catch (Error e) {
repNodeError = e;
repImpl.invalidate(e);
} finally {
try {
LoggerUtils.info(logger, repImpl,
"RepNode main thread shutting down.");
if (repNodeError != null) {
LoggerUtils.info(logger, repImpl,
"Node state at shutdown:\n"+
repImpl.dumpState());
throw repNodeError;
}
Throwable exception = getSavedShutdownException();
if (exception == null) {
LoggerUtils.fine(logger, repImpl,
"Node state at shutdown:\n"+
repImpl.dumpState());
} else {
LoggerUtils.info(logger, repImpl,
"RepNode shutdown exception:\n" +
exception.getMessage() +
repImpl.dumpState());
}
try {
shutdown();
} catch (DatabaseException e) {
RepUtils.chainExceptionCause(e, exception);
LoggerUtils.severe(logger, repImpl,
"Unexpected exception during shutdown" +
e);
throw e;
}
} catch (InterruptedException e1) {
// Ignore exceptions on exit
}
nodeState.changeAndNotify(DETACHED, NameIdPair.NULL);
cleanup();
}
}
/**
* Used to shutdown all activity associated with this replication stream.
* If method is invoked from different thread of control, it will wait
* until the rep node thread exits. If it's from the same thread, it's the
* caller's responsibility to exit the thread upon return from this method.
*
* @throws InterruptedException
* @throws DatabaseException
*/
public void shutdown()
throws InterruptedException, DatabaseException {
if (shutdownDone()) {
return;
}
LoggerUtils.info(logger, repImpl, "Shutting down node " + nameIdPair);
/* Fire a LeaveGroup if this RepNode is valid. */
if (repImpl.isValid()) {
monitorEventManager.notifyLeaveGroup(getLeaveReason());
}
/* Stop accepting any new network requests. */
serviceDispatcher.preShutdown();
if (elections != null) {
elections.shutdown();
}
/* Initiate the FeederManger soft shutdown if it's active. */
feederManager.shutdownQueue();
if ((getReplicaCloseCatchupMs() >= 0) &&
(nodeState.getRepEnvState().isMaster())) {
/*
* A group shutdown. Shutting down the queue will cause the
* FeederManager to shutdown it's feeders and exit.
*/
this.join();
}
/* Shutdown the replica, if it's active. */
replica.shutdown();
shutdownThread(logger);
LoggerUtils.info(logger, repImpl,
"RepNode main thread: " + this.getName() + " exited.");
/* Shut down all other services. */
utilityServicesShutdown();
/* Shutdown all the services before shutting down the dispatcher. */
serviceDispatcher.shutdown();
LoggerUtils.info(logger, repImpl,
nameIdPair + " shutdown completed.");
masterStatus.setGroupMaster(null, NameIdPair.NULL);
readyLatch.releaseAwait(getSavedShutdownException());
/* Cancel the TimerTasks. */
channelTimeoutTask.cancel();
if (logFlusher != null) {
logFlusher.cancelTask();
}
timer.cancel();
}
/**
* Soft shutdown for the RepNode thread. Note that since the thread is
* shared by the FeederManager and the Replica, the FeederManager or
* Replica specific soft shutdown actions should already have been done
* earlier.
*/
@Override
protected int initiateSoftShutdown() {
return getThreadWaitInterval();
}
/* Get the shut down reason for this node. */
private LeaveReason getLeaveReason() {
LeaveReason reason = null;
Exception exception = getSavedShutdownException();
if (exception == null) {
reason = LeaveReason.NORMAL_SHUTDOWN;
} else if (exception instanceof GroupShutdownException) {
reason = LeaveReason.MASTER_SHUTDOWN_GROUP;
} else {
reason = LeaveReason.ABNORMAL_TERMINATION;
}
return reason;
}
private void utilityServicesShutdown() {
if (ldiff != null) {
ldiff.shutdown();
}
if (logFeederManager != null) {
logFeederManager.shutdown();
}
if (binaryNodeStateService != null) {
binaryNodeStateService.shutdown();
}
if (nodeStateService != null) {
serviceDispatcher.cancel(NodeStateService.SERVICE_NAME);
}
}
/**
* Must be invoked on the Master via the last open handle.
*
* Note that the method itself does not shutdown the group. It merely
* sets replicaCloseCatchupMs, indicating that the ensuing handle close
* should shutdown the Replicas. The actual coordination with the closing
* of the handle is implemented by ReplicatedEnvironment.shutdownGroup().
*
* @see ReplicatedEnvironment#shutdownGroup(long, TimeUnit)
*/
public void shutdownGroupOnClose(long timeoutMs)
throws IllegalStateException {
if (!nodeState.getRepEnvState().isMaster()) {
throw new IllegalStateException
("Node state must be " + MASTER +
", not " + nodeState.getRepEnvState());
}
replicaCloseCatchupMs = (timeoutMs < 0) ? 0 : timeoutMs;
}
/**
* JoinGroup ensures that a RepNode is actively participating in a
* replication group. It's invoked each time a replicated environment
* handle is created.
*
* If the node is already participating in a replication group, because
* it's not the first handle to the environment, it will return without
* having to wait. Otherwise it will wait until a master is elected and
* this node is active, either as a Master, or as a Replica.
*
* If the node joins as a replica, it will wait further until it has become
* sufficiently consistent as defined by its consistency argument. By
* default it uses PointConsistencyPolicy to ensure that it is at least as
* consistent as the master as of the time the handle was opened.
*
* A node can also join in the Unknown state if it has been configured to
* do so via ENV_UNKNOWN_STATE_TIMEOUT.
*
* @throws UnknownMasterException If a master cannot be established within
* ENV_SETUP_TIMEOUT, unless ENV_UNKNOWN_STATE_TIMEOUT has
* been set to allow the creation of a handle while in the UNKNOWN state.
*
* @return MASTER, REPLICA, or UNKNOWN (if ENV_UNKNOWN_STATE_TIMEOUT
* is set)
*/
public ReplicatedEnvironment.State
joinGroup(ReplicaConsistencyPolicy consistency,
QuorumPolicy initialElectionPolicy)
throws ReplicaConsistencyException, DatabaseException, IOException {
final JoinGroupTimeouts timeouts =
new JoinGroupTimeouts(getConfigManager());
startup(initialElectionPolicy);
LoggerUtils.finest(logger, repImpl, "joinGroup " +
nodeState.getRepEnvState());
DatabaseException exitException = null;
int retries=0;
repImpl.getStartupTracker().start(Phase.BECOME_CONSISTENT);
repImpl.getStartupTracker().setProgress
(RecoveryProgress.BECOME_CONSISTENT);
try {
for (retries=0; retries < JOIN_RETRIES; retries++ ) {
try {
/* Wait for Feeder/Replica to be fully initialized. */
boolean done = getReadyLatch().awaitOrException
(timeouts.getTimeout(), TimeUnit.MILLISECONDS);
/*
* Save the state, and use it from this point forward,
* since the node's state may change again.
*/
final ReplicatedEnvironment.State finalState =
nodeState.getRepEnvState();
if (!done) {
/* An election or setup, timeout. */
if (finalState.isReplica()) {
if (timeouts.timeoutIsForUnknownState()) {
/*
* Replica syncing up; move onwards to the
* setup timeout and continue with the syncup.
*/
timeouts.setSetupTimeout();
continue;
}
throw new ReplicaConsistencyException
(String.format("Setup time exceeded %,d ms",
timeouts.getSetupTimeout()),
null);
}
if (finalState.isUnknown() &&
timeouts.timeoutIsForUnknownState()) {
return UNKNOWN;
}
break;
}
switch (finalState) {
case UNKNOWN:
/*
* State flipped between release of ready latch and
* nodeState.getRepEnvState() above; retry for a
* Master/Replica state.
*/
continue;
case REPLICA:
joinAsReplica(consistency);
break;
case MASTER:
LoggerUtils.info(logger, repImpl,
"Joining group as master");
break;
case DETACHED:
throw EnvironmentFailureException.
unexpectedState("Node in DETACHED state " +
"while joining group.");
}
return finalState;
} catch (InterruptedException e) {
throw EnvironmentFailureException.unexpectedException(e);
} catch (MasterStateException e) {
/* Transition to master while establishing consistency. */
LoggerUtils.warning(logger, repImpl,
"Join retry due to master transition: "
+ e.getMessage());
continue;
} catch (RestartRequiredException e) {
LoggerUtils.warning(logger, repImpl,
"Environment needs to be restarted: " +
e.getMessage());
throw e;
} catch (DatabaseException e) {
Throwable cause = e.getCause();
if ((cause != null) &&
(cause.getClass() ==
Replica.ConnectRetryException.class)) {
/*
* The master may have changed. Retry if there is time
* left to do so. It may result in a new master.
*/
exitException = e;
if (timeouts.getTimeout() > 0) {
LoggerUtils.warning(logger, repImpl,
"Join retry due to exception: "
+ cause.getMessage());
continue;
}
}
throw e;
}
}
} finally {
repImpl.getStartupTracker().stop(Phase.BECOME_CONSISTENT);
}
/* Timed out or exceeded retries. */
if (exitException != null) {
LoggerUtils.warning(logger, repImpl, "Exiting joinGroup after " +
retries + " retries." + exitException);
throw exitException;
}
throw new UnknownMasterException(null, repImpl.getStateChangeEvent());
}
/**
* Join the group as a Replica ensuring that the node is sufficiently
* consistent as defined by its consistency policy.
*
* @param consistency the consistency policy to use when joining initially
*/
private void joinAsReplica(ReplicaConsistencyPolicy consistency)
throws InterruptedException {
if (consistency == null) {
final int consistencyTimeout =
getConfigManager().getDuration(ENV_CONSISTENCY_TIMEOUT);
consistency = new PointConsistencyPolicy
(new VLSN(replica.getMasterTxnEndVLSN()),
consistencyTimeout, TimeUnit.MILLISECONDS);
}
/*
* Wait for the replica to become sufficiently consistent.
*/
consistency.ensureConsistency(repImpl);
/*
* Flush changes to the file system. The flush ensures in particular
* that any member database updates defining this node itself are not
* lost in case of a process crash. See SR 20607.
*/
repImpl.getLogManager().flushNoSync();
LoggerUtils.info(logger, repImpl, "Joined group as a replica. " +
" join consistencyPolicy=" + consistency +
" " + repImpl.getVLSNIndex().getRange());
}
/**
* Should be called whenever a new VLSN is associated with a log entry
* suitable for Replica/Feeder syncup.
*/
public void trackSyncableVLSN(VLSN syncableVLSN, long lsn) {
cbvlsnTracker.track(syncableVLSN, lsn);
}
/** May return NULL_VLSN */
public VLSN getGroupCBVLSN() {
return globalCBVLSN.getCBVLSN();
}
/**
* Returns the number of nodes needed to form a quorum for elections
*
* @param quorumPolicy
* @return the number of nodes required for a quorum
*/
public int getElectionQuorumSize(QuorumPolicy quorumPolicy) {
if (electableGroupSizeOverride > 0) {
return quorumPolicy.quorumSize(electableGroupSizeOverride);
}
if (activePrimary &&
QuorumPolicy.SIMPLE_MAJORITY.equals(quorumPolicy)) {
return 1;
}
return quorumPolicy.quorumSize(group.getElectableGroupSize());
}
/**
* Returns the minimum number of replication nodes required to
* implement the ReplicaAckPolicy for a given group size.
*
* @return the number of nodes that are needed
*/
public int minAckNodes(ReplicaAckPolicy ackPolicy) {
if (electableGroupSizeOverride > 0) {
return ackPolicy.minAckNodes(electableGroupSizeOverride);
}
if (activePrimary && ReplicaAckPolicy.SIMPLE_MAJORITY.
equals(ackPolicy)) {
return 1;
}
return ackPolicy.minAckNodes(group.getElectableGroupSize());
}
/* Convenience overloading */
public int minAckNodes(Durability durability) {
return minAckNodes(durability.getReplicaAck());
}
/**
* Returns the group wide CBVLSN. The group CBVLSN is computed as the
* minimum of CBVLSNs after discarding CBVLSNs that are obsolete. A CBVLSN
* is considered obsolete, if it has not been updated within a configurable
* time interval relative to the time that the most recent CBVLSN was
* updated.
*
* @throws DatabaseException
*/
public void syncupStarted() {
globalCBVLSN.syncupStarted();
}
/*
* The globalCBVLSN can't be changed when a syncup is in progress. A feeder
* may have multiple syncups in action.
*/
public void syncupEnded() {
globalCBVLSN.syncupEnded();
}
/**
* Returns the file number that forms a barrier for the cleaner's file
* deletion activities. Files with numbers >= this file number cannot be
* by the cleaner without disrupting the replication stream.
*
* @return the file number that's the barrier for cleaner file deletion
*
* @throws DatabaseException
*/
public long getCleanerBarrierFile()
throws DatabaseException {
/* Take the minimum of SyncCleanerBarrier and GlobalCBVLSN. */
long syncStart = repImpl.getSyncCleanerBarrier().getMinSyncStart();
if (syncStart != LogChangeSet.NULL_POSITION) {
VLSN vlsn = new VLSN(syncStart);
if (vlsn.compareTo(globalCBVLSN.getCBVLSN()) < 0) {
return repImpl.getVLSNIndex().getLTEFileNumber(vlsn);
}
}
return globalCBVLSN.getCleanerBarrierFile();
}
long getReplicaCloseCatchupMs() {
return replicaCloseCatchupMs;
}
/**
* Returns true if the node is a designated Primary that has been
* activated.
*/
public boolean isActivePrimary() {
return activePrimary;
}
/**
* Tries to activate this node as a Primary, if it has been configured as
* such and if the group size is two. This method is invoked when an
* operation falls short of quorum requirements and is ready to trade
* durability for availability. More specifically it's invoked when an
* election fails, or there is an insufficient number of replicas during
* a begin transaction or a transaction commit.
*
* The Primary is passivated again when the Secondary contacts it.
*
* @return true if the primary was activated -- the quorum value is 1
*/
public boolean tryActivatePrimary() {
boolean activatedPrimary =
(repImpl != null) && /* Not a dummy test rep node. */
repImpl.isDesignatedPrimary() &&
getGroup().getElectableGroupSize() == 2 ;
if (activatedPrimary) {
LoggerUtils.info(logger, repImpl,
"Primary activated; quorum is one.");
activePrimary = true;
}
return activatedPrimary;
}
/*
* Invoked whenever there is an opportunity to passivate a node. Typically
* when this node is contacted by another node and the other node is
* sufficiently current so as to be able to respond to ack requests. Or
* when the node is no longer the designated primary.
*/
final public void passivatePrimary() {
if (activePrimary) {
LoggerUtils.info(logger, repImpl, "Primary passivated.");
}
activePrimary = false;
}
/**
* Shuts down the Network backup service *before* a rollback is initiated
* as part of syncup, thus ensuring that NetworkRestore does not see an
* inconsistent set of log files. Any network backup operations that are in
* progress at this node are aborted. The client of the service will
* experience network connection failures and will retry with this node
* (when the service is re-established at this node), or with some other
* node.
* <p>
* restarNetworkBackup() is then used to restart the service after it was
* shut down.
*/
final public void shutdownNetworkBackup() {
logFeederManager.shutdown();
logFeederManager = null;
}
/**
* Restarts the network backup service *after* a rollback has been
* completed and the log files are once again in a consistent state.
*/
final public void restartNetworkBackup() {
if (logFeederManager != null) {
throw EnvironmentFailureException.unexpectedState(repImpl);
}
logFeederManager=
new com.sleepycat.je.rep.impl.networkRestore.FeederManager
(serviceDispatcher, repImpl, nameIdPair);
}
/*
* Used to create deliberate clock skews for testing purposes. Replicator
* code should use it instead of invoking System.currentTimeMillis()
* directly.
*/
public static class Clock {
private final int skewMs;
private Clock(int skewMs) {
this.skewMs = skewMs;
}
public long currentTimeMillis() {
return System.currentTimeMillis() + skewMs;
}
}
/**
* Dumps the states associated with any active Feeders as well as
* information pertaining to the group CBVLSN and the composition of the
* group itself.
*/
public String dumpState() {
return "\n" + feederManager.dumpState() +
"\nGlobalCBVLSN=" + getGroupCBVLSN() +
"\n" + getGroup();
}
/**
* Dumps the state associated with all active Feeders.
*/
public String dumpFeederState() {
return "\n" + feederManager.dumpState() + "\n";
}
/*
* Sets the override value for the Electable Group size.
*/
public void setElectableGroupSizeOverride(int override) {
if (electableGroupSizeOverride != override) {
LoggerUtils.warning(logger, repImpl,
"Electable group size override changed to:" +
override);
}
this.electableGroupSizeOverride = override;
}
}