/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.cassandra.service;
import java.io.IOError;
import java.io.IOException;
import java.lang.management.ManagementFactory;
import java.net.InetAddress;
import java.net.UnknownHostException;
import java.nio.ByteBuffer;
import java.util.*;
import java.util.concurrent.*;
import javax.management.MBeanServer;
import javax.management.ObjectName;
import com.google.common.collect.ArrayListMultimap;
import com.google.common.collect.HashMultimap;
import com.google.common.collect.Multimap;
import org.apache.commons.lang.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.cassandra.concurrent.RetryingScheduledThreadPoolExecutor;
import org.apache.cassandra.concurrent.Stage;
import org.apache.cassandra.concurrent.StageManager;
import org.apache.cassandra.config.*;
import org.apache.cassandra.db.*;
import org.apache.cassandra.db.commitlog.CommitLog;
import org.apache.cassandra.db.migration.AddKeyspace;
import org.apache.cassandra.db.migration.Migration;
import org.apache.cassandra.dht.BootStrapper;
import org.apache.cassandra.dht.IPartitioner;
import org.apache.cassandra.dht.Range;
import org.apache.cassandra.dht.Token;
import org.apache.cassandra.gms.*;
import org.apache.cassandra.io.DeletionService;
import org.apache.cassandra.io.util.FileUtils;
import org.apache.cassandra.locator.AbstractReplicationStrategy;
import org.apache.cassandra.locator.DynamicEndpointSnitch;
import org.apache.cassandra.locator.IEndpointSnitch;
import org.apache.cassandra.locator.TokenMetadata;
import org.apache.cassandra.net.IAsyncResult;
import org.apache.cassandra.net.Message;
import org.apache.cassandra.net.MessagingService;
import org.apache.cassandra.net.ResponseVerbHandler;
import org.apache.cassandra.service.AntiEntropyService.TreeRequestVerbHandler;
import org.apache.cassandra.streaming.*;
import org.apache.cassandra.thrift.Constants;
import org.apache.cassandra.thrift.UnavailableException;
import org.apache.cassandra.utils.*;
import org.apache.log4j.Level;
import org.yaml.snakeyaml.Dumper;
import org.yaml.snakeyaml.DumperOptions;
import org.yaml.snakeyaml.Yaml;
import org.yaml.snakeyaml.nodes.Tag;
/*
* This abstraction contains the token/identifier of this node
* on the identifier space. This token gets gossiped around.
* This class will also maintain histograms of the load information
* of other nodes in the cluster.
*/
public class StorageService implements IEndpointStateChangeSubscriber, StorageServiceMBean
{
private static Logger logger_ = LoggerFactory.getLogger(StorageService.class);
public static final int RING_DELAY = 30 * 1000; // delay after which we assume ring has stablized
/* All verb handler identifiers */
public enum Verb
{
MUTATION,
BINARY,
READ_REPAIR,
READ,
REQUEST_RESPONSE, // client-initiated reads and writes
STREAM_INITIATE, // Deprecated
STREAM_INITIATE_DONE, // Deprecated
STREAM_REPLY,
STREAM_REQUEST,
RANGE_SLICE,
BOOTSTRAP_TOKEN,
TREE_REQUEST,
TREE_RESPONSE,
JOIN, // Deprecated
GOSSIP_DIGEST_SYN,
GOSSIP_DIGEST_ACK,
GOSSIP_DIGEST_ACK2,
DEFINITIONS_ANNOUNCE,
DEFINITIONS_UPDATE_RESPONSE,
TRUNCATE,
SCHEMA_CHECK,
INDEX_SCAN,
REPLICATION_FINISHED,
INTERNAL_RESPONSE, // responses to internal calls
COUNTER_MUTATION,
// use as padding for backwards compatability where a previous version needs to validate a verb from the future.
UNUSED_1,
UNUSED_2,
UNUSED_3,
;
// remember to add new verbs at the end, since we serialize by ordinal
}
public static final Verb[] VERBS = Verb.values();
public static final EnumMap<StorageService.Verb, Stage> verbStages = new EnumMap<StorageService.Verb, Stage>(StorageService.Verb.class)
{{
put(Verb.MUTATION, Stage.MUTATION);
put(Verb.BINARY, Stage.MUTATION);
put(Verb.READ_REPAIR, Stage.MUTATION);
put(Verb.READ, Stage.READ);
put(Verb.REQUEST_RESPONSE, Stage.REQUEST_RESPONSE);
put(Verb.STREAM_REPLY, Stage.MISC); // TODO does this really belong on misc? I've just copied old behavior here
put(Verb.STREAM_REQUEST, Stage.STREAM);
put(Verb.RANGE_SLICE, Stage.READ);
put(Verb.BOOTSTRAP_TOKEN, Stage.MISC);
put(Verb.TREE_REQUEST, Stage.ANTI_ENTROPY);
put(Verb.TREE_RESPONSE, Stage.ANTI_ENTROPY);
put(Verb.GOSSIP_DIGEST_ACK, Stage.GOSSIP);
put(Verb.GOSSIP_DIGEST_ACK2, Stage.GOSSIP);
put(Verb.GOSSIP_DIGEST_SYN, Stage.GOSSIP);
put(Verb.DEFINITIONS_ANNOUNCE, Stage.READ);
put(Verb.DEFINITIONS_UPDATE_RESPONSE, Stage.READ);
put(Verb.TRUNCATE, Stage.MUTATION);
put(Verb.SCHEMA_CHECK, Stage.MIGRATION);
put(Verb.INDEX_SCAN, Stage.READ);
put(Verb.REPLICATION_FINISHED, Stage.MISC);
put(Verb.INTERNAL_RESPONSE, Stage.INTERNAL_RESPONSE);
put(Verb.COUNTER_MUTATION, Stage.MUTATION);
put(Verb.UNUSED_1, Stage.INTERNAL_RESPONSE);
put(Verb.UNUSED_2, Stage.INTERNAL_RESPONSE);
put(Verb.UNUSED_3, Stage.INTERNAL_RESPONSE);
}};
/**
* This pool is used for periodic short (sub-second) tasks.
*/
public static final RetryingScheduledThreadPoolExecutor scheduledTasks = new RetryingScheduledThreadPoolExecutor("ScheduledTasks");
/**
* This pool is used by tasks that can have longer execution times, and usually are non periodic.
*/
public static final RetryingScheduledThreadPoolExecutor tasks = new RetryingScheduledThreadPoolExecutor("NonPeriodicTasks");
/* This abstraction maintains the token/endpoint metadata information */
private TokenMetadata tokenMetadata_ = new TokenMetadata();
private IPartitioner partitioner = DatabaseDescriptor.getPartitioner();
public VersionedValue.VersionedValueFactory valueFactory = new VersionedValue.VersionedValueFactory(partitioner);
public static final StorageService instance = new StorageService();
public static IPartitioner getPartitioner()
{
return instance.partitioner;
}
public Collection<Range> getLocalRanges(String table)
{
return getRangesForEndpoint(table, FBUtilities.getLocalAddress());
}
public Range getLocalPrimaryRange()
{
return getPrimaryRangeForEndpoint(FBUtilities.getLocalAddress());
}
private Set<InetAddress> replicatingNodes = Collections.synchronizedSet(new HashSet<InetAddress>());
private CassandraDaemon daemon;
private InetAddress removingNode;
/* Are we starting this node in bootstrap mode? */
private boolean isBootstrapMode;
/* when intialized as a client, we shouldn't write to the system table. */
private boolean isClientMode;
private boolean initialized;
private volatile boolean joined = false;
private String operationMode;
private volatile boolean efficientCrossDCWrites;
private MigrationManager migrationManager = new MigrationManager();
/* Used for tracking drain progress */
private volatile int totalCFs, remainingCFs;
public void finishBootstrapping()
{
isBootstrapMode = false;
SystemTable.setBootstrapped(true);
setToken(getLocalToken());
logger_.info("Bootstrap/move completed! Now serving reads.");
}
/** This method updates the local token on disk */
public void setToken(Token token)
{
if (logger_.isDebugEnabled())
logger_.debug("Setting token to {}", token);
SystemTable.updateToken(token);
tokenMetadata_.updateNormalToken(token, FBUtilities.getLocalAddress());
Gossiper.instance.addLocalApplicationState(ApplicationState.STATUS, valueFactory.normal(getLocalToken()));
setMode("Normal", false);
}
public StorageService()
{
MBeanServer mbs = ManagementFactory.getPlatformMBeanServer();
try
{
mbs.registerMBean(this, new ObjectName("org.apache.cassandra.db:type=StorageService"));
}
catch (Exception e)
{
throw new RuntimeException(e);
}
/* register the verb handlers */
MessagingService.instance().registerVerbHandlers(Verb.BINARY, new BinaryVerbHandler());
MessagingService.instance().registerVerbHandlers(Verb.MUTATION, new RowMutationVerbHandler());
MessagingService.instance().registerVerbHandlers(Verb.READ_REPAIR, new ReadRepairVerbHandler());
MessagingService.instance().registerVerbHandlers(Verb.READ, new ReadVerbHandler());
MessagingService.instance().registerVerbHandlers(Verb.RANGE_SLICE, new RangeSliceVerbHandler());
MessagingService.instance().registerVerbHandlers(Verb.INDEX_SCAN, new IndexScanVerbHandler());
MessagingService.instance().registerVerbHandlers(Verb.COUNTER_MUTATION, new CounterMutationVerbHandler());
// see BootStrapper for a summary of how the bootstrap verbs interact
MessagingService.instance().registerVerbHandlers(Verb.BOOTSTRAP_TOKEN, new BootStrapper.BootstrapTokenVerbHandler());
MessagingService.instance().registerVerbHandlers(Verb.STREAM_REQUEST, new StreamRequestVerbHandler());
MessagingService.instance().registerVerbHandlers(Verb.STREAM_REPLY, new StreamReplyVerbHandler());
MessagingService.instance().registerVerbHandlers(Verb.REPLICATION_FINISHED, new ReplicationFinishedVerbHandler());
MessagingService.instance().registerVerbHandlers(Verb.REQUEST_RESPONSE, new ResponseVerbHandler());
MessagingService.instance().registerVerbHandlers(Verb.INTERNAL_RESPONSE, new ResponseVerbHandler());
MessagingService.instance().registerVerbHandlers(Verb.TREE_REQUEST, new TreeRequestVerbHandler());
MessagingService.instance().registerVerbHandlers(Verb.TREE_RESPONSE, new AntiEntropyService.TreeResponseVerbHandler());
MessagingService.instance().registerVerbHandlers(Verb.GOSSIP_DIGEST_SYN, new GossipDigestSynVerbHandler());
MessagingService.instance().registerVerbHandlers(Verb.GOSSIP_DIGEST_ACK, new GossipDigestAckVerbHandler());
MessagingService.instance().registerVerbHandlers(Verb.GOSSIP_DIGEST_ACK2, new GossipDigestAck2VerbHandler());
MessagingService.instance().registerVerbHandlers(Verb.DEFINITIONS_ANNOUNCE, new DefinitionsAnnounceVerbHandler());
MessagingService.instance().registerVerbHandlers(Verb.DEFINITIONS_UPDATE_RESPONSE, new DefinitionsUpdateResponseVerbHandler());
MessagingService.instance().registerVerbHandlers(Verb.TRUNCATE, new TruncateVerbHandler());
MessagingService.instance().registerVerbHandlers(Verb.SCHEMA_CHECK, new SchemaCheckVerbHandler());
// spin up the streaming serivice so it is available for jmx tools.
if (StreamingService.instance == null)
throw new RuntimeException("Streaming service is unavailable.");
}
public void registerDaemon(CassandraDaemon daemon)
{
this.daemon = daemon;
}
// should only be called via JMX
public void stopGossiping()
{
if (initialized)
{
logger_.warn("Stopping gossip by operator request");
Gossiper.instance.stop();
initialized = false;
}
}
// should only be called via JMX
public void startGossiping()
{
if (!initialized)
{
logger_.warn("Starting gossip by operator request");
Gossiper.instance.start((int)(System.currentTimeMillis() / 1000));
initialized = true;
}
}
// should only be called via JMX
public void startRPCServer()
{
if (daemon == null)
{
throw new IllegalStateException("No configured RPC daemon");
}
daemon.startRPCServer();
}
// should only be called via JMX
public void stopRPCServer()
{
if (daemon == null)
{
throw new IllegalStateException("No configured RPC daemon");
}
daemon.stopRPCServer();
}
public boolean isRPCServerRunning()
{
if (daemon == null)
{
throw new IllegalStateException("No configured RPC daemon");
}
return daemon.isRPCServerRunning();
}
public void stopClient()
{
Gossiper.instance.unregister(migrationManager);
Gossiper.instance.unregister(this);
Gossiper.instance.stop();
MessagingService.instance().shutdown();
StageManager.shutdownNow();
}
public boolean isInitialized()
{
return initialized;
}
public synchronized void initClient() throws IOException, ConfigurationException
{
if (initialized)
{
if (!isClientMode)
throw new UnsupportedOperationException("StorageService does not support switching modes.");
return;
}
initialized = true;
isClientMode = true;
logger_.info("Starting up client gossip");
setMode("Client", false);
Gossiper.instance.register(this);
Gossiper.instance.start((int)(System.currentTimeMillis() / 1000)); // needed for node-ring gathering.
MessagingService.instance().listen(FBUtilities.getLocalAddress());
// sleep a while to allow gossip to warm up (the other nodes need to know about this one before they can reply).
try
{
Thread.sleep(5000L);
}
catch (Exception ex)
{
throw new IOError(ex);
}
MigrationManager.announce(DatabaseDescriptor.getDefsVersion(), DatabaseDescriptor.getSeeds());
}
public synchronized void initServer() throws IOException, org.apache.cassandra.config.ConfigurationException
{
logger_.info("Cassandra version: " + FBUtilities.getReleaseVersionString());
logger_.info("Thrift API version: " + Constants.VERSION);
if (initialized)
{
if (isClientMode)
throw new UnsupportedOperationException("StorageService does not support switching modes.");
return;
}
initialized = true;
isClientMode = false;
if (Boolean.parseBoolean(System.getProperty("cassandra.load_ring_state", "true")))
{
logger_.info("Loading persisted ring state");
for (Map.Entry<Token, InetAddress> entry : SystemTable.loadTokens().entrySet())
{
tokenMetadata_.updateNormalToken(entry.getKey(), entry.getValue());
Gossiper.instance.addSavedEndpoint(entry.getValue());
}
}
// daemon threads, like our executors', continue to run while shutdown hooks are invoked
Thread drainOnShutdown = new Thread(new WrappedRunnable()
{
public void runMayThrow() throws ExecutionException, InterruptedException, IOException
{
ThreadPoolExecutor mutationStage = StageManager.getStage(Stage.MUTATION);
if (!mutationStage.isShutdown())
{
mutationStage.shutdown();
mutationStage.awaitTermination(1, TimeUnit.SECONDS);
CommitLog.instance.shutdownBlocking();
}
}
});
Runtime.getRuntime().addShutdownHook(drainOnShutdown);
if (Boolean.parseBoolean(System.getProperty("cassandra.join_ring", "true")))
{
joinTokenRing();
}
else
{
logger_.info("Not joining ring as requested. Use JMX (StorageService->joinRing()) to initiate ring joining");
}
}
private void joinTokenRing() throws IOException, org.apache.cassandra.config.ConfigurationException
{
logger_.info("Starting up server gossip");
joined = true;
// have to start the gossip service before we can see any info on other nodes. this is necessary
// for bootstrap to get the load info it needs.
// (we won't be part of the storage ring though until we add a nodeId to our state, below.)
Gossiper.instance.register(this);
Gossiper.instance.register(migrationManager);
Gossiper.instance.start(SystemTable.incrementAndGetGeneration()); // needed for node-ring gathering.
MessagingService.instance().listen(FBUtilities.getLocalAddress());
StorageLoadBalancer.instance.startBroadcasting();
MigrationManager.announce(DatabaseDescriptor.getDefsVersion(), DatabaseDescriptor.getSeeds());
Gossiper.instance.addLocalApplicationState(ApplicationState.RELEASE_VERSION, valueFactory.releaseVersion());
HintedHandOffManager.instance.registerMBean();
if (DatabaseDescriptor.isAutoBootstrap()
&& DatabaseDescriptor.getSeeds().contains(FBUtilities.getLocalAddress())
&& !SystemTable.isBootstrapped())
logger_.info("This node will not auto bootstrap because it is configured to be a seed node.");
Token token;
if (DatabaseDescriptor.isAutoBootstrap()
&& !(DatabaseDescriptor.getSeeds().contains(FBUtilities.getLocalAddress()) || SystemTable.isBootstrapped()))
{
setMode("Joining: getting load and schema information", true);
StorageLoadBalancer.instance.waitForLoadInfo();
if (logger_.isDebugEnabled())
logger_.debug("... got load + schema info");
if (tokenMetadata_.isMember(FBUtilities.getLocalAddress()))
{
String s = "This node is already a member of the token ring; bootstrap aborted. (If replacing a dead node, remove the old one from the ring first.)";
throw new UnsupportedOperationException(s);
}
setMode("Joining: getting bootstrap token", true);
token = BootStrapper.getBootstrapToken(tokenMetadata_, StorageLoadBalancer.instance.getLoadInfo());
// don't bootstrap if there are no tables defined.
if (DatabaseDescriptor.getNonSystemTables().size() > 0)
{
bootstrap(token);
assert !isBootstrapMode; // bootstrap will block until finished
}
// else nothing to do, go directly to participating in ring
}
else
{
token = SystemTable.getSavedToken();
if (token == null)
{
String initialToken = DatabaseDescriptor.getInitialToken();
if (initialToken == null)
{
token = partitioner.getRandomToken();
logger_.warn("Generated random token " + token + ". Random tokens will result in an unbalanced ring; see http://wiki.apache.org/cassandra/Operations");
}
else
{
token = partitioner.getTokenFactory().fromString(initialToken);
logger_.info("Saved token not found. Using " + token + " from configuration");
}
}
else
{
logger_.info("Using saved token " + token);
}
}
SystemTable.setBootstrapped(true); // first startup is only chance to bootstrap
setToken(token);
assert tokenMetadata_.sortedTokens().size() > 0;
}
public synchronized void joinRing() throws IOException, org.apache.cassandra.config.ConfigurationException
{
if (!joined)
{
logger_.info("Joining ring by operator request");
joinTokenRing();
}
}
public boolean isJoined()
{
return joined;
}
public void setCompactionThroughputMbPerSec(int value) {
DatabaseDescriptor.setCompactionThroughputMbPerSec(value);
}
private void setMode(String m, boolean log)
{
operationMode = m;
if (log)
logger_.info(m);
else
logger_.debug(m);
}
private void bootstrap(Token token) throws IOException
{
isBootstrapMode = true;
SystemTable.updateToken(token); // DON'T use setToken, that makes us part of the ring locally which is incorrect until we are done bootstrapping
Gossiper.instance.addLocalApplicationState(ApplicationState.STATUS, valueFactory.bootstrapping(token));
setMode("Joining: sleeping " + RING_DELAY + " ms for pending range setup", true);
try
{
Thread.sleep(RING_DELAY);
}
catch (InterruptedException e)
{
throw new AssertionError(e);
}
setMode("Bootstrapping", true);
new BootStrapper(FBUtilities.getLocalAddress(), token, tokenMetadata_).bootstrap(); // handles token update
}
public boolean isBootstrapMode()
{
return isBootstrapMode;
}
public TokenMetadata getTokenMetadata()
{
return tokenMetadata_;
}
/**
* for a keyspace, return the ranges and corresponding hosts for a given keyspace.
* @param keyspace
* @return
*/
public Map<Range, List<String>> getRangeToEndpointMap(String keyspace)
{
// some people just want to get a visual representation of things. Allow null and set it to the first
// non-system table.
if (keyspace == null)
keyspace = DatabaseDescriptor.getNonSystemTables().get(0);
/* All the ranges for the tokens */
Map<Range, List<String>> map = new HashMap<Range, List<String>>();
for (Map.Entry<Range,List<InetAddress>> entry : getRangeToAddressMap(keyspace).entrySet())
{
map.put(entry.getKey(), stringify(entry.getValue()));
}
return map;
}
public Map<Range, List<String>> getPendingRangeToEndpointMap(String keyspace)
{
// some people just want to get a visual representation of things. Allow null and set it to the first
// non-system table.
if (keyspace == null)
keyspace = DatabaseDescriptor.getNonSystemTables().get(0);
Map<Range, List<String>> map = new HashMap<Range, List<String>>();
for (Map.Entry<Range, Collection<InetAddress>> entry : tokenMetadata_.getPendingRanges(keyspace).entrySet())
{
List<InetAddress> l = new ArrayList<InetAddress>(entry.getValue());
map.put(entry.getKey(), stringify(l));
}
return map;
}
public Map<Range, List<InetAddress>> getRangeToAddressMap(String keyspace)
{
List<Range> ranges = getAllRanges(tokenMetadata_.sortedTokens());
return constructRangeToEndpointMap(keyspace, ranges);
}
public Map<Token, String> getTokenToEndpointMap()
{
Map<Token, InetAddress> mapInetAddress = tokenMetadata_.getTokenToEndpointMap();
Map<Token, String> mapString = new HashMap<Token, String>(mapInetAddress.size());
for (Map.Entry<Token, InetAddress> entry : mapInetAddress.entrySet())
{
mapString.put(entry.getKey(), entry.getValue().getHostAddress());
}
return mapString;
}
/**
* Construct the range to endpoint mapping based on the true view
* of the world.
* @param ranges
* @return mapping of ranges to the replicas responsible for them.
*/
private Map<Range, List<InetAddress>> constructRangeToEndpointMap(String keyspace, List<Range> ranges)
{
Map<Range, List<InetAddress>> rangeToEndpointMap = new HashMap<Range, List<InetAddress>>();
for (Range range : ranges)
{
rangeToEndpointMap.put(range, Table.open(keyspace).getReplicationStrategy().getNaturalEndpoints(range.right));
}
return rangeToEndpointMap;
}
/*
* onChange only ever sees one ApplicationState piece change at a time, so we perform a kind of state machine here.
* We are concerned with two events: knowing the token associated with an endpoint, and knowing its operation mode.
* Nodes can start in either bootstrap or normal mode, and from bootstrap mode can change mode to normal.
* A node in bootstrap mode needs to have pendingranges set in TokenMetadata; a node in normal mode
* should instead be part of the token ring.
*
* Normal MOVE_STATE progression of a node should be like this:
* STATE_BOOTSTRAPPING,token
* if bootstrapping. stays this way until all files are received.
* STATE_NORMAL,token
* ready to serve reads and writes.
* STATE_NORMAL,token,REMOVE_TOKEN,token
* specialized normal state in which this node acts as a proxy to tell the cluster about a dead node whose
* token is being removed. this value becomes the permanent state of this node (unless it coordinates another
* removetoken in the future).
* STATE_LEAVING,token
* get ready to leave the cluster as part of a decommission or move
* STATE_LEFT,token
* set after decommission or move is completed.
* STATE_MOVE,token
* set if node if currently moving to a new token in the ring
*
* Note: Any time a node state changes from STATE_NORMAL, it will not be visible to new nodes. So it follows that
* you should never bootstrap a new node during a removetoken, decommission or move.
*/
public void onChange(InetAddress endpoint, ApplicationState state, VersionedValue value)
{
switch (state)
{
case RELEASE_VERSION:
updateEfficientCrossDCWriteMode();
break;
case STATUS:
String apStateValue = value.value;
String[] pieces = apStateValue.split(VersionedValue.DELIMITER_STR, -1);
assert (pieces.length > 0);
String moveName = pieces[0];
if (moveName.equals(VersionedValue.STATUS_BOOTSTRAPPING))
handleStateBootstrap(endpoint, pieces);
else if (moveName.equals(VersionedValue.STATUS_NORMAL))
handleStateNormal(endpoint, pieces);
else if (moveName.equals(VersionedValue.STATUS_LEAVING))
handleStateLeaving(endpoint, pieces);
else if (moveName.equals(VersionedValue.STATUS_LEFT))
handleStateLeft(endpoint, pieces);
else if (moveName.equals(VersionedValue.STATUS_MOVING))
handleStateMoving(endpoint, pieces);
}
}
/**
* We can remove this in 0.8, since mixing 0.7.0 with 0.8 is not supported (0.7.1 is required)
*/
private void updateEfficientCrossDCWriteMode()
{
for (Map.Entry<InetAddress, EndpointState> entry : Gossiper.instance.getEndpointStates())
{
VersionedValue version = entry.getValue().getApplicationState(ApplicationState.RELEASE_VERSION);
// no version means it's old code that doesn't gossip version, < 0.7.1.
if (version == null)
{
efficientCrossDCWrites = false;
return;
}
}
efficientCrossDCWrites = true;
}
/**
* Handle node bootstrap
*
* @param endpoint bootstrapping node
* @param pieces STATE_BOOTSTRAPPING,bootstrap token as string
*/
private void handleStateBootstrap(InetAddress endpoint, String[] pieces)
{
assert pieces.length == 2;
Token token = getPartitioner().getTokenFactory().fromString(pieces[1]);
if (logger_.isDebugEnabled())
logger_.debug("Node " + endpoint + " state bootstrapping, token " + token);
// if this node is present in token metadata, either we have missed intermediate states
// or the node had crashed. Print warning if needed, clear obsolete stuff and
// continue.
if (tokenMetadata_.isMember(endpoint))
{
// If isLeaving is false, we have missed both LEAVING and LEFT. However, if
// isLeaving is true, we have only missed LEFT. Waiting time between completing
// leave operation and rebootstrapping is relatively short, so the latter is quite
// common (not enough time for gossip to spread). Therefore we report only the
// former in the log.
if (!tokenMetadata_.isLeaving(endpoint))
logger_.info("Node " + endpoint + " state jump to bootstrap");
tokenMetadata_.removeEndpoint(endpoint);
}
tokenMetadata_.addBootstrapToken(token, endpoint);
calculatePendingRanges();
}
/**
* Handle node move to normal state. That is, node is entering token ring and participating
* in reads.
*
* @param endpoint node
* @param pieces STATE_NORMAL,token[,other_state,token]
*/
private void handleStateNormal(InetAddress endpoint, String[] pieces)
{
assert pieces.length >= 2;
Token token = getPartitioner().getTokenFactory().fromString(pieces[1]);
if (logger_.isDebugEnabled())
logger_.debug("Node " + endpoint + " state normal, token " + token);
if (tokenMetadata_.isMember(endpoint))
logger_.info("Node " + endpoint + " state jump to normal");
// we don't want to update if this node is responsible for the token and it has a later startup time than endpoint.
InetAddress currentOwner = tokenMetadata_.getEndpoint(token);
if (currentOwner == null)
{
logger_.debug("New node " + endpoint + " at token " + token);
tokenMetadata_.updateNormalToken(token, endpoint);
if (!isClientMode)
SystemTable.updateToken(endpoint, token);
}
else if (endpoint.equals(currentOwner))
{
// set state back to normal, since the node may have tried to leave, but failed and is now back up
// no need to persist, token/ip did not change
tokenMetadata_.updateNormalToken(token, endpoint);
}
else if (Gossiper.instance.compareEndpointStartup(endpoint, currentOwner) > 0)
{
logger_.info(String.format("Nodes %s and %s have the same token %s. %s is the new owner",
endpoint, currentOwner, token, endpoint));
tokenMetadata_.updateNormalToken(token, endpoint);
if (!isClientMode)
SystemTable.updateToken(endpoint, token);
}
else
{
logger_.info(String.format("Nodes %s and %s have the same token %s. Ignoring %s",
endpoint, currentOwner, token, endpoint));
}
if (pieces.length > 2)
{
assert pieces.length == 4;
handleStateRemoving(endpoint, getPartitioner().getTokenFactory().fromString(pieces[3]), pieces[2]);
}
if (tokenMetadata_.isMoving(endpoint)) // if endpoint was moving to a new token
tokenMetadata_.removeFromMoving(endpoint);
calculatePendingRanges();
}
/**
* Handle node preparing to leave the ring
*
* @param endpoint node
* @param pieces STATE_LEAVING,token
*/
private void handleStateLeaving(InetAddress endpoint, String[] pieces)
{
assert pieces.length == 2;
String moveValue = pieces[1];
Token token = getPartitioner().getTokenFactory().fromString(moveValue);
if (logger_.isDebugEnabled())
logger_.debug("Node " + endpoint + " state leaving, token " + token);
// If the node is previously unknown or tokens do not match, update tokenmetadata to
// have this node as 'normal' (it must have been using this token before the
// leave). This way we'll get pending ranges right.
if (!tokenMetadata_.isMember(endpoint))
{
logger_.info("Node " + endpoint + " state jump to leaving");
tokenMetadata_.updateNormalToken(token, endpoint);
}
else if (!tokenMetadata_.getToken(endpoint).equals(token))
{
logger_.warn("Node " + endpoint + " 'leaving' token mismatch. Long network partition?");
tokenMetadata_.updateNormalToken(token, endpoint);
}
// at this point the endpoint is certainly a member with this token, so let's proceed
// normally
tokenMetadata_.addLeavingEndpoint(endpoint);
calculatePendingRanges();
}
/**
* Handle node leaving the ring. This will happen when a node is decommissioned
*
* @param endpoint If reason for leaving is decommission, endpoint is the leaving node.
* @param pieces STATE_LEFT,token
*/
private void handleStateLeft(InetAddress endpoint, String[] pieces)
{
assert pieces.length == 2;
Token token = getPartitioner().getTokenFactory().fromString(pieces[1]);
if (logger_.isDebugEnabled())
logger_.debug("Node " + endpoint + " state left, token " + token);
excise(token, endpoint);
}
/**
* Handle node moving inside the ring.
*
* @param endpoint moving endpoint address
* @param pieces STATE_MOVING, token
*/
private void handleStateMoving(InetAddress endpoint, String[] pieces)
{
assert pieces.length == 2;
Token token = getPartitioner().getTokenFactory().fromString(pieces[1]);
if (logger_.isDebugEnabled())
logger_.debug("Node " + endpoint + " state moving, new token " + token);
tokenMetadata_.addMovingEndpoint(token, endpoint);
calculatePendingRanges();
}
/**
* Handle notification that a node being actively removed from the ring via 'removetoken'
*
* @param endpoint node
* @param state either REMOVED_TOKEN (node is gone) or REMOVING_TOKEN (replicas need to be restored)
*/
private void handleStateRemoving(InetAddress endpoint, Token removeToken, String state)
{
InetAddress removeEndpoint = tokenMetadata_.getEndpoint(removeToken);
if (removeEndpoint == null)
return;
if (removeEndpoint.equals(FBUtilities.getLocalAddress()))
{
logger_.info("Received removeToken gossip about myself. Is this node a replacement for a removed one?");
return;
}
if (VersionedValue.REMOVED_TOKEN.equals(state))
{
excise(removeToken, removeEndpoint);
}
else if (VersionedValue.REMOVING_TOKEN.equals(state))
{
if (logger_.isDebugEnabled())
logger_.debug("Token " + removeToken + " removed manually (endpoint was " + removeEndpoint + ")");
// Note that the endpoint is being removed
tokenMetadata_.addLeavingEndpoint(removeEndpoint);
calculatePendingRanges();
// grab any data we are now responsible for and notify responsible node
restoreReplicaCount(removeEndpoint, endpoint);
}
}
private void excise(Token token, InetAddress endpoint)
{
Gossiper.instance.removeEndpoint(endpoint);
tokenMetadata_.removeEndpoint(endpoint);
HintedHandOffManager.instance.deleteHintsForEndpoint(endpoint);
tokenMetadata_.removeBootstrapToken(token);
calculatePendingRanges();
if (!isClientMode)
{
logger_.info("Removing token " + token + " for " + endpoint);
SystemTable.removeToken(token);
}
}
/**
* Calculate pending ranges according to bootsrapping and leaving nodes. Reasoning is:
*
* (1) When in doubt, it is better to write too much to a node than too little. That is, if
* there are multiple nodes moving, calculate the biggest ranges a node could have. Cleaning
* up unneeded data afterwards is better than missing writes during movement.
* (2) When a node leaves, ranges for other nodes can only grow (a node might get additional
* ranges, but it will not lose any of its current ranges as a result of a leave). Therefore
* we will first remove _all_ leaving tokens for the sake of calculation and then check what
* ranges would go where if all nodes are to leave. This way we get the biggest possible
* ranges with regard current leave operations, covering all subsets of possible final range
* values.
* (3) When a node bootstraps, ranges of other nodes can only get smaller. Without doing
* complex calculations to see if multiple bootstraps overlap, we simply base calculations
* on the same token ring used before (reflecting situation after all leave operations have
* completed). Bootstrapping nodes will be added and removed one by one to that metadata and
* checked what their ranges would be. This will give us the biggest possible ranges the
* node could have. It might be that other bootstraps make our actual final ranges smaller,
* but it does not matter as we can clean up the data afterwards.
*
* NOTE: This is heavy and ineffective operation. This will be done only once when a node
* changes state in the cluster, so it should be manageable.
*/
private void calculatePendingRanges()
{
for (String table : DatabaseDescriptor.getNonSystemTables())
calculatePendingRanges(Table.open(table).getReplicationStrategy(), table);
}
// public & static for testing purposes
public static void calculatePendingRanges(AbstractReplicationStrategy strategy, String table)
{
TokenMetadata tm = StorageService.instance.getTokenMetadata();
Multimap<Range, InetAddress> pendingRanges = HashMultimap.create();
Map<Token, InetAddress> bootstrapTokens = tm.getBootstrapTokens();
Set<InetAddress> leavingEndpoints = tm.getLeavingEndpoints();
if (bootstrapTokens.isEmpty() && leavingEndpoints.isEmpty() && tm.getMovingEndpoints().isEmpty())
{
if (logger_.isDebugEnabled())
logger_.debug("No bootstrapping, leaving or moving nodes -> empty pending ranges for {}", table);
tm.setPendingRanges(table, pendingRanges);
return;
}
Multimap<InetAddress, Range> addressRanges = strategy.getAddressRanges();
// Copy of metadata reflecting the situation after all leave operations are finished.
TokenMetadata allLeftMetadata = tm.cloneAfterAllLeft();
// get all ranges that will be affected by leaving nodes
Set<Range> affectedRanges = new HashSet<Range>();
for (InetAddress endpoint : leavingEndpoints)
affectedRanges.addAll(addressRanges.get(endpoint));
// for each of those ranges, find what new nodes will be responsible for the range when
// all leaving nodes are gone.
for (Range range : affectedRanges)
{
Collection<InetAddress> currentEndpoints = strategy.calculateNaturalEndpoints(range.right, tm);
Collection<InetAddress> newEndpoints = strategy.calculateNaturalEndpoints(range.right, allLeftMetadata);
newEndpoints.removeAll(currentEndpoints);
pendingRanges.putAll(range, newEndpoints);
}
// At this stage pendingRanges has been updated according to leave operations. We can
// now continue the calculation by checking bootstrapping nodes.
// For each of the bootstrapping nodes, simply add and remove them one by one to
// allLeftMetadata and check in between what their ranges would be.
for (Map.Entry<Token, InetAddress> entry : bootstrapTokens.entrySet())
{
InetAddress endpoint = entry.getValue();
allLeftMetadata.updateNormalToken(entry.getKey(), endpoint);
for (Range range : strategy.getAddressRanges(allLeftMetadata).get(endpoint))
pendingRanges.put(range, endpoint);
allLeftMetadata.removeEndpoint(endpoint);
}
// At this stage pendingRanges has been updated according to leaving and bootstrapping nodes.
// We can now finish the calculation by checking moving nodes.
// For each of the moving nodes, we do the same thing we did for bootstrapping:
// simply add and remove them one by one to allLeftMetadata and check in between what their ranges would be.
for (Pair<Token, InetAddress> moving : tm.getMovingEndpoints())
{
InetAddress endpoint = moving.right; // address of the moving node
// moving.left is a new token of the endpoint
allLeftMetadata.updateNormalToken(moving.left, endpoint);
for (Range range : strategy.getAddressRanges(allLeftMetadata).get(endpoint))
{
pendingRanges.put(range, endpoint);
}
allLeftMetadata.removeEndpoint(endpoint);
}
tm.setPendingRanges(table, pendingRanges);
if (logger_.isDebugEnabled())
logger_.debug("Pending ranges:\n" + (pendingRanges.isEmpty() ? "<empty>" : tm.printPendingRanges()));
}
/**
* Finds living endpoints responsible for the given ranges
*
* @param table the table ranges belong to
* @param ranges the ranges to find sources for
* @return multimap of addresses to ranges the address is responsible for
*/
private Multimap<InetAddress, Range> getNewSourceRanges(String table, Set<Range> ranges)
{
InetAddress myAddress = FBUtilities.getLocalAddress();
Multimap<Range, InetAddress> rangeAddresses = Table.open(table).getReplicationStrategy().getRangeAddresses(tokenMetadata_);
Multimap<InetAddress, Range> sourceRanges = HashMultimap.create();
IFailureDetector failureDetector = FailureDetector.instance;
// find alive sources for our new ranges
for (Range range : ranges)
{
Collection<InetAddress> possibleRanges = rangeAddresses.get(range);
IEndpointSnitch snitch = DatabaseDescriptor.getEndpointSnitch();
List<InetAddress> sources = snitch.getSortedListByProximity(myAddress, possibleRanges);
assert (!sources.contains(myAddress));
for (InetAddress source : sources)
{
if (failureDetector.isAlive(source))
{
sourceRanges.put(source, range);
break;
}
}
}
return sourceRanges;
}
/**
* Sends a notification to a node indicating we have finished replicating data.
*
* @param local the local address
* @param remote node to send notification to
*/
private void sendReplicationNotification(InetAddress local, InetAddress remote)
{
// notify the remote token
Message msg = new Message(local, StorageService.Verb.REPLICATION_FINISHED, new byte[0], Gossiper.instance.getVersion(remote));
IFailureDetector failureDetector = FailureDetector.instance;
while (failureDetector.isAlive(remote))
{
IAsyncResult iar = MessagingService.instance().sendRR(msg, remote);
try
{
iar.get(DatabaseDescriptor.getRpcTimeout(), TimeUnit.MILLISECONDS);
return; // done
}
catch(TimeoutException e)
{
// try again
}
}
}
/**
* Called when an endpoint is removed from the ring. This function checks
* whether this node becomes responsible for new ranges as a
* consequence and streams data if needed.
*
* This is rather ineffective, but it does not matter so much
* since this is called very seldom
*
* @param endpoint the node that left
*/
private void restoreReplicaCount(InetAddress endpoint, final InetAddress notifyEndpoint)
{
final Multimap<InetAddress, String> fetchSources = HashMultimap.create();
Multimap<String, Map.Entry<InetAddress, Collection<Range>>> rangesToFetch = HashMultimap.create();
final InetAddress myAddress = FBUtilities.getLocalAddress();
for (String table : DatabaseDescriptor.getNonSystemTables())
{
Multimap<Range, InetAddress> changedRanges = getChangedRangesForLeaving(table, endpoint);
Set<Range> myNewRanges = new HashSet<Range>();
for (Map.Entry<Range, InetAddress> entry : changedRanges.entries())
{
if (entry.getValue().equals(myAddress))
myNewRanges.add(entry.getKey());
}
Multimap<InetAddress, Range> sourceRanges = getNewSourceRanges(table, myNewRanges);
for (Map.Entry<InetAddress, Collection<Range>> entry : sourceRanges.asMap().entrySet())
{
fetchSources.put(entry.getKey(), table);
rangesToFetch.put(table, entry);
}
}
for (final String table : rangesToFetch.keySet())
{
for (Map.Entry<InetAddress, Collection<Range>> entry : rangesToFetch.get(table))
{
final InetAddress source = entry.getKey();
Collection<Range> ranges = entry.getValue();
final Runnable callback = new Runnable()
{
public void run()
{
synchronized (fetchSources)
{
fetchSources.remove(source, table);
if (fetchSources.isEmpty())
sendReplicationNotification(myAddress, notifyEndpoint);
}
}
};
if (logger_.isDebugEnabled())
logger_.debug("Requesting from " + source + " ranges " + StringUtils.join(ranges, ", "));
StreamIn.requestRanges(source, table, ranges, callback, OperationType.RESTORE_REPLICA_COUNT);
}
}
}
// needs to be modified to accept either a table or ARS.
private Multimap<Range, InetAddress> getChangedRangesForLeaving(String table, InetAddress endpoint)
{
// First get all ranges the leaving endpoint is responsible for
Collection<Range> ranges = getRangesForEndpoint(table, endpoint);
if (logger_.isDebugEnabled())
logger_.debug("Node " + endpoint + " ranges [" + StringUtils.join(ranges, ", ") + "]");
Map<Range, List<InetAddress>> currentReplicaEndpoints = new HashMap<Range, List<InetAddress>>();
// Find (for each range) all nodes that store replicas for these ranges as well
for (Range range : ranges)
currentReplicaEndpoints.put(range, Table.open(table).getReplicationStrategy().calculateNaturalEndpoints(range.right, tokenMetadata_));
TokenMetadata temp = tokenMetadata_.cloneAfterAllLeft();
// endpoint might or might not be 'leaving'. If it was not leaving (that is, removetoken
// command was used), it is still present in temp and must be removed.
if (temp.isMember(endpoint))
temp.removeEndpoint(endpoint);
Multimap<Range, InetAddress> changedRanges = HashMultimap.create();
// Go through the ranges and for each range check who will be
// storing replicas for these ranges when the leaving endpoint
// is gone. Whoever is present in newReplicaEndpoints list, but
// not in the currentReplicaEndpoints list, will be needing the
// range.
for (Range range : ranges)
{
Collection<InetAddress> newReplicaEndpoints = Table.open(table).getReplicationStrategy().calculateNaturalEndpoints(range.right, temp);
newReplicaEndpoints.removeAll(currentReplicaEndpoints.get(range));
if (logger_.isDebugEnabled())
if (newReplicaEndpoints.isEmpty())
logger_.debug("Range " + range + " already in all replicas");
else
logger_.debug("Range " + range + " will be responsibility of " + StringUtils.join(newReplicaEndpoints, ", "));
changedRanges.putAll(range, newReplicaEndpoints);
}
return changedRanges;
}
public void onJoin(InetAddress endpoint, EndpointState epState)
{
for (Map.Entry<ApplicationState, VersionedValue> entry : epState.getApplicationStateMap().entrySet())
{
onChange(endpoint, entry.getKey(), entry.getValue());
}
}
public void onAlive(InetAddress endpoint, EndpointState state)
{
if (!isClientMode)
deliverHints(endpoint);
}
public void onRemove(InetAddress endpoint)
{
tokenMetadata_.removeEndpoint(endpoint);
calculatePendingRanges();
}
public void onDead(InetAddress endpoint, EndpointState state)
{
MessagingService.instance().convict(endpoint);
}
/** raw load value */
public double getLoad()
{
double bytes = 0;
for (String tableName : DatabaseDescriptor.getTables())
{
Table table = Table.open(tableName);
for (ColumnFamilyStore cfs : table.getColumnFamilyStores())
bytes += cfs.getLiveDiskSpaceUsed();
}
return bytes;
}
public String getLoadString()
{
return FileUtils.stringifyFileSize(getLoad());
}
public Map<String, String> getLoadMap()
{
Map<String, String> map = new HashMap<String, String>();
for (Map.Entry<InetAddress,Double> entry : StorageLoadBalancer.instance.getLoadInfo().entrySet())
{
map.put(entry.getKey().getHostAddress(), FileUtils.stringifyFileSize(entry.getValue()));
}
// gossiper doesn't see its own updates, so we need to special-case the local node
map.put(FBUtilities.getLocalAddress().getHostAddress(), getLoadString());
return map;
}
/**
* Deliver hints to the specified node when it has crashed
* and come back up/ marked as alive after a network partition
*/
public final void deliverHints(InetAddress endpoint)
{
HintedHandOffManager.instance.deliverHints(endpoint);
}
public final void deliverHints(String host) throws UnknownHostException
{
HintedHandOffManager.instance.deliverHints(host);
}
public Token getLocalToken()
{
Token token = SystemTable.getSavedToken();
assert token != null; // should not be called before initServer sets this
return token;
}
/* These methods belong to the MBean interface */
public String getToken()
{
return getLocalToken().toString();
}
public String getReleaseVersion()
{
return FBUtilities.getReleaseVersionString();
}
public List<String> getLeavingNodes()
{
return stringify(tokenMetadata_.getLeavingEndpoints());
}
public List<String> getMovingNodes()
{
List<String> endpoints = new ArrayList<String>();
for (Pair<Token, InetAddress> node : tokenMetadata_.getMovingEndpoints())
{
endpoints.add(node.right.getHostAddress());
}
return endpoints;
}
public List<String> getJoiningNodes()
{
return stringify(tokenMetadata_.getBootstrapTokens().values());
}
public List<String> getLiveNodes()
{
return stringify(Gossiper.instance.getLiveMembers());
}
public List<String> getUnreachableNodes()
{
return stringify(Gossiper.instance.getUnreachableMembers());
}
private List<String> stringify(Iterable<InetAddress> endpoints)
{
List<String> stringEndpoints = new ArrayList<String>();
for (InetAddress ep : endpoints)
{
stringEndpoints.add(ep.getHostAddress());
}
return stringEndpoints;
}
public int getCurrentGenerationNumber()
{
return Gossiper.instance.getCurrentGenerationNumber(FBUtilities.getLocalAddress());
}
public void forceTableCleanup(String tableName, String... columnFamilies) throws IOException, ExecutionException, InterruptedException
{
if (tableName.equals("system"))
throw new RuntimeException("Cleanup of the system table is neither necessary nor wise");
NodeId.OneShotRenewer nodeIdRenewer = new NodeId.OneShotRenewer();
for (ColumnFamilyStore cfStore : getValidColumnFamilies(tableName, columnFamilies))
{
cfStore.forceCleanup(nodeIdRenewer);
}
}
public void scrub(String tableName, String... columnFamilies) throws IOException, ExecutionException, InterruptedException
{
for (ColumnFamilyStore cfStore : getValidColumnFamilies(tableName, columnFamilies))
cfStore.scrub();
}
public void forceTableCompaction(String tableName, String... columnFamilies) throws IOException, ExecutionException, InterruptedException
{
for (ColumnFamilyStore cfStore : getValidColumnFamilies(tableName, columnFamilies))
{
cfStore.forceMajorCompaction();
}
}
public void invalidateKeyCaches(String tableName, String... columnFamilies) throws IOException
{
for (ColumnFamilyStore cfStore : getValidColumnFamilies(tableName, columnFamilies))
{
cfStore.invalidateKeyCache();
}
}
public void invalidateRowCaches(String tableName, String... columnFamilies) throws IOException
{
for (ColumnFamilyStore cfStore : getValidColumnFamilies(tableName, columnFamilies))
{
cfStore.invalidateRowCache();
}
}
/**
* Takes the snapshot for the given tables. A snapshot name must be specified.
*
* @param tag the tag given to the snapshot; may not be null or empty
* @param tableNames the name of the tables to snapshot; empty means "all."
*/
public void takeSnapshot(String tag, String... tableNames) throws IOException
{
if (tag == null || tag.equals(""))
throw new IOException("You must supply a snapshot name.");
Iterable<Table> tables;
if (tableNames.length == 0)
{
tables = Table.all();
}
else
{
ArrayList<Table> t = new ArrayList<Table>();
for (String table : tableNames)
t.add(getValidTable(table));
tables = t;
}
// Do a check to see if this snapshot exists before we actually snapshot
for (Table table : tables)
if (table.snapshotExists(tag))
throw new IOException("Snapshot " + tag + " already exists.");
for (Table table : tables)
table.snapshot(tag);
}
private Table getValidTable(String tableName) throws IOException
{
if (!DatabaseDescriptor.getTables().contains(tableName))
{
throw new IOException("Table " + tableName + "does not exist");
}
return Table.open(tableName);
}
/**
* Remove the snapshot with the given name from the given tables.
* If no tag is specified we will remove all snapshots.
*/
public void clearSnapshot(String tag, String... tableNames) throws IOException
{
if(tag == null)
tag = "";
Iterable<Table> tables;
if (tableNames.length == 0)
{
tables = Table.all();
}
else
{
ArrayList<Table> tempTables = new ArrayList<Table>();
for(String table : tableNames)
tempTables.add(getValidTable(table));
tables = tempTables;
}
for (Table table : tables)
table.clearSnapshot(tag);
if (logger_.isDebugEnabled())
logger_.debug("Cleared out snapshot directories");
}
public Iterable<ColumnFamilyStore> getValidColumnFamilies(String tableName, String... cfNames) throws IOException
{
Table table = getValidTable(tableName);
if (cfNames.length == 0)
// all stores are interesting
return table.getColumnFamilyStores();
// filter out interesting stores
Set<ColumnFamilyStore> valid = new HashSet<ColumnFamilyStore>();
for (String cfName : cfNames)
{
ColumnFamilyStore cfStore = table.getColumnFamilyStore(cfName);
if (cfStore == null)
{
// this means there was a cf passed in that is not recognized in the keyspace. report it and continue.
logger_.warn(String.format("Invalid column family specified: %s. Proceeding with others.", cfName));
continue;
}
valid.add(cfStore);
}
return valid;
}
/**
* Flush all memtables for a table and column families.
* @param tableName
* @param columnFamilies
* @throws IOException
*/
public void forceTableFlush(final String tableName, final String... columnFamilies)
throws IOException, ExecutionException, InterruptedException
{
for (ColumnFamilyStore cfStore : getValidColumnFamilies(tableName, columnFamilies))
{
logger_.debug("Forcing binary flush on keyspace " + tableName + ", CF " + cfStore.getColumnFamilyName());
cfStore.forceFlushBinary();
logger_.debug("Forcing flush on keyspace " + tableName + ", CF " + cfStore.getColumnFamilyName());
cfStore.forceBlockingFlush();
}
}
/**
* Trigger proactive repair for a table and column families.
* @param tableName
* @param columnFamilies
* @throws IOException
*/
public void forceTableRepair(final String tableName, final String... columnFamilies) throws IOException
{
List<AntiEntropyService.RepairSession> sessions = new ArrayList<AntiEntropyService.RepairSession>();
for (Range range : getLocalRanges(tableName))
{
sessions.add(forceTableRepair(range, tableName, columnFamilies));
}
boolean failedSession = false;
// block until all repair sessions have completed
for (AntiEntropyService.RepairSession sess : sessions)
{
try
{
sess.join();
}
catch (InterruptedException e)
{
logger_.error("Repair session " + sess + " failed.", e);
failedSession = true;
}
}
if (failedSession)
throw new IOException("Some Repair session(s) failed.");
}
public AntiEntropyService.RepairSession forceTableRepair(final Range range, final String tableName, final String... columnFamilies) throws IOException
{
String[] families;
if (columnFamilies.length == 0)
{
ArrayList<String> names = new ArrayList<String>();
for (ColumnFamilyStore cfStore : getValidColumnFamilies(tableName)) {
names.add(cfStore.getColumnFamilyName());
}
families = names.toArray(new String[] {});
}
else
{
families = columnFamilies;
}
AntiEntropyService.RepairSession sess = AntiEntropyService.instance.getRepairSession(range, tableName, families);
sess.start();
return sess;
}
/* End of MBean interface methods */
/**
* This method returns the predecessor of the endpoint ep on the identifier
* space.
*/
InetAddress getPredecessor(InetAddress ep)
{
Token token = tokenMetadata_.getToken(ep);
return tokenMetadata_.getEndpoint(tokenMetadata_.getPredecessor(token));
}
/*
* This method returns the successor of the endpoint ep on the identifier
* space.
*/
public InetAddress getSuccessor(InetAddress ep)
{
Token token = tokenMetadata_.getToken(ep);
return tokenMetadata_.getEndpoint(tokenMetadata_.getSuccessor(token));
}
/**
* Get the primary range for the specified endpoint.
* @param ep endpoint we are interested in.
* @return range for the specified endpoint.
*/
public Range getPrimaryRangeForEndpoint(InetAddress ep)
{
return tokenMetadata_.getPrimaryRangeFor(tokenMetadata_.getToken(ep));
}
/**
* Get all ranges an endpoint is responsible for (by table)
* @param ep endpoint we are interested in.
* @return ranges for the specified endpoint.
*/
Collection<Range> getRangesForEndpoint(String table, InetAddress ep)
{
return Table.open(table).getReplicationStrategy().getAddressRanges().get(ep);
}
/**
* Get all ranges that span the ring given a set
* of tokens. All ranges are in sorted order of
* ranges.
* @return ranges in sorted order
*/
public List<Range> getAllRanges(List<Token> sortedTokens)
{
if (logger_.isDebugEnabled())
logger_.debug("computing ranges for " + StringUtils.join(sortedTokens, ", "));
if (sortedTokens.isEmpty())
return Collections.emptyList();
List<Range> ranges = new ArrayList<Range>();
int size = sortedTokens.size();
for (int i = 1; i < size; ++i)
{
Range range = new Range(sortedTokens.get(i - 1), sortedTokens.get(i));
ranges.add(range);
}
Range range = new Range(sortedTokens.get(size - 1), sortedTokens.get(0));
ranges.add(range);
return ranges;
}
/**
* This method returns the N endpoints that are responsible for storing the
* specified key i.e for replication.
*
* @param key - key for which we need to find the endpoint return value -
* the endpoint responsible for this key
*/
public List<InetAddress> getNaturalEndpoints(String table, byte[] key)
{
return getNaturalEndpoints(table, partitioner.getToken(ByteBuffer.wrap(key)));
}
public List<InetAddress> getNaturalEndpoints(String table, ByteBuffer key)
{
return getNaturalEndpoints(table, partitioner.getToken(key));
}
/**
* This method returns the N endpoints that are responsible for storing the
* specified key i.e for replication.
*
* @param token - token for which we need to find the endpoint return value -
* the endpoint responsible for this token
*/
public List<InetAddress> getNaturalEndpoints(String table, Token token)
{
return Table.open(table).getReplicationStrategy().getNaturalEndpoints(token);
}
/**
* This method attempts to return N endpoints that are responsible for storing the
* specified key i.e for replication.
*
* @param key - key for which we need to find the endpoint return value -
* the endpoint responsible for this key
*/
public List<InetAddress> getLiveNaturalEndpoints(String table, ByteBuffer key)
{
return getLiveNaturalEndpoints(table, partitioner.getToken(key));
}
public List<InetAddress> getLiveNaturalEndpoints(String table, Token token)
{
List<InetAddress> liveEps = new ArrayList<InetAddress>();
List<InetAddress> endpoints = Table.open(table).getReplicationStrategy().getNaturalEndpoints(token);
for (InetAddress endpoint : endpoints)
{
if (FailureDetector.instance.isAlive(endpoint))
liveEps.add(endpoint);
}
return liveEps;
}
public void setLog4jLevel(String classQualifier, String rawLevel)
{
Level level = Level.toLevel(rawLevel);
org.apache.log4j.Logger.getLogger(classQualifier).setLevel(level);
logger_.info("set log level to " + level + " for classes under '" + classQualifier + "' (if the level doesn't look like '" + rawLevel + "' then log4j couldn't parse '" + rawLevel + "')");
}
/**
* @return list of Tokens (_not_ keys!) breaking up the data this node is responsible for into pieces of roughly keysPerSplit
*/
public List<Token> getSplits(String table, String cfName, Range range, int keysPerSplit)
{
List<Token> tokens = new ArrayList<Token>();
// we use the actual Range token for the first and last brackets of the splits to ensure correctness
tokens.add(range.left);
List<DecoratedKey> keys = new ArrayList<DecoratedKey>();
Table t = Table.open(table);
ColumnFamilyStore cfs = t.getColumnFamilyStore(cfName);
for (DecoratedKey sample : cfs.allKeySamples())
{
if (range.contains(sample.token))
keys.add(sample);
}
FBUtilities.sortSampledKeys(keys, range);
int splits = keys.size() * DatabaseDescriptor.getIndexInterval() / keysPerSplit;
if (keys.size() >= splits)
{
for (int i = 1; i < splits; i++)
{
int index = i * (keys.size() / splits);
tokens.add(keys.get(index).token);
}
}
tokens.add(range.right);
return tokens;
}
/** return a token to which if a node bootstraps it will get about 1/2 of this node's range */
public Token getBootstrapToken()
{
Range range = getLocalPrimaryRange();
List<DecoratedKey> keys = new ArrayList<DecoratedKey>();
for (ColumnFamilyStore cfs : ColumnFamilyStore.all())
{
for (DecoratedKey key : cfs.allKeySamples())
{
if (range.contains(key.token))
keys.add(key);
}
}
FBUtilities.sortSampledKeys(keys, range);
if (keys.size() < 3)
return partitioner.midpoint(range.left, range.right);
else
return keys.get(keys.size() / 2).token;
}
/**
* Broadcast leaving status and update local tokenMetadata_ accordingly
*/
private void startLeaving()
{
Gossiper.instance.addLocalApplicationState(ApplicationState.STATUS, valueFactory.leaving(getLocalToken()));
tokenMetadata_.addLeavingEndpoint(FBUtilities.getLocalAddress());
calculatePendingRanges();
}
public void decommission() throws InterruptedException
{
if (!tokenMetadata_.isMember(FBUtilities.getLocalAddress()))
throw new UnsupportedOperationException("local node is not a member of the token ring yet");
if (tokenMetadata_.cloneAfterAllLeft().sortedTokens().size() < 2)
throw new UnsupportedOperationException("no other normal nodes in the ring; decommission would be pointless");
for (String table : DatabaseDescriptor.getNonSystemTables())
{
if (tokenMetadata_.getPendingRanges(table, FBUtilities.getLocalAddress()).size() > 0)
throw new UnsupportedOperationException("data is currently moving to this node; unable to leave the ring");
}
if (logger_.isDebugEnabled())
logger_.debug("DECOMMISSIONING");
startLeaving();
setMode("Leaving: sleeping " + RING_DELAY + " ms for pending range setup", true);
Thread.sleep(RING_DELAY);
Runnable finishLeaving = new Runnable()
{
public void run()
{
Gossiper.instance.stop();
MessagingService.instance().shutdown();
StageManager.shutdownNow();
setMode("Decommissioned", true);
// let op be responsible for killing the process
}
};
unbootstrap(finishLeaving);
}
private void leaveRing()
{
SystemTable.setBootstrapped(false);
tokenMetadata_.removeEndpoint(FBUtilities.getLocalAddress());
calculatePendingRanges();
Gossiper.instance.addLocalApplicationState(ApplicationState.STATUS, valueFactory.left(getLocalToken()));
logger_.info("Announcing that I have left the ring for " + RING_DELAY + "ms");
try
{
Thread.sleep(RING_DELAY);
}
catch (InterruptedException e)
{
throw new AssertionError(e);
}
}
private void unbootstrap(final Runnable onFinish)
{
Map<String, Multimap<Range, InetAddress>> rangesToStream = new HashMap<String, Multimap<Range, InetAddress>>();
for (final String table : DatabaseDescriptor.getNonSystemTables())
{
Multimap<Range, InetAddress> rangesMM = getChangedRangesForLeaving(table, FBUtilities.getLocalAddress());
if (logger_.isDebugEnabled())
logger_.debug("Ranges needing transfer are [" + StringUtils.join(rangesMM.keySet(), ",") + "]");
rangesToStream.put(table, rangesMM);
}
setMode("Leaving: streaming data to other nodes", true);
CountDownLatch latch = streamRanges(rangesToStream);
// wait for the transfer runnables to signal the latch.
logger_.debug("waiting for stream aks.");
try
{
latch.await();
}
catch (InterruptedException e)
{
throw new RuntimeException(e);
}
logger_.debug("stream acks all received.");
leaveRing();
onFinish.run();
}
public void move(String newToken) throws IOException, InterruptedException
{
move(partitioner.getTokenFactory().fromString(newToken));
}
/**
* move the node to new token or find a new token to boot to according to load
*
* @param newToken new token to boot to, or if null, find balanced token to boot to
*
* @throws IOException on any I/O operation error
*/
private void move(Token newToken) throws IOException
{
if (newToken == null)
throw new IOException("Can't move to the undefined (null) token.");
if (tokenMetadata_.sortedTokens().contains(newToken))
throw new IOException("target token " + newToken + " is already owned by another node.");
// address of the current node
InetAddress localAddress = FBUtilities.getLocalAddress();
List<String> tablesToProcess = DatabaseDescriptor.getNonSystemTables();
// checking if data is moving to this node
for (String table : tablesToProcess)
{
if (tokenMetadata_.getPendingRanges(table, localAddress).size() > 0)
throw new UnsupportedOperationException("data is currently moving to this node; unable to leave the ring");
}
// setting 'moving' application state
Gossiper.instance.addLocalApplicationState(ApplicationState.STATUS, valueFactory.moving(newToken));
logger_.info(String.format("Moving %s from %s to %s.", localAddress, getLocalToken(), newToken));
IEndpointSnitch snitch = DatabaseDescriptor.getEndpointSnitch();
Map<String, Multimap<InetAddress, Range>> rangesToFetch = new HashMap<String, Multimap<InetAddress, Range>>();
Map<String, Multimap<Range, InetAddress>> rangesToStreamByTable = new HashMap<String, Multimap<Range, InetAddress>>();
TokenMetadata tokenMetaClone = tokenMetadata_.cloneAfterAllSettled();
// for each of the non system tables calculating new ranges
// which current node will handle after move to the new token
for (String table : tablesToProcess)
{
// replication strategy of the current keyspace (aka table)
AbstractReplicationStrategy strategy = Table.open(table).getReplicationStrategy();
// getting collection of the currently used ranges by this keyspace
Collection<Range> currentRanges = getRangesForEndpoint(table, localAddress);
// collection of ranges which this node will serve after move to the new token
Collection<Range> updatedRanges = strategy.getPendingAddressRanges(tokenMetadata_, newToken, localAddress);
// ring ranges and endpoints associated with them
// this used to determine what nodes should we ping about range data
Multimap<Range, InetAddress> rangeAddresses = strategy.getRangeAddresses(tokenMetaClone);
// calculated parts of the ranges to request/stream from/to nodes in the ring
Pair<Set<Range>, Set<Range>> rangesPerTable = calculateStreamAndFetchRanges(currentRanges, updatedRanges);
/**
* In this loop we are going through all ranges "to fetch" and determining
* nodes in the ring responsible for data we are interested in
*/
Multimap<Range, InetAddress> rangesToFetchWithPreferredEndpoints = ArrayListMultimap.create();
for (Range toFetch : rangesPerTable.right)
{
for (Range range : rangeAddresses.keySet())
{
if (range.contains(toFetch))
{
List<InetAddress> endpoints = snitch.getSortedListByProximity(localAddress, rangeAddresses.get(range));
// storing range and preferred endpoint set
rangesToFetchWithPreferredEndpoints.putAll(toFetch, endpoints);
}
}
}
// calculating endpoints to stream current ranges to if needed
// in some situations node will handle current ranges as part of the new ranges
Multimap<Range, InetAddress> rangeWithEndpoints = HashMultimap.create();
for (Range toStream : rangesPerTable.left)
{
List<InetAddress> endpoints = strategy.calculateNaturalEndpoints(toStream.right, tokenMetaClone);
rangeWithEndpoints.putAll(toStream, endpoints);
}
// associating table with range-to-endpoints map
rangesToStreamByTable.put(table, rangeWithEndpoints);
Multimap<InetAddress, Range> workMap = BootStrapper.getWorkMap(rangesToFetchWithPreferredEndpoints);
rangesToFetch.put(table, workMap);
if (logger_.isDebugEnabled())
logger_.debug("Table {}: work map {}.", table, workMap);
}
if (!rangesToStreamByTable.isEmpty() || !rangesToFetch.isEmpty())
{
logger_.info("Sleeping {} ms before start streaming/fetching ranges.", RING_DELAY);
try
{
Thread.sleep(RING_DELAY);
}
catch (InterruptedException e)
{
throw new RuntimeException("Sleep interrupted " + e.getMessage());
}
setMode("Moving: fetching new ranges and streaming old ranges", true);
if (logger_.isDebugEnabled())
logger_.debug("[Move->STREAMING] Work Map: " + rangesToStreamByTable);
CountDownLatch streamLatch = streamRanges(rangesToStreamByTable);
if (logger_.isDebugEnabled())
logger_.debug("[Move->FETCHING] Work Map: " + rangesToFetch);
CountDownLatch fetchLatch = requestRanges(rangesToFetch);
try
{
streamLatch.await();
fetchLatch.await();
}
catch (InterruptedException e)
{
throw new RuntimeException("Interrupted latch while waiting for stream/fetch ranges to finish: " + e.getMessage());
}
}
setToken(newToken); // setting new token as we have everything settled
if (logger_.isDebugEnabled())
logger_.debug("Successfully moved to new token {}", getLocalToken());
}
/**
* Get the status of a token removal.
*/
public String getRemovalStatus()
{
if (removingNode == null) {
return "No token removals in process.";
}
return String.format("Removing token (%s). Waiting for replication confirmation from [%s].",
tokenMetadata_.getToken(removingNode),
StringUtils.join(replicatingNodes, ","));
}
/**
* Force a remove operation to complete. This may be necessary if a remove operation
* blocks forever due to node/stream failure. removeToken() must be called
* first, this is a last resort measure. No further attempt will be made to restore replicas.
*/
public void forceRemoveCompletion()
{
if (!replicatingNodes.isEmpty())
{
logger_.warn("Removal not confirmed for for " + StringUtils.join(this.replicatingNodes, ","));
replicatingNodes.clear();
}
else
{
throw new UnsupportedOperationException("No tokens to force removal on, call 'removetoken' first");
}
}
/**
* Remove a node that has died, attempting to restore the replica count.
* If the node is alive, decommission should be attempted. If decommission
* fails, then removeToken should be called. If we fail while trying to
* restore the replica count, finally forceRemoveCompleteion should be
* called to forcibly remove the node without regard to replica count.
*
* @param tokenString token for the node
*/
public void removeToken(String tokenString)
{
InetAddress myAddress = FBUtilities.getLocalAddress();
Token localToken = tokenMetadata_.getToken(myAddress);
Token token = partitioner.getTokenFactory().fromString(tokenString);
InetAddress endpoint = tokenMetadata_.getEndpoint(token);
if (endpoint == null)
throw new UnsupportedOperationException("Token not found.");
if (endpoint.equals(myAddress))
throw new UnsupportedOperationException("Cannot remove node's own token");
if (Gossiper.instance.getLiveMembers().contains(endpoint))
throw new UnsupportedOperationException("Node " + endpoint + " is alive and owns this token. Use decommission command to remove it from the ring");
// A leaving endpoint that is dead is already being removed.
if (tokenMetadata_.isLeaving(endpoint))
logger_.warn("Node " + endpoint + " is already being removed, continuing removal anyway");
if (!replicatingNodes.isEmpty())
throw new UnsupportedOperationException("This node is already processing a removal. Wait for it to complete, or use 'removetoken force' if this has failed.");
// Find the endpoints that are going to become responsible for data
for (String table : DatabaseDescriptor.getNonSystemTables())
{
// if the replication factor is 1 the data is lost so we shouldn't wait for confirmation
if (Table.open(table).getReplicationStrategy().getReplicationFactor() == 1)
continue;
// get all ranges that change ownership (that is, a node needs
// to take responsibility for new range)
Multimap<Range, InetAddress> changedRanges = getChangedRangesForLeaving(table, endpoint);
IFailureDetector failureDetector = FailureDetector.instance;
for (InetAddress ep : changedRanges.values())
{
if (failureDetector.isAlive(ep))
replicatingNodes.add(ep);
else
logger_.warn("Endpoint " + ep + " is down and will not receive data for re-replication of " + endpoint);
}
}
removingNode = endpoint;
tokenMetadata_.addLeavingEndpoint(endpoint);
calculatePendingRanges();
// bundle two states together. include this nodes state to keep the status quo,
// but indicate the leaving token so that it can be dealt with.
Gossiper.instance.addLocalApplicationState(ApplicationState.STATUS, valueFactory.removingNonlocal(localToken, token));
// kick off streaming commands
restoreReplicaCount(endpoint, myAddress);
// wait for ReplicationFinishedVerbHandler to signal we're done
while (!replicatingNodes.isEmpty())
{
try
{
Thread.sleep(100);
}
catch (InterruptedException e)
{
throw new AssertionError(e);
}
}
excise(token, endpoint);
// indicate the token has left
Gossiper.instance.addLocalApplicationState(ApplicationState.STATUS, valueFactory.removedNonlocal(localToken, token));
replicatingNodes.clear();
removingNode = null;
}
public void confirmReplication(InetAddress node)
{
assert !replicatingNodes.isEmpty();
replicatingNodes.remove(node);
}
public boolean isClientMode()
{
return isClientMode;
}
public synchronized void requestGC()
{
if (hasUnreclaimedSpace())
{
logger_.info("requesting GC to free disk space");
System.gc();
try
{
Thread.sleep(1000);
}
catch (InterruptedException e)
{
throw new AssertionError(e);
}
}
}
private boolean hasUnreclaimedSpace()
{
for (ColumnFamilyStore cfs : ColumnFamilyStore.all())
{
if (cfs.hasUnreclaimedSpace())
return true;
}
return false;
}
public String getOperationMode()
{
return operationMode;
}
public String getDrainProgress()
{
return String.format("Drained %s/%s ColumnFamilies", remainingCFs, totalCFs);
}
/** shuts node off to writes, empties memtables and the commit log. */
public synchronized void drain() throws IOException, InterruptedException, ExecutionException
{
ExecutorService mutationStage = StageManager.getStage(Stage.MUTATION);
if (mutationStage.isTerminated())
{
logger_.warn("Cannot drain node (did it already happen?)");
return;
}
setMode("Starting drain process", true);
Gossiper.instance.stop();
setMode("Draining: shutting down MessageService", false);
MessagingService.instance().shutdown();
setMode("Draining: emptying MessageService pools", false);
MessagingService.instance().waitFor();
setMode("Draining: clearing mutation stage", false);
mutationStage.shutdown();
mutationStage.awaitTermination(3600, TimeUnit.SECONDS);
// lets flush.
setMode("Draining: flushing column families", false);
List<ColumnFamilyStore> cfses = new ArrayList<ColumnFamilyStore>();
for (String tableName : DatabaseDescriptor.getNonSystemTables())
{
Table table = Table.open(tableName);
cfses.addAll(table.getColumnFamilyStores());
}
totalCFs = remainingCFs = cfses.size();
for (ColumnFamilyStore cfs : cfses)
{
cfs.forceBlockingFlush();
remainingCFs--;
}
ColumnFamilyStore.postFlushExecutor.shutdown();
ColumnFamilyStore.postFlushExecutor.awaitTermination(60, TimeUnit.SECONDS);
CommitLog.instance.shutdownBlocking();
// want to make sure that any segments deleted as a result of flushing are gone.
DeletionService.waitFor();
setMode("Node is drained", true);
}
// Never ever do this at home. Used by tests.
IPartitioner setPartitionerUnsafe(IPartitioner newPartitioner)
{
IPartitioner oldPartitioner = partitioner;
partitioner = newPartitioner;
valueFactory = new VersionedValue.VersionedValueFactory(partitioner);
return oldPartitioner;
}
TokenMetadata setTokenMetadataUnsafe(TokenMetadata tmd)
{
TokenMetadata old = tokenMetadata_;
tokenMetadata_ = tmd;
return old;
}
public void truncate(String keyspace, String columnFamily) throws UnavailableException, TimeoutException, IOException
{
StorageProxy.truncateBlocking(keyspace, columnFamily);
}
public void saveCaches() throws ExecutionException, InterruptedException
{
List<Future<?>> futures = new ArrayList<Future<?>>();
logger_.debug("submitting cache saves");
for (ColumnFamilyStore cfs : ColumnFamilyStore.all())
{
futures.add(cfs.keyCache.submitWrite());
futures.add(cfs.rowCache.submitWrite());
}
FBUtilities.waitOnFutures(futures);
logger_.debug("cache saves completed");
}
public Map<Token, Float> getOwnership()
{
List<Token> sortedTokens = new ArrayList<Token>(getTokenToEndpointMap().keySet());
Collections.sort(sortedTokens);
return partitioner.describeOwnership(sortedTokens);
}
public List<String> getKeyspaces()
{
List<String> tableslist = new ArrayList<String>(DatabaseDescriptor.getTables());
return Collections.unmodifiableList(tableslist);
}
public void updateSnitch(String epSnitchClassName, Boolean dynamic, Integer dynamicUpdateInterval, Integer dynamicResetInterval, Double dynamicBadnessThreshold) throws ConfigurationException
{
IEndpointSnitch oldSnitch = DatabaseDescriptor.getEndpointSnitch();
// new snitch registers mbean during construction
IEndpointSnitch newSnitch = FBUtilities.construct(epSnitchClassName, "snitch");
if (dynamic)
{
DatabaseDescriptor.setDynamicUpdateInterval(dynamicUpdateInterval);
DatabaseDescriptor.setDynamicResetInterval(dynamicResetInterval);
DatabaseDescriptor.setDynamicBadnessThreshold(dynamicBadnessThreshold);
newSnitch = new DynamicEndpointSnitch(newSnitch);
}
// point snitch references to the new instance
DatabaseDescriptor.setEndpointSnitch(newSnitch);
for (String ks : DatabaseDescriptor.getTables())
{
Table.open(ks).getReplicationStrategy().snitch = newSnitch;
}
if (oldSnitch instanceof DynamicEndpointSnitch)
((DynamicEndpointSnitch)oldSnitch).unregisterMBean();
}
public boolean useEfficientCrossDCWrites()
{
return efficientCrossDCWrites;
}
/**
* Flushes the two largest memtables by ops and by throughput
*/
public void flushLargestMemtables()
{
ColumnFamilyStore largestByOps = null;
ColumnFamilyStore largestByThroughput = null;
for (ColumnFamilyStore cfs : ColumnFamilyStore.all())
{
long ops = 0;
long throughput = 0;
for (ColumnFamilyStore subordinate : cfs.concatWithIndexes())
{
ops += subordinate.getMemtableColumnsCount();
throughput += subordinate.getMemtableDataSize();
}
if (ops > 0 && (largestByOps == null || ops > largestByOps.getMemtableColumnsCount()))
{
logger_.debug(ops + " total ops in " + cfs);
largestByOps = cfs;
}
if (throughput > 0 && (largestByThroughput == null || throughput > largestByThroughput.getMemtableThroughputInMB()))
{
logger_.debug(throughput + " total throughput in " + cfs);
largestByThroughput = cfs;
}
}
if (largestByOps == null)
{
logger_.info("Unable to reduce heap usage since there are no dirty column families");
return;
}
logger_.warn("Flushing " + largestByOps + " to relieve memory pressure");
largestByOps.forceFlush();
if (largestByThroughput != largestByOps)
{
logger_.warn("Flushing " + largestByThroughput + " to relieve memory pressure");
largestByThroughput.forceFlush();
}
}
public void reduceCacheSizes()
{
for (ColumnFamilyStore cfs : ColumnFamilyStore.all())
cfs.reduceCacheSizes();
}
/**
* Seed data to the endpoints that will be responsible for it at the future
*
* @param rangesToStreamByTable tables and data ranges with endpoints included for each
* @return latch to count down
*/
private CountDownLatch streamRanges(final Map<String, Multimap<Range, InetAddress>> rangesToStreamByTable)
{
final CountDownLatch latch = new CountDownLatch(rangesToStreamByTable.keySet().size());
for (final String table : rangesToStreamByTable.keySet())
{
Multimap<Range, InetAddress> rangesWithEndpoints = rangesToStreamByTable.get(table);
if (rangesWithEndpoints.isEmpty())
{
latch.countDown();
continue;
}
final Set<Map.Entry<Range, InetAddress>> pending = new HashSet<Map.Entry<Range, InetAddress>>(rangesWithEndpoints.entries());
for (final Map.Entry<Range, InetAddress> entry : rangesWithEndpoints.entries())
{
final Range range = entry.getKey();
final InetAddress newEndpoint = entry.getValue();
final Runnable callback = new Runnable()
{
public void run()
{
synchronized (pending)
{
pending.remove(entry);
if (pending.isEmpty())
latch.countDown();
}
}
};
StageManager.getStage(Stage.STREAM).execute(new Runnable()
{
public void run()
{
// TODO each call to transferRanges re-flushes, this is potentially a lot of waste
StreamOut.transferRanges(newEndpoint, table, Arrays.asList(range), callback, OperationType.UNBOOTSTRAP);
}
});
}
}
return latch;
}
/**
* Used to request ranges from endpoints in the ring (will block until all data is fetched and ready)
* @param ranges ranges to fetch as map of the preferred address and range collection
* @return latch to count down
*/
private CountDownLatch requestRanges(final Map<String, Multimap<InetAddress, Range>> ranges)
{
final CountDownLatch latch = new CountDownLatch(ranges.keySet().size());
for (final String table : ranges.keySet())
{
Multimap<InetAddress, Range> endpointWithRanges = ranges.get(table);
if (endpointWithRanges.isEmpty())
{
latch.countDown();
continue;
}
final Set<InetAddress> pending = new HashSet<InetAddress>(endpointWithRanges.keySet());
// Send messages to respective folks to stream data over to me
for (final InetAddress source: endpointWithRanges.keySet())
{
Collection<Range> toFetch = endpointWithRanges.get(source);
final Runnable callback = new Runnable()
{
public void run()
{
pending.remove(source);
if (pending.isEmpty())
latch.countDown();
}
};
if (logger_.isDebugEnabled())
logger_.debug("Requesting from " + source + " ranges " + StringUtils.join(toFetch, ", "));
// sending actual request
StreamIn.requestRanges(source, table, toFetch, callback, OperationType.BOOTSTRAP);
}
}
return latch;
}
// see calculateStreamAndFetchRanges(Iterator, Iterator) for description
private Pair<Set<Range>, Set<Range>> calculateStreamAndFetchRanges(Collection<Range> current, Collection<Range> updated)
{
return calculateStreamAndFetchRanges(current.iterator(), updated.iterator());
}
/**
* Calculate pair of ranges to stream/fetch for given two range collections
* (current ranges for table and ranges after move to new token)
*
* @param current collection of the ranges by current token
* @param updated collection of the ranges after token is changed
* @return pair of ranges to stream/fetch for given current and updated range collections
*/
private Pair<Set<Range>, Set<Range>> calculateStreamAndFetchRanges(Iterator<Range> current, Iterator<Range> updated)
{
Set<Range> toStream = new HashSet<Range>();
Set<Range> toFetch = new HashSet<Range>();
while (current.hasNext() && updated.hasNext())
{
Range r1 = current.next();
Range r2 = updated.next();
// if ranges intersect we need to fetch only missing part
if (r1.intersects(r2))
{
// adding difference ranges to fetch from a ring
toFetch.addAll(r1.differenceToFetch(r2));
// if current range is a sub-range of a new range we don't need to seed
// otherwise we need to seed parts of the current range
if (!r2.contains(r1))
{
// (A, B] & (C, D]
if (Range.compare(r1.left, r2.left) < 0) // if A < C
{
toStream.add(new Range(r1.left, r2.left)); // seed (A, C]
}
if (Range.compare(r1.right, r2.right) > 0) // if B > D
{
toStream.add(new Range(r2.right, r1.right)); // seed (D, B]
}
}
}
else // otherwise we need to fetch whole new range
{
toStream.add(r1); // should seed whole old range
toFetch.add(r2);
}
}
return new Pair<Set<Range>, Set<Range>>(toStream, toFetch);
}
}