package org.apache.cassandra.db;
import java.io.DataInputStream;
import java.io.IOException;
import java.net.InetAddress;
import java.nio.ByteBuffer;
import java.util.*;
import java.util.concurrent.TimeUnit;
import javax.management.openmbean.*;
import com.google.common.base.Function;
import com.google.common.collect.HashMultimap;
import com.google.common.collect.Iterables;
import com.google.common.collect.SetMultimap;
import com.google.common.collect.Sets;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.cassandra.config.CFMetaData;
import org.apache.cassandra.config.DatabaseDescriptor;
import org.apache.cassandra.config.KSMetaData;
import org.apache.cassandra.config.Schema;
import org.apache.cassandra.cql3.QueryProcessor;
import org.apache.cassandra.cql3.UntypedResultSet;
import org.apache.cassandra.db.columniterator.IdentityQueryFilter;
import org.apache.cassandra.db.compaction.CompactionHistoryTabularData;
import org.apache.cassandra.db.commitlog.ReplayPosition;
import org.apache.cassandra.db.compaction.LeveledCompactionStrategy;
import org.apache.cassandra.db.composites.Composite;
import org.apache.cassandra.db.filter.QueryFilter;
import org.apache.cassandra.db.marshal.*;
import org.apache.cassandra.dht.Range;
import org.apache.cassandra.dht.Token;
import org.apache.cassandra.exceptions.ConfigurationException;
import org.apache.cassandra.io.sstable.format.SSTableReader;
import org.apache.cassandra.io.util.DataOutputBuffer;
import org.apache.cassandra.locator.IEndpointSnitch;
import org.apache.cassandra.locator.LocalStrategy;
import org.apache.cassandra.metrics.RestorableMeter;
import org.apache.cassandra.service.StorageService;
import org.apache.cassandra.service.paxos.Commit;
import org.apache.cassandra.service.paxos.PaxosState;
import org.apache.cassandra.thrift.cassandraConstants;
import org.apache.cassandra.transport.Server;
import org.apache.cassandra.utils.*;
import static org.apache.cassandra.cql3.QueryProcessor.executeInternal;
import static org.apache.cassandra.cql3.QueryProcessor.executeOnceInternal;
public final class SystemKeyspace
private static final Logger logger = LoggerFactory.getLogger(SystemKeyspace.class);
public static final String NAME = "system";
public static final String SCHEMA_KEYSPACES_TABLE = "schema_keyspaces";
public static final String SCHEMA_COLUMNFAMILIES_TABLE = "schema_columnfamilies";
public static final String SCHEMA_COLUMNS_TABLE = "schema_columns";
public static final String SCHEMA_TRIGGERS_TABLE = "schema_triggers";
public static final String SCHEMA_USER_TYPES_TABLE = "schema_usertypes";
public static final String SCHEMA_FUNCTIONS_TABLE = "schema_functions";
public static final String BUILT_INDEXES_TABLE = "IndexInfo";
public static final String HINTS_TABLE = "hints";
public static final String BATCHLOG_TABLE = "batchlog";
public static final String PAXOS_TABLE = "paxos";
public static final String LOCAL_TABLE = "local";
public static final String PEERS_TABLE = "peers";
public static final String PEER_EVENTS_TABLE = "peer_events";
public static final String RANGE_XFERS_TABLE = "range_xfers";
public static final String COMPACTION_LOG_TABLE = "compactions_in_progress";
public static final String COMPACTION_HISTORY_TABLE = "compaction_history";
public static final String SSTABLE_ACTIVITY_TABLE = "sstable_activity";
public static final List<String> ALL_SCHEMA_TABLES =
private static int WEEK = (int) TimeUnit.DAYS.toSeconds(7);
public static final CFMetaData SchemaKeyspacesTable =
compile(SCHEMA_KEYSPACES_TABLE, "keyspace definitions",
+ "keyspace_name text,"
+ "durable_writes boolean,"
+ "strategy_class text,"
+ "strategy_options text,"
+ "PRIMARY KEY ((keyspace_name))) "
public static final CFMetaData SchemaColumnFamiliesTable =
compile(SCHEMA_COLUMNFAMILIES_TABLE, "table definitions",
+ "keyspace_name text,"
+ "columnfamily_name text,"
+ "bloom_filter_fp_chance double,"
+ "caching text,"
+ "cf_id uuid," // post-2.1 UUID cfid
+ "comment text,"
+ "compaction_strategy_class text,"
+ "compaction_strategy_options text,"
+ "comparator text,"
+ "compression_parameters text,"
+ "default_time_to_live int,"
+ "default_validator text,"
+ "dropped_columns map<text, bigint>,"
+ "gc_grace_seconds int,"
+ "is_dense boolean,"
+ "key_validator text,"
+ "local_read_repair_chance double,"
+ "max_compaction_threshold int,"
+ "max_index_interval int,"
+ "memtable_flush_period_in_ms int,"
+ "min_compaction_threshold int,"
+ "min_index_interval int,"
+ "read_repair_chance double,"
+ "speculative_retry text,"
+ "subcomparator text,"
+ "type text,"
+ "PRIMARY KEY ((keyspace_name), columnfamily_name))")
public static final CFMetaData SchemaColumnsTable =
compile(SCHEMA_COLUMNS_TABLE, "column definitions",
+ "keyspace_name text,"
+ "columnfamily_name text,"
+ "column_name text,"
+ "component_index int,"
+ "index_name text,"
+ "index_options text,"
+ "index_type text,"
+ "type text,"
+ "validator text,"
+ "PRIMARY KEY ((keyspace_name), columnfamily_name, column_name))")
public static final CFMetaData SchemaTriggersTable =
compile(SCHEMA_TRIGGERS_TABLE, "trigger definitions",
+ "keyspace_name text,"
+ "columnfamily_name text,"
+ "trigger_name text,"
+ "trigger_options map<text, text>,"
+ "PRIMARY KEY ((keyspace_name), columnfamily_name, trigger_name))")
public static final CFMetaData SchemaUserTypesTable =
compile(SCHEMA_USER_TYPES_TABLE, "user defined type definitions",
+ "keyspace_name text,"
+ "type_name text,"
+ "field_names list<text>,"
+ "field_types list<text>,"
+ "PRIMARY KEY ((keyspace_name), type_name))")
public static final CFMetaData SchemaFunctionsTable =
compile(SCHEMA_FUNCTIONS_TABLE, "user defined function definitions",
+ "keyspace_name text,"
+ "function_name text,"
+ "signature blob,"
+ "argument_names list<text>,"
+ "argument_types list<text>,"
+ "body text,"
+ "deterministic boolean,"
+ "language text,"
+ "return_type text,"
+ "PRIMARY KEY ((keyspace_name), function_name, signature))")
public static final CFMetaData BuiltIndexesTable =
compile(BUILT_INDEXES_TABLE, "built column indexes",
"CREATE TABLE \"%s\" ("
+ "table_name text,"
+ "index_name text,"
+ "PRIMARY KEY ((table_name), index_name)) "
public static final CFMetaData HintsTable =
compile(HINTS_TABLE, "hints awaiting delivery",
+ "target_id uuid,"
+ "hint_id timeuuid,"
+ "message_version int,"
+ "mutation blob,"
+ "PRIMARY KEY ((target_id), hint_id, message_version)) "
.compactionStrategyOptions(Collections.singletonMap("enabled", "false"))
public static final CFMetaData BatchlogTable =
compile(BATCHLOG_TABLE, "batches awaiting replay",
+ "id uuid,"
+ "data blob,"
+ "version int,"
+ "written_at timestamp,"
+ "PRIMARY KEY ((id)))")
.compactionStrategyOptions(Collections.singletonMap("min_threshold", "2"))
private static final CFMetaData PaxosTable =
compile(PAXOS_TABLE, "in-progress paxos proposals",
+ "row_key blob,"
+ "cf_id UUID,"
+ "in_progress_ballot timeuuid,"
+ "most_recent_commit blob,"
+ "most_recent_commit_at timeuuid,"
+ "proposal blob,"
+ "proposal_ballot timeuuid,"
+ "PRIMARY KEY ((row_key), cf_id))")
private static final CFMetaData LocalTable =
compile(LOCAL_TABLE, "information about the local node",
+ "key text,"
+ "bootstrapped text,"
+ "cluster_name text,"
+ "cql_version text,"
+ "data_center text,"
+ "gossip_generation int,"
+ "host_id uuid,"
+ "native_protocol_version text,"
+ "partitioner text,"
+ "rack text,"
+ "release_version text,"
+ "schema_version uuid,"
+ "thrift_version text,"
+ "tokens set<varchar>,"
+ "truncated_at map<uuid, blob>,"
+ "PRIMARY KEY ((key)))");
private static final CFMetaData PeersTable =
compile(PEERS_TABLE, "information about known peers in the cluster",
+ "peer inet,"
+ "data_center text,"
+ "host_id uuid,"
+ "preferred_ip inet,"
+ "rack text,"
+ "release_version text,"
+ "rpc_address inet,"
+ "schema_version uuid,"
+ "tokens set<varchar>,"
+ "PRIMARY KEY ((peer)))");
private static final CFMetaData PeerEventsTable =
compile(PEER_EVENTS_TABLE, "events related to peers",
+ "peer inet,"
+ "hints_dropped map<uuid, int>,"
+ "PRIMARY KEY ((peer)))");
private static final CFMetaData RangeXfersTable =
compile(RANGE_XFERS_TABLE, "ranges requested for transfer",
+ "token_bytes blob,"
+ "requested_at timestamp,"
+ "PRIMARY KEY ((token_bytes)))");
private static final CFMetaData CompactionLogTable =
compile(COMPACTION_LOG_TABLE, "unfinished compactions",
+ "id uuid,"
+ "columnfamily_name text,"
+ "inputs set<int>,"
+ "keyspace_name text,"
+ "PRIMARY KEY ((id)))");
private static final CFMetaData CompactionHistoryTable =
compile(COMPACTION_HISTORY_TABLE, "week-long compaction history",
+ "id uuid,"
+ "bytes_in bigint,"
+ "bytes_out bigint,"
+ "columnfamily_name text,"
+ "compacted_at timestamp,"
+ "keyspace_name text,"
+ "rows_merged map<int, bigint>,"
+ "PRIMARY KEY ((id)))")
private static final CFMetaData SSTableActivityTable =
compile(SSTABLE_ACTIVITY_TABLE, "historic sstable read rates",
+ "keyspace_name text,"
+ "columnfamily_name text,"
+ "generation int,"
+ "rate_120m double,"
+ "rate_15m double,"
+ "PRIMARY KEY ((keyspace_name, columnfamily_name, generation)))");
private static CFMetaData compile(String table, String comment, String cql)
return CFMetaData.compile(String.format(cql, table), NAME).comment(comment);
public static KSMetaData definition()
List<CFMetaData> tables =
return new KSMetaData(NAME, LocalStrategy.class, Collections.<String, String>emptyMap(), true, tables);
private static final String LOCAL_KEY = "local";
private static volatile Map<UUID, Pair<ReplayPosition, Long>> truncationRecords;
public enum BootstrapState
private static DecoratedKey decorate(ByteBuffer key)
return StorageService.getPartitioner().decorateKey(key);
public static void finishStartup()
// add entries to system schema columnfamilies for the hardcoded system definitions
KSMetaData ksmd = Schema.instance.getKSMetaData(NAME);
// delete old, possibly obsolete entries in schema tables
for (String table : ALL_SCHEMA_TABLES)
executeOnceInternal(String.format("DELETE FROM system.%s WHERE keyspace_name = ?", table), ksmd.name);
// (+1 to timestamp to make sure we don't get shadowed by the tombstones we just added)
ksmd.toSchema(FBUtilities.timestampMicros() + 1).apply();
private static void setupVersion()
String req = "INSERT INTO system.%s (key, release_version, cql_version, thrift_version, native_protocol_version, data_center, rack, partitioner) VALUES (?, ?, ?, ?, ?, ?, ?, ?)";
IEndpointSnitch snitch = DatabaseDescriptor.getEndpointSnitch();
executeOnceInternal(String.format(req, LOCAL_TABLE),
* Write compaction log, except columfamilies under system keyspace.
* @param cfs cfs to compact
* @param toCompact sstables to compact
* @return compaction task id or null if cfs is under system keyspace
public static UUID startCompaction(ColumnFamilyStore cfs, Iterable<SSTableReader> toCompact)
if (NAME.equals(cfs.keyspace.getName()))
return null;
UUID compactionId = UUIDGen.getTimeUUID();
Iterable<Integer> generations = Iterables.transform(toCompact, new Function<SSTableReader, Integer>()
public Integer apply(SSTableReader sstable)
return sstable.descriptor.generation;
String req = "INSERT INTO system.%s (id, keyspace_name, columnfamily_name, inputs) VALUES (?, ?, ?, ?)";
executeInternal(String.format(req, COMPACTION_LOG_TABLE), compactionId, cfs.keyspace.getName(), cfs.name, Sets.newHashSet(generations));
return compactionId;
* Deletes the entry for this compaction from the set of compactions in progress. The compaction does not need
* to complete successfully for this to be called.
* @param taskId what was returned from {@code startCompaction}
public static void finishCompaction(UUID taskId)
assert taskId != null;
executeInternal(String.format("DELETE FROM system.%s WHERE id = ?", COMPACTION_LOG_TABLE), taskId);
* Returns a Map whose keys are KS.CF pairs and whose values are maps from sstable generation numbers to the
* task ID of the compaction they were participating in.
public static Map<Pair<String, String>, Map<Integer, UUID>> getUnfinishedCompactions()
String req = "SELECT * FROM system.%s";
UntypedResultSet resultSet = executeInternal(String.format(req, COMPACTION_LOG_TABLE));
Map<Pair<String, String>, Map<Integer, UUID>> unfinishedCompactions = new HashMap<>();
for (UntypedResultSet.Row row : resultSet)
String keyspace = row.getString("keyspace_name");
String columnfamily = row.getString("columnfamily_name");
Set<Integer> inputs = row.getSet("inputs", Int32Type.instance);
UUID taskID = row.getUUID("id");
Pair<String, String> kscf = Pair.create(keyspace, columnfamily);
Map<Integer, UUID> generationToTaskID = unfinishedCompactions.get(kscf);
if (generationToTaskID == null)
generationToTaskID = new HashMap<>(inputs.size());
for (Integer generation : inputs)
generationToTaskID.put(generation, taskID);
unfinishedCompactions.put(kscf, generationToTaskID);
return unfinishedCompactions;
public static void discardCompactionsInProgress()
ColumnFamilyStore compactionLog = Keyspace.open(NAME).getColumnFamilyStore(COMPACTION_LOG_TABLE);
public static void updateCompactionHistory(String ksname,
String cfname,
long compactedAt,
long bytesIn,
long bytesOut,
Map<Integer, Long> rowsMerged)
// don't write anything when the history table itself is compacted, since that would in turn cause new compactions
if (ksname.equals("system") && cfname.equals(COMPACTION_HISTORY_TABLE))
String req = "INSERT INTO system.%s (id, keyspace_name, columnfamily_name, compacted_at, bytes_in, bytes_out, rows_merged) VALUES (?, ?, ?, ?, ?, ?, ?)";
executeInternal(String.format(req, COMPACTION_HISTORY_TABLE), UUIDGen.getTimeUUID(), ksname, cfname, ByteBufferUtil.bytes(compactedAt), bytesIn, bytesOut, rowsMerged);
public static TabularData getCompactionHistory() throws OpenDataException
UntypedResultSet queryResultSet = executeInternal(String.format("SELECT * from system.%s", COMPACTION_HISTORY_TABLE));
return CompactionHistoryTabularData.from(queryResultSet);
public static synchronized void saveTruncationRecord(ColumnFamilyStore cfs, long truncatedAt, ReplayPosition position)
String req = "UPDATE system.%s SET truncated_at = truncated_at + ? WHERE key = '%s'";
executeInternal(String.format(req, LOCAL_TABLE, LOCAL_KEY), truncationAsMapEntry(cfs, truncatedAt, position));
truncationRecords = null;
* This method is used to remove information about truncation time for specified column family
public static synchronized void removeTruncationRecord(UUID cfId)
String req = "DELETE truncated_at[?] from system.%s WHERE key = '%s'";
executeInternal(String.format(req, LOCAL_TABLE, LOCAL_KEY), cfId);
truncationRecords = null;
private static Map<UUID, ByteBuffer> truncationAsMapEntry(ColumnFamilyStore cfs, long truncatedAt, ReplayPosition position)
DataOutputBuffer out = new DataOutputBuffer();
ReplayPosition.serializer.serialize(position, out);
catch (IOException e)
throw new RuntimeException(e);
return Collections.singletonMap(cfs.metadata.cfId, ByteBuffer.wrap(out.getData(), 0, out.getLength()));
public static ReplayPosition getTruncatedPosition(UUID cfId)
Pair<ReplayPosition, Long> record = getTruncationRecord(cfId);
return record == null ? null : record.left;
public static long getTruncatedAt(UUID cfId)
Pair<ReplayPosition, Long> record = getTruncationRecord(cfId);
return record == null ? Long.MIN_VALUE : record.right;
private static synchronized Pair<ReplayPosition, Long> getTruncationRecord(UUID cfId)
if (truncationRecords == null)
truncationRecords = readTruncationRecords();
return truncationRecords.get(cfId);
private static Map<UUID, Pair<ReplayPosition, Long>> readTruncationRecords()
UntypedResultSet rows = executeInternal(String.format("SELECT truncated_at FROM system.%s WHERE key = '%s'", LOCAL_TABLE, LOCAL_KEY));
Map<UUID, Pair<ReplayPosition, Long>> records = new HashMap<>();
if (!rows.isEmpty() && rows.one().has("truncated_at"))
Map<UUID, ByteBuffer> map = rows.one().getMap("truncated_at", UUIDType.instance, BytesType.instance);
for (Map.Entry<UUID, ByteBuffer> entry : map.entrySet())
records.put(entry.getKey(), truncationRecordFromBlob(entry.getValue()));
return records;
private static Pair<ReplayPosition, Long> truncationRecordFromBlob(ByteBuffer bytes)
DataInputStream in = new DataInputStream(ByteBufferUtil.inputStream(bytes));
return Pair.create(ReplayPosition.serializer.deserialize(in), in.available() > 0 ? in.readLong() : Long.MIN_VALUE);
catch (IOException e)
throw new RuntimeException(e);
* Record tokens being used by another node
public static synchronized void updateTokens(InetAddress ep, Collection<Token> tokens)
if (ep.equals(FBUtilities.getBroadcastAddress()))
String req = "INSERT INTO system.%s (peer, tokens) VALUES (?, ?)";
executeInternal(String.format(req, PEERS_TABLE), ep, tokensAsSet(tokens));
public static synchronized void updatePreferredIP(InetAddress ep, InetAddress preferred_ip)
String req = "INSERT INTO system.%s (peer, preferred_ip) VALUES (?, ?)";
executeInternal(String.format(req, PEERS_TABLE), ep, preferred_ip);
public static synchronized void updatePeerInfo(InetAddress ep, String columnName, Object value)
if (ep.equals(FBUtilities.getBroadcastAddress()))
String req = "INSERT INTO system.%s (peer, %s) VALUES (?, ?)";
executeInternal(String.format(req, PEERS_TABLE, columnName), ep, value);
public static synchronized void updateHintsDropped(InetAddress ep, UUID timePeriod, int value)
// with 30 day TTL
String req = "UPDATE system.%s USING TTL 2592000 SET hints_dropped[ ? ] = ? WHERE peer = ?";
executeInternal(String.format(req, PEER_EVENTS_TABLE), timePeriod, value, ep);
public static synchronized void updateSchemaVersion(UUID version)
String req = "INSERT INTO system.%s (key, schema_version) VALUES ('%s', ?)";
executeInternal(String.format(req, LOCAL_TABLE, LOCAL_KEY), version);
private static Set<String> tokensAsSet(Collection<Token> tokens)
Token.TokenFactory factory = StorageService.getPartitioner().getTokenFactory();
Set<String> s = new HashSet<>(tokens.size());
for (Token tk : tokens)
return s;
private static Collection<Token> deserializeTokens(Collection<String> tokensStrings)
Token.TokenFactory factory = StorageService.getPartitioner().getTokenFactory();
List<Token> tokens = new ArrayList<>(tokensStrings.size());
for (String tk : tokensStrings)
return tokens;
* Remove stored tokens being used by another node
public static synchronized void removeEndpoint(InetAddress ep)
String req = "DELETE FROM system.%s WHERE peer = ?";
executeInternal(String.format(req, PEERS_TABLE), ep);
* This method is used to update the System Keyspace with the new tokens for this node
public static synchronized void updateTokens(Collection<Token> tokens)
assert !tokens.isEmpty() : "removeEndpoint should be used instead";
String req = "INSERT INTO system.%s (key, tokens) VALUES ('%s', ?)";
executeInternal(String.format(req, LOCAL_TABLE, LOCAL_KEY), tokensAsSet(tokens));
* Convenience method to update the list of tokens in the local system keyspace.
* @param addTokens tokens to add
* @param rmTokens tokens to remove
* @return the collection of persisted tokens
public static synchronized Collection<Token> updateLocalTokens(Collection<Token> addTokens, Collection<Token> rmTokens)
Collection<Token> tokens = getSavedTokens();
return tokens;
public static void forceBlockingFlush(String cfname)
if (!Boolean.getBoolean("cassandra.unsafesystem"))
* Return a map of stored tokens to IP addresses
public static SetMultimap<InetAddress, Token> loadTokens()
SetMultimap<InetAddress, Token> tokenMap = HashMultimap.create();
for (UntypedResultSet.Row row : executeInternal("SELECT peer, tokens FROM system." + PEERS_TABLE))
InetAddress peer = row.getInetAddress("peer");
if (row.has("tokens"))
tokenMap.putAll(peer, deserializeTokens(row.getSet("tokens", UTF8Type.instance)));
return tokenMap;
* Return a map of store host_ids to IP addresses
public static Map<InetAddress, UUID> loadHostIds()
Map<InetAddress, UUID> hostIdMap = new HashMap<>();
for (UntypedResultSet.Row row : executeInternal("SELECT peer, host_id FROM system." + PEERS_TABLE))
InetAddress peer = row.getInetAddress("peer");
if (row.has("host_id"))
hostIdMap.put(peer, row.getUUID("host_id"));
return hostIdMap;
* Get preferred IP for given endpoint if it is known. Otherwise this returns given endpoint itself.
* @param ep endpoint address to check
* @return Preferred IP for given endpoint if present, otherwise returns given ep
public static InetAddress getPreferredIP(InetAddress ep)
String req = "SELECT preferred_ip FROM system.%s WHERE peer=?";
UntypedResultSet result = executeInternal(String.format(req, PEERS_TABLE), ep);
if (!result.isEmpty() && result.one().has("preferred_ip"))
return result.one().getInetAddress("preferred_ip");
return ep;
* Return a map of IP addresses containing a map of dc and rack info
public static Map<InetAddress, Map<String,String>> loadDcRackInfo()
Map<InetAddress, Map<String, String>> result = new HashMap<>();
for (UntypedResultSet.Row row : executeInternal("SELECT peer, data_center, rack from system." + PEERS_TABLE))
InetAddress peer = row.getInetAddress("peer");
if (row.has("data_center") && row.has("rack"))
Map<String, String> dcRack = new HashMap<>();
dcRack.put("data_center", row.getString("data_center"));
dcRack.put("rack", row.getString("rack"));
result.put(peer, dcRack);
return result;
* One of three things will happen if you try to read the system keyspace:
* 1. files are present and you can read them: great
* 2. no files are there: great (new node is assumed)
* 3. files are present but you can't read them: bad
* @throws ConfigurationException
public static void checkHealth() throws ConfigurationException
Keyspace keyspace;
keyspace = Keyspace.open(NAME);
catch (AssertionError err)
// this happens when a user switches from OPP to RP.
ConfigurationException ex = new ConfigurationException("Could not read system keyspace!");
throw ex;
ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(LOCAL_TABLE);
String req = "SELECT cluster_name FROM system.%s WHERE key='%s'";
UntypedResultSet result = executeInternal(String.format(req, LOCAL_TABLE, LOCAL_KEY));
if (result.isEmpty() || !result.one().has("cluster_name"))
// this is a brand new node
if (!cfs.getSSTables().isEmpty())
throw new ConfigurationException("Found system keyspace files, but they couldn't be loaded!");
// no system files. this is a new node.
req = "INSERT INTO system.%s (key, cluster_name) VALUES ('%s', ?)";
executeInternal(String.format(req, LOCAL_TABLE, LOCAL_KEY), DatabaseDescriptor.getClusterName());
String savedClusterName = result.one().getString("cluster_name");
if (!DatabaseDescriptor.getClusterName().equals(savedClusterName))
throw new ConfigurationException("Saved cluster name " + savedClusterName + " != configured name " + DatabaseDescriptor.getClusterName());
public static Collection<Token> getSavedTokens()
String req = "SELECT tokens FROM system.%s WHERE key='%s'";
UntypedResultSet result = executeInternal(String.format(req, LOCAL_TABLE, LOCAL_KEY));
return result.isEmpty() || !result.one().has("tokens")
? Collections.<Token>emptyList()
: deserializeTokens(result.one().getSet("tokens", UTF8Type.instance));
public static int incrementAndGetGeneration()
String req = "SELECT gossip_generation FROM system.%s WHERE key='%s'";
UntypedResultSet result = executeInternal(String.format(req, LOCAL_TABLE, LOCAL_KEY));
int generation;
if (result.isEmpty() || !result.one().has("gossip_generation"))
// seconds-since-epoch isn't a foolproof new generation
// (where foolproof is "guaranteed to be larger than the last one seen at this ip address"),
// but it's as close as sanely possible
generation = (int) (System.currentTimeMillis() / 1000);
// Other nodes will ignore gossip messages about a node that have a lower generation than previously seen.
final int storedGeneration = result.one().getInt("gossip_generation") + 1;
final int now = (int) (System.currentTimeMillis() / 1000);
if (storedGeneration >= now)
logger.warn("Using stored Gossip Generation {} as it is greater than current system time {}. See CASSANDRA-3654 if you experience problems",
storedGeneration, now);
generation = storedGeneration;
generation = now;
req = "INSERT INTO system.%s (key, gossip_generation) VALUES ('%s', ?)";
executeInternal(String.format(req, LOCAL_TABLE, LOCAL_KEY), generation);
return generation;
public static BootstrapState getBootstrapState()
String req = "SELECT bootstrapped FROM system.%s WHERE key='%s'";
UntypedResultSet result = executeInternal(String.format(req, LOCAL_TABLE, LOCAL_KEY));
if (result.isEmpty() || !result.one().has("bootstrapped"))
return BootstrapState.NEEDS_BOOTSTRAP;
return BootstrapState.valueOf(result.one().getString("bootstrapped"));
public static boolean bootstrapComplete()
return getBootstrapState() == BootstrapState.COMPLETED;
public static boolean bootstrapInProgress()
return getBootstrapState() == BootstrapState.IN_PROGRESS;
public static void setBootstrapState(BootstrapState state)
String req = "INSERT INTO system.%s (key, bootstrapped) VALUES ('%s', ?)";
executeInternal(String.format(req, LOCAL_TABLE, LOCAL_KEY), state.name());
public static boolean isIndexBuilt(String keyspaceName, String indexName)
ColumnFamilyStore cfs = Keyspace.open(NAME).getColumnFamilyStore(BUILT_INDEXES_TABLE);
QueryFilter filter = QueryFilter.getNamesFilter(decorate(ByteBufferUtil.bytes(keyspaceName)),
FBUtilities.singleton(cfs.getComparator().makeCellName(indexName), cfs.getComparator()),
return ColumnFamilyStore.removeDeleted(cfs.getColumnFamily(filter), Integer.MAX_VALUE) != null;
public static void setIndexBuilt(String keyspaceName, String indexName)
ColumnFamily cf = ArrayBackedSortedColumns.factory.create(NAME, BUILT_INDEXES_TABLE);
cf.addColumn(new BufferCell(cf.getComparator().makeCellName(indexName), ByteBufferUtil.EMPTY_BYTE_BUFFER, FBUtilities.timestampMicros()));
new Mutation(NAME, ByteBufferUtil.bytes(keyspaceName), cf).apply();
public static void setIndexRemoved(String keyspaceName, String indexName)
Mutation mutation = new Mutation(NAME, ByteBufferUtil.bytes(keyspaceName));
mutation.delete(BUILT_INDEXES_TABLE, BuiltIndexesTable.comparator.makeCellName(indexName), FBUtilities.timestampMicros());
* Read the host ID from the system keyspace, creating (and storing) one if
* none exists.
public static UUID getLocalHostId()
String req = "SELECT host_id FROM system.%s WHERE key='%s'";
UntypedResultSet result = executeInternal(String.format(req, LOCAL_TABLE, LOCAL_KEY));
// Look up the Host UUID (return it if found)
if (!result.isEmpty() && result.one().has("host_id"))
return result.one().getUUID("host_id");
// ID not found, generate a new one, persist, and then return it.
UUID hostId = UUID.randomUUID();
logger.warn("No host ID found, created {} (Note: This should happen exactly once per node).", hostId);
return setLocalHostId(hostId);
* Sets the local host ID explicitly. Should only be called outside of SystemTable when replacing a node.
public static UUID setLocalHostId(UUID hostId)
String req = "INSERT INTO system.%s (key, host_id) VALUES ('%s', ?)";
executeInternal(String.format(req, LOCAL_TABLE, LOCAL_KEY), hostId);
return hostId;
* @param cfName The name of the ColumnFamily responsible for part of the schema (keyspace, ColumnFamily, columns)
* @return CFS responsible to hold low-level serialized schema
public static ColumnFamilyStore schemaCFS(String cfName)
return Keyspace.open(NAME).getColumnFamilyStore(cfName);
public static List<Row> serializedSchema()
List<Row> schema = new ArrayList<>();
for (String cf : ALL_SCHEMA_TABLES)
return schema;
* @param schemaCfName The name of the ColumnFamily responsible for part of the schema (keyspace, ColumnFamily, columns)
* @return low-level schema representation (each row represents individual Keyspace or ColumnFamily)
public static List<Row> serializedSchema(String schemaCfName)
Token minToken = StorageService.getPartitioner().getMinimumToken();
return schemaCFS(schemaCfName).getRangeSlice(new Range<RowPosition>(minToken.minKeyBound(), minToken.maxKeyBound()),
new IdentityQueryFilter(),
public static Collection<Mutation> serializeSchema()
Map<DecoratedKey, Mutation> mutationMap = new HashMap<>();
for (String cf : ALL_SCHEMA_TABLES)
serializeSchema(mutationMap, cf);
return mutationMap.values();
private static void serializeSchema(Map<DecoratedKey, Mutation> mutationMap, String schemaCfName)
for (Row schemaRow : serializedSchema(schemaCfName))
if (Schema.ignoredSchemaRow(schemaRow))
Mutation mutation = mutationMap.get(schemaRow.key);
if (mutation == null)
mutation = new Mutation(NAME, schemaRow.key.getKey());
mutationMap.put(schemaRow.key, mutation);
public static Map<DecoratedKey, ColumnFamily> getSchema(String cfName)
Map<DecoratedKey, ColumnFamily> schema = new HashMap<>();
for (Row schemaEntity : SystemKeyspace.serializedSchema(cfName))
schema.put(schemaEntity.key, schemaEntity.cf);
return schema;
public static Map<DecoratedKey, ColumnFamily> getSchema(String schemaCfName, Set<String> keyspaces)
Map<DecoratedKey, ColumnFamily> schema = new HashMap<>();
for (String keyspace : keyspaces)
Row schemaEntity = readSchemaRow(schemaCfName, keyspace);
if (schemaEntity.cf != null)
schema.put(schemaEntity.key, schemaEntity.cf);
return schema;
public static ByteBuffer getSchemaKSKey(String ksName)
return AsciiType.instance.fromString(ksName);
* Fetches a subset of schema (table data, columns metadata or triggers) for the keyspace.
* @param schemaCfName the schema table to get the data from (schema_keyspaces, schema_columnfamilies, schema_columns or schema_triggers)
* @param ksName the keyspace of the tables we are interested in
* @return a Row containing the schema data of a particular type for the keyspace
public static Row readSchemaRow(String schemaCfName, String ksName)
DecoratedKey key = StorageService.getPartitioner().decorateKey(getSchemaKSKey(ksName));
ColumnFamilyStore schemaCFS = SystemKeyspace.schemaCFS(schemaCfName);
ColumnFamily result = schemaCFS.getColumnFamily(QueryFilter.getIdentityFilter(key, schemaCfName, System.currentTimeMillis()));
return new Row(key, result);
* Fetches a subset of schema (table data, columns metadata or triggers) for the keyspace+table pair.
* @param schemaCfName the schema table to get the data from (schema_columnfamilies, schema_columns or schema_triggers)
* @param ksName the keyspace of the table we are interested in
* @param cfName the table we are interested in
* @return a Row containing the schema data of a particular type for the table
public static Row readSchemaRow(String schemaCfName, String ksName, String cfName)
DecoratedKey key = StorageService.getPartitioner().decorateKey(getSchemaKSKey(ksName));
ColumnFamilyStore schemaCFS = SystemKeyspace.schemaCFS(schemaCfName);
Composite prefix = schemaCFS.getComparator().make(cfName);
ColumnFamily cf = schemaCFS.getColumnFamily(key,
return new Row(key, cf);
public static PaxosState loadPaxosState(ByteBuffer key, CFMetaData metadata)
String req = "SELECT * FROM system.%s WHERE row_key = ? AND cf_id = ?";
UntypedResultSet results = executeInternal(String.format(req, PAXOS_TABLE), key, metadata.cfId);
if (results.isEmpty())
return new PaxosState(key, metadata);
UntypedResultSet.Row row = results.one();
Commit promised = row.has("in_progress_ballot")
? new Commit(key, row.getUUID("in_progress_ballot"), ArrayBackedSortedColumns.factory.create(metadata))
: Commit.emptyCommit(key, metadata);
// either we have both a recently accepted ballot and update or we have neither
Commit accepted = row.has("proposal")
? new Commit(key, row.getUUID("proposal_ballot"), ColumnFamily.fromBytes(row.getBytes("proposal")))
: Commit.emptyCommit(key, metadata);
// either most_recent_commit and most_recent_commit_at will both be set, or neither
Commit mostRecent = row.has("most_recent_commit")
? new Commit(key, row.getUUID("most_recent_commit_at"), ColumnFamily.fromBytes(row.getBytes("most_recent_commit")))
: Commit.emptyCommit(key, metadata);
return new PaxosState(promised, accepted, mostRecent);
public static void savePaxosPromise(Commit promise)
String req = "UPDATE system.%s USING TIMESTAMP ? AND TTL ? SET in_progress_ballot = ? WHERE row_key = ? AND cf_id = ?";
executeInternal(String.format(req, PAXOS_TABLE),
public static void savePaxosProposal(Commit proposal)
executeInternal(String.format("UPDATE system.%s USING TIMESTAMP ? AND TTL ? SET proposal_ballot = ?, proposal = ? WHERE row_key = ? AND cf_id = ?", PAXOS_TABLE),
private static int paxosTtl(CFMetaData metadata)
// keep paxos state around for at least 3h
return Math.max(3 * 3600, metadata.getGcGraceSeconds());
public static void savePaxosCommit(Commit commit)
// We always erase the last proposal (with the commit timestamp to no erase more recent proposal in case the commit is old)
// even though that's really just an optimization since SP.beginAndRepairPaxos will exclude accepted proposal older than the mrc.
String cql = "UPDATE system.%s USING TIMESTAMP ? AND TTL ? SET proposal_ballot = null, proposal = null, most_recent_commit_at = ?, most_recent_commit = ? WHERE row_key = ? AND cf_id = ?";
executeInternal(String.format(cql, PAXOS_TABLE),
* Returns a RestorableMeter tracking the average read rate of a particular SSTable, restoring the last-seen rate
* from values in system.sstable_activity if present.
* @param keyspace the keyspace the sstable belongs to
* @param table the table the sstable belongs to
* @param generation the generation number for the sstable
public static RestorableMeter getSSTableReadMeter(String keyspace, String table, int generation)
String cql = "SELECT * FROM system.%s WHERE keyspace_name=? and columnfamily_name=? and generation=?";
UntypedResultSet results = executeInternal(String.format(cql, SSTABLE_ACTIVITY_TABLE), keyspace, table, generation);
if (results.isEmpty())
return new RestorableMeter();
UntypedResultSet.Row row = results.one();
double m15rate = row.getDouble("rate_15m");
double m120rate = row.getDouble("rate_120m");
return new RestorableMeter(m15rate, m120rate);
* Writes the current read rates for a given SSTable to system.sstable_activity
public static void persistSSTableReadMeter(String keyspace, String table, int generation, RestorableMeter meter)
// Store values with a one-day TTL to handle corner cases where cleanup might not occur
String cql = "INSERT INTO system.%s (keyspace_name, columnfamily_name, generation, rate_15m, rate_120m) VALUES (?, ?, ?, ?, ?) USING TTL 864000";
executeInternal(String.format(cql, SSTABLE_ACTIVITY_TABLE),
* Clears persisted read rates from system.sstable_activity for SSTables that have been deleted.
public static void clearSSTableReadMeter(String keyspace, String table, int generation)
String cql = "DELETE FROM system.%s WHERE keyspace_name=? AND columnfamily_name=? and generation=?";
executeInternal(String.format(cql, SSTABLE_ACTIVITY_TABLE), keyspace, table, generation);