/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.accumulo.server.master;
import static java.lang.Math.min;
import static org.apache.accumulo.core.util.TabletOperations.createNewTableTabletDirectories;
import java.io.IOException;
import java.math.BigInteger;
import java.net.InetSocketAddress;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.SortedMap;
import java.util.TimerTask;
import java.util.TreeMap;
import java.util.concurrent.atomic.AtomicInteger;
import org.apache.accumulo.core.Constants;
import org.apache.accumulo.core.client.AccumuloException;
import org.apache.accumulo.core.client.AccumuloSecurityException;
import org.apache.accumulo.core.client.Connector;
import org.apache.accumulo.core.client.Instance;
import org.apache.accumulo.core.client.Scanner;
import org.apache.accumulo.core.client.TableNotFoundException;
import org.apache.accumulo.core.client.impl.HdfsZooInstance;
import org.apache.accumulo.core.client.impl.Tables;
import org.apache.accumulo.core.client.impl.thrift.TableOperation;
import org.apache.accumulo.core.client.impl.thrift.TableOperationExceptionType;
import org.apache.accumulo.core.client.impl.thrift.ThriftTableOperationException;
import org.apache.accumulo.core.conf.AccumuloConfiguration;
import org.apache.accumulo.core.conf.Property;
import org.apache.accumulo.core.data.Key;
import org.apache.accumulo.core.data.KeyExtent;
import org.apache.accumulo.core.data.Range;
import org.apache.accumulo.core.data.Value;
import org.apache.accumulo.core.data.thrift.TKeyExtent;
import org.apache.accumulo.core.master.thrift.LoggerStatus;
import org.apache.accumulo.core.master.thrift.MasterClientService;
import org.apache.accumulo.core.master.thrift.MasterClientService.Processor;
import org.apache.accumulo.core.master.thrift.MasterGoalState;
import org.apache.accumulo.core.master.thrift.MasterMonitorInfo;
import org.apache.accumulo.core.master.thrift.MasterState;
import org.apache.accumulo.core.master.thrift.TableInfo;
import org.apache.accumulo.core.master.thrift.TabletLoadState;
import org.apache.accumulo.core.master.thrift.TabletServerStatus;
import org.apache.accumulo.core.master.thrift.TabletSplit;
import org.apache.accumulo.core.master.thrift.TimeType;
import org.apache.accumulo.core.security.SystemPermission;
import org.apache.accumulo.core.security.TablePermission;
import org.apache.accumulo.core.security.thrift.AuthInfo;
import org.apache.accumulo.core.security.thrift.SecurityErrorCode;
import org.apache.accumulo.core.security.thrift.ThriftSecurityException;
import org.apache.accumulo.core.util.AddressUtil;
import org.apache.accumulo.core.util.ByteArraySet;
import org.apache.accumulo.core.util.CachedConfiguration;
import org.apache.accumulo.core.util.ColumnFQ;
import org.apache.accumulo.core.util.Daemon;
import org.apache.accumulo.core.util.LoggingRunnable;
import org.apache.accumulo.core.util.UtilWaitThread;
import org.apache.accumulo.core.zookeeper.ZooLock;
import org.apache.accumulo.core.zookeeper.ZooLock.LockLossReason;
import org.apache.accumulo.core.zookeeper.ZooLock.LockWatcher;
import org.apache.accumulo.core.zookeeper.ZooSession;
import org.apache.accumulo.core.zookeeper.ZooUtil;
import org.apache.accumulo.core.zookeeper.ZooUtil.NodeExistsPolicy;
import org.apache.accumulo.core.zookeeper.ZooUtil.NodeMissingPolicy;
import org.apache.accumulo.server.Accumulo;
import org.apache.accumulo.server.client.ClientServiceHandler;
import org.apache.accumulo.server.master.LiveTServerSet.Listener;
import org.apache.accumulo.server.master.LiveTServerSet.TServerConnection;
import org.apache.accumulo.server.master.TabletServerLoggers.NewLoggerWatcher;
import org.apache.accumulo.server.master.balancer.DefaultLoadBalancer;
import org.apache.accumulo.server.master.balancer.LoggerBalancer;
import org.apache.accumulo.server.master.balancer.LoggerUser;
import org.apache.accumulo.server.master.balancer.SimpleLoggerBalancer;
import org.apache.accumulo.server.master.balancer.TServerUsesLoggers;
import org.apache.accumulo.server.master.balancer.TabletBalancer;
import org.apache.accumulo.server.master.state.Assignment;
import org.apache.accumulo.server.master.state.CurrentState;
import org.apache.accumulo.server.master.state.DistributedStoreException;
import org.apache.accumulo.server.master.state.MetaDataStateStore;
import org.apache.accumulo.server.master.state.MetaDataTableScanner;
import org.apache.accumulo.server.master.state.RootTabletStateStore;
import org.apache.accumulo.server.master.state.TServerInstance;
import org.apache.accumulo.server.master.state.TableCounts;
import org.apache.accumulo.server.master.state.TableStats;
import org.apache.accumulo.server.master.state.TabletLocationState;
import org.apache.accumulo.server.master.state.TabletMigration;
import org.apache.accumulo.server.master.state.TabletServerState;
import org.apache.accumulo.server.master.state.TabletState;
import org.apache.accumulo.server.master.state.TabletStateStore;
import org.apache.accumulo.server.master.state.ZooStore;
import org.apache.accumulo.server.master.state.ZooTabletStateStore;
import org.apache.accumulo.server.master.state.tables.TableManager;
import org.apache.accumulo.server.master.state.tables.TableObserver;
import org.apache.accumulo.server.master.state.tables.TableState;
import org.apache.accumulo.server.monitor.Monitor;
import org.apache.accumulo.server.problems.ProblemReports;
import org.apache.accumulo.server.security.Authenticator;
import org.apache.accumulo.server.security.SecurityConstants;
import org.apache.accumulo.server.security.ZKAuthenticator;
import org.apache.accumulo.server.tabletserver.TabletTime;
import org.apache.accumulo.server.tabletserver.log.RemoteLogger;
import org.apache.accumulo.server.util.DefaultMap;
import org.apache.accumulo.server.util.Halt;
import org.apache.accumulo.server.util.MetadataTable;
import org.apache.accumulo.server.util.OfflineMetadataScanner;
import org.apache.accumulo.server.util.SystemPropUtil;
import org.apache.accumulo.server.util.TServerUtils;
import org.apache.accumulo.server.util.TablePropUtil;
import org.apache.accumulo.server.util.time.SimpleTimer;
import org.apache.accumulo.start.classloader.AccumuloClassLoader;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.log4j.Logger;
import org.apache.thrift.TException;
import org.apache.thrift.server.TServer;
import org.apache.thrift.transport.TTransportException;
import org.apache.zookeeper.KeeperException;
import org.apache.zookeeper.data.Stat;
import cloudtrace.instrument.thrift.TraceWrap;
import cloudtrace.thrift.TInfo;
/**
* The Master is responsible for assigning and balancing tablets and loggers to tablet servers.
*
* The master will also coordinate log recoveries and reports general status.
*/
public class Master implements Listener, NewLoggerWatcher, TableObserver, CurrentState {
final private static Logger log = Logger.getLogger(Master.class);
final private static int ONE_SECOND = 1000;
final private static Text METADATA_TABLE_ID = new Text(Constants.METADATA_TABLE_ID);
final private static long TIME_TO_WAIT_BETWEEN_SCANS = 5000;
final private static int TIME_TO_WAIT_FOR_TSERVERS_TO_STABILIZE = 2 * ONE_SECOND;
final private static int TIME_BETWEEN_MIGRATION_CLEANUPS = 5 * 60 * ONE_SECOND;
final private static int TIME_BETWEEN_DELETE_CHECKS = 10 * ONE_SECOND;
final private static int WAIT_BETWEEN_ERRORS = ONE_SECOND;
final private static int DEFAULT_WAIT_FOR_WATCHER = 10 * ONE_SECOND;
final private static int MAX_TSERVER_WORK_CHUNK = 5000;
final private static int MAX_BAD_STATUS_COUNT = 3;
final private Instance instance;
final private String hostname;
final private FileSystem fs;
final private LiveTServerSet tserverSet;
final private List<TabletGroupWatcher> watchers = new ArrayList<TabletGroupWatcher>();
final private Authenticator authenticator;
final private Map<TServerInstance,AtomicInteger> badServers = Collections.synchronizedMap(new DefaultMap<TServerInstance,AtomicInteger>(new AtomicInteger()));
final private Set<TServerInstance> serversToShutdown = Collections.synchronizedSet(new HashSet<TServerInstance>());
final private SortedMap<KeyExtent,TServerInstance> migrations = Collections.synchronizedSortedMap(new TreeMap<KeyExtent,TServerInstance>());
final private TabletBalancer tabletBalancer;
final private EventCoordinator nextEvent = new EventCoordinator();
private ZooLock masterLock = null;
private TServer clientService = null;
private TabletServerLoggers loggers = null;
private CoordinateRecoveryTask recovery = null;
private MasterState state = MasterState.INITIAL;
volatile private SortedMap<TServerInstance,TabletServerStatus> tserverStatus = Collections
.unmodifiableSortedMap(new TreeMap<TServerInstance,TabletServerStatus>());
private LoggerBalancer loggerBalancer;
synchronized private MasterState getMasterState() {
return state;
}
private boolean stillMaster() {
return getMasterState() != MasterState.STOP;
}
static final boolean X = true;
static final boolean _ = false;
static final boolean transitionOK[][] = {
// INITIAL HAVE_LOCK WAIT SAFE_MODE NORMAL UNLOAD_META UNLOAD_ROOT STOP
/* INITIAL */{X, X, _, _, _, _, _, X},
/* HAVE_LOCK */{_, X, X, _, _, _, _, X},
/* WAIT_FOR_TSERVERS */{_, _, X, X, _, _, _, X},
/* SAFE_MODE */{_, _, _, X, X, X, _, X},
/* NORMAL */{_, _, _, X, X, X, _, X},
/* UNLOAD_METADATA_TABLETS */{_, _, _, X, X, X, X, X},
/* UNLOAD_ROOT_TABLET */{_, _, _, _, _, _, X, X},
/* STOP */{_, _, _, _, _, _, _, X},};
synchronized private void setMasterState(MasterState newState) {
if (!transitionOK[state.ordinal()][newState.ordinal()]) {
log.error("Programmer error: master should not transition from " + state + " to " + newState);
}
MasterState oldState = state;
state = newState;
nextEvent.somethingInterestingHappened("State changed from %s to %s", oldState, newState);
// This frees the main thread and will cause the master to exit
if (newState == MasterState.STOP)
clientService.stop();
if (oldState != newState && newState == MasterState.SAFE_MODE) {
upgradeSettings();
}
}
private void upgradeSettings() {
AccumuloConfiguration conf = AccumuloConfiguration.getTableConfiguration(instance.getInstanceID(), Constants.METADATA_TABLE_ID);
if (!conf.getBoolean(Property.TABLE_BLOCKCACHE_ENABLED)) {
try {
// make sure the last shutdown was clean
OfflineMetadataScanner scanner = new OfflineMetadataScanner();
boolean fail = false;
for (Entry<Key,Value> entry : scanner) {
if (entry.getKey().getColumnFamily().equals(Constants.METADATA_LOG_COLUMN_FAMILY)) {
log.error(String.format("Unable to upgrade: extent %s has log entry %s", entry.getKey().getRow(), entry.getValue()));
fail = true;
}
}
if (fail)
throw new Exception("Upgrade requires a clean shutdown");
// perform 1.2 -> 1.3 settings
zset(Property.TABLE_LOCALITY_GROUP_PREFIX.getKey() + "tablet",
String.format("%s,%s", Constants.METADATA_TABLET_COLUMN_FAMILY.toString(), Constants.METADATA_CURRENT_LOCATION_COLUMN_FAMILY.toString()));
zset(Property.TABLE_LOCALITY_GROUP_PREFIX.getKey() + "server", String.format("%s,%s,%s,%s", Constants.METADATA_DATAFILE_COLUMN_FAMILY.toString(),
Constants.METADATA_LOG_COLUMN_FAMILY.toString(), Constants.METADATA_SERVER_COLUMN_FAMILY.toString(),
Constants.METADATA_FUTURE_LOCATION_COLUMN_FAMILY.toString()));
zset(Property.TABLE_LOCALITY_GROUPS.getKey(), "tablet,server");
zset(Property.TABLE_DEFAULT_SCANTIME_VISIBILITY.getKey(), "");
zset(Property.TABLE_INDEXCACHE_ENABLED.getKey(), "true");
zset(Property.TABLE_BLOCKCACHE_ENABLED.getKey(), "true");
for (String id : Tables.getIdToNameMap(instance).keySet())
ZooUtil.putPersistentData(ZooUtil.getRoot(instance) + Constants.ZTABLES + "/" + id + "/state", "ONLINE".getBytes(), NodeExistsPolicy.OVERWRITE);
} catch (Exception ex) {
log.fatal("Error performing upgrade", ex);
System.exit(1);
}
}
}
private static void zset(String property, String value) throws KeeperException, InterruptedException {
TablePropUtil.setTableProperty(Constants.METADATA_TABLE_ID, property, value);
}
private int assignedOrHosted(Text tableId) {
int result = 0;
for (TabletGroupWatcher watcher : watchers) {
TableCounts count = watcher.getStats(tableId);
result += count.hosted() + count.assigned();
}
return result;
}
private int totalAssignedOrHosted() {
int result = 0;
for (TabletGroupWatcher watcher : watchers) {
for (TableCounts counts : watcher.getStats().values()) {
result += counts.assigned() + counts.hosted();
}
}
return result;
}
private int nonMetaDataTabletsAssignedOrHosted() {
return totalAssignedOrHosted() - assignedOrHosted(new Text(Constants.METADATA_TABLE_ID));
}
private int notHosted() {
int result = 0;
for (TabletGroupWatcher watcher : watchers) {
for (TableCounts counts : watcher.getStats().values()) {
result += counts.assigned() + counts.unassigned() + counts.assignedToDeadServers();
}
}
return result;
}
// The number of unassigned tablets that should be assigned: displayed on the monitor page
private int displayUnassigned() {
int result = 0;
if (getMasterState().equals(MasterState.NORMAL)) {
// Count offline tablets for online tables
for (TabletGroupWatcher watcher : watchers) {
TableManager manager = TableManager.getInstance();
for (Entry<Text,TableCounts> entry : watcher.getStats().entrySet()) {
Text tableId = entry.getKey();
TableCounts counts = entry.getValue();
TableState tableState = manager.getTableState(tableId.toString());
if (tableState != null && tableState.equals(TableState.ONLINE)) {
result += counts.unassigned() + counts.assignedToDeadServers() + counts.assigned();
}
}
}
} else if (getMasterState().equals(MasterState.SAFE_MODE)) {
// Count offline tablets for the METADATA table
Text meta = new Text(Constants.METADATA_TABLE_ID);
for (TabletGroupWatcher watcher : watchers) {
result += watcher.getStats(meta).unassigned();
}
} else if (getMasterState().equals(MasterState.UNLOAD_METADATA_TABLETS)) {
Text meta = new Text(Constants.METADATA_TABLE_ID);
for (TabletGroupWatcher watcher : watchers) {
result += watcher.getStats(meta).unassigned();
}
// assumes that any !METADATA table online is the root tablet
if (result > 0) {
return 0;
}
}
return result;
}
private void checkNotMetadataTable(String tableName, TableOperation operation) throws ThriftTableOperationException {
if (tableName.compareTo(Constants.METADATA_TABLE_NAME) == 0) {
String why = "Table names cannot be == " + Constants.METADATA_TABLE_NAME;
log.warn(why);
throw new ThriftTableOperationException(null, tableName, operation, TableOperationExceptionType.OTHER, why);
}
}
private void checkTableName(String tableName, TableOperation operation) throws ThriftTableOperationException {
if (!tableName.matches(Constants.VALID_TABLE_NAME_REGEX)) {
String why = "Table names must only contain word characters (letters, digits, and underscores): " + tableName;
log.warn(why);
throw new ThriftTableOperationException(null, tableName, operation, TableOperationExceptionType.OTHER, why);
}
}
private void verify(AuthInfo credentials, boolean match) throws ThriftSecurityException {
if (!match)
throw new AccumuloSecurityException(credentials.user, SecurityErrorCode.PERMISSION_DENIED).asThriftException();
}
private boolean check(AuthInfo credentials, SystemPermission permission) throws ThriftSecurityException {
try {
return authenticator.hasSystemPermission(credentials, credentials.user, permission);
} catch (AccumuloSecurityException e) {
throw e.asThriftException();
}
}
private boolean check(AuthInfo credentials, String tableId, TablePermission permission) throws ThriftSecurityException {
try {
return authenticator.hasTablePermission(credentials, credentials.user, tableId, permission);
} catch (AccumuloSecurityException e) {
throw e.asThriftException();
}
}
private void checkTableDoesNotExist(String tableName, TableOperation operation) throws ThriftTableOperationException {
if (Tables.getNameToIdMap(instance).containsKey(tableName))
throw new ThriftTableOperationException(null, tableName, operation, TableOperationExceptionType.EXISTS, null);
}
private void waitAround() {
nextEvent.waitForSomethingInterestingToHappen(ONE_SECOND);
}
// @TODO: maybe move this to Property? We do this in TabletServer, Master, TableLoadBalancer, etc.
public static <T> T createInstanceFromPropertyName(Property property, Class<T> base, T defaultInstance) {
String clazzName = AccumuloConfiguration.getSystemConfiguration().get(property);
T instance = null;
try {
Class<? extends T> clazz = AccumuloClassLoader.loadClass(clazzName, base);
instance = clazz.newInstance();
log.info("Loaded class : " + clazzName);
} catch (Exception e) {
log.warn("Failed to load class ", e);
}
if (instance == null) {
log.info("Using " + defaultInstance.getClass().getName());
instance = defaultInstance;
}
return instance;
}
public Master(String[] args) throws IOException {
Accumulo.init("master");
log.info("Version " + Constants.VERSION);
instance = HdfsZooInstance.getInstance();
log.info("Instance " + instance.getInstanceID());
hostname = Accumulo.getLocalAddress(args).getHostName();
fs = FileSystem.get(CachedConfiguration.getInstance());
authenticator = ZKAuthenticator.getInstance();
tserverSet = new LiveTServerSet(instance, this);
this.tabletBalancer = createInstanceFromPropertyName(Property.MASTER_TABLET_BALANCER, TabletBalancer.class, new DefaultLoadBalancer());
this.loggerBalancer = createInstanceFromPropertyName(Property.MASTER_LOGGER_BALANCER, LoggerBalancer.class, new SimpleLoggerBalancer());
Accumulo.enableTracing(hostname, "master");
}
private final class ShutdownTabletServers extends TimerTask {
@Override
public void run() {
// iterate over a copy of serversToShutdown to avoid using a synchronized block
HashSet<TServerInstance> copyServersToShutdown = new HashSet<TServerInstance>();
synchronized (serversToShutdown) {
copyServersToShutdown.addAll(serversToShutdown);
}
for (TServerInstance doomed : copyServersToShutdown) {
final String path = ZooUtil.getRoot(instance) + Constants.ZDOOMEDSERVERS + "/" + doomed.hostPort();
try {
TServerConnection server = tserverSet.getConnection(doomed);
if (server != null) {
TabletServerStatus status = server.getTableMap();
if (status.tableMap != null && status.tableMap.size() > 0) {
continue;
}
server.halt(masterLock);
}
// remove from original (not copy)
serversToShutdown.remove(doomed);
tserverSet.remove(doomed);
try {
ZooUtil.recursiveDelete(path, NodeMissingPolicy.SKIP);
} catch (Exception ex) {
log.error("Unable to remove doomed server marker " + doomed + " from zookeeper");
}
nextEvent.somethingInterestingHappened("Tablet server %s has been shutdown", doomed);
} catch (Exception ex) {
log.error("Error cleaning up after shutdown servers", ex);
}
}
}
}
private class MasterClientServiceHandler extends ClientServiceHandler implements MasterClientService.Iface {
private Object createLock = new Object();
@Override
public void createTable(TInfo info, AuthInfo c, String tableName, List<byte[]> splitPoints, Map<String,String> aggregators, TimeType timeType)
throws ThriftSecurityException, ThriftTableOperationException, TException {
verify(c, check(c, SystemPermission.CREATE_TABLE));
checkNotMetadataTable(tableName, TableOperation.CREATE);
checkTableName(tableName, TableOperation.CREATE);
final ByteArraySet uniq = new ByteArraySet(splitPoints);
splitPoints = uniq.toList();
int newly_created_tablets = 0;
// Don't create new tables until we have read the entire metadata table
while (stillMaster() && !cycledOnce()) {
waitAround();
}
String tableId = null;
synchronized (createLock) {
Tables.clearCache(instance);
checkTableDoesNotExist(tableName, TableOperation.CREATE);
// this whole block is basically: tableId += 1
try {
final String ntp = ZooUtil.getRoot(instance) + Constants.ZTABLES;
final Stat stat = new Stat();
tableId = new String(ZooSession.getSession().getData(ntp, false, stat));
BigInteger nextId = new BigInteger(tableId, Character.MAX_RADIX);
nextId = nextId.add(BigInteger.ONE);
final String nextIdString = nextId.toString(Character.MAX_RADIX);
ZooSession.getSession().setData(ntp, nextIdString.getBytes(), stat.getVersion());
} catch (Exception e1) {
log.error("Failed to assign tableId to " + tableName, e1);
throw new ThriftTableOperationException(tableId, tableName, TableOperation.CREATE, TableOperationExceptionType.OTHER, e1.getMessage());
}
// write tableName & tableId to zookeeper
try {
TableManager.getInstance().addTable(tableId, tableName);
Tables.clearCache(instance);
} catch (Exception e2) {
log.error("Failed to create table " + tableName, e2);
throw new ThriftTableOperationException(tableId, tableName, TableOperation.CREATE, TableOperationExceptionType.OTHER, e2.getMessage());
}
}
for (Entry<String,String> entry : aggregators.entrySet()) {
setTableProperty(null, SecurityConstants.systemCredentials, tableName, entry.getKey(), entry.getValue());
}
log.info(String.format("Creating table %s with tableId %s and %d splitPoints", tableName, tableId, splitPoints.size()));
try {
if (splitPoints.isEmpty()) {
final Text tablet = null;
final String dir = Constants.getTablesDir() + "/" + tableId;
createNewTableTabletDirectories(fs, dir, Collections.singletonList(tablet));
KeyExtent extent = new KeyExtent(new Text(tableId), null, null);
MetadataTable.addTablet(extent, Constants.DEFAULT_TABLET_LOCATION, SecurityConstants.systemCredentials, TabletTime.getTimeID(timeType), masterLock);
newly_created_tablets++;
} else {
Text previous = null;
splitPoints.add(null);
final ArrayList<Text> splitPointsText = new ArrayList<Text>(splitPoints.size());
for (byte[] splitpoint : splitPoints) {
if (splitpoint != null) {
splitPointsText.add(new Text(splitpoint));
} else {
splitPointsText.add(null);
}
}
final String dir = Constants.getTablesDir() + "/" + tableId;
final Map<Text,String> tabletDirs = createNewTableTabletDirectories(fs, dir, splitPointsText);
for (Text splitpointText : splitPointsText) {
String datafiles = null;
if (splitpointText != null) {
datafiles = tabletDirs.get(splitpointText);
} else {
datafiles = Constants.DEFAULT_TABLET_LOCATION;
}
final KeyExtent local_newExtent = new KeyExtent(new Text(tableId), splitpointText, previous);
fs.mkdirs(new Path(Constants.getTablesDir() + "/" + tableId + datafiles));
MetadataTable.addTablet(local_newExtent, datafiles, SecurityConstants.systemCredentials, TabletTime.getTimeID(timeType), masterLock);
previous = splitpointText;
newly_created_tablets++;
}
}
} catch (Exception e) {
log.error(e.getMessage(), e);
throw new ThriftTableOperationException(tableId, tableName, TableOperation.CREATE, TableOperationExceptionType.OTHER, e.getMessage());
}
// give all table permissions to the creator
for (TablePermission permission : TablePermission.values()) {
try {
authenticator.grantTablePermission(SecurityConstants.systemCredentials, c.user, tableId, permission);
} catch (AccumuloSecurityException e) {
log.error(e.getMessage(), e);
throw e.asThriftException();
}
}
nextEvent.somethingInterestingHappened("Created table %s with %d tablets", tableName, newly_created_tablets);
TableManager.getInstance().transitionTableState(tableId, TableState.ONLINE);
}
@Override
public void deleteTable(TInfo info, AuthInfo c, String tableName) throws ThriftSecurityException, ThriftTableOperationException, TException {
final String tableId = checkTableId(tableName, TableOperation.DELETE);
checkNotMetadataTable(tableName, TableOperation.DELETE);
verify(c, check(c, SystemPermission.DROP_TABLE) || check(c, tableId, TablePermission.DROP_TABLE));
TableManager.getInstance().transitionTableState(tableId, TableState.DELETING);
nextEvent.somethingInterestingHappened("deleted table %s", tableName);
while (stillMaster() && TableManager.getInstance().getTableState(tableId) != null) {
waitAround();
}
}
@Override
public void flushTable(TInfo info, AuthInfo c, String tableName) throws ThriftSecurityException, ThriftTableOperationException, TException {
final String tableId = checkTableId(tableName, TableOperation.FLUSH);
verify(c, check(c, tableId, TablePermission.WRITE) || check(c, tableId, TablePermission.ALTER_TABLE));
for (TServerInstance instance : tserverSet.getCurrentServers()) {
try {
final TServerConnection server = tserverSet.getConnection(instance);
if (server != null)
server.flush(masterLock, tableId);
} catch (TException ex) {
log.error(ex.toString());
}
}
}
@Override
public MasterMonitorInfo getMasterStats(TInfo info, AuthInfo credentials) throws ThriftSecurityException, TException {
final MasterMonitorInfo result = new MasterMonitorInfo();
result.loggers = new ArrayList<LoggerStatus>();
for (String logger : loggers.getLoggersFromZooKeeper().keySet()) {
result.loggers.add(new LoggerStatus(logger));
}
result.recovery = recovery.status();
result.tServerInfo = new ArrayList<TabletServerStatus>();
result.tableMap = new DefaultMap<String,TableInfo>(new TableInfo());
for (Entry<TServerInstance,TabletServerStatus> serverEntry : tserverStatus.entrySet()) {
final TabletServerStatus status = serverEntry.getValue();
result.tServerInfo.add(status);
for (Entry<String,TableInfo> entry : status.tableMap.entrySet()) {
String table = entry.getKey();
TableInfo summary = result.tableMap.get(table);
Monitor.add(summary, entry.getValue());
}
}
result.badTServers = new HashMap<String,Byte>();
synchronized (badServers) {
for (TServerInstance bad : badServers.keySet()) {
result.badTServers.put(bad.hostPort(), TabletServerState.UNRESPONSIVE.getId());
}
}
result.state = getMasterState();
result.goalState = getMasterGoalState();
result.unassignedTablets = Master.this.displayUnassigned();
result.serversShuttingDown = new HashSet<String>();
synchronized (serversToShutdown) {
for (TServerInstance server : serversToShutdown)
result.serversShuttingDown.add(server.hostPort());
}
return result;
}
private void updateTableState(AuthInfo c, String tableName, TableOperation operation, TableState state) throws ThriftTableOperationException,
ThriftSecurityException {
final String tableId = checkTableId(tableName, operation);
verify(c, check(c, SystemPermission.SYSTEM) || check(c, tableId, TablePermission.ALTER_TABLE));
TableManager.getInstance().transitionTableState(tableId, state);
nextEvent.somethingInterestingHappened("Set table state of %s to %s", tableName, state);
}
@Override
public void offlineTable(TInfo info, AuthInfo credentials, String tableName) throws ThriftSecurityException, ThriftTableOperationException, TException {
checkNotMetadataTable(tableName, TableOperation.OFFLINE);
updateTableState(credentials, tableName, TableOperation.OFFLINE, TableState.OFFLINE);
}
@Override
public void onlineTable(TInfo info, AuthInfo credentials, String tableName) throws ThriftSecurityException, ThriftTableOperationException, TException {
updateTableState(credentials, tableName, TableOperation.ONLINE, TableState.ONLINE);
}
@Override
public void renameTable(TInfo info, AuthInfo c, String oldTableName, String newTableName) throws ThriftSecurityException, ThriftTableOperationException,
TException {
String tableId = checkTableId(oldTableName, TableOperation.RENAME);
checkNotMetadataTable(oldTableName, TableOperation.RENAME);
checkNotMetadataTable(newTableName, TableOperation.RENAME);
checkTableName(newTableName, TableOperation.RENAME);
verify(c, check(c, tableId, TablePermission.WRITE) || check(c, SystemPermission.ALTER_TABLE));
checkTableDoesNotExist(newTableName, TableOperation.RENAME);
try {
final String tap = ZooUtil.getRoot(instance) + Constants.ZTABLES + "/" + tableId + Constants.ZTABLE_NAME;
final Stat stat = new Stat();
final String currentName = new String(ZooSession.getSession().getData(tap, false, stat));
if (!currentName.equals(oldTableName)) {
throw new ThriftTableOperationException(null, oldTableName, TableOperation.RENAME, TableOperationExceptionType.NOTFOUND,
"Name changed while processing");
}
ZooSession.getSession().setData(tap, newTableName.getBytes(), stat.getVersion());
} catch (Exception e) {
log.warn("Rename failed ", e);
throw new ThriftTableOperationException(null, newTableName, TableOperation.RENAME, TableOperationExceptionType.OTHER, e.getMessage());
}
}
private void alterTableProperty(AuthInfo c, String tableName, String property, String value, TableOperation op) throws ThriftSecurityException,
ThriftTableOperationException {
final String tableId = checkTableId(tableName, op);
verify(c, check(c, SystemPermission.ALTER_TABLE) || check(c, tableId, TablePermission.ALTER_TABLE));
try {
if (value == null) {
TablePropUtil.removeTableProperty(tableId, property);
} else if (!TablePropUtil.setTableProperty(tableId, property, value)) {
throw new Exception("Invalid table property.");
}
} catch (Exception e) {
log.error("Problem altering table property", e);
throw new ThriftTableOperationException(tableId, tableName, op, TableOperationExceptionType.OTHER, e.getMessage());
}
}
@Override
public void removeTableProperty(TInfo info, AuthInfo credentials, String tableName, String property) throws ThriftSecurityException,
ThriftTableOperationException, TException {
alterTableProperty(credentials, tableName, property, null, TableOperation.REMOVE_PROPERTY);
}
@Override
public void setTableProperty(TInfo info, AuthInfo credentials, String tableName, String property, String value) throws ThriftSecurityException,
ThriftTableOperationException, TException {
alterTableProperty(credentials, tableName, property, value, TableOperation.SET_PROPERTY);
}
@Override
public void shutdown(TInfo info, AuthInfo c, boolean stopTabletServers) throws ThriftSecurityException, TException {
verify(c, check(c, SystemPermission.SYSTEM));
Master.this.shutdown(stopTabletServers);
}
@Override
public void shutdownTabletServer(TInfo info, AuthInfo c, String tabletServer) throws ThriftSecurityException, TException {
verify(c, check(c, SystemPermission.SYSTEM));
final InetSocketAddress addr = AddressUtil.parseAddress(tabletServer, Property.TSERV_CLIENTPORT);
final TServerInstance doomed = tserverSet.find(AddressUtil.toString(addr));
final TServerConnection server = tserverSet.getConnection(doomed);
if (server == null) {
log.warn("No server found for name " + tabletServer);
return;
}
serversToShutdown.add(doomed);
final String path = ZooUtil.getRoot(instance) + Constants.ZDOOMEDSERVERS + "/" + doomed.hostPort();
try {
ZooUtil.putPersistentData(path, doomed.getSession().getBytes(), NodeExistsPolicy.OVERWRITE);
} catch (Exception e) {
log.error("Unable to remember doomed server " + doomed + " in zookeeper");
}
nextEvent.somethingInterestingHappened("Tablet Server shutdown requested for %s", tabletServer);
while (stillMaster()) {
if (tserverSet.find(AddressUtil.toString(addr)) == null) {
break;
}
waitAround();
}
}
@Override
public void reportSplitExtent(TInfo info, AuthInfo credentials, String serverName, TabletSplit split) throws TException {
if (migrations.remove(new KeyExtent(split.oldTablet)) != null) {
log.info("Canceled migration of " + split.oldTablet);
}
for (TServerInstance instance : tserverSet.getCurrentServers()) {
if (serverName.equals(instance.hostPort())) {
nextEvent.somethingInterestingHappened("%s reported split %s", serverName, split);
return;
}
}
log.warn("Got a split from a server we don't recognize: " + serverName);
}
@Override
public void reportTabletStatus(TInfo info, AuthInfo credentials, String serverName, TabletLoadState status, TKeyExtent ttablet) throws TException {
KeyExtent tablet = new KeyExtent(ttablet);
switch (status) {
case LOAD_FAILURE:
log.error(serverName + " reports assignment failed for tablet " + tablet);
break;
case LOADED:
nextEvent.somethingInterestingHappened("tablet %s was loaded", tablet);
break;
case UNLOADED:
nextEvent.somethingInterestingHappened("tablet %s was unloaded", tablet);
break;
case UNLOAD_ERROR:
log.error(serverName + " reports unload failed for tablet " + tablet);
break;
case UNLOAD_FAILURE_NOT_SERVING:
if (log.isTraceEnabled()) {
log.trace(serverName + " reports unload failed: not serving tablet, could be a split: " + tablet);
}
break;
}
}
@Override
public void setMasterGoalState(TInfo info, AuthInfo c, MasterGoalState state) throws ThriftSecurityException, TException {
verify(c, check(c, SystemPermission.SYSTEM));
Master.this.setMasterGoalState(state);
}
@Override
public void removeSystemProperty(TInfo info, AuthInfo c, String property) throws ThriftSecurityException, TException {
verify(c, check(c, SystemPermission.SYSTEM));
try {
SystemPropUtil.removeSystemProperty(property);
} catch (Exception e) {
log.error("Problem removing config property in zookeeper", e);
throw new TException(e.getMessage());
}
}
@Override
public void setSystemProperty(TInfo info, AuthInfo credentials, String property, String value) throws ThriftSecurityException, TException {
verify(credentials, check(credentials, SystemPermission.SYSTEM));
try {
SystemPropUtil.setSystemProperty(property, value);
} catch (Exception e) {
log.error("Problem setting config property in zookeeper", e);
throw new TException(e.getMessage());
}
}
}
private void setMasterGoalState(MasterGoalState state) {
try {
ZooUtil.putPersistentData(ZooUtil.getRoot(instance) + Constants.ZMASTER_GOAL_STATE, state.name().getBytes(), NodeExistsPolicy.OVERWRITE);
} catch (Exception ex) {
log.error("Unable to set master goal state in zookeeper");
}
}
MasterGoalState getMasterGoalState() {
while (true)
try {
byte[] data = ZooUtil.getData(ZooUtil.getRoot(instance) + Constants.ZMASTER_GOAL_STATE, null);
return MasterGoalState.valueOf(new String(data));
} catch (Exception e) {
log.error("Problem getting real goal state: " + e);
UtilWaitThread.sleep(1000);
}
}
private void shutdown(boolean stopTabletServers) {
if (stopTabletServers) {
setMasterGoalState(MasterGoalState.CLEAN_STOP);
do {
waitAround();
} while (tserverSet.size() > 0);
}
setMasterState(MasterState.STOP);
}
private class DeleteThread extends Daemon {
public void run() {
while (stillMaster()) {
try {
for (String tableId : Tables.getIdToNameMap(instance).keySet()) {
TableState state = TableManager.getInstance().getTableState(tableId);
if (state == null || state.equals(TableState.DELETING)) {
waitForTabletsToBeOffline(new Text(tableId));
cleanUp(tableId);
}
}
} catch (Exception ex) {
log.warn("Error cleaning up tables", ex);
}
UtilWaitThread.sleep(TIME_BETWEEN_DELETE_CHECKS);
}
}
private void waitForTabletsToBeOffline(Text tableId) {
// Wait for all the watchers to synch against the updated table state
long now = System.currentTimeMillis();
while (true) {
boolean waitLonger = false;
for (TabletGroupWatcher watcher : watchers) {
if (watcher.stats.lastScanFinished() < now)
waitLonger = true;
}
if (!waitLonger)
break;
waitAround();
}
// Now the watchers won't assigning tablets for the deleted table
boolean done = false;
while (!done) {
done = true;
Range tableRange = new KeyExtent(new Text(tableId), null, null).toMetadataRange();
MetaDataTableScanner metaDataTableScanner = new MetaDataTableScanner(tableRange, null, null);
try {
while (metaDataTableScanner.hasNext()) {
TabletLocationState locationState = metaDataTableScanner.next();
TabletState state = locationState.getState(onlineTabletServers());
if (state.equals(TabletState.ASSIGNED) || state.equals(TabletState.HOSTED)) {
log.debug("Still waiting for table to be deleted: " + tableId + " locationState: " + locationState);
done = false;
break;
}
}
} finally {
metaDataTableScanner.close();
}
if (!done)
waitAround();
}
}
void cleanUp(String tableId) {
// stop any migrations in progress
synchronized (migrations) {
Iterator<KeyExtent> iterator = migrations.keySet().iterator();
while (iterator.hasNext()) {
KeyExtent extent = iterator.next();
if (extent.getTableId().toString().equals(tableId)) {
iterator.remove();
}
}
}
// remove metadata table entries
try {
MetadataTable.deleteTable(tableId, SecurityConstants.systemCredentials, masterLock);
} catch (Exception e) {
log.error("error deleting " + tableId + " from metadata table", e);
}
// remove any problem reports the table may have
try {
ProblemReports.getInstance().deleteProblemReports(tableId);
} catch (Exception e) {
log.error("Failed to delete problem reports for table " + tableId, e);
}
// remove any permissions associated with this table
try {
authenticator.deleteTable(SecurityConstants.systemCredentials, tableId);
} catch (AccumuloSecurityException e) {
log.error(e.getMessage(), e);
}
// delete the map files
try {
fs.delete(new Path(Constants.getTablesDir(), tableId), true);
} catch (IOException e) {
log.error("Unable to remove deleted table directory", e);
}
// remove table from zookeeper
try {
TableManager.getInstance().removeTable(tableId);
Tables.clearCache(instance);
} catch (Exception e) {
log.error("Failed to find table id in zookeeper", e);
}
}
}
// Have we walked the metadata table once and assigned everything?
private boolean cycledOnce() {
for (TabletGroupWatcher watcher : watchers) {
if (!watcher.cycledOnce()) {
return false;
}
}
return true;
}
static enum TabletGoalState {
HOSTED, UNASSIGNED, DELETED
};
TabletGoalState getSystemGoalState(KeyExtent extent) {
switch (getMasterState()) {
case NORMAL:
return TabletGoalState.HOSTED;
case SAFE_MODE:
if (extent.getTableId().equals(METADATA_TABLE_ID))
return TabletGoalState.HOSTED;
return TabletGoalState.UNASSIGNED;
case UNLOAD_METADATA_TABLETS:
if (extent.equals(Constants.ROOT_TABLET_EXTENT))
return TabletGoalState.HOSTED;
return TabletGoalState.UNASSIGNED;
case UNLOAD_ROOT_TABLET:
return TabletGoalState.UNASSIGNED;
case STOP:
return TabletGoalState.UNASSIGNED;
}
// unreachable
return TabletGoalState.HOSTED;
}
TabletGoalState getTableGoalState(KeyExtent extent) {
TableState tableState = TableManager.getInstance().getTableState(extent.getTableId().toString());
if (tableState == null)
return TabletGoalState.DELETED;
switch (tableState) {
case DELETING:
return TabletGoalState.DELETED;
case DISABLING:
case UNLOADING:
case DISABLED:
case OFFLINE:
case NEW:
return TabletGoalState.UNASSIGNED;
}
return TabletGoalState.HOSTED;
}
TabletGoalState getGoalState(TServerInstance server, KeyExtent extent) {
// Shutting down?
TabletGoalState state = getSystemGoalState(extent);
if (state == TabletGoalState.HOSTED) {
if (server != null && serversToShutdown.contains(server)) {
return TabletGoalState.UNASSIGNED;
}
// taking table offline?
state = getTableGoalState(extent);
if (state == TabletGoalState.HOSTED) {
// Maybe this tablet needs to be migrated
TServerInstance dest = migrations.get(extent);
if (dest != null && server != null && !dest.equals(server)) {
return TabletGoalState.UNASSIGNED;
}
}
}
return state;
}
private class TabletGroupWatcher extends Daemon {
final TabletStateStore store;
final TableStats stats = new TableStats();
TabletGroupWatcher(TabletStateStore store) {
this.store = store;
}
public boolean cycledOnce() {
return stats.lastScanFinished() > 0;
}
Map<Text,TableCounts> getStats() {
return stats.getLast();
}
TableCounts getStats(Text tableId) {
return stats.getLast(tableId);
}
public void run() {
Thread.currentThread().setName("Watching " + store.name());
int[] oldCounts = new int[TabletState.values().length];
while (stillMaster()) {
int totalUnloaded = 0;
int unloaded = 0;
try {
// Get the current status for the current list of tservers
SortedMap<TServerInstance,TabletServerStatus> currentTServers = new TreeMap<TServerInstance,TabletServerStatus>();
for (TServerInstance entry : tserverSet.getCurrentServers()) {
currentTServers.put(entry, tserverStatus.get(entry));
}
if (currentTServers.size() == 0) {
nextEvent.waitForSomethingInterestingToHappen(TIME_TO_WAIT_BETWEEN_SCANS);
continue;
}
// Don't move tablets to servers that are shutting down
SortedMap<TServerInstance,TabletServerStatus> destinations = new TreeMap<TServerInstance,TabletServerStatus>(currentTServers);
destinations.keySet().removeAll(serversToShutdown);
List<Assignment> assignments = new ArrayList<Assignment>();
List<Assignment> assigned = new ArrayList<Assignment>();
List<TabletLocationState> assignedToDeadServers = new ArrayList<TabletLocationState>();
Map<KeyExtent,TServerInstance> unassigned = new HashMap<KeyExtent,TServerInstance>();
int[] counts = new int[TabletState.values().length];
stats.begin();
// Walk through the tablets in our store, and work tablets
// towards their goal
for (TabletLocationState tls : store) {
if (tls == null) {
continue;
}
// Don't overwhelm the tablet servers with work
if (unassigned.size() + unloaded > MAX_TSERVER_WORK_CHUNK * currentTServers.size()) {
flushChanges(destinations, assignments, assigned, assignedToDeadServers, unassigned);
assignments.clear();
assigned.clear();
assignedToDeadServers.clear();
unassigned.clear();
unloaded = 0;
UtilWaitThread.sleep(TIME_TO_WAIT_BETWEEN_SCANS);
}
TabletGoalState goal = getGoalState(tls.current, tls.extent);
TServerInstance server = tls.getServer();
TabletState state = tls.getState(currentTServers.keySet());
stats.update(tls.extent.getTableId(), state);
// Always follow through with assignments
if (state == TabletState.ASSIGNED) {
goal = TabletGoalState.HOSTED;
}
if (goal == TabletGoalState.HOSTED) {
if (state != TabletState.HOSTED && !tls.walogs.isEmpty()) {
if (!recovery.recover(SecurityConstants.systemCredentials, tls.extent, tls.walogs)) {
continue;
}
}
switch (state) {
case HOSTED:
if (server.equals(migrations.get(tls.extent)))
migrations.remove(tls.extent);
break;
case ASSIGNED_TO_DEAD_SERVER:
assignedToDeadServers.add(tls);
break;
case UNASSIGNED:
// maybe it's a finishing migration
TServerInstance dest = migrations.get(tls.extent);
if (dest != null && destinations.keySet().contains(dest)) {
assignments.add(new Assignment(tls.extent, dest));
} else {
unassigned.put(tls.extent, server);
}
break;
case ASSIGNED:
// Send another reminder
assigned.add(new Assignment(tls.extent, tls.future));
break;
}
} else {
if (state != TabletState.UNASSIGNED) {
TServerConnection conn = tserverSet.getConnection(server);
if (conn != null) {
conn.unloadTablet(masterLock, tls.extent, goal != TabletGoalState.DELETED);
unloaded++;
totalUnloaded++;
} else {
log.warn("Could not connect to server " + server);
}
}
}
counts[state.ordinal()]++;
}
stats.end();
// Report changes
for (TabletState state : TabletState.values()) {
int i = state.ordinal();
if (counts[i] > 0 && counts[i] != oldCounts[i]) {
nextEvent.somethingInterestingHappened("[%s]: %d tablets are %s", store.name(), counts[i], state.name());
}
}
log.debug(String.format("[%s]: scan time %.2f seconds", store.name(), stats.getScanTime() / 1000.));
oldCounts = counts;
if (totalUnloaded > 0) {
nextEvent.somethingInterestingHappened("[%s]: %d tablets unloaded", store.name(), totalUnloaded);
}
flushChanges(destinations, assignments, assigned, assignedToDeadServers, unassigned);
log.debug(String.format("[%s] sleeping for %.2f seconds", store.name(), TIME_TO_WAIT_BETWEEN_SCANS / 1000.));
nextEvent.waitForSomethingInterestingToHappen(TIME_TO_WAIT_BETWEEN_SCANS);
} catch (Exception ex) {
log.error("Error processing table state for store " + store.name(), ex);
UtilWaitThread.sleep(WAIT_BETWEEN_ERRORS);
}
}
}
private void flushChanges(SortedMap<TServerInstance,TabletServerStatus> currentTServers, List<Assignment> assignments, List<Assignment> assigned,
List<TabletLocationState> assignedToDeadServers, Map<KeyExtent,TServerInstance> unassigned) throws DistributedStoreException, TException {
if (!assignedToDeadServers.isEmpty()) {
int maxServersToShow = min(assignedToDeadServers.size(), 100);
log.debug(assignedToDeadServers.size() + " assigned to dead servers: " + assignedToDeadServers.subList(0, maxServersToShow) + "...");
List<TabletLocationState> recovered = new ArrayList<TabletLocationState>();
store.unassign(assignedToDeadServers);
nextEvent.somethingInterestingHappened("Marked %d tablets as unassigned because they don't have current servers", recovered.size());
}
if (!currentTServers.isEmpty()) {
Map<KeyExtent,TServerInstance> assignedOut = new HashMap<KeyExtent,TServerInstance>();
tabletBalancer.getAssignments(Collections.unmodifiableSortedMap(currentTServers), Collections.unmodifiableMap(unassigned), assignedOut);
for (Entry<KeyExtent,TServerInstance> assignment : assignedOut.entrySet()) {
if (unassigned.containsKey(assignment.getKey())) {
if (assignment.getValue() != null) {
log.debug(store.name() + " assigning tablet " + assignment);
assignments.add(new Assignment(assignment.getKey(), assignment.getValue()));
}
} else {
log.warn(store.name() + " load balancer assigning tablet that was not nominated for assignment " + assignment.getKey());
}
}
if (!unassigned.isEmpty() && assignedOut.isEmpty())
log.warn("Load balancer failed to assign any tablets");
}
if (assignments.size() > 0) {
log.info(String.format("Assigning %d tablets", assignments.size()));
store.setFutureLocations(assignments);
}
assignments.addAll(assigned);
for (Assignment a : assignments) {
TServerConnection conn = tserverSet.getConnection(a.server);
if (conn != null) {
conn.assignTablet(masterLock, a.tablet);
} else {
log.warn("Could not connect to server " + a.server);
}
}
}
}
private class MigrationCleanupThread extends Daemon {
public void run() {
setName("Migration Cleanup Thread");
while (stillMaster()) {
if (!migrations.isEmpty()) {
try {
cleanupMutations();
} catch (Exception ex) {
log.error("Error cleaning up migrations", ex);
}
}
UtilWaitThread.sleep(TIME_BETWEEN_MIGRATION_CLEANUPS);
}
}
// If a migrating tablet splits, and the tablet dies before sending the
// master a message, the migration will refer to a non-existing tablet,
// so it can never complete. Periodically scan the metadata table and
// remove any migrating tablets that no longer exist.
private void cleanupMutations() throws AccumuloException, AccumuloSecurityException, TableNotFoundException {
Connector connector = instance.getConnector(SecurityConstants.SYSTEM_USERNAME, SecurityConstants.systemCredentials.password);
Scanner scanner = connector.createScanner(Constants.METADATA_TABLE_NAME, Constants.NO_AUTHS);
ColumnFQ.fetch(scanner, Constants.METADATA_PREV_ROW_COLUMN);
Set<KeyExtent> found = new HashSet<KeyExtent>();
for (Entry<Key,Value> entry : scanner) {
KeyExtent extent = new KeyExtent(entry.getKey().getRow(), entry.getValue());
if (migrations.containsKey(extent)) {
found.add(extent);
}
}
Set<KeyExtent> notFound = new HashSet<KeyExtent>();
synchronized (migrations) {
notFound.addAll(migrations.keySet());
}
notFound.removeAll(found);
for (KeyExtent extent : notFound) {
log.info("Canceling migration of " + extent + " to " + migrations.get(extent) + ": tablet no longer exists (probably due to a split)");
migrations.remove(extent);
}
}
}
private class StatusThread extends Daemon {
public void run() {
setName("Status Thread");
while (stillMaster()) {
int count = 0;
long wait = DEFAULT_WAIT_FOR_WATCHER;
try {
switch (getMasterGoalState()) {
case NORMAL:
switch (getMasterState()) {
case SAFE_MODE:
if (cycledOnce())
setMasterState(MasterState.NORMAL);
case NORMAL:
break;
}
break;
case SAFE_MODE:
switch (getMasterState()) {
case NORMAL:
setMasterState(MasterState.SAFE_MODE);
break;
}
break;
case CLEAN_STOP:
switch (getMasterState()) {
case NORMAL:
setMasterState(MasterState.SAFE_MODE);
break;
case SAFE_MODE:
count = nonMetaDataTabletsAssignedOrHosted();
log.debug(String.format("There are %d non-metadata tablets assigned or hosted", count));
if (count == 0)
setMasterState(MasterState.UNLOAD_METADATA_TABLETS);
break;
case UNLOAD_METADATA_TABLETS:
count = assignedOrHosted(METADATA_TABLE_ID);
log.debug(String.format("There are %d metadata tablets assigned or hosted", count));
// Assumes last tablet hosted is the root tablet;
// it's possible
// that's not the case (root tablet is offline?)
if (count == 1)
setMasterState(MasterState.UNLOAD_ROOT_TABLET);
break;
case UNLOAD_ROOT_TABLET:
count = assignedOrHosted(METADATA_TABLE_ID);
if (count > 0)
log.debug(String.format("The root tablet is still assigned or hosted"));
if (count == 0) {
Set<TServerInstance> currentServers = tserverSet.getCurrentServers();
log.debug("stopping " + currentServers.size() + " tablet servers");
for (TServerInstance server : currentServers) {
try {
tserverSet.getConnection(server).halt(masterLock);
} catch (TTransportException e) {
// its probably down, and we don't care
} catch (Exception e) {
log.error("Unable to halt server " + server + ": " + e);
}
}
if (currentServers.size() == 0)
setMasterState(MasterState.STOP);
}
break;
}
}
wait = updateStatus();
nextEvent.waitForSomethingInterestingToHappen(wait);
} catch (Throwable t) {
log.error("Error balancing tablets", t);
UtilWaitThread.sleep(WAIT_BETWEEN_ERRORS);
}
}
}
private long updateStatus() throws AccumuloException, AccumuloSecurityException, TableNotFoundException {
tserverStatus = Collections.synchronizedSortedMap(gatherTableInformation());
if (badServers.size() > 0) {
log.debug("not balancing because the balance information is out-of-date");
} else if (notHosted() > 0) {
log.debug("not balancing because there are unhosted tablets");
} else if (getMasterGoalState() == MasterGoalState.CLEAN_STOP) {
log.debug("not balancing because the master is attempting to stop cleanly");
} else {
balanceLoggers();
return balanceTablets();
}
return DEFAULT_WAIT_FOR_WATCHER;
}
private void balanceLoggers() {
List<LoggerUser> logUsers = new ArrayList<LoggerUser>();
for (Entry<TServerInstance,TabletServerStatus> entry : tserverStatus.entrySet()) {
logUsers.add(new TServerUsesLoggers(entry.getKey(), entry.getValue()));
}
List<String> logNames = new ArrayList<String>(loggers.getLoggersFromZooKeeper().values());
Map<LoggerUser,List<String>> assignmentsOut = new HashMap<LoggerUser,List<String>>();
int loggersPerServer = AccumuloConfiguration.getSystemConfiguration().getCount(Property.TSERV_LOGGER_COUNT);
loggerBalancer.balance(logUsers, logNames, assignmentsOut, loggersPerServer);
for (Entry<LoggerUser,List<String>> entry : assignmentsOut.entrySet()) {
TServerUsesLoggers tserver = (TServerUsesLoggers) entry.getKey();
try {
log.debug("Telling " + tserver.getInstance() + " to use loggers " + entry.getValue());
TServerConnection connection = tserverSet.getConnection(tserver.getInstance());
if (connection != null)
connection.useLoggers(new HashSet<String>(entry.getValue()));
} catch (Exception ex) {
log.warn("Unable to talk to " + tserver.getInstance(), ex);
}
}
}
private long balanceTablets() {
List<TabletMigration> migrationsOut = new ArrayList<TabletMigration>();
Set<KeyExtent> migrationsCopy = new HashSet<KeyExtent>();
synchronized (migrations) {
migrationsCopy.addAll(migrations.keySet());
}
long wait = tabletBalancer.balance(Collections.unmodifiableSortedMap(tserverStatus), Collections.unmodifiableSet(migrationsCopy), migrationsOut);
for (TabletMigration m : TabletBalancer.checkMigrationSanity(tserverStatus.keySet(), migrationsOut)) {
if (migrations.containsKey(m.tablet)) {
log.warn("balancer requested migration more than once, skipping " + m);
continue;
}
migrations.put(m.tablet, m.newServer);
log.debug("migration " + m);
}
if (migrationsOut.size() > 0) {
nextEvent.somethingInterestingHappened("Migrating %d more tablets, %d total", migrationsOut.size(), migrations.size());
}
return wait;
}
}
private SortedMap<TServerInstance,TabletServerStatus> gatherTableInformation() {
long start = System.currentTimeMillis();
SortedMap<TServerInstance,TabletServerStatus> result = new TreeMap<TServerInstance,TabletServerStatus>();
Set<TServerInstance> currentServers = tserverSet.getCurrentServers();
for (TServerInstance server : currentServers) {
if (serversToShutdown.contains(server))
continue;
try {
TabletServerStatus status = tserverSet.getConnection(server).getTableMap();
result.put(server, status);
} catch (Exception ex) {
log.error("unable to get tablet server status " + server);
if (badServers.get(server).incrementAndGet() > MAX_BAD_STATUS_COUNT) {
try {
tserverSet.getConnection(server).halt(masterLock);
} catch (Exception e) {
log.info("error talking to troublesome tablet server ", e);
}
badServers.remove(server);
tserverSet.remove(server);
}
}
}
synchronized (badServers) {
badServers.keySet().retainAll(currentServers);
}
log.debug(String.format("Finished gathering information from %d servers in %.2f seconds", result.size(), (System.currentTimeMillis() - start) / 1000.));
return result;
}
public void run() throws IOException, InterruptedException, KeeperException {
final String zroot = ZooUtil.getRoot(instance);
getMasterLock(zroot + Constants.ZMASTER_LOCK);
TableManager.getInstance().addObserver(this);
recovery = new CoordinateRecoveryTask(fs);
Thread recoveryThread = new Daemon(new LoggingRunnable(log, recovery), "Recovery Status");
recoveryThread.start();
loggers = new TabletServerLoggers(this);
loggers.scanZooKeeperForUpdates();
StatusThread statusThread = new StatusThread();
statusThread.start();
MigrationCleanupThread migrationCleanupThread = new MigrationCleanupThread();
migrationCleanupThread.start();
DeleteThread deleteThread = new DeleteThread();
deleteThread.start();
tserverSet.startListeningForTabletServerChanges();
final int count = waitForTabletServerCountToStabilize();
log.info("Starting with " + count + " tablet servers");
recoverServersToShutdownFromZooKeeper();
SimpleTimer.getInstance().schedule(new ShutdownTabletServers(), 1000, 1000);
final TabletStateStore stores[] = {new ZooTabletStateStore(new ZooStore(zroot)), new RootTabletStateStore(this), new MetaDataStateStore(this)};
for (int i = 0; i < stores.length; i++) {
watchers.add(new TabletGroupWatcher(stores[i]));
}
for (TabletGroupWatcher watcher : watchers) {
watcher.start();
}
Processor processor = new MasterClientService.Processor(TraceWrap.service(new MasterClientServiceHandler()));
clientService = TServerUtils.startServer(Property.MASTER_CLIENTPORT, processor, "Master", "Master Client Service Handler", false).server;
// use the main thread to accept client connections
clientService.serve();
final long deadline = System.currentTimeMillis() + 1000;
statusThread.join(remaining(deadline));
deleteThread.join(remaining(deadline));
recovery.stop();
recoveryThread.join(remaining(deadline));
// quit, even if the tablet servers somehow jam up and the watchers
// don't stop
for (TabletGroupWatcher watcher : watchers) {
watcher.join(remaining(deadline));
}
log.info("exiting");
}
private long remaining(long deadline) {
return Math.max(1, deadline - System.currentTimeMillis());
}
private void recoverServersToShutdownFromZooKeeper() throws KeeperException, InterruptedException {
final String zRoot = ZooUtil.getRoot(instance);
final String doomedServers = zRoot + Constants.ZDOOMEDSERVERS;
for (String hostPort : ZooSession.getSession().getChildren(doomedServers, false)) {
final InetSocketAddress addr = AddressUtil.parseAddress(hostPort, Property.TSERV_CLIENTPORT);
final byte data[] = ZooUtil.getData(zRoot + Constants.ZDOOMEDSERVERS + "/" + hostPort, null);
serversToShutdown.add(new TServerInstance(addr, new String(data)));
}
}
private int waitForTabletServerCountToStabilize() {
setMasterState(MasterState.WAIT_FOR_TSERVERS);
int tserverCount = tserverSet.getCurrentServers().size();
int lastCount = tserverCount;
while (stillMaster() && (tserverCount == 0 || tserverCount != lastCount)) {
lastCount = tserverCount;
nextEvent.waitForSomethingInterestingToHappen(TIME_TO_WAIT_FOR_TSERVERS_TO_STABILIZE);
tserverSet.scanServers();
tserverCount = tserverSet.getCurrentServers().size();
}
setMasterState(MasterState.SAFE_MODE);
return tserverCount;
}
private void getMasterLock(final String zMasterLoc) throws KeeperException, InterruptedException {
log.info("trying to get master lock");
LockWatcher masterLockWatcher = new ZooLock.LockWatcher() {
public void lostLock(LockLossReason reason) {
Halt.halt("Master lock in zookeeper lost (reason = " + reason + "), exiting!", -1);
}
};
long current = System.currentTimeMillis();
final long waitTime = AccumuloConfiguration.getSystemConfiguration().getTimeInMillis(Property.INSTANCE_ZK_TIMEOUT);
final String masterClientAddress = hostname + ":" + AccumuloConfiguration.getSystemConfiguration().getPort(Property.MASTER_CLIENTPORT);
boolean locked = false;
while (System.currentTimeMillis() - current < waitTime) {
masterLock = new ZooLock(zMasterLoc);
if (masterLock.tryLock(masterLockWatcher, masterClientAddress.getBytes())) {
locked = true;
break;
}
UtilWaitThread.sleep(1000);
}
if (!locked) {
log.info("Failed to get master lock, even after waiting for session timeout, becoming back-up server");
while (true) {
masterLock = new ZooLock(zMasterLoc);
if (masterLock.tryLock(masterLockWatcher, masterClientAddress.getBytes())) {
break;
}
UtilWaitThread.sleep(1000);
}
}
setMasterState(MasterState.HAVE_LOCK);
while (getMasterGoalState() == MasterGoalState.CLEAN_STOP) {
UtilWaitThread.sleep(1000);
}
}
public static void main(String[] args) throws Exception {
Master master = new Master(args);
master.run();
}
@Override
public void newLogger(String address) {
try {
RemoteLogger remote = new RemoteLogger(address);
for (String onDisk : remote.getClosedLogs()) {
Path path = new Path(Constants.getRecoveryDir(), onDisk + ".failed");
if (fs.exists(path)) {
fs.delete(path, true);
}
}
} catch (Exception ex) {
log.warn("Unexpected error clearing failed recovery markers for new logger");
}
}
@Override
public void update(LiveTServerSet current, Set<TServerInstance> deleted, Set<TServerInstance> added) {
if (added.size() > 0) {
log.info("New server: " + added);
}
if (deleted.size() > 0) {
if (stillMaster()) {
log.warn("Lost servers " + deleted);
}
badServers.keySet().removeAll(deleted);
}
synchronized (migrations) {
Iterator<Entry<KeyExtent,TServerInstance>> iter = migrations.entrySet().iterator();
while (iter.hasNext()) {
Entry<KeyExtent,TServerInstance> entry = iter.next();
if (deleted.contains(entry.getValue())) {
log.info("Canceling migration of " + entry.getKey() + " to " + entry.getValue());
iter.remove();
}
}
}
nextEvent.somethingInterestingHappened("There are now %d tablet servers", current.size());
}
@Override
public void stateChanged(String tableId, TableState state) {
nextEvent.somethingInterestingHappened("Table state in zookeeper changed for %s to %s", tableId, state);
}
@Override
public void initialize(Map<String,TableState> tableIdToStateMap) {}
@Override
public void sessionExpired() {}
@Override
public Set<String> onlineTables() {
Set<String> result = new HashSet<String>();
if (getMasterState() != MasterState.NORMAL) {
if (getMasterState() != MasterState.UNLOAD_METADATA_TABLETS)
result.add(Constants.METADATA_TABLE_ID);
return result;
}
TableManager manager = TableManager.getInstance();
for (String tableId : Tables.getIdToNameMap(instance).keySet()) {
TableState state = manager.getTableState(tableId);
if (state != null) {
switch (state) {
case LOADING:
case ONLINE:
result.add(tableId);
}
}
}
return result;
}
@Override
public Set<TServerInstance> onlineTabletServers() {
return tserverSet.getCurrentServers();
}
}