/**
* Copyright 2007 The Apache Software Foundation
*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hbase;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.lang.reflect.Constructor;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Random;
import java.util.Set;
import java.util.SortedMap;
import java.util.TreeMap;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.DelayQueue;
import java.util.concurrent.Delayed;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicReference;
import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReentrantLock;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.dfs.DistributedFileSystem;
import org.apache.hadoop.dfs.FSConstants;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.hbase.io.BatchUpdate;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.ipc.HbaseRPC;
import org.apache.hadoop.hbase.util.FSUtils;
import org.apache.hadoop.hbase.util.InfoServer;
import org.apache.hadoop.hbase.util.Sleeper;
import org.apache.hadoop.hbase.util.Threads;
import org.apache.hadoop.hbase.util.Writables;
import org.apache.hadoop.hbase.io.HbaseMapWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.ipc.RemoteException;
import org.apache.hadoop.ipc.Server;
/**
* HMaster is the "master server" for a HBase.
* There is only one HMaster for a single HBase deployment.
*/
public class HMaster extends Thread implements HConstants, HMasterInterface,
HMasterRegionInterface {
static final Log LOG = LogFactory.getLog(HMaster.class.getName());
static final Long ZERO_L = Long.valueOf(0L);
/** {@inheritDoc} */
public long getProtocolVersion(String protocol,
@SuppressWarnings("unused") long clientVersion)
throws IOException {
if (protocol.equals(HMasterInterface.class.getName())) {
return HMasterInterface.versionID;
} else if (protocol.equals(HMasterRegionInterface.class.getName())) {
return HMasterRegionInterface.versionID;
} else {
throw new IOException("Unknown protocol to name node: " + protocol);
}
}
// We start out with closed flag on. Using AtomicBoolean rather than
// plain boolean because want to pass a reference to supporting threads
// started here in HMaster rather than have them have to know about the
// hosting class
volatile AtomicBoolean closed = new AtomicBoolean(true);
volatile boolean shutdownRequested = false;
volatile AtomicInteger quiescedMetaServers = new AtomicInteger(0);
volatile boolean fsOk = true;
final Path rootdir;
final HBaseConfiguration conf;
final FileSystem fs;
final Random rand;
final int threadWakeFrequency;
final int numRetries;
final long maxRegionOpenTime;
// How many regions to assign a server at a time.
private final int maxAssignInOneGo;
volatile DelayQueue<RegionServerOperation> delayedToDoQueue =
new DelayQueue<RegionServerOperation>();
volatile BlockingQueue<RegionServerOperation> toDoQueue =
new LinkedBlockingQueue<RegionServerOperation>();
final int leaseTimeout;
private final Leases serverLeases;
private final Server server;
private final HServerAddress address;
final HConnection connection;
final int metaRescanInterval;
volatile AtomicReference<HServerAddress> rootRegionLocation =
new AtomicReference<HServerAddress>(null);
final Lock splitLogLock = new ReentrantLock();
// A Sleeper that sleeps for threadWakeFrequency
protected final Sleeper sleeper;
// Default access so accesible from unit tests. MASTER is name of the webapp
// and the attribute name used stuffing this instance into web context.
InfoServer infoServer;
/** Name of master server */
public static final String MASTER = "master";
/**
* Base HRegion scanner class. Holds utilty common to <code>ROOT</code> and
* <code>META</code> HRegion scanners.
*
* <p>How do we know if all regions are assigned? After the initial scan of
* the <code>ROOT</code> and <code>META</code> regions, all regions known at
* that time will have been or are in the process of being assigned.</p>
*
* <p>When a region is split the region server notifies the master of the
* split and the new regions are assigned. But suppose the master loses the
* split message? We need to periodically rescan the <code>ROOT</code> and
* <code>META</code> regions.
* <ul>
* <li>If we rescan, any regions that are new but not assigned will have
* no server info. Any regions that are not being served by the same
* server will get re-assigned.</li>
*
* <li>Thus a periodic rescan of the root region will find any new
* <code>META</code> regions where we missed the <code>META</code> split
* message or we failed to detect a server death and consequently need to
* assign the region to a new server.</li>
*
* <li>if we keep track of all the known <code>META</code> regions, then
* we can rescan them periodically. If we do this then we can detect any
* regions for which we missed a region split message.</li>
* </ul>
*
* Thus just keeping track of all the <code>META</code> regions permits
* periodic rescanning which will detect unassigned regions (new or
* otherwise) without the need to keep track of every region.</p>
*
* <p>So the <code>ROOT</code> region scanner needs to wake up:
* <ol>
* <li>when the master receives notification that the <code>ROOT</code>
* region has been opened.</li>
* <li>periodically after the first scan</li>
* </ol>
*
* The <code>META</code> scanner needs to wake up:
* <ol>
* <li>when a <code>META</code> region comes on line</li>
* </li>periodically to rescan the online <code>META</code> regions</li>
* </ol>
*
* <p>A <code>META</code> region is not 'online' until it has been scanned
* once.
*/
abstract class BaseScanner extends Chore {
protected boolean rootRegion;
protected abstract boolean initialScan();
protected abstract void maintenanceScan();
BaseScanner(final boolean rootRegion, final int period,
final AtomicBoolean stop) {
super(period, stop);
this.rootRegion = rootRegion;
}
@Override
protected boolean initialChore() {
return initialScan();
}
@Override
protected void chore() {
maintenanceScan();
}
/**
* @param region Region to scan
* @throws IOException
*/
protected void scanRegion(final MetaRegion region) throws IOException {
HRegionInterface regionServer = null;
long scannerId = -1L;
LOG.info(Thread.currentThread().getName() + " scanning meta region " +
region.toString());
// Array to hold list of split parents found. Scan adds to list. After
// scan we go check if parents can be removed.
Map<HRegionInfo, SortedMap<Text, byte[]>> splitParents =
new HashMap<HRegionInfo, SortedMap<Text, byte[]>>();
List<Text> emptyRows = new ArrayList<Text>();
try {
regionServer = connection.getHRegionConnection(region.getServer());
scannerId =
regionServer.openScanner(region.getRegionName(), COLUMN_FAMILY_ARRAY,
EMPTY_START_ROW, HConstants.LATEST_TIMESTAMP, null);
int numberOfRegionsFound = 0;
while (true) {
HbaseMapWritable values = regionServer.next(scannerId);
if (values == null || values.size() == 0) {
break;
}
// TODO: Why does this have to be a sorted map?
RowMap m = toRowMap(values);
SortedMap<Text, byte[]> results = m.getMap();
Text row = m.getRow();
HRegionInfo info = getHRegionInfo(row, results);
if (info == null) {
emptyRows.add(row);
continue;
}
String serverName = Writables.bytesToString(results.get(COL_SERVER));
long startCode = Writables.bytesToLong(results.get(COL_STARTCODE));
if (LOG.isDebugEnabled()) {
LOG.debug(Thread.currentThread().getName() + " regioninfo: {" +
info.toString() + "}, server: " + serverName + ", startCode: " +
startCode);
}
// Note Region has been assigned.
checkAssigned(info, serverName, startCode);
if (isSplitParent(info)) {
splitParents.put(info, results);
}
numberOfRegionsFound += 1;
}
if (this.rootRegion) {
numberOfMetaRegions.set(numberOfRegionsFound);
}
} catch (IOException e) {
if (e instanceof RemoteException) {
e = RemoteExceptionHandler.decodeRemoteException((RemoteException) e);
if (e instanceof UnknownScannerException) {
// Reset scannerId so we do not try closing a scanner the other side
// has lost account of: prevents duplicated stack trace out of the
// below close in the finally.
scannerId = -1L;
}
}
throw e;
} finally {
try {
if (scannerId != -1L && regionServer != null) {
regionServer.close(scannerId);
}
} catch (IOException e) {
LOG.error("Closing scanner",
RemoteExceptionHandler.checkIOException(e));
}
}
// Scan is finished.
// First clean up any meta region rows which had null HRegionInfo's
if (emptyRows.size() > 0) {
LOG.warn("Found " + emptyRows.size() +
" rows with empty HRegionInfo while scanning meta region " +
region.getRegionName());
deleteEmptyMetaRows(regionServer, region.getRegionName(), emptyRows);
}
// Take a look at split parents to see if any we can clean up.
if (splitParents.size() > 0) {
for (Map.Entry<HRegionInfo, SortedMap<Text, byte[]>> e:
splitParents.entrySet()) {
HRegionInfo hri = e.getKey();
cleanupSplits(region.getRegionName(), regionServer, hri, e.getValue());
}
}
LOG.info(Thread.currentThread().getName() + " scan of meta region " +
region.toString() + " complete");
}
/*
* @param info Region to check.
* @return True if this is a split parent.
*/
private boolean isSplitParent(final HRegionInfo info) {
if (!info.isSplit()) {
return false;
}
if (!info.isOffline()) {
LOG.warn("Region is split but not offline: " + info.getRegionName());
}
return true;
}
/*
* If daughters no longer hold reference to the parents, delete the parent.
* @param metaRegionName Meta region name.
* @param server HRegionInterface of meta server to talk to
* @param parent HRegionInfo of split parent
* @param rowContent Content of <code>parent</code> row in
* <code>metaRegionName</code>
* @return True if we removed <code>parent</code> from meta table and from
* the filesystem.
* @throws IOException
*/
private boolean cleanupSplits(final Text metaRegionName,
final HRegionInterface srvr, final HRegionInfo parent,
SortedMap<Text, byte[]> rowContent)
throws IOException {
boolean result = false;
boolean hasReferencesA = hasReferences(metaRegionName, srvr,
parent.getRegionName(), rowContent, COL_SPLITA);
boolean hasReferencesB = hasReferences(metaRegionName, srvr,
parent.getRegionName(), rowContent, COL_SPLITB);
if (!hasReferencesA && !hasReferencesB) {
LOG.info("Deleting region " + parent.getRegionName() +
" because daughter splits no longer hold references");
HRegion.deleteRegion(fs, rootdir, parent);
HRegion.removeRegionFromMETA(srvr, metaRegionName,
parent.getRegionName());
result = true;
} else if (LOG.isDebugEnabled()) {
// If debug, note we checked and current state of daughters.
LOG.debug("Checked " + parent.getRegionName() +
" for references: splitA: " + hasReferencesA + ", splitB: "+
hasReferencesB);
}
return result;
}
/*
* Checks if a daughter region -- either splitA or splitB -- still holds
* references to parent. If not, removes reference to the split from
* the parent meta region row.
* @param metaRegionName Name of meta region to look in.
* @param srvr Where region resides.
* @param parent Parent region name.
* @param rowContent Keyed content of the parent row in meta region.
* @param splitColumn Column name of daughter split to examine
* @return True if still has references to parent.
* @throws IOException
*/
private boolean hasReferences(final Text metaRegionName,
final HRegionInterface srvr, final Text parent,
SortedMap<Text, byte[]> rowContent, final Text splitColumn)
throws IOException {
boolean result = false;
HRegionInfo split =
Writables.getHRegionInfoOrNull(rowContent.get(splitColumn));
if (split == null) {
return result;
}
Path tabledir =
HTableDescriptor.getTableDir(rootdir, split.getTableDesc().getName());
for (HColumnDescriptor family: split.getTableDesc().families().values()) {
Path p = HStoreFile.getMapDir(tabledir, split.getEncodedName(),
family.getFamilyName());
// Look for reference files. Call listPaths with an anonymous
// instance of PathFilter.
FileStatus [] ps = fs.listStatus(p,
new PathFilter () {
public boolean accept(Path path) {
return HStore.isReference(path);
}
}
);
if (ps != null && ps.length > 0) {
result = true;
break;
}
}
if (result) {
return result;
}
if (LOG.isDebugEnabled()) {
LOG.debug(split.getRegionName().toString()
+" no longer has references to " + parent.toString());
}
BatchUpdate b = new BatchUpdate(rand.nextLong());
long lockid = b.startUpdate(parent);
b.delete(lockid, splitColumn);
srvr.batchUpdate(metaRegionName, b);
return result;
}
protected void checkAssigned(final HRegionInfo info,
final String serverName, final long startCode)
throws IOException {
// Skip region - if ...
if (info.isOffline() // offline
|| killedRegions.contains(info.getRegionName())) { // queued for offline
unassignedRegions.remove(info);
pendingRegions.remove(info);
return;
}
HServerInfo storedInfo = null;
boolean deadServer = false;
if (serverName.length() != 0) {
synchronized (killList) {
Map<Text, HRegionInfo> regionsToKill = killList.get(serverName);
if (regionsToKill != null &&
regionsToKill.containsKey(info.getRegionName())) {
// Skip if region is on kill list
if (LOG.isDebugEnabled()) {
LOG.debug("not assigning region (on kill list): " +
info.getRegionName());
}
return;
}
}
storedInfo = serversToServerInfo.get(serverName);
deadServer = deadServers.contains(serverName);
}
/*
* If the server is a dead server or its startcode is off -- either null
* or doesn't match the start code for the address -- then add it to the
* list of unassigned regions IF not already there (or pending open).
*/
if (!deadServer && !unassignedRegions.containsKey(info) &&
!pendingRegions.contains(info.getRegionName())
&& (storedInfo == null || storedInfo.getStartCode() != startCode)) {
// The current assignment is invalid
if (LOG.isDebugEnabled()) {
LOG.debug("Current assignment of " + info.getRegionName() +
" is not valid: storedInfo: " + storedInfo + ", startCode: " +
startCode + ", storedInfo.startCode: " +
((storedInfo != null)? storedInfo.getStartCode(): -1) +
", unassignedRegions: " + unassignedRegions.containsKey(info) +
", pendingRegions: " +
pendingRegions.contains(info.getRegionName()));
}
// Recover the region server's log if there is one.
// This is only done from here if we are restarting and there is stale
// data in the meta region. Once we are on-line, dead server log
// recovery is handled by lease expiration and ProcessServerShutdown
if (!initialMetaScanComplete && serverName.length() != 0) {
StringBuilder dirName = new StringBuilder("log_");
dirName.append(serverName.replace(":", "_"));
Path logDir = new Path(rootdir, dirName.toString());
try {
if (fs.exists(logDir)) {
splitLogLock.lock();
try {
HLog.splitLog(rootdir, logDir, fs, conf);
} finally {
splitLogLock.unlock();
}
}
if (LOG.isDebugEnabled()) {
LOG.debug("Split " + logDir.toString());
}
} catch (IOException e) {
LOG.warn("unable to split region server log because: ", e);
throw e;
}
}
// Now get the region assigned
unassignedRegions.put(info, ZERO_L);
}
}
}
volatile boolean rootScanned = false;
/** Scanner for the <code>ROOT</code> HRegion. */
class RootScanner extends BaseScanner {
/** Constructor */
public RootScanner() {
super(true, metaRescanInterval, closed);
}
// Don't retry if we get an error while scanning. Errors are most often
// caused by the server going away. Wait until next rescan interval when
// things should be back to normal
private boolean scanRoot() {
boolean scanSuccessful = false;
synchronized (rootRegionLocation) {
while (!closed.get() && rootRegionLocation.get() == null) {
// rootRegionLocation will be filled in when we get an 'open region'
// regionServerReport message from the HRegionServer that has been
// allocated the ROOT region below.
try {
rootRegionLocation.wait();
} catch (InterruptedException e) {
// continue
}
}
}
if (closed.get()) {
return scanSuccessful;
}
try {
// Don't interrupt us while we're working
synchronized(rootScannerLock) {
scanRegion(new MetaRegion(rootRegionLocation.get(),
HRegionInfo.rootRegionInfo.getRegionName(), null));
}
scanSuccessful = true;
} catch (IOException e) {
e = RemoteExceptionHandler.checkIOException(e);
LOG.warn("Scan ROOT region", e);
// Make sure the file system is still available
checkFileSystem();
} catch (Exception e) {
// If for some reason we get some other kind of exception,
// at least log it rather than go out silently.
LOG.error("Unexpected exception", e);
}
return scanSuccessful;
}
@Override
protected boolean initialScan() {
rootScanned = scanRoot();
return rootScanned;
}
@Override
protected void maintenanceScan() {
scanRoot();
}
}
private final RootScanner rootScannerThread;
final Integer rootScannerLock = new Integer(0);
/** Describes a meta region and its server */
@SuppressWarnings("unchecked")
public static class MetaRegion implements Comparable {
private HServerAddress server;
private Text regionName;
private Text startKey;
MetaRegion(HServerAddress server, Text regionName, Text startKey) {
if (server == null) {
throw new IllegalArgumentException("server cannot be null");
}
this.server = server;
if (regionName == null) {
throw new IllegalArgumentException("regionName cannot be null");
}
this.regionName = new Text(regionName);
this.startKey = new Text();
if (startKey != null) {
this.startKey.set(startKey);
}
}
/** {@inheritDoc} */
@Override
public String toString() {
return "{regionname: " + this.regionName.toString() + ", startKey: <" +
this.startKey.toString() + ">, server: " + this.server.toString() + "}";
}
/** @return the regionName */
public Text getRegionName() {
return regionName;
}
/** @return the server */
public HServerAddress getServer() {
return server;
}
/** @return the startKey */
public Text getStartKey() {
return startKey;
}
/** {@inheritDoc} */
@Override
public boolean equals(Object o) {
return this.compareTo(o) == 0;
}
/** {@inheritDoc} */
@Override
public int hashCode() {
int result = this.regionName.hashCode();
result ^= this.startKey.hashCode();
return result;
}
// Comparable
/** {@inheritDoc} */
public int compareTo(Object o) {
MetaRegion other = (MetaRegion)o;
int result = this.regionName.compareTo(other.getRegionName());
if(result == 0) {
result = this.startKey.compareTo(other.getStartKey());
if (result == 0) {
// Might be on different host?
result = this.server.compareTo(other.server);
}
}
return result;
}
}
/** Set by root scanner to indicate the number of meta regions */
volatile AtomicInteger numberOfMetaRegions = new AtomicInteger();
/** Initial work for the meta scanner is queued up here */
volatile BlockingQueue<MetaRegion> metaRegionsToScan =
new LinkedBlockingQueue<MetaRegion>();
/** These are the online meta regions */
volatile SortedMap<Text, MetaRegion> onlineMetaRegions =
Collections.synchronizedSortedMap(new TreeMap<Text, MetaRegion>());
/** Set by meta scanner after initial scan */
volatile boolean initialMetaScanComplete = false;
/**
* MetaScanner <code>META</code> table.
*
* When a <code>META</code> server comes on line, a MetaRegion object is
* queued up by regionServerReport() and this thread wakes up.
*
* It's important to do this work in a separate thread, or else the blocking
* action would prevent other work from getting done.
*/
class MetaScanner extends BaseScanner {
private final List<MetaRegion> metaRegionsToRescan =
new ArrayList<MetaRegion>();
/** Constructor */
public MetaScanner() {
super(false, metaRescanInterval, closed);
}
// Don't retry if we get an error while scanning. Errors are most often
// caused by the server going away. Wait until next rescan interval when
// things should be back to normal
private boolean scanOneMetaRegion(MetaRegion region) {
boolean scanSuccessful = false;
while (!closed.get() && !rootScanned &&
rootRegionLocation.get() == null) {
sleeper.sleep();
}
if (closed.get()) {
return scanSuccessful;
}
try {
// Don't interrupt us while we're working
synchronized (metaScannerLock) {
scanRegion(region);
onlineMetaRegions.put(region.getStartKey(), region);
}
scanSuccessful = true;
} catch (IOException e) {
e = RemoteExceptionHandler.checkIOException(e);
LOG.warn("Scan one META region: " + region.toString(), e);
// The region may have moved (TestRegionServerAbort, etc.). If
// so, either it won't be in the onlineMetaRegions list or its host
// address has changed and the containsValue will fail. If not
// found, best thing to do here is probably return.
if (!onlineMetaRegions.containsValue(region.getStartKey())) {
LOG.debug("Scanned region is no longer in map of online " +
"regions or its value has changed");
return scanSuccessful;
}
// Make sure the file system is still available
checkFileSystem();
} catch (Exception e) {
// If for some reason we get some other kind of exception,
// at least log it rather than go out silently.
LOG.error("Unexpected exception", e);
}
return scanSuccessful;
}
@Override
protected boolean initialScan() {
MetaRegion region = null;
// Keep going if not closed, metaRegionsToScan has been emptied (or it
// hasn't gotten anything in it yet) and all meta regions are onlined
// (root and meta).
while (!closed.get() &&
(region == null || metaRegionsToScan.size() > 0) &&
!metaRegionsScanned()) {
try {
region =
metaRegionsToScan.poll(threadWakeFrequency, TimeUnit.MILLISECONDS);
} catch (InterruptedException e) {
// continue
}
if (region == null && metaRegionsToRescan.size() != 0) {
region = metaRegionsToRescan.remove(0);
}
if (region != null) {
if (!scanOneMetaRegion(region)) {
metaRegionsToRescan.add(region);
}
}
}
initialMetaScanComplete = true;
return true;
}
@Override
protected void maintenanceScan() {
ArrayList<MetaRegion> regions = new ArrayList<MetaRegion>();
synchronized (onlineMetaRegions) {
regions.addAll(onlineMetaRegions.values());
}
for (MetaRegion r: regions) {
scanOneMetaRegion(r);
}
metaRegionsScanned();
}
/**
* Called by the meta scanner when it has completed scanning all meta
* regions. This wakes up any threads that were waiting for this to happen.
*/
private synchronized boolean metaRegionsScanned() {
if (!rootScanned ||
numberOfMetaRegions.get() != onlineMetaRegions.size()) {
return false;
}
LOG.info("all meta regions scanned");
notifyAll();
return true;
}
/**
* Other threads call this method to wait until all the meta regions have
* been scanned.
*/
synchronized boolean waitForMetaRegionsOrClose() {
while (!closed.get()) {
if (rootScanned &&
numberOfMetaRegions.get() == onlineMetaRegions.size()) {
break;
}
try {
wait(threadWakeFrequency);
} catch (InterruptedException e) {
// continue
}
}
return closed.get();
}
}
final MetaScanner metaScannerThread;
final Integer metaScannerLock = new Integer(0);
/** The map of known server names to server info */
volatile Map<String, HServerInfo> serversToServerInfo =
new ConcurrentHashMap<String, HServerInfo>();
/** Set of known dead servers */
volatile Set<String> deadServers =
Collections.synchronizedSet(new HashSet<String>());
/** SortedMap server load -> Set of server names */
volatile SortedMap<HServerLoad, Set<String>> loadToServers =
Collections.synchronizedSortedMap(new TreeMap<HServerLoad, Set<String>>());
/** Map of server names -> server load */
volatile Map<String, HServerLoad> serversToLoad =
new ConcurrentHashMap<String, HServerLoad>();
/**
* The 'unassignedRegions' table maps from a HRegionInfo to a timestamp that
* indicates the last time we *tried* to assign the region to a RegionServer.
* If the timestamp is out of date, then we can try to reassign it.
*
* We fill 'unassignedRecords' by scanning ROOT and META tables, learning the
* set of all known valid regions.
*
* <p>Items are removed from this list when a region server reports in that
* the region has been deployed.
*/
volatile SortedMap<HRegionInfo, Long> unassignedRegions =
Collections.synchronizedSortedMap(new TreeMap<HRegionInfo, Long>());
/**
* Regions that have been assigned, and the server has reported that it has
* started serving it, but that we have not yet recorded in the meta table.
*/
volatile Set<Text> pendingRegions =
Collections.synchronizedSet(new HashSet<Text>());
/**
* The 'killList' is a list of regions that are going to be closed, but not
* reopened.
*/
volatile Map<String, HashMap<Text, HRegionInfo>> killList =
new ConcurrentHashMap<String, HashMap<Text, HRegionInfo>>();
/** 'killedRegions' contains regions that are in the process of being closed */
volatile Set<Text> killedRegions =
Collections.synchronizedSet(new HashSet<Text>());
/** Set of tables currently in creation. */
private volatile Set<Text> tableInCreation =
Collections.synchronizedSet(new HashSet<Text>());
/** Build the HMaster out of a raw configuration item.
*
* @param conf - Configuration object
* @throws IOException
*/
public HMaster(HBaseConfiguration conf) throws IOException {
this(new Path(conf.get(HBASE_DIR)),
new HServerAddress(conf.get(MASTER_ADDRESS, DEFAULT_MASTER_ADDRESS)),
conf);
}
/**
* Build the HMaster
* @param rd base directory of this HBase instance. Must be fully
* qualified so includes filesystem to use.
* @param address server address and port number
* @param conf configuration
*
* @throws IOException
*/
public HMaster(Path rd, HServerAddress address, HBaseConfiguration conf)
throws IOException {
this.conf = conf;
try {
FSUtils.validateRootPath(rd);
} catch (IOException e) {
LOG.fatal("Not starting HMaster because the root directory path '" +
rd.toString() + "' is not valid. Check the setting of the" +
" configuration parameter '" + HBASE_DIR + "'", e);
throw e;
}
this.rootdir = rd;
this.threadWakeFrequency = conf.getInt(THREAD_WAKE_FREQUENCY, 10 * 1000);
// The filesystem hbase wants to use is probably not what is set into
// fs.default.name; its value is probably the default.
this.conf.set("fs.default.name", this.rootdir.toString());
this.fs = FileSystem.get(conf);
if (this.fs instanceof DistributedFileSystem) {
// Make sure dfs is not in safe mode
String message = "Waiting for dfs to exit safe mode...";
while (((DistributedFileSystem) fs).setSafeMode(
FSConstants.SafeModeAction.SAFEMODE_GET)) {
LOG.info(message);
try {
Thread.sleep(this.threadWakeFrequency);
} catch (InterruptedException e) {
//continue
}
}
}
this.conf.set(HConstants.HBASE_DIR, this.rootdir.toString());
this.rand = new Random();
Path rootRegionDir =
HRegion.getRegionDir(rootdir, HRegionInfo.rootRegionInfo);
LOG.info("Root region dir: " + rootRegionDir.toString());
try {
// Make sure the root directory exists!
if(! fs.exists(rootdir)) {
fs.mkdirs(rootdir);
FSUtils.setVersion(fs, rootdir);
} else {
FSUtils.checkVersion(fs, rootdir, true);
}
if (!fs.exists(rootRegionDir)) {
LOG.info("bootstrap: creating ROOT and first META regions");
try {
HRegion root = HRegion.createHRegion(HRegionInfo.rootRegionInfo,
this.rootdir, this.conf);
HRegion meta = HRegion.createHRegion(HRegionInfo.firstMetaRegionInfo,
this.rootdir, this.conf);
// Add first region from the META table to the ROOT region.
HRegion.addRegionToMETA(root, meta);
root.close();
root.getLog().closeAndDelete();
meta.close();
meta.getLog().closeAndDelete();
} catch (IOException e) {
e = RemoteExceptionHandler.checkIOException(e);
LOG.error("bootstrap", e);
throw e;
}
}
} catch (IOException e) {
LOG.fatal("Not starting HMaster because:", e);
throw e;
}
this.numRetries = conf.getInt("hbase.client.retries.number", 2);
this.maxRegionOpenTime =
conf.getLong("hbase.hbasemaster.maxregionopen", 60 * 1000);
this.leaseTimeout = conf.getInt("hbase.master.lease.period", 30 * 1000);
this.serverLeases = new Leases(this.leaseTimeout,
conf.getInt("hbase.master.lease.thread.wakefrequency", 15 * 1000));
this.server = HbaseRPC.getServer(this, address.getBindAddress(),
address.getPort(), conf.getInt("hbase.regionserver.handler.count", 10),
false, conf);
// The rpc-server port can be ephemeral... ensure we have the correct info
this.address = new HServerAddress(server.getListenerAddress());
conf.set(MASTER_ADDRESS, address.toString());
this.connection = HConnectionManager.getConnection(conf);
this.metaRescanInterval =
conf.getInt("hbase.master.meta.thread.rescanfrequency", 60 * 1000);
// The root region
this.rootScannerThread = new RootScanner();
// Scans the meta table
this.metaScannerThread = new MetaScanner();
unassignRootRegion();
this.sleeper = new Sleeper(this.threadWakeFrequency, this.closed);
this.maxAssignInOneGo =
this.conf.getInt("hbase.master.regions.percheckin", 10);
// We're almost open for business
this.closed.set(false);
LOG.info("HMaster initialized on " + this.address.toString());
}
/*
* Unassign the root region.
* This method would be used in case where root region server had died
* without reporting in. Currently, we just flounder and never recover. We
* could 'notice' dead region server in root scanner -- if we failed access
* multiple times -- but reassigning root is catastrophic.
*
*/
void unassignRootRegion() {
this.rootRegionLocation.set(null);
if (!this.shutdownRequested) {
this.unassignedRegions.put(HRegionInfo.rootRegionInfo, ZERO_L);
}
}
/**
* Checks to see if the file system is still accessible.
* If not, sets closed
* @return false if file system is not available
*/
protected boolean checkFileSystem() {
if (fsOk) {
try {
FSUtils.checkFileSystemAvailable(fs);
} catch (IOException e) {
LOG.fatal("Shutting down HBase cluster: file system not available", e);
closed.set(true);
fsOk = false;
}
}
return fsOk;
}
/** @return HServerAddress of the master server */
public HServerAddress getMasterAddress() {
return address;
}
/**
* @return Hbase root dir.
*/
public Path getRootDir() {
return this.rootdir;
}
/**
* @return Read-only map of servers to serverinfo.
*/
public Map<String, HServerInfo> getServersToServerInfo() {
return Collections.unmodifiableMap(this.serversToServerInfo);
}
/**
* @return Read-only map of servers to load.
*/
public Map<String, HServerLoad> getServersToLoad() {
return Collections.unmodifiableMap(this.serversToLoad);
}
/**
* @return Location of the <code>-ROOT-</code> region.
*/
public HServerAddress getRootRegionLocation() {
HServerAddress rootServer = null;
if (!shutdownRequested && !closed.get()) {
rootServer = this.rootRegionLocation.get();
}
return rootServer;
}
/**
* @return Read-only map of online regions.
*/
public Map<Text, MetaRegion> getOnlineMetaRegions() {
return Collections.unmodifiableSortedMap(this.onlineMetaRegions);
}
/** Main processing loop */
@Override
public void run() {
final String threadName = "HMaster";
Thread.currentThread().setName(threadName);
startServiceThreads();
/* Main processing loop */
try {
while (!closed.get()) {
RegionServerOperation op = null;
if (shutdownRequested && serversToServerInfo.size() == 0) {
startShutdown();
break;
}
if (rootRegionLocation.get() != null) {
// We can't process server shutdowns unless the root region is online
op = this.delayedToDoQueue.poll();
}
if (op == null ) {
try {
op = toDoQueue.poll(threadWakeFrequency, TimeUnit.MILLISECONDS);
} catch (InterruptedException e) {
// continue
}
}
if (op == null || closed.get()) {
continue;
}
try {
if (LOG.isDebugEnabled()) {
LOG.debug("Main processing loop: " + op.toString());
}
if (!op.process()) {
// Operation would have blocked because not all meta regions are
// online. This could cause a deadlock, because this thread is waiting
// for the missing meta region(s) to come back online, but since it
// is waiting, it cannot process the meta region online operation it
// is waiting for. So put this operation back on the queue for now.
if (toDoQueue.size() == 0) {
// The queue is currently empty so wait for a while to see if what
// we need comes in first
sleeper.sleep();
}
try {
if (LOG.isDebugEnabled()) {
LOG.debug("Put " + op.toString() + " back on queue");
}
toDoQueue.put(op);
} catch (InterruptedException e) {
throw new RuntimeException(
"Putting into toDoQueue was interrupted.", e);
}
}
} catch (Exception ex) {
if (ex instanceof RemoteException) {
try {
ex = RemoteExceptionHandler.decodeRemoteException(
(RemoteException)ex);
} catch (IOException e) {
ex = e;
LOG.warn("main processing loop: " + op.toString(), e);
}
}
if (!checkFileSystem()) {
break;
}
LOG.warn("Processing pending operations: " + op.toString(), ex);
try {
toDoQueue.put(op);
} catch (InterruptedException e) {
throw new RuntimeException(
"Putting into toDoQueue was interrupted.", e);
} catch (Exception e) {
LOG.error("main processing loop: " + op.toString(), e);
}
}
}
} catch (Throwable t) {
LOG.fatal("Unhandled exception. Starting shutdown.", t);
this.closed.set(true);
}
// The region servers won't all exit until we stop scanning the meta regions
stopScanners();
// Wait for all the remaining region servers to report in.
letRegionServersShutdown();
/*
* Clean up and close up shop
*/
if (this.infoServer != null) {
LOG.info("Stopping infoServer");
try {
this.infoServer.stop();
} catch (InterruptedException ex) {
ex.printStackTrace();
}
}
server.stop(); // Stop server
serverLeases.close(); // Turn off the lease monitor
// Join up with all threads
try {
if (rootScannerThread.isAlive()) {
rootScannerThread.join(); // Wait for the root scanner to finish.
}
} catch (Exception iex) {
LOG.warn("root scanner", iex);
}
try {
if (metaScannerThread.isAlive()) {
metaScannerThread.join(); // Wait for meta scanner to finish.
}
} catch(Exception iex) {
LOG.warn("meta scanner", iex);
}
LOG.info("HMaster main thread exiting");
}
/*
* Start up all services. If any of these threads gets an unhandled exception
* then they just die with a logged message. This should be fine because
* in general, we do not expect the master to get such unhandled exceptions
* as OOMEs; it should be lightly loaded. See what HRegionServer does if
* need to install an unexpected exception handler.
*/
private void startServiceThreads() {
String threadName = Thread.currentThread().getName();
try {
Threads.setDaemonThreadRunning(this.rootScannerThread,
threadName + ".rootScanner");
Threads.setDaemonThreadRunning(this.metaScannerThread,
threadName + ".metaScanner");
// Leases are not the same as Chore threads. Set name differently.
this.serverLeases.setName(threadName + ".leaseChecker");
this.serverLeases.start();
// Put up info server.
int port = this.conf.getInt("hbase.master.info.port", 60010);
if (port >= 0) {
String a = this.conf.get("hbase.master.info.bindAddress", "0.0.0.0");
this.infoServer = new InfoServer(MASTER, a, port, false);
this.infoServer.setAttribute(MASTER, this);
this.infoServer.start();
}
// Start the server so everything else is running before we start
// receiving requests.
this.server.start();
} catch (IOException e) {
if (e instanceof RemoteException) {
try {
e = RemoteExceptionHandler.decodeRemoteException((RemoteException) e);
} catch (IOException ex) {
LOG.warn("thread start", ex);
}
}
// Something happened during startup. Shut things down.
this.closed.set(true);
LOG.error("Failed startup", e);
}
if (LOG.isDebugEnabled()) {
LOG.debug("Started service threads");
}
if (LOG.isDebugEnabled()) {
LOG.debug("Started service threads");
}
}
/*
* Start shutting down the master
*/
private void startShutdown() {
closed.set(true);
stopScanners();
synchronized(toDoQueue) {
toDoQueue.clear(); // Empty the queue
delayedToDoQueue.clear(); // Empty shut down queue
toDoQueue.notifyAll(); // Wake main thread
}
synchronized (serversToServerInfo) {
serversToServerInfo.notifyAll();
}
}
/*
* Stop the root and meta scanners so that the region servers serving meta
* regions can shut down.
*/
private void stopScanners() {
if (LOG.isDebugEnabled()) {
LOG.debug("telling root scanner to stop");
}
synchronized(rootScannerLock) {
if (rootScannerThread.isAlive()) {
rootScannerThread.interrupt(); // Wake root scanner
}
}
if (LOG.isDebugEnabled()) {
LOG.debug("telling meta scanner to stop");
}
synchronized(metaScannerLock) {
if (metaScannerThread.isAlive()) {
metaScannerThread.interrupt(); // Wake meta scanner
}
}
if (LOG.isDebugEnabled()) {
LOG.debug("meta and root scanners notified");
}
}
/*
* Wait on regionservers to report in
* with {@link #regionServerReport(HServerInfo, HMsg[])} so they get notice
* the master is going down. Waits until all region servers come back with
* a MSG_REGIONSERVER_STOP which will cancel their lease or until leases held
* by remote region servers have expired.
*/
private void letRegionServersShutdown() {
if (!fsOk) {
// Forget waiting for the region servers if the file system has gone
// away. Just exit as quickly as possible.
return;
}
synchronized (serversToServerInfo) {
while (this.serversToServerInfo.size() > 0) {
LOG.info("Waiting on following regionserver(s) to go down (or " +
"region server lease expiration, whichever happens first): " +
this.serversToServerInfo.values());
try {
serversToServerInfo.wait(threadWakeFrequency);
} catch (InterruptedException e) {
// continue
}
}
}
}
/*
* HMasterRegionInterface
*/
/** {@inheritDoc} */
@SuppressWarnings("unused")
public HbaseMapWritable regionServerStartup(HServerInfo serverInfo)
throws IOException {
String s = serverInfo.getServerAddress().toString().trim();
LOG.info("received start message from: " + s);
// Do the lease check up here. There might already be one out on this
// server expecially if it just shutdown and came back up near-immediately
// after.
if (!closed.get()) {
long serverLabel = getServerLabel(s);
this.serverLeases.createLease(serverLabel, serverLabel,
new ServerExpirer(s));
}
registerRegionServer(s, serverInfo);
return createConfigurationSubset();
}
/* Register the newly reporting regionserver with out local data structures
* that keep up load, server address to server info, etc.
* @param serverAddress
* @param serverInfo
*/
private void registerRegionServer(final String serverAddress,
final HServerInfo serverInfo) {
HServerLoad load = serversToLoad.remove(serverAddress);
if (load != null) {
// The startup message was from a known server.
// Remove stale information about the server's load.
Set<String> servers = loadToServers.get(load);
if (servers != null) {
servers.remove(serverAddress);
loadToServers.put(load, servers);
}
}
HServerInfo storedInfo = serversToServerInfo.remove(serverAddress);
if (storedInfo != null && !closed.get()) {
// The startup message was from a known server with the same name.
// Timeout the old one right away.
HServerAddress root = this.rootRegionLocation.get();
if (root != null && root.equals(storedInfo.getServerAddress())) {
unassignRootRegion();
}
this.delayedToDoQueue.put(new ProcessServerShutdown(storedInfo));
}
// Record new server
load = new HServerLoad();
serverInfo.setLoad(load);
this.serversToServerInfo.put(serverAddress, serverInfo);
this.serversToLoad.put(serverAddress, load);
Set<String> servers = loadToServers.get(load);
if (servers == null) {
servers = new HashSet<String>();
}
servers.add(serverAddress);
this.loadToServers.put(load, servers);
}
/**
* @return Subset of configuration to pass initializing regionservers: e.g.
* the filesystem to use and root directory to use.
*/
protected HbaseMapWritable createConfigurationSubset() {
HbaseMapWritable mw = addConfig(new HbaseMapWritable(), HConstants.HBASE_DIR);
return addConfig(mw, "fs.default.name");
}
private HbaseMapWritable addConfig(final HbaseMapWritable mw, final String key) {
mw.put(new Text(key), new Text(this.conf.get(key)));
return mw;
}
private long getServerLabel(final String s) {
return s.hashCode();
}
/** {@inheritDoc} */
public HMsg[] regionServerReport(HServerInfo serverInfo, HMsg msgs[])
throws IOException {
String serverName = serverInfo.getServerAddress().toString().trim();
long serverLabel = getServerLabel(serverName);
if (msgs.length > 0) {
if (msgs[0].getMsg() == HMsg.MSG_REPORT_EXITING) {
synchronized (serversToServerInfo) {
try {
// HRegionServer is shutting down. Cancel the server's lease.
// Note that canceling the server's lease takes care of updating
// serversToServerInfo, etc.
if (LOG.isDebugEnabled()) {
LOG.debug("Region server " + serverName +
": MSG_REPORT_EXITING -- cancelling lease");
}
if (cancelLease(serverName, serverLabel)) {
// Only process the exit message if the server still has a lease.
// Otherwise we could end up processing the server exit twice.
LOG.info("Region server " + serverName +
": MSG_REPORT_EXITING -- lease cancelled");
// Get all the regions the server was serving reassigned
// (if we are not shutting down).
if (!closed.get()) {
for (int i = 1; i < msgs.length; i++) {
HRegionInfo info = msgs[i].getRegionInfo();
if (info.isRootRegion()) {
rootRegionLocation.set(null);
} else if (info.isMetaTable()) {
onlineMetaRegions.remove(info.getStartKey());
}
if (!killedRegions.remove(info.getRegionName())) {
this.unassignedRegions.put(info, ZERO_L);
}
}
}
}
// We don't need to return anything to the server because it isn't
// going to do any more work.
return new HMsg[0];
} finally {
serversToServerInfo.notifyAll();
}
}
} else if (msgs[0].getMsg() == HMsg.MSG_REPORT_QUIESCED) {
LOG.info("Region server " + serverName + " quiesced");
quiescedMetaServers.incrementAndGet();
}
}
if(quiescedMetaServers.get() >= serversToServerInfo.size()) {
// If the only servers we know about are meta servers, then we can
// proceed with shutdown
LOG.info("All user tables quiesced. Proceeding with shutdown");
startShutdown();
}
if (shutdownRequested && !closed.get()) {
// Tell the server to stop serving any user regions
return new HMsg[]{new HMsg(HMsg.MSG_REGIONSERVER_QUIESCE)};
}
if (closed.get()) {
// Tell server to shut down if we are shutting down. This should
// happen after check of MSG_REPORT_EXITING above, since region server
// will send us one of these messages after it gets MSG_REGIONSERVER_STOP
return new HMsg[]{new HMsg(HMsg.MSG_REGIONSERVER_STOP)};
}
HServerInfo storedInfo = serversToServerInfo.get(serverName);
if (storedInfo == null) {
if (LOG.isDebugEnabled()) {
LOG.debug("received server report from unknown server: " + serverName);
}
// The HBaseMaster may have been restarted.
// Tell the RegionServer to start over and call regionServerStartup()
return new HMsg[]{new HMsg(HMsg.MSG_CALL_SERVER_STARTUP)};
} else if (storedInfo.getStartCode() != serverInfo.getStartCode()) {
// This state is reachable if:
//
// 1) RegionServer A started
// 2) RegionServer B started on the same machine, then
// clobbered A in regionServerStartup.
// 3) RegionServer A returns, expecting to work as usual.
//
// The answer is to ask A to shut down for good.
if (LOG.isDebugEnabled()) {
LOG.debug("region server race condition detected: " + serverName);
}
synchronized (serversToServerInfo) {
cancelLease(serverName, serverLabel);
serversToServerInfo.notifyAll();
}
return new HMsg[]{new HMsg(HMsg.MSG_REGIONSERVER_STOP)};
} else {
// All's well. Renew the server's lease.
// This will always succeed; otherwise, the fetch of serversToServerInfo
// would have failed above.
serverLeases.renewLease(serverLabel, serverLabel);
// Refresh the info object and the load information
serversToServerInfo.put(serverName, serverInfo);
HServerLoad load = serversToLoad.get(serverName);
if (load != null && !load.equals(serverInfo.getLoad())) {
// We have previous information about the load on this server
// and the load on this server has changed
Set<String> servers = loadToServers.get(load);
// Note that servers should never be null because loadToServers
// and serversToLoad are manipulated in pairs
servers.remove(serverName);
loadToServers.put(load, servers);
}
// Set the current load information
load = serverInfo.getLoad();
serversToLoad.put(serverName, load);
Set<String> servers = loadToServers.get(load);
if (servers == null) {
servers = new HashSet<String>();
}
servers.add(serverName);
loadToServers.put(load, servers);
// Next, process messages for this server
return processMsgs(serverInfo, msgs);
}
}
/** Cancel a server's lease and update its load information */
private boolean cancelLease(final String serverName, final long serverLabel) {
boolean leaseCancelled = false;
HServerInfo info = serversToServerInfo.remove(serverName);
if (info != null) {
// Only cancel lease and update load information once.
// This method can be called a couple of times during shutdown.
if (rootRegionLocation.get() != null &&
info.getServerAddress().equals(rootRegionLocation.get())) {
unassignRootRegion();
}
LOG.info("Cancelling lease for " + serverName);
serverLeases.cancelLease(serverLabel, serverLabel);
leaseCancelled = true;
// update load information
HServerLoad load = serversToLoad.remove(serverName);
if (load != null) {
Set<String> servers = loadToServers.get(load);
if (servers != null) {
servers.remove(serverName);
loadToServers.put(load, servers);
}
}
}
return leaseCancelled;
}
/**
* Process all the incoming messages from a server that's contacted us.
*
* Note that we never need to update the server's load information because
* that has already been done in regionServerReport.
*/
private HMsg[] processMsgs(HServerInfo info, HMsg incomingMsgs[])
throws IOException {
ArrayList<HMsg> returnMsgs = new ArrayList<HMsg>();
String serverName = info.getServerAddress().toString();
HashMap<Text, HRegionInfo> regionsToKill = null;
regionsToKill = killList.remove(serverName);
// Get reports on what the RegionServer did.
for (int i = 0; i < incomingMsgs.length; i++) {
if (LOG.isDebugEnabled()) {
LOG.debug("Received " + incomingMsgs[i].toString() + " from " +
serverName);
}
HRegionInfo region = incomingMsgs[i].getRegionInfo();
switch (incomingMsgs[i].getMsg()) {
case HMsg.MSG_REPORT_PROCESS_OPEN:
synchronized (unassignedRegions) {
// Region server is reporting in that its working on region open
// (We can get more than one of these messages if region is replaying
// a bunch of edits and taking a while to open).
// Extend region open time by max region open time.
this.unassignedRegions.put(region,
Long.valueOf(System.currentTimeMillis() + this.maxRegionOpenTime));
}
break;
case HMsg.MSG_REPORT_OPEN:
boolean duplicateAssignment = false;
synchronized (unassignedRegions) {
if (unassignedRegions.remove(region) == null) {
if (region.getRegionName().compareTo(
HRegionInfo.rootRegionInfo.getRegionName()) == 0) {
// Root region
HServerAddress rootServer = rootRegionLocation.get();
if (rootServer != null) {
if (rootServer.toString().compareTo(serverName) == 0) {
// A duplicate open report from the correct server
break;
}
// We received an open report on the root region, but it is
// assigned to a different server
duplicateAssignment = true;
}
} else {
// Not root region. If it is not a pending region, then we are
// going to treat it as a duplicate assignment
if (pendingRegions.contains(region.getRegionName())) {
// A duplicate report from the correct server
break;
}
// Although we can't tell for certain if this is a duplicate
// report from the correct server, we are going to treat it
// as such
duplicateAssignment = true;
}
}
if (duplicateAssignment) {
if (LOG.isDebugEnabled()) {
LOG.debug("region server " + info.getServerAddress().toString()
+ " should not have opened region " + region.getRegionName());
}
// This Region should not have been opened.
// Ask the server to shut it down, but don't report it as closed.
// Otherwise the HMaster will think the Region was closed on purpose,
// and then try to reopen it elsewhere; that's not what we want.
returnMsgs.add(
new HMsg(HMsg.MSG_REGION_CLOSE_WITHOUT_REPORT, region));
} else {
LOG.info(info.getServerAddress().toString() + " serving " +
region.getRegionName());
if (region.getRegionName().compareTo(
HRegionInfo.rootRegionInfo.getRegionName()) == 0) {
// Store the Root Region location (in memory)
synchronized (rootRegionLocation) {
this.rootRegionLocation.set(
new HServerAddress(info.getServerAddress()));
this.rootRegionLocation.notifyAll();
}
} else {
// Note that the table has been assigned and is waiting for the
// meta table to be updated.
pendingRegions.add(region.getRegionName());
// Queue up an update to note the region location.
try {
toDoQueue.put(new ProcessRegionOpen(info, region));
} catch (InterruptedException e) {
throw new RuntimeException(
"Putting into toDoQueue was interrupted.", e);
}
}
}
}
break;
case HMsg.MSG_REPORT_CLOSE:
LOG.info(info.getServerAddress().toString() + " no longer serving " +
region);
if (region.getRegionName().compareTo(
HRegionInfo.rootRegionInfo.getRegionName()) == 0) {
// Root region
if (region.isOffline()) {
// Can't proceed without root region. Shutdown.
LOG.fatal("root region is marked offline");
shutdown();
}
unassignRootRegion();
} else {
boolean reassignRegion = !region.isOffline();
if (killedRegions.remove(region.getRegionName())) {
reassignRegion = false;
}
if (region.isMetaTable()) {
// Region is part of the meta table. Remove it from onlineMetaRegions
onlineMetaRegions.remove(region.getStartKey());
}
// NOTE: we cannot put the region into unassignedRegions as that
// could create a race with the pending close if it gets
// reassigned before the close is processed.
unassignedRegions.remove(region);
try {
toDoQueue.put(new ProcessRegionClose(region, reassignRegion));
} catch (InterruptedException e) {
throw new RuntimeException(
"Putting into toDoQueue was interrupted.", e);
}
}
break;
case HMsg.MSG_REPORT_SPLIT:
HRegionInfo newRegionA = incomingMsgs[++i].getRegionInfo();
addToUnassignedRegions(newRegionA);
HRegionInfo newRegionB = incomingMsgs[++i].getRegionInfo();
addToUnassignedRegions(newRegionB);
LOG.info("Region " + region.getRegionName() + " split; new regions: " +
newRegionA.getRegionName() + ", " + newRegionB.getRegionName());
if (region.isMetaTable()) {
// A meta region has split.
onlineMetaRegions.remove(region.getStartKey());
numberOfMetaRegions.incrementAndGet();
}
break;
default:
throw new IOException(
"Impossible state during msg processing. Instruction: " +
incomingMsgs[i].getMsg());
}
}
// Process the kill list
if (regionsToKill != null) {
for (HRegionInfo i: regionsToKill.values()) {
returnMsgs.add(new HMsg(HMsg.MSG_REGION_CLOSE, i));
killedRegions.add(i.getRegionName());
}
}
// Figure out what the RegionServer ought to do, and write back.
assignRegions(info, serverName, returnMsgs);
return returnMsgs.toArray(new HMsg[returnMsgs.size()]);
}
/*
* @param hri Add to unassigned regions but make sure its not in pending
* else can end up double-assigning
* @see HBASE-534
*/
private void addToUnassignedRegions(final HRegionInfo hri) {
synchronized(this.unassignedRegions) {
if (!this.unassignedRegions.containsKey(hri) &&
!this.pendingRegions.contains(hri.getRegionName())) {
this.unassignedRegions.put(hri, ZERO_L);
}
}
}
/*
* Assigns regions to region servers attempting to balance the load across
* all region servers
*
* @param info
* @param serverName
* @param returnMsgs
*/
private void assignRegions(HServerInfo info, String serverName,
ArrayList<HMsg> returnMsgs) {
synchronized (this.unassignedRegions) {
// We need to hold a lock on assign attempts while we figure out what to
// do so that multiple threads do not execute this method in parallel
// resulting in assigning the same region to multiple servers.
long now = System.currentTimeMillis();
Set<HRegionInfo> regionsToAssign = new HashSet<HRegionInfo>();
for (Map.Entry<HRegionInfo, Long> e: this.unassignedRegions.entrySet()) {
HRegionInfo i = e.getKey();
if (numberOfMetaRegions.get() != onlineMetaRegions.size() &&
!i.isMetaRegion()) {
// Can't assign user regions until all meta regions have been assigned
// and are on-line
continue;
}
long diff = now - e.getValue().longValue();
if (diff > this.maxRegionOpenTime) {
regionsToAssign.add(e.getKey());
}
}
int nRegionsToAssign = regionsToAssign.size();
if (nRegionsToAssign <= 0) {
// No regions to assign. Return.
return;
}
if (this.serversToServerInfo.size() == 1) {
assignRegionsToOneServer(regionsToAssign, serverName, returnMsgs);
// Finished. Return.
return;
}
// Multiple servers in play.
// We need to allocate regions only to most lightly loaded servers.
HServerLoad thisServersLoad = info.getLoad();
int nregions = regionsPerServer(nRegionsToAssign, thisServersLoad);
nRegionsToAssign -= nregions;
if (nRegionsToAssign > 0) {
// We still have more regions to assign. See how many we can assign
// before this server becomes more heavily loaded than the next
// most heavily loaded server.
SortedMap<HServerLoad, Set<String>> heavyServers =
new TreeMap<HServerLoad, Set<String>>();
synchronized (this.loadToServers) {
heavyServers.putAll(this.loadToServers.tailMap(thisServersLoad));
}
int nservers = 0;
HServerLoad heavierLoad = null;
for (Map.Entry<HServerLoad, Set<String>> e : heavyServers.entrySet()) {
Set<String> servers = e.getValue();
nservers += servers.size();
if (e.getKey().compareTo(thisServersLoad) == 0) {
// This is the load factor of the server we are considering
nservers -= 1;
continue;
}
// If we get here, we are at the first load entry that is a
// heavier load than the server we are considering
heavierLoad = e.getKey();
break;
}
nregions = 0;
if (heavierLoad != null) {
// There is a more heavily loaded server
for (HServerLoad load =
new HServerLoad(thisServersLoad.getNumberOfRequests(),
thisServersLoad.getNumberOfRegions());
load.compareTo(heavierLoad) <= 0 && nregions < nRegionsToAssign;
load.setNumberOfRegions(load.getNumberOfRegions() + 1), nregions++) {
// continue;
}
}
if (nregions < nRegionsToAssign) {
// There are some more heavily loaded servers
// but we can't assign all the regions to this server.
if (nservers > 0) {
// There are other servers that can share the load.
// Split regions that need assignment across the servers.
nregions = (int) Math.ceil((1.0 * nRegionsToAssign)
/ (1.0 * nservers));
} else {
// No other servers with same load.
// Split regions over all available servers
nregions = (int) Math.ceil((1.0 * nRegionsToAssign)
/ (1.0 * serversToServerInfo.size()));
}
} else {
// Assign all regions to this server
nregions = nRegionsToAssign;
}
if (nregions > this.maxAssignInOneGo) {
nregions = this.maxAssignInOneGo;
}
now = System.currentTimeMillis();
for (HRegionInfo regionInfo: regionsToAssign) {
LOG.info("assigning region " + regionInfo.getRegionName() +
" to server " + serverName);
this.unassignedRegions.put(regionInfo, Long.valueOf(now));
returnMsgs.add(new HMsg(HMsg.MSG_REGION_OPEN, regionInfo));
if (--nregions <= 0) {
break;
}
}
}
}
}
/*
* @param nRegionsToAssign
* @param thisServersLoad
* @return How many regions we can assign to more lightly loaded servers
*/
private int regionsPerServer(final int nRegionsToAssign,
final HServerLoad thisServersLoad) {
SortedMap<HServerLoad, Set<String>> lightServers =
new TreeMap<HServerLoad, Set<String>>();
synchronized (this.loadToServers) {
lightServers.putAll(this.loadToServers.headMap(thisServersLoad));
}
int nRegions = 0;
for (Map.Entry<HServerLoad, Set<String>> e : lightServers.entrySet()) {
HServerLoad lightLoad = new HServerLoad(e.getKey().getNumberOfRequests(),
e.getKey().getNumberOfRegions());
do {
lightLoad.setNumberOfRegions(lightLoad.getNumberOfRegions() + 1);
nRegions += 1;
} while (lightLoad.compareTo(thisServersLoad) <= 0
&& nRegions < nRegionsToAssign);
nRegions *= e.getValue().size();
if (nRegions >= nRegionsToAssign) {
break;
}
}
return nRegions;
}
/*
* Assign all to the only server. An unlikely case but still possible.
* @param regionsToAssign
* @param serverName
* @param returnMsgs
*/
private void assignRegionsToOneServer(final Set<HRegionInfo> regionsToAssign,
final String serverName, final ArrayList<HMsg> returnMsgs) {
long now = System.currentTimeMillis();
int count = 0;
for (HRegionInfo regionInfo: regionsToAssign) {
LOG.info("assigning region " + regionInfo.getRegionName() +
" to the only server " + serverName);
this.unassignedRegions.put(regionInfo, Long.valueOf(now));
returnMsgs.add(new HMsg(HMsg.MSG_REGION_OPEN, regionInfo));
if (count++ >= this.maxAssignInOneGo) {
break;
}
}
}
/*
* Some internal classes to manage msg-passing and region server operations
*/
private abstract class RegionServerOperation implements Delayed {
private long expire;
protected RegionServerOperation() {
// Set the future time at which we expect to be released from the
// DelayQueue we're inserted in on lease expiration.
this.expire = System.currentTimeMillis() + leaseTimeout / 2;
}
/** {@inheritDoc} */
public long getDelay(TimeUnit unit) {
return unit.convert(this.expire - System.currentTimeMillis(),
TimeUnit.MILLISECONDS);
}
/** {@inheritDoc} */
public int compareTo(Delayed o) {
return Long.valueOf(getDelay(TimeUnit.MILLISECONDS)
- o.getDelay(TimeUnit.MILLISECONDS)).intValue();
}
protected void requeue() {
this.expire = System.currentTimeMillis() + leaseTimeout / 2;
delayedToDoQueue.put(this);
}
protected boolean rootAvailable() {
boolean available = true;
if (rootRegionLocation.get() == null) {
available = false;
requeue();
}
return available;
}
protected boolean metaTableAvailable() {
boolean available = true;
if (LOG.isDebugEnabled()) {
LOG.debug("numberOfMetaRegions: " + numberOfMetaRegions.get() +
", onlineMetaRegions.size(): " + onlineMetaRegions.size());
}
if (numberOfMetaRegions.get() != onlineMetaRegions.size()) {
// We can't proceed because not all of the meta regions are online.
// We can't block either because that would prevent the meta region
// online message from being processed. In order to prevent spinning
// in the run queue, put this request on the delay queue to give
// other threads the opportunity to get the meta regions on-line.
if (LOG.isDebugEnabled()) {
LOG.debug("Requeuing because not all meta regions are online");
}
available = false;
requeue();
}
return available;
}
protected abstract boolean process() throws IOException;
}
/**
* Instantiated when a server's lease has expired, meaning it has crashed.
* The region server's log file needs to be split up for each region it was
* serving, and the regions need to get reassigned.
*/
private class ProcessServerShutdown extends RegionServerOperation {
private HServerAddress deadServer;
private String deadServerName;
private Path oldLogDir;
private boolean logSplit;
private boolean rootRescanned;
private class ToDoEntry {
boolean regionOffline;
Text row;
HRegionInfo info;
ToDoEntry(Text row, HRegionInfo info) {
this.regionOffline = false;
this.row = row;
this.info = info;
}
}
/**
* @param serverInfo
*/
public ProcessServerShutdown(HServerInfo serverInfo) {
super();
this.deadServer = serverInfo.getServerAddress();
this.deadServerName = this.deadServer.toString();
this.logSplit = false;
this.rootRescanned = false;
StringBuilder dirName = new StringBuilder("log_");
dirName.append(deadServer.getBindAddress());
dirName.append("_");
dirName.append(serverInfo.getStartCode());
dirName.append("_");
dirName.append(deadServer.getPort());
this.oldLogDir = new Path(rootdir, dirName.toString());
}
/** {@inheritDoc} */
@Override
public String toString() {
return "ProcessServerShutdown of " + this.deadServer.toString();
}
/* Finds regions that the dead region server was serving
*/
private void scanMetaRegion(HRegionInterface server, long scannerId,
Text regionName)
throws IOException {
List<ToDoEntry> toDoList = new ArrayList<ToDoEntry>();
Set<HRegionInfo> regions = new HashSet<HRegionInfo>();
List<Text> emptyRows = new ArrayList<Text>();
try {
while (true) {
HbaseMapWritable values = null;
try {
values = server.next(scannerId);
} catch (IOException e) {
LOG.error("Shutdown scanning of meta region",
RemoteExceptionHandler.checkIOException(e));
break;
}
if (values == null || values.size() == 0) {
break;
}
// TODO: Why does this have to be a sorted map?
RowMap rm = toRowMap(values);
Text row = rm.getRow();
SortedMap<Text, byte[]> map = rm.getMap();
// Check server name. If null, be conservative and treat as though
// region had been on shutdown server (could be null because we
// missed edits in hlog because hdfs does not do write-append).
String serverName;
try {
serverName = Writables.bytesToString(map.get(COL_SERVER));
} catch (UnsupportedEncodingException e) {
LOG.error("Server name", e);
break;
}
if (serverName.length() > 0 &&
deadServerName.compareTo(serverName) != 0) {
continue;
}
// Bingo! Found it.
HRegionInfo info = getHRegionInfo(row, map);
if (info == null) {
emptyRows.add(row);
continue;
}
LOG.info(info.getRegionName() + " was on shutdown server <" +
serverName + "> (or server is null -- " + server.toString() +
" --). Marking unassigned in meta and clearing pendingRegions");
if (info.isMetaTable()) {
if (LOG.isDebugEnabled()) {
LOG.debug("removing meta region " + info.getRegionName() +
" from online meta regions");
}
onlineMetaRegions.remove(info.getStartKey());
}
ToDoEntry todo = new ToDoEntry(row, info);
toDoList.add(todo);
synchronized (killList) {
if (killList.containsKey(deadServerName)) {
HashMap<Text, HRegionInfo> regionsToKill =
new HashMap<Text, HRegionInfo>();
regionsToKill.putAll(killList.get(deadServerName));
if (regionsToKill.containsKey(info.getRegionName())) {
regionsToKill.remove(info.getRegionName());
killList.put(deadServerName, regionsToKill);
unassignedRegions.remove(info);
// Mark region offline
todo.regionOffline = true;
}
} else {
// Get region reassigned
regions.add(info);
}
// If it was pending, remove.
pendingRegions.remove(info.getRegionName());
}
}
} finally {
if(scannerId != -1L) {
try {
server.close(scannerId);
} catch (IOException e) {
LOG.error("Closing scanner",
RemoteExceptionHandler.checkIOException(e));
}
}
}
// Scan complete. Remove any rows which had empty HRegionInfo
if (emptyRows.size() > 0) {
LOG.warn("Found " + emptyRows.size() +
" rows with empty HRegionInfo while scanning meta region " +
regionName);
deleteEmptyMetaRows(server, regionName, emptyRows);
}
// Update server in root/meta entries
for (ToDoEntry e: toDoList) {
if (e.regionOffline) {
HRegion.offlineRegionInMETA(server, regionName, e.info);
}
}
// Get regions reassigned
for (HRegionInfo info: regions) {
unassignedRegions.put(info, ZERO_L);
}
}
@Override
protected boolean process() throws IOException {
LOG.info("process shutdown of server " + deadServer + ": logSplit: " +
this.logSplit + ", rootRescanned: " + this.rootRescanned +
", numberOfMetaRegions: " + numberOfMetaRegions.get() +
", onlineMetaRegions.size(): " + onlineMetaRegions.size());
if (!logSplit) {
// Process the old log file
if (fs.exists(oldLogDir)) {
if (!splitLogLock.tryLock()) {
return false;
}
try {
HLog.splitLog(rootdir, oldLogDir, fs, conf);
} finally {
splitLogLock.unlock();
}
}
logSplit = true;
}
if (!rootAvailable()) {
// Return true so that worker does not put this request back on the
// toDoQueue.
// rootAvailable() has already put it on the delayedToDoQueue
return true;
}
if (!rootRescanned) {
// Scan the ROOT region
HRegionInterface server = null;
long scannerId = -1L;
for (int tries = 0; tries < numRetries; tries ++) {
if (closed.get()) {
return true;
}
server = connection.getHRegionConnection(rootRegionLocation.get());
scannerId = -1L;
try {
if (LOG.isDebugEnabled()) {
LOG.debug("process server shutdown scanning root region on " +
rootRegionLocation.get().getBindAddress());
}
scannerId =
server.openScanner(HRegionInfo.rootRegionInfo.getRegionName(),
COLUMN_FAMILY_ARRAY, EMPTY_START_ROW,
HConstants.LATEST_TIMESTAMP, null);
scanMetaRegion(server, scannerId,
HRegionInfo.rootRegionInfo.getRegionName());
break;
} catch (IOException e) {
if (tries == numRetries - 1) {
throw RemoteExceptionHandler.checkIOException(e);
}
}
sleeper.sleep();
}
if (LOG.isDebugEnabled()) {
LOG.debug("process server shutdown scanning root region on " +
rootRegionLocation.get().getBindAddress() + " finished " +
Thread.currentThread().getName());
}
rootRescanned = true;
}
if (!metaTableAvailable()) {
// We can't proceed because not all meta regions are online.
// metaAvailable() has put this request on the delayedToDoQueue
// Return true so that worker does not put this on the toDoQueue
return true;
}
for (int tries = 0; tries < numRetries; tries++) {
try {
if (closed.get()) {
return true;
}
List<MetaRegion> regions = new ArrayList<MetaRegion>();
synchronized (onlineMetaRegions) {
regions.addAll(onlineMetaRegions.values());
}
for (MetaRegion r: regions) {
HRegionInterface server = null;
long scannerId = -1L;
if (LOG.isDebugEnabled()) {
LOG.debug("process server shutdown scanning " +
r.getRegionName() + " on " + r.getServer() + " " +
Thread.currentThread().getName() + " attempt " + tries);
}
server = connection.getHRegionConnection(r.getServer());
scannerId =
server.openScanner(r.getRegionName(), COLUMN_FAMILY_ARRAY,
EMPTY_START_ROW, HConstants.LATEST_TIMESTAMP, null);
scanMetaRegion(server, scannerId, r.getRegionName());
if (LOG.isDebugEnabled()) {
LOG.debug("process server shutdown finished scanning " +
r.getRegionName() + " on " + r.getServer() + " " +
Thread.currentThread().getName());
}
}
killList.remove(deadServerName);
deadServers.remove(deadServerName);
break;
} catch (IOException e) {
if (tries == numRetries - 1) {
throw RemoteExceptionHandler.checkIOException(e);
}
}
sleeper.sleep();
}
return true;
}
}
/**
* Abstract class that performs common operations for
* @see #ProcessRegionClose and @see #ProcessRegionOpen
*/
private abstract class ProcessRegionStatusChange
extends RegionServerOperation {
protected final boolean isMetaTable;
protected final HRegionInfo regionInfo;
private MetaRegion metaRegion;
protected Text metaRegionName;
/**
* @param regionInfo
*/
public ProcessRegionStatusChange(HRegionInfo regionInfo) {
super();
this.regionInfo = regionInfo;
this.isMetaTable = regionInfo.isMetaTable();
this.metaRegion = null;
this.metaRegionName = null;
}
protected boolean metaRegionAvailable() {
boolean available = true;
if (isMetaTable) {
// This operation is for the meta table
if (!rootAvailable()) {
// But we can't proceed unless the root region is available
available = false;
}
} else {
if (!rootScanned || !metaTableAvailable()) {
// The root region has not been scanned or the meta table is not
// available so we can't proceed.
// Put the operation on the delayedToDoQueue
requeue();
available = false;
}
}
return available;
}
protected HRegionInterface getMetaServer() throws IOException {
if (this.isMetaTable) {
this.metaRegionName = HRegionInfo.rootRegionInfo.getRegionName();
} else {
if (this.metaRegion == null) {
synchronized (onlineMetaRegions) {
metaRegion = onlineMetaRegions.size() == 1 ?
onlineMetaRegions.get(onlineMetaRegions.firstKey()) :
onlineMetaRegions.containsKey(regionInfo.getRegionName()) ?
onlineMetaRegions.get(regionInfo.getRegionName()) :
onlineMetaRegions.get(onlineMetaRegions.headMap(
regionInfo.getRegionName()).lastKey());
}
this.metaRegionName = metaRegion.getRegionName();
}
}
HServerAddress server = null;
if (isMetaTable) {
server = rootRegionLocation.get();
} else {
server = metaRegion.getServer();
}
return connection.getHRegionConnection(server);
}
}
/**
* ProcessRegionClose is instantiated when a region server reports that it
* has closed a region.
*/
private class ProcessRegionClose extends ProcessRegionStatusChange {
private boolean reassignRegion;
/**
* @param regionInfo
* @param reassignRegion
*/
public ProcessRegionClose(HRegionInfo regionInfo, boolean reassignRegion) {
super(regionInfo);
this.reassignRegion = reassignRegion;
}
/** {@inheritDoc} */
@Override
public String toString() {
return "ProcessRegionClose of " + this.regionInfo.getRegionName() +
", " + this.reassignRegion;
}
@Override
protected boolean process() throws IOException {
for (int tries = 0; tries < numRetries; tries++) {
if (closed.get()) {
return true;
}
LOG.info("region closed: " + regionInfo.getRegionName());
// Mark the Region as unavailable in the appropriate meta table
if (!metaRegionAvailable()) {
// We can't proceed unless the meta region we are going to update
// is online. metaRegionAvailable() has put this operation on the
// delayedToDoQueue, so return true so the operation is not put
// back on the toDoQueue
return true;
}
try {
if (!this.reassignRegion) {
HRegion.offlineRegionInMETA(getMetaServer(), metaRegionName,
regionInfo);
}
break;
} catch (IOException e) {
if (tries == numRetries - 1) {
throw RemoteExceptionHandler.checkIOException(e);
}
}
sleeper.sleep();
}
if (reassignRegion) {
LOG.info("reassign region: " + regionInfo.getRegionName());
unassignedRegions.put(regionInfo, ZERO_L);
}
return true;
}
}
/**
* ProcessRegionOpen is instantiated when a region server reports that it is
* serving a region. This applies to all meta and user regions except the
* root region which is handled specially.
*/
private class ProcessRegionOpen extends ProcessRegionStatusChange {
private final HServerAddress serverAddress;
private final byte [] startCode;
/**
* @param info
* @param regionInfo
* @throws IOException
*/
public ProcessRegionOpen(HServerInfo info, HRegionInfo regionInfo)
throws IOException {
super(regionInfo);
this.serverAddress = info.getServerAddress();
this.startCode = Writables.longToBytes(info.getStartCode());
}
/** {@inheritDoc} */
@Override
public String toString() {
return "PendingOpenOperation from " + serverAddress.toString();
}
@Override
protected boolean process() throws IOException {
for (int tries = 0; tries < numRetries; tries++) {
if (closed.get()) {
return true;
}
LOG.info(regionInfo.toString() + " open on " +
this.serverAddress.toString());
if (!metaRegionAvailable()) {
// We can't proceed unless the meta region we are going to update
// is online. metaRegionAvailable() has put this operation on the
// delayedToDoQueue, so return true so the operation is not put
// back on the toDoQueue
return true;
}
// Register the newly-available Region's location.
HRegionInterface server = getMetaServer();
LOG.info("updating row " + regionInfo.getRegionName() + " in table " +
metaRegionName + " with startcode " +
Writables.bytesToLong(this.startCode) + " and server "+
serverAddress.toString());
try {
BatchUpdate b = new BatchUpdate(rand.nextLong());
long lockid = b.startUpdate(regionInfo.getRegionName());
b.put(lockid, COL_SERVER,
Writables.stringToBytes(serverAddress.toString()));
b.put(lockid, COL_STARTCODE, startCode);
server.batchUpdate(metaRegionName, b);
if (isMetaTable) {
// It's a meta region.
MetaRegion m = new MetaRegion(this.serverAddress,
this.regionInfo.getRegionName(), this.regionInfo.getStartKey());
if (!initialMetaScanComplete) {
// Put it on the queue to be scanned for the first time.
try {
LOG.debug("Adding " + m.toString() + " to regions to scan");
metaRegionsToScan.put(m);
} catch (InterruptedException e) {
throw new RuntimeException(
"Putting into metaRegionsToScan was interrupted.", e);
}
} else {
// Add it to the online meta regions
LOG.debug("Adding to onlineMetaRegions: " + m.toString());
onlineMetaRegions.put(this.regionInfo.getStartKey(), m);
}
}
// If updated successfully, remove from pending list.
pendingRegions.remove(regionInfo.getRegionName());
break;
} catch (IOException e) {
if (tries == numRetries - 1) {
throw RemoteExceptionHandler.checkIOException(e);
}
}
sleeper.sleep();
}
return true;
}
}
/*
* HMasterInterface
*/
/** {@inheritDoc} */
public boolean isMasterRunning() {
return !closed.get();
}
/** {@inheritDoc} */
public void shutdown() {
LOG.info("Cluster shutdown requested. Starting to quiesce servers");
this.shutdownRequested = true;
}
/** {@inheritDoc} */
public void createTable(HTableDescriptor desc)
throws IOException {
if (!isMasterRunning()) {
throw new MasterNotRunningException();
}
HRegionInfo newRegion = new HRegionInfo(desc, null, null);
for (int tries = 0; tries < numRetries; tries++) {
try {
// We can not access meta regions if they have not already been
// assigned and scanned. If we timeout waiting, just shutdown.
if (this.metaScannerThread.waitForMetaRegionsOrClose()) {
break;
}
createTable(newRegion);
LOG.info("created table " + desc.getName());
break;
} catch (IOException e) {
if (tries == numRetries - 1) {
throw RemoteExceptionHandler.checkIOException(e);
}
}
sleeper.sleep();
}
}
private void createTable(final HRegionInfo newRegion) throws IOException {
Text tableName = newRegion.getTableDesc().getName();
// TODO: Not thread safe check.
if (tableInCreation.contains(tableName)) {
throw new TableExistsException("Table " + tableName + " in process "
+ "of being created");
}
tableInCreation.add(tableName);
try {
// 1. Check to see if table already exists. Get meta region where
// table would sit should it exist. Open scanner on it. If a region
// for the table we want to create already exists, then table already
// created. Throw already-exists exception.
MetaRegion m = null;
synchronized (onlineMetaRegions) {
m = (onlineMetaRegions.size() == 1 ?
onlineMetaRegions.get(onlineMetaRegions.firstKey()) :
(onlineMetaRegions.containsKey(newRegion.getRegionName()) ?
onlineMetaRegions.get(newRegion.getRegionName()) :
onlineMetaRegions.get(onlineMetaRegions.headMap(
newRegion.getTableDesc().getName()).lastKey())));
}
Text metaRegionName = m.getRegionName();
HRegionInterface server = connection.getHRegionConnection(m.getServer());
long scannerid = server.openScanner(metaRegionName, COL_REGIONINFO_ARRAY,
tableName, HConstants.LATEST_TIMESTAMP, null);
try {
HbaseMapWritable data = server.next(scannerid);
// Test data and that the row for the data is for our table. If table
// does not exist, scanner will return row after where our table would
// be inserted if it exists so look for exact match on table name.
if (data != null && data.size() > 0) {
for (Writable k: data.keySet()) {
if (HRegionInfo.getTableNameFromRegionName(
((HStoreKey) k).getRow()).equals(tableName)) {
// Then a region for this table already exists. Ergo table exists.
throw new TableExistsException(tableName.toString());
}
}
}
} finally {
server.close(scannerid);
}
// 2. Create the HRegion
HRegion region =
HRegion.createHRegion(newRegion, this.rootdir, this.conf);
// 3. Insert into meta
HRegionInfo info = region.getRegionInfo();
Text regionName = region.getRegionName();
BatchUpdate b = new BatchUpdate(rand.nextLong());
long lockid = b.startUpdate(regionName);
b.put(lockid, COL_REGIONINFO, Writables.getBytes(info));
server.batchUpdate(metaRegionName, b);
// 4. Close the new region to flush it to disk. Close its log file too.
region.close();
region.getLog().closeAndDelete();
// 5. Get it assigned to a server
this.unassignedRegions.put(info, ZERO_L);
} finally {
tableInCreation.remove(newRegion.getTableDesc().getName());
}
}
/** {@inheritDoc} */
public void deleteTable(Text tableName) throws IOException {
new TableDelete(tableName).process();
LOG.info("deleted table: " + tableName);
}
/** {@inheritDoc} */
public void addColumn(Text tableName, HColumnDescriptor column)
throws IOException {
new AddColumn(tableName, column).process();
}
/** {@inheritDoc} */
public void modifyColumn(Text tableName, Text columnName,
HColumnDescriptor descriptor)
throws IOException {
new ModifyColumn(tableName, columnName, descriptor).process();
}
/** {@inheritDoc} */
public void deleteColumn(Text tableName, Text columnName) throws IOException {
new DeleteColumn(tableName, HStoreKey.extractFamily(columnName)).process();
}
/** {@inheritDoc} */
public void enableTable(Text tableName) throws IOException {
new ChangeTableState(tableName, true).process();
}
/** {@inheritDoc} */
public void disableTable(Text tableName) throws IOException {
new ChangeTableState(tableName, false).process();
}
/** {@inheritDoc} */
public HServerAddress findRootRegion() {
return rootRegionLocation.get();
}
/*
* Helper classes for HMasterInterface
*/
private abstract class TableOperation {
private Set<MetaRegion> metaRegions;
protected Text tableName;
protected Set<HRegionInfo> unservedRegions;
protected TableOperation(Text tableName) throws IOException {
if (!isMasterRunning()) {
throw new MasterNotRunningException();
}
this.metaRegions = new HashSet<MetaRegion>();
this.tableName = tableName;
this.unservedRegions = new HashSet<HRegionInfo>();
// We can not access any meta region if they have not already been
// assigned and scanned.
if (metaScannerThread.waitForMetaRegionsOrClose()) {
throw new MasterNotRunningException(); // We're shutting down. Forget it.
}
Text firstMetaRegion = null;
synchronized (onlineMetaRegions) {
if (onlineMetaRegions.size() == 1) {
firstMetaRegion = onlineMetaRegions.firstKey();
} else if (onlineMetaRegions.containsKey(tableName)) {
firstMetaRegion = tableName;
} else {
firstMetaRegion = onlineMetaRegions.headMap(tableName).lastKey();
}
this.metaRegions.addAll(onlineMetaRegions.tailMap(
firstMetaRegion).values());
}
}
void process() throws IOException {
for (int tries = 0; tries < numRetries; tries++) {
boolean tableExists = false;
try {
synchronized(metaScannerLock) { // Prevent meta scanner from running
for (MetaRegion m: metaRegions) {
// Get a connection to a meta server
HRegionInterface server =
connection.getHRegionConnection(m.getServer());
// Open a scanner on the meta region
long scannerId =
server.openScanner(m.getRegionName(), COLUMN_FAMILY_ARRAY,
tableName, HConstants.LATEST_TIMESTAMP, null);
List<Text> emptyRows = new ArrayList<Text>();
try {
while (true) {
HbaseMapWritable values = server.next(scannerId);
if(values == null || values.size() == 0) {
break;
}
RowMap rm = toRowMap(values);
SortedMap<Text, byte[]> map = rm.getMap();
Text row = rm.getRow();
HRegionInfo info = getHRegionInfo(row, map);
if (info == null) {
emptyRows.add(row);
LOG.error(COL_REGIONINFO + " not found on " + row);
continue;
}
String serverName = Writables.bytesToString(map.get(COL_SERVER));
long startCode = Writables.bytesToLong(map.get(COL_STARTCODE));
if (info.getTableDesc().getName().compareTo(tableName) > 0) {
break; // Beyond any more entries for this table
}
tableExists = true;
if (!isBeingServed(serverName, startCode)) {
unservedRegions.add(info);
}
processScanItem(serverName, startCode, info);
} // while(true)
} finally {
if (scannerId != -1L) {
try {
server.close(scannerId);
} catch (IOException e) {
e = RemoteExceptionHandler.checkIOException(e);
LOG.error("closing scanner", e);
}
}
scannerId = -1L;
}
// Get rid of any rows that have a null HRegionInfo
if (emptyRows.size() > 0) {
LOG.warn("Found " + emptyRows.size() +
" rows with empty HRegionInfo while scanning meta region " +
m.getRegionName());
deleteEmptyMetaRows(server, m.getRegionName(), emptyRows);
}
if (!tableExists) {
throw new TableNotFoundException(tableName + " does not exist");
}
postProcessMeta(m, server);
unservedRegions.clear();
} // for(MetaRegion m:)
} // synchronized(metaScannerLock)
} catch (IOException e) {
if (e instanceof TableNotFoundException ||
e instanceof TableNotDisabledException ||
e instanceof InvalidColumnNameException) {
throw e;
}
if (tries == numRetries - 1) {
// No retries left
checkFileSystem();
throw RemoteExceptionHandler.checkIOException(e);
}
sleeper.sleep();
continue;
}
break;
} // for(tries...)
}
protected boolean isBeingServed(String serverName, long startCode) {
boolean result = false;
if (serverName != null && serverName.length() > 0 && startCode != -1L) {
HServerInfo s = serversToServerInfo.get(serverName);
result = s != null && s.getStartCode() == startCode;
}
return result;
}
protected boolean isEnabled(HRegionInfo info) {
return !info.isOffline();
}
protected abstract void processScanItem(String serverName, long startCode,
HRegionInfo info) throws IOException;
protected abstract void postProcessMeta(MetaRegion m,
HRegionInterface server) throws IOException;
}
/** Instantiated to enable or disable a table */
private class ChangeTableState extends TableOperation {
private boolean online;
protected Map<String, HashSet<HRegionInfo>> servedRegions =
new HashMap<String, HashSet<HRegionInfo>>();
protected long lockid;
ChangeTableState(Text tableName, boolean onLine) throws IOException {
super(tableName);
this.online = onLine;
}
@Override
protected void processScanItem(String serverName, long startCode,
HRegionInfo info) {
if (isBeingServed(serverName, startCode)) {
HashSet<HRegionInfo> regions = servedRegions.get(serverName);
if (regions == null) {
regions = new HashSet<HRegionInfo>();
}
regions.add(info);
servedRegions.put(serverName, regions);
}
}
@Override
protected void postProcessMeta(MetaRegion m, HRegionInterface server)
throws IOException {
// Process regions not being served
if (LOG.isDebugEnabled()) {
LOG.debug("processing unserved regions");
}
for (HRegionInfo i: unservedRegions) {
if (i.isOffline() && i.isSplit()) {
if (LOG.isDebugEnabled()) {
LOG.debug("Skipping region " + i.toString() +
" because it is offline because it has been split");
}
continue;
}
// Update meta table
if (LOG.isDebugEnabled()) {
LOG.debug("updating columns in row: " + i.getRegionName());
}
BatchUpdate b = new BatchUpdate(rand.nextLong());
lockid = b.startUpdate(i.getRegionName());
updateRegionInfo(b, i);
b.delete(lockid, COL_SERVER);
b.delete(lockid, COL_STARTCODE);
server.batchUpdate(m.getRegionName(), b);
if (LOG.isDebugEnabled()) {
LOG.debug("updated columns in row: " + i.getRegionName());
}
if (online) { // Bring offline regions on-line
killedRegions.remove(i.getRegionName());
synchronized (unassignedRegions) {
if (!unassignedRegions.containsKey(i)) {
unassignedRegions.put(i, ZERO_L);
}
}
} else { // Prevent region from getting assigned.
unassignedRegions.remove(i);
}
}
// Process regions currently being served
if (LOG.isDebugEnabled()) {
LOG.debug("processing regions currently being served");
}
for (Map.Entry<String, HashSet<HRegionInfo>> e: servedRegions.entrySet()) {
String serverName = e.getKey();
if (online) {
LOG.debug("Already online");
continue; // Already being served
}
// Cause regions being served to be taken off-line and disabled
HashMap<Text, HRegionInfo> localKillList =
new HashMap<Text, HRegionInfo>();
for (HRegionInfo i: e.getValue()) {
if (LOG.isDebugEnabled()) {
LOG.debug("adding region " + i.getRegionName() +
" to local kill list");
}
localKillList.put(i.getRegionName(), i);
}
synchronized (killList) {
HashMap<Text, HRegionInfo> killedRegions = killList.get(serverName);
if (killedRegions != null) {
localKillList.putAll(killedRegions);
}
if (localKillList.size() > 0) {
if (LOG.isDebugEnabled()) {
LOG.debug("inserted local kill list into kill list for server " +
serverName);
}
killList.put(serverName, localKillList);
}
}
}
servedRegions.clear();
}
protected void updateRegionInfo(final BatchUpdate b, final HRegionInfo i)
throws IOException {
i.setOffline(!online);
b.put(lockid, COL_REGIONINFO, Writables.getBytes(i));
}
}
/**
* Instantiated to delete a table. Table must be disabled first
*/
private class TableDelete extends TableOperation {
TableDelete(Text tableName) throws IOException {
super(tableName);
}
@Override
protected void processScanItem(
@SuppressWarnings("unused") String serverName,
@SuppressWarnings("unused") long startCode,
final HRegionInfo info) throws IOException {
if (isEnabled(info)) {
throw new TableNotDisabledException("You must first disable table <" +
tableName.toString() + "> before you can drop it");
}
}
@Override
protected void postProcessMeta(MetaRegion m, HRegionInterface server)
throws IOException {
for (HRegionInfo i: unservedRegions) {
// Update meta table
if (LOG.isDebugEnabled()) {
LOG.debug("updating columns in row: " + i.getRegionName());
}
BatchUpdate b = new BatchUpdate(rand.nextLong());
long lockid = b.startUpdate(i.getRegionName());
updateRegionInfo(lockid, b, i);
server.batchUpdate(m.getRegionName(), b);
if (LOG.isDebugEnabled()) {
LOG.debug("updated columns in row: " + i.getRegionName());
}
// Delete the region
try {
HRegion.deleteRegion(fs, rootdir, i);
} catch (IOException e) {
LOG.error("failed to delete region " + i.getRegionName(),
RemoteExceptionHandler.checkIOException(e));
}
}
}
private void updateRegionInfo(long lockid, BatchUpdate b,
@SuppressWarnings("unused") HRegionInfo info) {
for (int i = 0; i < ALL_META_COLUMNS.length; i++) {
// Be sure to clean all cells
b.delete(lockid, ALL_META_COLUMNS[i]);
}
}
}
private abstract class ColumnOperation extends TableOperation {
protected ColumnOperation(Text tableName) throws IOException {
super(tableName);
}
@Override
protected void processScanItem(
@SuppressWarnings("unused") String serverName,
@SuppressWarnings("unused") long startCode,
final HRegionInfo info) throws IOException {
if (isEnabled(info)) {
throw new TableNotDisabledException(tableName.toString());
}
}
protected void updateRegionInfo(HRegionInterface server, Text regionName,
HRegionInfo i) throws IOException {
BatchUpdate b = new BatchUpdate(rand.nextLong());
long lockid = b.startUpdate(i.getRegionName());
b.put(lockid, COL_REGIONINFO, Writables.getBytes(i));
server.batchUpdate(regionName, b);
if (LOG.isDebugEnabled()) {
LOG.debug("updated columns in row: " + i.getRegionName());
}
}
}
/** Instantiated to remove a column family from a table */
private class DeleteColumn extends ColumnOperation {
private Text columnName;
DeleteColumn(Text tableName, Text columnName) throws IOException {
super(tableName);
this.columnName = columnName;
}
@Override
protected void postProcessMeta(MetaRegion m, HRegionInterface server)
throws IOException {
Path tabledir = new Path(rootdir, tableName.toString());
for (HRegionInfo i: unservedRegions) {
i.getTableDesc().families().remove(columnName);
updateRegionInfo(server, m.getRegionName(), i);
// Delete the directories used by the column
String encodedName = i.getEncodedName();
fs.delete(HStoreFile.getMapDir(tabledir, encodedName, columnName));
fs.delete(HStoreFile.getInfoDir(tabledir, encodedName, columnName));
}
}
}
/** Instantiated to add a column family to a table */
private class AddColumn extends ColumnOperation {
private HColumnDescriptor newColumn;
AddColumn(Text tableName, HColumnDescriptor newColumn) throws IOException {
super(tableName);
this.newColumn = newColumn;
}
@Override
protected void postProcessMeta(MetaRegion m, HRegionInterface server)
throws IOException {
for (HRegionInfo i: unservedRegions) {
// All we need to do to add a column is add it to the table descriptor.
// When the region is brought on-line, it will find the column missing
// and create it.
i.getTableDesc().addFamily(newColumn);
updateRegionInfo(server, m.getRegionName(), i);
}
}
}
/** Instantiated to modify an existing column family on a table */
private class ModifyColumn extends ColumnOperation {
private HColumnDescriptor descriptor;
private Text columnName;
ModifyColumn(Text tableName, Text columnName, HColumnDescriptor _descriptor)
throws IOException {
super(tableName);
this.descriptor = _descriptor;
this.columnName = columnName;
}
@Override
protected void postProcessMeta(MetaRegion m, HRegionInterface server)
throws IOException {
for (HRegionInfo i: unservedRegions) {
// get the column families map from the table descriptor
Map<Text, HColumnDescriptor> families = i.getTableDesc().families();
// if the table already has this column, then put the new descriptor
// version.
if (families.get(columnName) != null){
families.put(columnName, descriptor);
updateRegionInfo(server, m.getRegionName(), i);
} else{ // otherwise, we have an error.
throw new InvalidColumnNameException("Column family '" + columnName +
"' doesn't exist, so cannot be modified.");
}
}
}
}
/*
* Managing leases
*/
/** Instantiated to monitor the health of a region server */
private class ServerExpirer implements LeaseListener {
@SuppressWarnings("hiding")
private String server;
ServerExpirer(String server) {
this.server = server;
}
/** {@inheritDoc} */
public void leaseExpired() {
LOG.info(server + " lease expired");
// Remove the server from the known servers list and update load info
HServerInfo info = serversToServerInfo.remove(server);
if (info != null) {
HServerAddress root = rootRegionLocation.get();
if (root != null && root.equals(info.getServerAddress())) {
unassignRootRegion();
}
String serverName = info.getServerAddress().toString();
HServerLoad load = serversToLoad.remove(serverName);
if (load != null) {
Set<String> servers = loadToServers.get(load);
if (servers != null) {
servers.remove(serverName);
loadToServers.put(load, servers);
}
}
deadServers.add(server);
}
synchronized (serversToServerInfo) {
serversToServerInfo.notifyAll();
}
// NOTE: If the server was serving the root region, we cannot reassign it
// here because the new server will start serving the root region before
// the ProcessServerShutdown operation has a chance to split the log file.
if (info != null) {
delayedToDoQueue.put(new ProcessServerShutdown(info));
}
}
}
/**
* @return Return configuration being used by this server.
*/
public HBaseConfiguration getConfiguration() {
return this.conf;
}
/*
* Data structure used to return results out of the toRowMap method.
*/
private static class RowMap {
private final Text row;
private final SortedMap<Text, byte[]> map;
/**
* Constructor
*
* @param r the row
* @param m the map of column names to values
*/
RowMap(final Text r, final SortedMap<Text, byte[]> m) {
this.row = r;
this.map = m;
}
/** @return the row */
Text getRow() {
return this.row;
}
/** @return the column value map */
SortedMap<Text, byte[]> getMap() {
return this.map;
}
}
/*
* Convert an HbaseMapWritable to a Map keyed by column.
* Utility method used scanning meta regions
* @param mw The MapWritable to convert. Cannot be null.
* @return Returns a SortedMap currently. TODO: This looks like it could
* be a plain Map.
*/
protected RowMap toRowMap(final HbaseMapWritable mw) {
if (mw == null) {
throw new IllegalArgumentException("Passed MapWritable cannot be null");
}
SortedMap<Text, byte[]> m = new TreeMap<Text, byte[]>();
Text row = null;
for (Map.Entry<Writable, Writable> e: mw.entrySet()) {
HStoreKey key = (HStoreKey) e.getKey();
Text thisRow = key.getRow();
if (row == null) {
row = thisRow;
} else {
if (!row.equals(thisRow)) {
LOG.error("Multiple rows in same scanner result set. firstRow=" +
row + ", currentRow=" + thisRow);
}
}
m.put(key.getColumn(), ((ImmutableBytesWritable) e.getValue()).get());
}
return new RowMap(row, m);
}
/*
* Get HRegionInfo from passed META map of row values.
* Returns null if none found (and logs fact that expected COL_REGIONINFO
* was missing). Utility method used by scanners of META tables.
* @param row name of the row
* @param map Map to do lookup in.
* @return Null or found HRegionInfo.
* @throws IOException
*/
protected HRegionInfo getHRegionInfo(final Text row,
final Map<Text, byte[]> map)
throws IOException {
byte [] bytes = map.get(COL_REGIONINFO);
if (bytes == null) {
LOG.warn(COL_REGIONINFO.toString() + " is empty for row: " + row +
"; has keys: " + map.keySet().toString());
return null;
}
return (HRegionInfo)Writables.getWritable(bytes, new HRegionInfo());
}
/*
* When we find rows in a meta region that has an empty HRegionInfo, we
* clean them up here.
*
* @param server connection to server serving meta region
* @param metaRegionName name of the meta region we scanned
* @param emptyRows the row keys that had empty HRegionInfos
*/
protected void deleteEmptyMetaRows(HRegionInterface server,
Text metaRegionName,
List<Text> emptyRows) {
for (Text regionName: emptyRows) {
try {
HRegion.removeRegionFromMETA(server, metaRegionName, regionName);
LOG.warn("Removed region: " + regionName + " from meta region: " +
metaRegionName + " because HRegionInfo was empty");
} catch (IOException e) {
LOG.error("deleting region: " + regionName + " from meta region: " +
metaRegionName, e);
}
}
}
/*
* Main program
*/
private static void printUsageAndExit() {
System.err.println("Usage: java org.apache.hbase.HMaster " +
"[--bind=hostname:port] start|stop");
System.exit(0);
}
protected static void doMain(String [] args,
Class<? extends HMaster> masterClass) {
if (args.length < 1) {
printUsageAndExit();
}
HBaseConfiguration conf = new HBaseConfiguration();
// Process command-line args. TODO: Better cmd-line processing
// (but hopefully something not as painful as cli options).
final String addressArgKey = "--bind=";
for (String cmd: args) {
if (cmd.startsWith(addressArgKey)) {
conf.set(MASTER_ADDRESS, cmd.substring(addressArgKey.length()));
continue;
}
if (cmd.equals("start")) {
try {
// If 'local', defer to LocalHBaseCluster instance.
if (LocalHBaseCluster.isLocal(conf)) {
(new LocalHBaseCluster(conf)).startup();
} else {
Constructor<? extends HMaster> c =
masterClass.getConstructor(HBaseConfiguration.class);
HMaster master = c.newInstance(conf);
master.start();
}
} catch (Throwable t) {
LOG.error( "Can not start master", t);
System.exit(-1);
}
break;
}
if (cmd.equals("stop")) {
try {
if (LocalHBaseCluster.isLocal(conf)) {
LocalHBaseCluster.doLocal(conf);
}
HBaseAdmin adm = new HBaseAdmin(conf);
adm.shutdown();
} catch (Throwable t) {
LOG.error( "Can not stop master", t);
System.exit(-1);
}
break;
}
// Print out usage if we get to here.
printUsageAndExit();
}
}
/**
* Main program
* @param args
*/
public static void main(String [] args) {
doMain(args, HMaster.class);
}
}