Skip to content

Commit 6f213e9

Browse files
committed
HBASE-24075: Fix a race between master shutdown and metrics (re)init
JMXCacheBuster resets the metrics state at various points in time. These events can potentially race with a master shutdown. When the master is tearing down, metrics initialization can touch a lot of unsafe state, for example invalidated FS objects. To avoid this, this patch makes the getMetrics() a no-op when the master is either stopped or in the process of shutting down. Additionally, getClusterId() when the server is shutting down is made a no-op. Simulating a test for this is a bit tricky but with the patch I don't locally see the long stacktraces from the jira. Signed-off-by: Michael Stack <[email protected]>
1 parent e5a8435 commit 6f213e9

File tree

6 files changed

+28
-9
lines changed

6 files changed

+28
-9
lines changed

hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/master/MetricsMasterWrapper.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,11 @@
3030
@InterfaceAudience.Private
3131
public interface MetricsMasterWrapper {
3232

33+
/**
34+
* Returns if the master is currently running and is not attempting to shutdown.
35+
*/
36+
boolean isRunning();
37+
3338
/**
3439
* Get ServerName
3540
*/

hbase-hadoop2-compat/src/main/java/org/apache/hadoop/hbase/master/MetricsMasterSourceImpl.java

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,9 @@ public void getMetrics(MetricsCollector metricsCollector, boolean all) {
8383
MetricsRecordBuilder metricsRecordBuilder = metricsCollector.addRecord(metricsName);
8484

8585
// masterWrapper can be null because this function is called inside of init.
86-
if (masterWrapper != null) {
86+
// If the master is already stopped or has initiated a shutdown, no point in registering the
87+
// metrics again.
88+
if (masterWrapper != null && masterWrapper.isRunning()) {
8789

8890
// Pair<online region number, offline region number>
8991
PairOfSameType<Integer> regionNumberPair = masterWrapper.getRegionCounts();

hbase-server/src/main/java/org/apache/hadoop/hbase/master/CachedClusterId.java

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
import org.apache.hadoop.fs.FileSystem;
2626
import org.apache.hadoop.fs.Path;
2727
import org.apache.hadoop.hbase.ClusterId;
28+
import org.apache.hadoop.hbase.Server;
2829
import org.apache.hadoop.hbase.util.FSUtils;
2930
import org.apache.yetus.audience.InterfaceAudience;
3031
import org.slf4j.Logger;
@@ -46,8 +47,8 @@ public class CachedClusterId {
4647
public static final Logger LOG = LoggerFactory.getLogger(CachedClusterId.class);
4748
private static final int MAX_FETCH_TIMEOUT_MS = 10000;
4849

49-
private Path rootDir;
50-
private FileSystem fs;
50+
private final Path rootDir;
51+
private final FileSystem fs;
5152

5253
// When true, indicates that a FileSystem fetch of ClusterID is in progress. This is used to
5354
// avoid multiple fetches from FS and let only one thread fetch the information.
@@ -58,12 +59,15 @@ public class CachedClusterId {
5859
// Immutable once set and read multiple times.
5960
private ClusterId clusterId;
6061

62+
private final Server server;
63+
6164
// cache stats for testing.
6265
private AtomicInteger cacheMisses = new AtomicInteger(0);
6366

64-
public CachedClusterId(Configuration conf) throws IOException {
65-
rootDir = FSUtils.getRootDir(conf);
66-
fs = rootDir.getFileSystem(conf);
67+
public CachedClusterId(Server server, Configuration conf) throws IOException {
68+
this.rootDir = FSUtils.getRootDir(conf);
69+
this.fs = rootDir.getFileSystem(conf);
70+
this.server = server;
6771
}
6872

6973
/**
@@ -130,9 +134,12 @@ private void waitForFetchToFinish() throws InterruptedException {
130134
* trying get from a clean cache.
131135
*
132136
* @return ClusterId by reading from FileSystem or null in any error case or cluster ID does
133-
* not exist on the file system.
137+
* not exist on the file system or if the server initiated a tear down.
134138
*/
135139
public String getFromCacheOrFetch() {
140+
if (server.isStopping() || server.isStopped()) {
141+
return null;
142+
}
136143
String id = getClusterId();
137144
if (id != null) {
138145
return id;

hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -586,7 +586,7 @@ public HMaster(final Configuration conf) throws IOException {
586586
this.metaRegionLocationCache = null;
587587
this.activeMasterManager = null;
588588
}
589-
cachedClusterId = new CachedClusterId(conf);
589+
cachedClusterId = new CachedClusterId(this, conf);
590590
} catch (Throwable t) {
591591
// Make sure we log the exception. HMaster is often started via reflection and the
592592
// cause of failed startup is lost.

hbase-server/src/main/java/org/apache/hadoop/hbase/master/MetricsMasterWrapperImpl.java

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,10 @@ public int getNumDeadRegionServers() {
134134
return serverManager.getDeadServers().size();
135135
}
136136

137+
@Override public boolean isRunning() {
138+
return !(master.isStopped() || master.isStopping());
139+
}
140+
137141
@Override
138142
public String getServerName() {
139143
ServerName serverName = master.getServerName();

hbase-server/src/test/java/org/apache/hadoop/hbase/TestCachedClusterId.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,8 @@ public void testClusterIdMatch() {
7676
@Test
7777
public void testMultiThreadedGetClusterId() throws Exception {
7878
Configuration conf = TEST_UTIL.getConfiguration();
79-
CachedClusterId cachedClusterId = new CachedClusterId(conf);
79+
CachedClusterId cachedClusterId = new CachedClusterId(TEST_UTIL.getHBaseCluster().getMaster(),
80+
conf);
8081
TestContext context = new TestContext(conf);
8182
int numThreads = 16;
8283
for (int i = 0; i < numThreads; i++) {

0 commit comments

Comments
 (0)