Skip to content

Commit 42f1ec0

Browse files
committed
HBASE-25032 Wait for region server to become online before adding it to online servers in Master
1 parent 202b17f commit 42f1ec0

File tree

7 files changed

+32
-34
lines changed

7 files changed

+32
-34
lines changed

hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -204,7 +204,8 @@ public boolean unregisterListener(final ServerListener listener) {
204204
}
205205

206206
/**
207-
* Let the server manager know a new regionserver has come online
207+
* Let the server manager know a regionserver is requesting configurations.
208+
* Regionserver will not be added here, but in its first report.
208209
* @param request the startup request
209210
* @param versionNumber the version number of the new regionserver
210211
* @param version the version of the new regionserver, could contain strings like "SNAPSHOT"
@@ -227,10 +228,6 @@ ServerName regionServerStartup(RegionServerStartupRequest request, int versionNu
227228
ServerName sn = ServerName.valueOf(hostname, request.getPort(), request.getServerStartCode());
228229
checkClockSkew(sn, request.getServerCurrentTime());
229230
checkIsDead(sn, "STARTUP");
230-
if (!checkAndRecordNewServer(sn, ServerMetricsBuilder.of(sn, versionNumber, version))) {
231-
LOG.warn(
232-
"THIS SHOULD NOT HAPPEN, RegionServerStartup" + " could not record the server: " + sn);
233-
}
234231
return sn;
235232
}
236233

@@ -281,7 +278,6 @@ public void regionServerReport(ServerName sn,
281278
if (null == this.onlineServers.replace(sn, sl)) {
282279
// Already have this host+port combo and its just different start code?
283280
// Just let the server in. Presume master joining a running cluster.
284-
// recordNewServer is what happens at the end of reportServerStartup.
285281
// The only thing we are skipping is passing back to the regionserver
286282
// the ServerName to use. Here we presume a master has already done
287283
// that so we'll press on with whatever it gave us for ServerName.

hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java

Lines changed: 17 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1014,8 +1014,8 @@ public void run() {
10141014
// node was created, in case any coprocessors want to use ZooKeeper
10151015
this.rsHost = new RegionServerCoprocessorHost(this, this.conf);
10161016

1017-
// Try and register with the Master; tell it we are here. Break if server is stopped or
1018-
// the clusterup flag is down or hdfs went wacky. Once registered successfully, go ahead and
1017+
// Get configurations from the Master. Break if server is stopped or
1018+
// the clusterup flag is down or hdfs went wacky. Then start up all Services.
10191019
// start up all Services. Use RetryCounter to get backoff in case Master is struggling to
10201020
// come up.
10211021
LOG.debug("About to register with Master.");
@@ -1050,7 +1050,7 @@ public void run() {
10501050
}
10511051
}
10521052

1053-
// We registered with the Master. Go into run mode.
1053+
// Run mode.
10541054
long lastMsg = System.currentTimeMillis();
10551055
long oldRequestCount = -1;
10561056
// The main run loop.
@@ -1084,7 +1084,14 @@ public void run() {
10841084
}
10851085
long now = System.currentTimeMillis();
10861086
if ((now - lastMsg) >= msgInterval) {
1087-
tryRegionServerReport(lastMsg, now);
1087+
// Register with the Master now that our setup is complete.
1088+
if (tryRegionServerReport(lastMsg, now) && !online.get()) {
1089+
// Wake up anyone waiting for this server to online
1090+
synchronized (online) {
1091+
online.set(true);
1092+
online.notifyAll();
1093+
}
1094+
}
10881095
lastMsg = System.currentTimeMillis();
10891096
}
10901097
if (!isStopped() && !isAborted()) {
@@ -1253,12 +1260,12 @@ private long getWriteRequestCount() {
12531260
}
12541261

12551262
@InterfaceAudience.Private
1256-
protected void tryRegionServerReport(long reportStartTime, long reportEndTime)
1263+
protected boolean tryRegionServerReport(long reportStartTime, long reportEndTime)
12571264
throws IOException {
12581265
RegionServerStatusService.BlockingInterface rss = rssStub;
12591266
if (rss == null) {
12601267
// the current server could be stopping.
1261-
return;
1268+
return false;
12621269
}
12631270
ClusterStatusProtos.ServerLoad sl = buildServerLoad(reportStartTime, reportEndTime);
12641271
try {
@@ -1278,7 +1285,9 @@ protected void tryRegionServerReport(long reportStartTime, long reportEndTime)
12781285
// Couldn't connect to the master, get location from zk and reconnect
12791286
// Method blocks until new master is found or we are stopped
12801287
createRegionServerStatusStub(true);
1288+
return false;
12811289
}
1290+
return true;
12821291
}
12831292

12841293
/**
@@ -1653,11 +1662,6 @@ protected void handleReportForDutyResponse(final RegionServerStartupResponse c)
16531662
", sessionid=0x" +
16541663
Long.toHexString(this.zooKeeper.getRecoverableZooKeeper().getSessionId()));
16551664

1656-
// Wake up anyone waiting for this server to online
1657-
synchronized (online) {
1658-
online.set(true);
1659-
online.notifyAll();
1660-
}
16611665
} catch (Throwable e) {
16621666
stop("Failed initialization");
16631667
throw convertThrowableToIOE(cleanup(e, "Failed init"),
@@ -2836,10 +2840,10 @@ private boolean keepLooping() {
28362840
}
28372841

28382842
/*
2839-
* Let the master know we're here Run initialization using parameters passed
2843+
* Run initialization using parameters passed us by the master.
28402844
* us by the master.
28412845
* @return A Map of key/value configurations we got from the Master else
2842-
* null if we failed to register.
2846+
* null if we failed during report.
28432847
* @throws IOException
28442848
*/
28452849
private RegionServerStartupResponse reportForDuty() throws IOException {

hbase-server/src/test/java/org/apache/hadoop/hbase/MiniHBaseCluster.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -446,8 +446,9 @@ public JVMClusterUtil.RegionServerThread startRegionServerAndWait(long timeout)
446446
ServerName rsServerName = t.getRegionServer().getServerName();
447447

448448
long start = System.currentTimeMillis();
449-
ClusterMetrics clusterStatus = getClusterMetrics();
449+
ClusterMetrics clusterStatus;
450450
while ((System.currentTimeMillis() - start) < timeout) {
451+
clusterStatus = getClusterMetrics();
451452
if (clusterStatus != null && clusterStatus.getLiveServerMetrics().containsKey(rsServerName)) {
452453
return t;
453454
}

hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestGetReplicationLoad.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,8 +58,9 @@ public MyMaster(Configuration conf) throws IOException, KeeperException, Interru
5858
}
5959

6060
@Override
61-
protected void tryRegionServerReport(long reportStartTime, long reportEndTime) {
61+
protected boolean tryRegionServerReport(long reportStartTime, long reportEndTime) {
6262
// do nothing
63+
return true;
6364
}
6465
}
6566

hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterMetrics.java

Lines changed: 4 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -61,23 +61,13 @@ public static class MyMaster extends HMaster {
6161
public MyMaster(Configuration conf) throws IOException, KeeperException, InterruptedException {
6262
super(conf);
6363
}
64-
65-
@Override
66-
protected void tryRegionServerReport(long reportStartTime, long reportEndTime) {
67-
// do nothing
68-
}
6964
}
7065

7166
public static class MyRegionServer extends MiniHBaseCluster.MiniHBaseClusterRegionServer {
7267

7368
public MyRegionServer(Configuration conf) throws IOException, InterruptedException {
7469
super(conf);
7570
}
76-
77-
@Override
78-
protected void tryRegionServerReport(long reportStartTime, long reportEndTime) {
79-
// do nothing
80-
}
8171
}
8272

8373
@BeforeClass
@@ -108,11 +98,14 @@ public void testClusterRequests() throws Exception {
10898
request.setServer(ProtobufUtil.toServerName(serverName));
10999
long expectedRequestNumber = 10000;
110100

111-
MetricsMasterSource masterSource = master.getMasterMetrics().getMetricsSource();
112101
ClusterStatusProtos.ServerLoad sl = ClusterStatusProtos.ServerLoad.newBuilder()
113102
.setTotalNumberOfRequests(expectedRequestNumber).build();
114103
request.setLoad(sl);
115104

105+
MetricsMasterSource masterSource = master.getMasterMetrics().getMetricsSource();
106+
// Init master source again to reset cluster requests counter
107+
masterSource.init();
108+
116109
master.getMasterRpcServices().regionServerReport(null, request.build());
117110
metricsHelper.assertCounter("cluster_requests", expectedRequestNumber, masterSource);
118111

hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestCompactionInDeadRegionServer.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -84,13 +84,14 @@ public IgnoreYouAreDeadRS(Configuration conf) throws IOException, InterruptedExc
8484
}
8585

8686
@Override
87-
protected void tryRegionServerReport(long reportStartTime, long reportEndTime)
87+
protected boolean tryRegionServerReport(long reportStartTime, long reportEndTime)
8888
throws IOException {
8989
try {
9090
super.tryRegionServerReport(reportStartTime, reportEndTime);
9191
} catch (YouAreDeadException e) {
9292
// ignore, do not abort
9393
}
94+
return true;
9495
}
9596
}
9697

hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestShutdownWhileWALBroken.java

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,13 +85,15 @@ public MyRegionServer(Configuration conf) throws IOException {
8585
}
8686

8787
@Override
88-
protected void tryRegionServerReport(long reportStartTime, long reportEndTime)
88+
protected boolean tryRegionServerReport(long reportStartTime, long reportEndTime)
8989
throws IOException {
9090
try {
9191
super.tryRegionServerReport(reportStartTime, reportEndTime);
9292
} catch (YouAreDeadException e) {
9393
LOG.info("Caught YouAreDeadException, ignore", e);
94+
return false;
9495
}
96+
return true;
9597
}
9698

9799
@Override

0 commit comments

Comments
 (0)