Skip to content

Commit a628295

Browse files
committed
HBASE-25032 Wait for region server to become online before adding it to online servers in Master
1 parent f6bb4bb commit a628295

File tree

7 files changed

+33
-37
lines changed

7 files changed

+33
-37
lines changed

hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -204,7 +204,8 @@ public boolean unregisterListener(final ServerListener listener) {
204204
}
205205

206206
/**
207-
* Let the server manager know a new regionserver has come online
207+
* Let the server manager know a regionserver is requesting configurations.
208+
* Regionserver will not be added here, but in its first report.
208209
* @param request the startup request
209210
* @param versionNumber the version number of the new regionserver
210211
* @param version the version of the new regionserver, could contain strings like "SNAPSHOT"
@@ -227,10 +228,6 @@ ServerName regionServerStartup(RegionServerStartupRequest request, int versionNu
227228
ServerName sn = ServerName.valueOf(hostname, request.getPort(), request.getServerStartCode());
228229
checkClockSkew(sn, request.getServerCurrentTime());
229230
checkIsDead(sn, "STARTUP");
230-
if (!checkAndRecordNewServer(sn, ServerMetricsBuilder.of(sn, versionNumber, version))) {
231-
LOG.warn(
232-
"THIS SHOULD NOT HAPPEN, RegionServerStartup" + " could not record the server: " + sn);
233-
}
234231
return sn;
235232
}
236233

@@ -281,7 +278,6 @@ public void regionServerReport(ServerName sn,
281278
if (null == this.onlineServers.replace(sn, sl)) {
282279
// Already have this host+port combo and its just different start code?
283280
// Just let the server in. Presume master joining a running cluster.
284-
// recordNewServer is what happens at the end of reportServerStartup.
285281
// The only thing we are skipping is passing back to the regionserver
286282
// the ServerName to use. Here we presume a master has already done
287283
// that so we'll press on with whatever it gave us for ServerName.

hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java

Lines changed: 18 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1014,10 +1014,9 @@ public void run() {
10141014
// node was created, in case any coprocessors want to use ZooKeeper
10151015
this.rsHost = new RegionServerCoprocessorHost(this, this.conf);
10161016

1017-
// Try and register with the Master; tell it we are here. Break if server is stopped or
1018-
// the clusterup flag is down or hdfs went wacky. Once registered successfully, go ahead and
1019-
// start up all Services. Use RetryCounter to get backoff in case Master is struggling to
1020-
// come up.
1017+
// Get configurations from the Master. Break if server is stopped or
1018+
// the clusterup flag is down or hdfs went wacky. Then start up all Services.
1019+
// Use RetryCounter to get backoff in case Master is struggling to come up.
10211020
LOG.debug("About to register with Master.");
10221021
RetryCounterFactory rcf =
10231022
new RetryCounterFactory(Integer.MAX_VALUE, this.sleeper.getPeriod(), 1000 * 60 * 5);
@@ -1050,7 +1049,7 @@ public void run() {
10501049
}
10511050
}
10521051

1053-
// We registered with the Master. Go into run mode.
1052+
// Run mode.
10541053
long lastMsg = System.currentTimeMillis();
10551054
long oldRequestCount = -1;
10561055
// The main run loop.
@@ -1084,7 +1083,14 @@ public void run() {
10841083
}
10851084
long now = System.currentTimeMillis();
10861085
if ((now - lastMsg) >= msgInterval) {
1087-
tryRegionServerReport(lastMsg, now);
1086+
// Register with the Master now that our setup is complete.
1087+
if (tryRegionServerReport(lastMsg, now) && !online.get()) {
1088+
// Wake up anyone waiting for this server to online
1089+
synchronized (online) {
1090+
online.set(true);
1091+
online.notifyAll();
1092+
}
1093+
}
10881094
lastMsg = System.currentTimeMillis();
10891095
}
10901096
if (!isStopped() && !isAborted()) {
@@ -1253,12 +1259,12 @@ private long getWriteRequestCount() {
12531259
}
12541260

12551261
@InterfaceAudience.Private
1256-
protected void tryRegionServerReport(long reportStartTime, long reportEndTime)
1262+
protected boolean tryRegionServerReport(long reportStartTime, long reportEndTime)
12571263
throws IOException {
12581264
RegionServerStatusService.BlockingInterface rss = rssStub;
12591265
if (rss == null) {
12601266
// the current server could be stopping.
1261-
return;
1267+
return false;
12621268
}
12631269
ClusterStatusProtos.ServerLoad sl = buildServerLoad(reportStartTime, reportEndTime);
12641270
try {
@@ -1278,7 +1284,9 @@ protected void tryRegionServerReport(long reportStartTime, long reportEndTime)
12781284
// Couldn't connect to the master, get location from zk and reconnect
12791285
// Method blocks until new master is found or we are stopped
12801286
createRegionServerStatusStub(true);
1287+
return false;
12811288
}
1289+
return true;
12821290
}
12831291

12841292
/**
@@ -1653,11 +1661,6 @@ protected void handleReportForDutyResponse(final RegionServerStartupResponse c)
16531661
", sessionid=0x" +
16541662
Long.toHexString(this.zooKeeper.getRecoverableZooKeeper().getSessionId()));
16551663

1656-
// Wake up anyone waiting for this server to online
1657-
synchronized (online) {
1658-
online.set(true);
1659-
online.notifyAll();
1660-
}
16611664
} catch (Throwable e) {
16621665
stop("Failed initialization");
16631666
throw convertThrowableToIOE(cleanup(e, "Failed init"),
@@ -2836,10 +2839,9 @@ private boolean keepLooping() {
28362839
}
28372840

28382841
/*
2839-
* Let the master know we're here Run initialization using parameters passed
2840-
* us by the master.
2842+
* Run initialization using parameters passed us by the master.
28412843
* @return A Map of key/value configurations we got from the Master else
2842-
* null if we failed to register.
2844+
* null if we failed during report.
28432845
* @throws IOException
28442846
*/
28452847
private RegionServerStartupResponse reportForDuty() throws IOException {

hbase-server/src/test/java/org/apache/hadoop/hbase/MiniHBaseCluster.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -446,8 +446,9 @@ public JVMClusterUtil.RegionServerThread startRegionServerAndWait(long timeout)
446446
ServerName rsServerName = t.getRegionServer().getServerName();
447447

448448
long start = System.currentTimeMillis();
449-
ClusterMetrics clusterStatus = getClusterMetrics();
449+
ClusterMetrics clusterStatus;
450450
while ((System.currentTimeMillis() - start) < timeout) {
451+
clusterStatus = getClusterMetrics();
451452
if (clusterStatus != null && clusterStatus.getLiveServerMetrics().containsKey(rsServerName)) {
452453
return t;
453454
}

hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestGetReplicationLoad.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,8 +58,9 @@ public MyMaster(Configuration conf) throws IOException, KeeperException, Interru
5858
}
5959

6060
@Override
61-
protected void tryRegionServerReport(long reportStartTime, long reportEndTime) {
61+
protected boolean tryRegionServerReport(long reportStartTime, long reportEndTime) {
6262
// do nothing
63+
return true;
6364
}
6465
}
6566

hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterMetrics.java

Lines changed: 4 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -61,23 +61,13 @@ public static class MyMaster extends HMaster {
6161
public MyMaster(Configuration conf) throws IOException, KeeperException, InterruptedException {
6262
super(conf);
6363
}
64-
65-
@Override
66-
protected void tryRegionServerReport(long reportStartTime, long reportEndTime) {
67-
// do nothing
68-
}
6964
}
7065

7166
public static class MyRegionServer extends MiniHBaseCluster.MiniHBaseClusterRegionServer {
7267

7368
public MyRegionServer(Configuration conf) throws IOException, InterruptedException {
7469
super(conf);
7570
}
76-
77-
@Override
78-
protected void tryRegionServerReport(long reportStartTime, long reportEndTime) {
79-
// do nothing
80-
}
8171
}
8272

8373
@BeforeClass
@@ -108,11 +98,14 @@ public void testClusterRequests() throws Exception {
10898
request.setServer(ProtobufUtil.toServerName(serverName));
10999
long expectedRequestNumber = 10000;
110100

111-
MetricsMasterSource masterSource = master.getMasterMetrics().getMetricsSource();
112101
ClusterStatusProtos.ServerLoad sl = ClusterStatusProtos.ServerLoad.newBuilder()
113102
.setTotalNumberOfRequests(expectedRequestNumber).build();
114103
request.setLoad(sl);
115104

105+
MetricsMasterSource masterSource = master.getMasterMetrics().getMetricsSource();
106+
// Init master source again to reset cluster requests counter
107+
masterSource.init();
108+
116109
master.getMasterRpcServices().regionServerReport(null, request.build());
117110
metricsHelper.assertCounter("cluster_requests", expectedRequestNumber, masterSource);
118111

hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestCompactionInDeadRegionServer.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -84,13 +84,14 @@ public IgnoreYouAreDeadRS(Configuration conf) throws IOException, InterruptedExc
8484
}
8585

8686
@Override
87-
protected void tryRegionServerReport(long reportStartTime, long reportEndTime)
87+
protected boolean tryRegionServerReport(long reportStartTime, long reportEndTime)
8888
throws IOException {
8989
try {
9090
super.tryRegionServerReport(reportStartTime, reportEndTime);
9191
} catch (YouAreDeadException e) {
9292
// ignore, do not abort
9393
}
94+
return true;
9495
}
9596
}
9697

hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestShutdownWhileWALBroken.java

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,13 +85,15 @@ public MyRegionServer(Configuration conf) throws IOException {
8585
}
8686

8787
@Override
88-
protected void tryRegionServerReport(long reportStartTime, long reportEndTime)
88+
protected boolean tryRegionServerReport(long reportStartTime, long reportEndTime)
8989
throws IOException {
9090
try {
9191
super.tryRegionServerReport(reportStartTime, reportEndTime);
9292
} catch (YouAreDeadException e) {
9393
LOG.info("Caught YouAreDeadException, ignore", e);
94+
return false;
9495
}
96+
return true;
9597
}
9698

9799
@Override

0 commit comments

Comments
 (0)