Skip to content

Commit 6beed16

Browse files
committed
HBASE-28180 Review the usage of RegionStates.getOrCreateServer
1 parent fa4c896 commit 6beed16

File tree

10 files changed

+174
-88
lines changed

10 files changed

+174
-88
lines changed

hbase-server/src/main/java/org/apache/hadoop/hbase/master/RegionServerTracker.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -135,7 +135,7 @@ public void upgrade(Set<ServerName> deadServersFromPE, Set<ServerName> liveServe
135135
.forEach(s -> LOG.error("{} has no matching ServerCrashProcedure", s));
136136
// create ServerNode for all possible live servers from wal directory and master local region
137137
liveServersBeforeRestart
138-
.forEach(sn -> server.getAssignmentManager().getRegionStates().getOrCreateServer(sn));
138+
.forEach(sn -> server.getAssignmentManager().getRegionStates().createServer(sn));
139139
ServerManager serverManager = server.getServerManager();
140140
synchronized (this) {
141141
Set<ServerName> liveServers = regionServers;

hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -426,6 +426,7 @@ public ServerName findServerWithSameHostnamePortWithLock(final ServerName server
426426
void recordNewServerWithLock(final ServerName serverName, final ServerMetrics sl) {
427427
LOG.info("Registering regionserver=" + serverName);
428428
this.onlineServers.put(serverName, sl);
429+
master.getAssignmentManager().getRegionStates().createServer(serverName);
429430
}
430431

431432
public ConcurrentNavigableMap<byte[], Long> getFlushedSequenceIdByRegion() {
@@ -603,6 +604,10 @@ synchronized long expireServer(final ServerName serverName, boolean force) {
603604
}
604605
LOG.info("Processing expiration of " + serverName + " on " + this.master.getServerName());
605606
long pid = master.getAssignmentManager().submitServerCrash(serverName, true, force);
607+
if (pid == Procedure.NO_PROC_ID) {
608+
// skip later processing as we failed to submit SCP
609+
return Procedure.NO_PROC_ID;
610+
}
606611
storage.expired(serverName);
607612
// Tell our listeners that a server was removed
608613
if (!this.listeners.isEmpty()) {

hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java

Lines changed: 98 additions & 56 deletions
Large diffs are not rendered by default.

hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionStates.java

Lines changed: 19 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -405,7 +405,7 @@ private boolean include(final RegionStateNode node, final boolean offline) {
405405
// ============================================================================================
406406

407407
private void setServerState(ServerName serverName, ServerState state) {
408-
ServerStateNode serverNode = getOrCreateServer(serverName);
408+
ServerStateNode serverNode = getServerNode(serverName);
409409
synchronized (serverNode) {
410410
serverNode.setState(state);
411411
}
@@ -746,18 +746,16 @@ public List<RegionState> getRegionFailedOpen() {
746746
// ==========================================================================
747747

748748
/**
749-
* Be judicious calling this method. Do it on server register ONLY otherwise you could mess up
750-
* online server accounting. TOOD: Review usage and convert to {@link #getServerNode(ServerName)}
751-
* where we can.
749+
* Create the ServerStateNode when registering a new region server
752750
*/
753-
public ServerStateNode getOrCreateServer(final ServerName serverName) {
754-
return serverMap.computeIfAbsent(serverName, key -> new ServerStateNode(key));
751+
public void createServer(ServerName serverName) {
752+
serverMap.computeIfAbsent(serverName, key -> new ServerStateNode(key));
755753
}
756754

757755
/**
758756
* Called by SCP at end of successful processing.
759757
*/
760-
public void removeServer(final ServerName serverName) {
758+
public void removeServer(ServerName serverName) {
761759
serverMap.remove(serverName);
762760
}
763761

@@ -776,17 +774,24 @@ public double getAverageLoad() {
776774
return numServers == 0 ? 0.0 : (double) totalLoad / (double) numServers;
777775
}
778776

779-
public ServerStateNode addRegionToServer(final RegionStateNode regionNode) {
780-
ServerStateNode serverNode = getOrCreateServer(regionNode.getRegionLocation());
777+
public void addRegionToServer(final RegionStateNode regionNode) {
778+
ServerStateNode serverNode = getServerNode(regionNode.getRegionLocation());
781779
serverNode.addRegion(regionNode);
782-
return serverNode;
783780
}
784781

785-
public ServerStateNode removeRegionFromServer(final ServerName serverName,
782+
public void removeRegionFromServer(final ServerName serverName,
786783
final RegionStateNode regionNode) {
787-
ServerStateNode serverNode = getOrCreateServer(serverName);
788-
serverNode.removeRegion(regionNode);
789-
return serverNode;
784+
ServerStateNode serverNode = getServerNode(serverName);
785+
// here the server node could be null. For example, if there is already a TRSP for a region and
786+
// at the same time, the target server is crashed and there is a SCP. The SCP will interrupt the
787+
// TRSP and the TRSP will first set the region as abnormally closed and remove it from the
788+
// server node. But here, this TRSP is not a child procedure of the SCP, so it is possible that
789+
// the SCP finishes, thus removes the server node for this region server, before the TRSP wakes
790+
// up and enter here to remove the region node from the server node, then we will get a null
791+
// server node here.
792+
if (serverNode != null) {
793+
serverNode.removeRegion(regionNode);
794+
}
790795
}
791796

792797
// ==========================================================================

hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/ServerStateNode.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@
3535
public class ServerStateNode implements Comparable<ServerStateNode> {
3636
private final Set<RegionStateNode> regions;
3737
private final ServerName serverName;
38+
// the lock here is for fencing SCP and TRSP, so not all operations need to hold this lock
3839
private final ReadWriteLock lock = new ReentrantReadWriteLock();
3940
private volatile ServerState state = ServerState.ONLINE;
4041

hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestBalancer.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ public void testAssignmentsForBalancer() throws Exception {
7878
AssignmentManager assignmentManager = master.getAssignmentManager();
7979
RegionStates regionStates = assignmentManager.getRegionStates();
8080
ServerName sn1 = ServerName.parseServerName("asf903.gq1.ygridcore.net,52690,1517835491385");
81-
regionStates.getOrCreateServer(sn1);
81+
regionStates.createServer(sn1);
8282

8383
TableStateManager tableStateManager = master.getTableStateManager();
8484
ServerManager serverManager = master.getServerManager();

hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestClockSkewDetection.java

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,12 +18,16 @@
1818
package org.apache.hadoop.hbase.master;
1919

2020
import static org.junit.Assert.fail;
21+
import static org.mockito.Mockito.mock;
22+
import static org.mockito.Mockito.when;
2123

2224
import java.net.InetAddress;
2325
import org.apache.hadoop.conf.Configuration;
2426
import org.apache.hadoop.hbase.ClockOutOfSyncException;
2527
import org.apache.hadoop.hbase.HBaseClassTestRule;
2628
import org.apache.hadoop.hbase.HBaseConfiguration;
29+
import org.apache.hadoop.hbase.master.assignment.AssignmentManager;
30+
import org.apache.hadoop.hbase.master.assignment.RegionStates;
2731
import org.apache.hadoop.hbase.testclassification.MasterTests;
2832
import org.apache.hadoop.hbase.testclassification.SmallTests;
2933
import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
@@ -44,11 +48,28 @@ public class TestClockSkewDetection {
4448

4549
private static final Logger LOG = LoggerFactory.getLogger(TestClockSkewDetection.class);
4650

51+
private static final class DummyMasterServices extends MockNoopMasterServices {
52+
53+
private final AssignmentManager am;
54+
55+
public DummyMasterServices(Configuration conf) {
56+
super(conf);
57+
am = mock(AssignmentManager.class);
58+
RegionStates rss = mock(RegionStates.class);
59+
when(am.getRegionStates()).thenReturn(rss);
60+
}
61+
62+
@Override
63+
public AssignmentManager getAssignmentManager() {
64+
return am;
65+
}
66+
}
67+
4768
@Test
4869
public void testClockSkewDetection() throws Exception {
4970
final Configuration conf = HBaseConfiguration.create();
5071
ServerManager sm =
51-
new ServerManager(new MockNoopMasterServices(conf), new DummyRegionServerList());
72+
new ServerManager(new DummyMasterServices(conf), new DummyRegionServerList());
5273

5374
LOG.debug("regionServerStartup 1");
5475
InetAddress ia1 = InetAddress.getLocalHost();

hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestClusterRestartFailover.java

Lines changed: 23 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
*/
1818
package org.apache.hadoop.hbase.master;
1919

20-
import static org.junit.Assert.assertFalse;
20+
import static org.junit.Assert.assertEquals;
2121
import static org.junit.Assert.assertNotNull;
2222
import static org.junit.Assert.assertNull;
2323
import static org.junit.Assert.assertTrue;
@@ -33,6 +33,7 @@
3333
import org.apache.hadoop.hbase.ServerName;
3434
import org.apache.hadoop.hbase.StartTestingClusterOption;
3535
import org.apache.hadoop.hbase.TableName;
36+
import org.apache.hadoop.hbase.Waiter.ExplainingPredicate;
3637
import org.apache.hadoop.hbase.client.RegionInfo;
3738
import org.apache.hadoop.hbase.client.Table;
3839
import org.apache.hadoop.hbase.master.assignment.AssignmentManager;
@@ -105,36 +106,44 @@ public void test() throws Exception {
105106
UTIL.waitFor(60000, () -> UTIL.getHBaseCluster().getMaster().isInitialized());
106107
LOG.info("Started cluster master, waiting for {}", SERVER_FOR_TEST);
107108
UTIL.waitFor(60000, () -> getServerStateNode(SERVER_FOR_TEST) != null);
108-
serverNode = getServerStateNode(SERVER_FOR_TEST);
109-
assertFalse("serverNode should not be ONLINE during SCP processing",
110-
serverNode.isInState(ServerState.ONLINE));
109+
UTIL.waitFor(30000, new ExplainingPredicate<Exception>() {
110+
111+
@Override
112+
public boolean evaluate() throws Exception {
113+
return !getServerStateNode(SERVER_FOR_TEST).isInState(ServerState.ONLINE);
114+
}
115+
116+
@Override
117+
public String explainFailure() throws Exception {
118+
return "serverNode should not be ONLINE during SCP processing";
119+
}
120+
});
111121
Optional<Procedure<?>> procedure = UTIL.getHBaseCluster().getMaster().getProcedures().stream()
112122
.filter(p -> (p instanceof ServerCrashProcedure)
113123
&& ((ServerCrashProcedure) p).getServerName().equals(SERVER_FOR_TEST))
114124
.findAny();
115125
assertTrue("Should have one SCP for " + SERVER_FOR_TEST, procedure.isPresent());
116-
assertTrue("Submit the SCP for the same serverName " + SERVER_FOR_TEST + " which should fail",
117-
UTIL.getHBaseCluster().getMaster().getServerManager().expireServer(SERVER_FOR_TEST)
118-
== Procedure.NO_PROC_ID);
126+
assertEquals("Submit the SCP for the same serverName " + SERVER_FOR_TEST + " which should fail",
127+
Procedure.NO_PROC_ID,
128+
UTIL.getHBaseCluster().getMaster().getServerManager().expireServer(SERVER_FOR_TEST));
119129

120130
// Wait the SCP to finish
121131
LOG.info("Waiting on latch");
122132
SCP_LATCH.countDown();
123133
UTIL.waitFor(60000, () -> procedure.get().isFinished());
134+
assertNull("serverNode should be deleted after SCP finished",
135+
getServerStateNode(SERVER_FOR_TEST));
124136

125-
assertFalse(
137+
assertEquals(
126138
"Even when the SCP is finished, the duplicate SCP should not be scheduled for "
127139
+ SERVER_FOR_TEST,
128-
UTIL.getHBaseCluster().getMaster().getServerManager().expireServer(SERVER_FOR_TEST)
129-
== Procedure.NO_PROC_ID);
130-
serverNode = UTIL.getHBaseCluster().getMaster().getAssignmentManager().getRegionStates()
131-
.getServerNode(SERVER_FOR_TEST);
132-
assertNull("serverNode should be deleted after SCP finished", serverNode);
140+
Procedure.NO_PROC_ID,
141+
UTIL.getHBaseCluster().getMaster().getServerManager().expireServer(SERVER_FOR_TEST));
133142

134143
MetricsMasterSource masterSource =
135144
UTIL.getHBaseCluster().getMaster().getMasterMetrics().getMetricsSource();
136145
metricsHelper.assertCounter(MetricsMasterSource.SERVER_CRASH_METRIC_PREFIX + "SubmittedCount",
137-
4, masterSource);
146+
3, masterSource);
138147
}
139148

140149
private void setupCluster() throws Exception {

hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/MasterProcedureTestingUtility.java

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,9 @@ public Void call() throws Exception {
109109
am.setupRIT(procExec.getActiveProceduresNoCopy().stream().filter(p -> !p.isSuccess())
110110
.filter(p -> p instanceof TransitRegionStateProcedure)
111111
.map(p -> (TransitRegionStateProcedure) p).collect(Collectors.toList()));
112+
// create server state node, to simulate master start up
113+
env.getMasterServices().getServerManager().getOnlineServersList()
114+
.forEach(am.getRegionStates()::createServer);
112115
return null;
113116
}
114117
},

hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestServerRemoteProcedure.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,7 @@ public void setUp() throws Exception {
8888
master.start(2, rsDispatcher);
8989
am = master.getAssignmentManager();
9090
master.getServerManager().getOnlineServersList().stream()
91-
.forEach(serverName -> am.getRegionStates().getOrCreateServer(serverName));
91+
.forEach(serverName -> am.getRegionStates().createServer(serverName));
9292
}
9393

9494
@After

0 commit comments

Comments
 (0)