Skip to content

Commit e07d1fe

Browse files
hiping-techlvhaiping.lhp
andauthored
HBASE-28113 Modify the way of acquiring the RegionStateNode lock in checkOnlineRegionsReport to tryLock (#5442)
* To prevent threads from being blocked by the lock of RegionStateNode, modify the way of acquiring the RegionStateNode lock in checkOnlineRegionsReport to tryLock. Co-authored-by: lvhaiping.lhp <[email protected]> Signed-off-by: Duo Zhang <[email protected]>
1 parent d3c0342 commit e07d1fe

File tree

2 files changed

+35
-21
lines changed

2 files changed

+35
-21
lines changed

hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java

Lines changed: 31 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1398,29 +1398,39 @@ private void checkOnlineRegionsReport(ServerStateNode serverNode, Set<byte[]> re
13981398
continue;
13991399
}
14001400
final long lag = 1000;
1401-
regionNode.lock();
1402-
try {
1403-
long diff = EnvironmentEdgeManager.currentTime() - regionNode.getLastUpdate();
1404-
if (regionNode.isInState(State.OPENING, State.OPEN)) {
1405-
// This is possible as a region server has just closed a region but the region server
1406-
// report is generated before the closing, but arrive after the closing. Make sure there
1407-
// is some elapsed time so less false alarms.
1408-
if (!regionNode.getRegionLocation().equals(serverName) && diff > lag) {
1409-
LOG.warn("Reporting {} server does not match {} (time since last "
1410-
+ "update={}ms); closing...", serverName, regionNode, diff);
1411-
closeRegionSilently(serverNode.getServerName(), regionName);
1412-
}
1413-
} else if (!regionNode.isInState(State.CLOSING, State.SPLITTING)) {
1414-
// So, we can get report that a region is CLOSED or SPLIT because a heartbeat
1415-
// came in at about same time as a region transition. Make sure there is some
1416-
// elapsed time so less false alarms.
1417-
if (diff > lag) {
1418-
LOG.warn("Reporting {} state does not match {} (time since last update={}ms)",
1419-
serverName, regionNode, diff);
1401+
// This is just a fallback check designed to identify unexpected data inconsistencies, so we
1402+
// use tryLock to attempt to acquire the lock, and if the lock cannot be acquired, we skip the
1403+
// check. This will not cause any additional problems and also prevents the regionServerReport
1404+
// call from being stuck for too long which may cause deadlock on region assignment.
1405+
if (regionNode.tryLock()) {
1406+
try {
1407+
long diff = EnvironmentEdgeManager.currentTime() - regionNode.getLastUpdate();
1408+
if (regionNode.isInState(State.OPENING, State.OPEN)) {
1409+
// This is possible as a region server has just closed a region but the region server
1410+
// report is generated before the closing, but arrive after the closing. Make sure
1411+
// there
1412+
// is some elapsed time so less false alarms.
1413+
if (!regionNode.getRegionLocation().equals(serverName) && diff > lag) {
1414+
LOG.warn("Reporting {} server does not match {} (time since last "
1415+
+ "update={}ms); closing...", serverName, regionNode, diff);
1416+
closeRegionSilently(serverNode.getServerName(), regionName);
1417+
}
1418+
} else if (!regionNode.isInState(State.CLOSING, State.SPLITTING)) {
1419+
// So, we can get report that a region is CLOSED or SPLIT because a heartbeat
1420+
// came in at about same time as a region transition. Make sure there is some
1421+
// elapsed time so less false alarms.
1422+
if (diff > lag) {
1423+
LOG.warn("Reporting {} state does not match {} (time since last update={}ms)",
1424+
serverName, regionNode, diff);
1425+
}
14201426
}
1427+
} finally {
1428+
regionNode.unlock();
14211429
}
1422-
} finally {
1423-
regionNode.unlock();
1430+
} else {
1431+
LOG.warn(
1432+
"Unable to acquire lock for regionNode {}. It is likely that another thread is currently holding the lock. To avoid deadlock, skip execution for now.",
1433+
regionNode);
14241434
}
14251435
}
14261436
}

hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionStateNode.java

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -323,6 +323,10 @@ public void lock() {
323323
lock.lock();
324324
}
325325

326+
public boolean tryLock() {
327+
return lock.tryLock();
328+
}
329+
326330
public void unlock() {
327331
lock.unlock();
328332
}

0 commit comments

Comments
 (0)