Skip to content

Commit 34afb6f

Browse files
hiping-techlvhaiping.lhp
authored andcommitted
HBASE-28113 Modify the way of acquiring the RegionStateNode lock in checkOnlineRegionsReport to tryLock (#5442)
* To prevent threads from being blocked by the lock of RegionStateNode, modify the way of acquiring the RegionStateNode lock in checkOnlineRegionsReport to tryLock. Co-authored-by: lvhaiping.lhp <[email protected]> Signed-off-by: Duo Zhang <[email protected]> (cherry picked from commit e07d1fe)
1 parent 18a38ae commit 34afb6f

File tree

2 files changed

+35
-21
lines changed

2 files changed

+35
-21
lines changed

hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java

Lines changed: 31 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1391,29 +1391,39 @@ private void checkOnlineRegionsReport(ServerStateNode serverNode, Set<byte[]> re
13911391
continue;
13921392
}
13931393
final long lag = 1000;
1394-
regionNode.lock();
1395-
try {
1396-
long diff = EnvironmentEdgeManager.currentTime() - regionNode.getLastUpdate();
1397-
if (regionNode.isInState(State.OPENING, State.OPEN)) {
1398-
// This is possible as a region server has just closed a region but the region server
1399-
// report is generated before the closing, but arrive after the closing. Make sure there
1400-
// is some elapsed time so less false alarms.
1401-
if (!regionNode.getRegionLocation().equals(serverName) && diff > lag) {
1402-
LOG.warn("Reporting {} server does not match {} (time since last "
1403-
+ "update={}ms); closing...", serverName, regionNode, diff);
1404-
closeRegionSilently(serverNode.getServerName(), regionName);
1405-
}
1406-
} else if (!regionNode.isInState(State.CLOSING, State.SPLITTING)) {
1407-
// So, we can get report that a region is CLOSED or SPLIT because a heartbeat
1408-
// came in at about same time as a region transition. Make sure there is some
1409-
// elapsed time so less false alarms.
1410-
if (diff > lag) {
1411-
LOG.warn("Reporting {} state does not match {} (time since last update={}ms)",
1412-
serverName, regionNode, diff);
1394+
// This is just a fallback check designed to identify unexpected data inconsistencies, so we
1395+
// use tryLock to attempt to acquire the lock, and if the lock cannot be acquired, we skip the
1396+
// check. This will not cause any additional problems and also prevents the regionServerReport
1397+
// call from being stuck for too long which may cause deadlock on region assignment.
1398+
if (regionNode.tryLock()) {
1399+
try {
1400+
long diff = EnvironmentEdgeManager.currentTime() - regionNode.getLastUpdate();
1401+
if (regionNode.isInState(State.OPENING, State.OPEN)) {
1402+
// This is possible as a region server has just closed a region but the region server
1403+
// report is generated before the closing, but arrive after the closing. Make sure
1404+
// there
1405+
// is some elapsed time so less false alarms.
1406+
if (!regionNode.getRegionLocation().equals(serverName) && diff > lag) {
1407+
LOG.warn("Reporting {} server does not match {} (time since last "
1408+
+ "update={}ms); closing...", serverName, regionNode, diff);
1409+
closeRegionSilently(serverNode.getServerName(), regionName);
1410+
}
1411+
} else if (!regionNode.isInState(State.CLOSING, State.SPLITTING)) {
1412+
// So, we can get report that a region is CLOSED or SPLIT because a heartbeat
1413+
// came in at about same time as a region transition. Make sure there is some
1414+
// elapsed time so less false alarms.
1415+
if (diff > lag) {
1416+
LOG.warn("Reporting {} state does not match {} (time since last update={}ms)",
1417+
serverName, regionNode, diff);
1418+
}
14131419
}
1420+
} finally {
1421+
regionNode.unlock();
14141422
}
1415-
} finally {
1416-
regionNode.unlock();
1423+
} else {
1424+
LOG.warn(
1425+
"Unable to acquire lock for regionNode {}. It is likely that another thread is currently holding the lock. To avoid deadlock, skip execution for now.",
1426+
regionNode);
14171427
}
14181428
}
14191429
}

hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionStateNode.java

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -319,6 +319,10 @@ public void lock() {
319319
lock.lock();
320320
}
321321

322+
public boolean tryLock() {
323+
return lock.tryLock();
324+
}
325+
322326
public void unlock() {
323327
lock.unlock();
324328
}

0 commit comments

Comments
 (0)