2222import java .util .Collection ;
2323import java .util .Collections ;
2424import java .util .HashMap ;
25+ import java .util .HashSet ;
2526import java .util .List ;
2627import java .util .Map ;
2728import java .util .Set ;
5354import org .apache .hadoop .hbase .master .RegionPlan ;
5455import org .apache .hadoop .hbase .master .RegionState ;
5556import org .apache .hadoop .hbase .master .RegionState .State ;
57+ import org .apache .hadoop .hbase .master .ServerManager ;
5658import org .apache .hadoop .hbase .master .TableStateManager ;
5759import org .apache .hadoop .hbase .master .balancer .FavoredStochasticBalancer ;
5860import org .apache .hadoop .hbase .master .procedure .MasterProcedureEnv ;
@@ -127,6 +129,10 @@ public class AssignmentManager {
127129 "hbase.assignment.rit.chore.interval.msec" ;
128130 private static final int DEFAULT_RIT_CHORE_INTERVAL_MSEC = 60 * 1000 ;
129131
132+ public static final String DEAD_REGION_METRIC_CHORE_INTERVAL_MSEC_CONF_KEY =
133+ "hbase.assignment.dead.region.metric.chore.interval.msec" ;
134+ private static final int DEFAULT_DEAD_REGION_METRIC_CHORE_INTERVAL_MSEC = 120 * 1000 ;
135+
130136 public static final String ASSIGN_MAX_ATTEMPTS =
131137 "hbase.assignment.maximum.attempts" ;
132138 private static final int DEFAULT_ASSIGN_MAX_ATTEMPTS = Integer .MAX_VALUE ;
@@ -145,6 +151,7 @@ public class AssignmentManager {
145151
146152 private final MetricsAssignmentManager metrics ;
147153 private final RegionInTransitionChore ritChore ;
154+ private final DeadServerMetricRegionChore deadMetricChore ;
148155 private final MasterServices master ;
149156
150157 private final AtomicBoolean running = new AtomicBoolean (false );
@@ -190,6 +197,14 @@ public AssignmentManager(final MasterServices master) {
190197 int ritChoreInterval = conf .getInt (RIT_CHORE_INTERVAL_MSEC_CONF_KEY ,
191198 DEFAULT_RIT_CHORE_INTERVAL_MSEC );
192199 this .ritChore = new RegionInTransitionChore (ritChoreInterval );
200+
201+ int deadRegionChoreInterval = conf .getInt (DEAD_REGION_METRIC_CHORE_INTERVAL_MSEC_CONF_KEY ,
202+ DEFAULT_DEAD_REGION_METRIC_CHORE_INTERVAL_MSEC );
203+ if (deadRegionChoreInterval > 0 ) {
204+ this .deadMetricChore = new DeadServerMetricRegionChore (deadRegionChoreInterval );
205+ } else {
206+ this .deadMetricChore = null ;
207+ }
193208 }
194209
195210 public void start () throws IOException , KeeperException {
@@ -271,6 +286,9 @@ public void stop() {
271286 // Remove the RIT chore
272287 if (hasProcExecutor ) {
273288 master .getMasterProcedureExecutor ().removeChore (this .ritChore );
289+ if (this .deadMetricChore != null ) {
290+ master .getMasterProcedureExecutor ().removeChore (this .deadMetricChore );
291+ }
274292 }
275293
276294 // Stop the Assignment Thread
@@ -1130,6 +1148,69 @@ protected void periodicExecute(final MasterProcedureEnv env) {
11301148 }
11311149 }
11321150
1151+ private static class DeadServerMetricRegionChore
1152+ extends ProcedureInMemoryChore <MasterProcedureEnv > {
1153+ public DeadServerMetricRegionChore (final int timeoutMsec ) {
1154+ super (timeoutMsec );
1155+ }
1156+
1157+ @ Override
1158+ protected void periodicExecute (final MasterProcedureEnv env ) {
1159+ final ServerManager sm = env .getMasterServices ().getServerManager ();
1160+ final AssignmentManager am = env .getAssignmentManager ();
1161+ // To minimize inconsistencies we are not going to snapshot live servers in advance in case
1162+ // new servers are added; OTOH we don't want to add heavy sync for a consistent view since
1163+ // this is for metrics. Instead, we're going to check each regions as we go; to avoid making
1164+ // too many checks, we maintain a local lists of server, limiting us to false negatives. If
1165+ // we miss some recently-dead server, we'll just see it next time.
1166+ Set <ServerName > recentlyLiveServers = new HashSet <>();
1167+ int deadRegions = 0 , unknownRegions = 0 ;
1168+ for (RegionStateNode rsn : am .getRegionStates ().getRegionStateNodes ()) {
1169+ if (rsn .getState () != State .OPEN ) {
1170+ continue ; // Opportunistic check, should quickly skip RITs, offline tables, etc.
1171+ }
1172+ ServerName sn ;
1173+ State state ;
1174+ rsn .lock ();
1175+ try {
1176+ sn = rsn .getRegionLocation ();
1177+ state = rsn .getState ();
1178+ } finally {
1179+ rsn .unlock ();
1180+ }
1181+ if (state != State .OPEN ) {
1182+ continue ; // Mostly skipping RITs that are already being take care of.
1183+ }
1184+ if (sn == null ) {
1185+ ++unknownRegions ; // Opened on null?
1186+ continue ;
1187+ }
1188+ if (recentlyLiveServers .contains (sn )) {
1189+ continue ;
1190+ }
1191+ ServerManager .ServerLiveState sls = sm .isServerKnownAndOnline (sn );
1192+ switch (sls ) {
1193+ case LIVE :
1194+ recentlyLiveServers .add (sn );
1195+ break ;
1196+ case DEAD :
1197+ ++deadRegions ;
1198+ break ;
1199+ case UNKNOWN :
1200+ ++unknownRegions ;
1201+ break ;
1202+ default : throw new AssertionError ("Unexpected " + sls );
1203+ }
1204+ }
1205+ if (deadRegions > 0 || unknownRegions > 0 ) {
1206+ LOG .info ("Found {} OPEN regions on dead servers and {} OPEN regions on unknown servers" ,
1207+ deadRegions , unknownRegions );
1208+ }
1209+
1210+ am .updateDeadServerRegionMetrics (deadRegions , unknownRegions );
1211+ }
1212+ }
1213+
11331214 public RegionInTransitionStat computeRegionInTransitionStat () {
11341215 final RegionInTransitionStat rit = new RegionInTransitionStat (getConfiguration ());
11351216 rit .update (this );
@@ -1236,6 +1317,11 @@ private void updateRegionsInTransitionMetrics(final RegionInTransitionStat ritSt
12361317 metrics .updateRITCountOverThreshold (ritStat .getTotalRITsOverThreshold ());
12371318 }
12381319
1320+ private void updateDeadServerRegionMetrics (int deadRegions , int unknownRegions ) {
1321+ metrics .updateDeadServerOpenRegions (deadRegions );
1322+ metrics .updateUnknownServerOpenRegions (unknownRegions );
1323+ }
1324+
12391325 private void handleRegionOverStuckWarningThreshold (final RegionInfo regionInfo ) {
12401326 final RegionStateNode regionNode = regionStates .getRegionStateNode (regionInfo );
12411327 //if (regionNode.isStuck()) {
@@ -1261,8 +1347,9 @@ public void joinCluster() throws IOException {
12611347 }
12621348 LOG .info ("Number of RegionServers={}" , master .getServerManager ().countOfRegionServers ());
12631349
1264- // Start the RIT chore
1350+ // Start the chores
12651351 master .getMasterProcedureExecutor ().addChore (this .ritChore );
1352+ master .getMasterProcedureExecutor ().addChore (this .deadMetricChore );
12661353
12671354 long costMs = TimeUnit .NANOSECONDS .toMillis (System .nanoTime () - startTime );
12681355 LOG .info ("Joined the cluster in {}" , StringUtils .humanTimeDiff (costMs ));
0 commit comments