Skip to content

Commit ada772a

Browse files
committed
HBASE-22408 add dead and unknown server open regions metric to AM
Signed-off-by: Duo Zhang <[email protected]>
1 parent 2f9d995 commit ada772a

File tree

6 files changed

+137
-5
lines changed

6 files changed

+137
-5
lines changed

hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/master/MetricsAssignmentManagerSource.java

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,8 @@ public interface MetricsAssignmentManagerSource extends BaseSource {
5050
String RIT_COUNT_OVER_THRESHOLD_NAME = "ritCountOverThreshold";
5151
String RIT_OLDEST_AGE_NAME = "ritOldestAge";
5252
String RIT_DURATION_NAME = "ritDuration";
53+
String DEAD_SERVER_OPEN_REGIONS = "deadServerOpenRegions";
54+
String UNKNOWN_SERVER_OPEN_REGIONS = "unknownServerOpenRegions";
5355

5456
String RIT_COUNT_DESC = "Current number of Regions In Transition (Gauge).";
5557
String RIT_COUNT_OVER_THRESHOLD_DESC =
@@ -93,6 +95,10 @@ public interface MetricsAssignmentManagerSource extends BaseSource {
9395

9496
void updateRitDuration(long duration);
9597

98+
void updateDeadServerOpenRegions(int deadRegions);
99+
100+
void updateUnknownServerOpenRegions(int unknownRegions);
101+
96102
/**
97103
* TODO: Remove. This may not be needed now as assign and unassign counts are tracked separately
98104
* Increment the count of operations (assign/unassign).

hbase-hadoop2-compat/src/main/java/org/apache/hadoop/hbase/master/MetricsAssignmentManagerSourceImpl.java

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,8 @@ public class MetricsAssignmentManagerSourceImpl
3434
private MutableGaugeLong ritCountOverThresholdGauge;
3535
private MutableGaugeLong ritOldestAgeGauge;
3636
private MetricHistogram ritDurationHisto;
37+
private MutableGaugeLong deadServerOpenRegions;
38+
private MutableGaugeLong unknownServerOpenRegions;
3739

3840
private MutableFastCounter operationCounter;
3941

@@ -63,6 +65,8 @@ public void init() {
6365
ritOldestAgeGauge = metricsRegistry.newGauge(RIT_OLDEST_AGE_NAME, RIT_OLDEST_AGE_DESC, 0L);
6466
ritDurationHisto = metricsRegistry.newTimeHistogram(RIT_DURATION_NAME, RIT_DURATION_DESC);
6567
operationCounter = metricsRegistry.getCounter(OPERATION_COUNT_NAME, 0L);
68+
deadServerOpenRegions = metricsRegistry.newGauge(DEAD_SERVER_OPEN_REGIONS, "", 0);
69+
unknownServerOpenRegions = metricsRegistry.newGauge(UNKNOWN_SERVER_OPEN_REGIONS, "", 0);
6670

6771
/**
6872
* NOTE: Please refer to HBASE-9774 and HBASE-14282. Based on these two issues, HBase is
@@ -104,6 +108,16 @@ public void updateRitDuration(long duration) {
104108
ritDurationHisto.add(duration);
105109
}
106110

111+
@Override
112+
public void updateDeadServerOpenRegions(int deadRegions) {
113+
deadServerOpenRegions.set(deadRegions);
114+
}
115+
116+
@Override
117+
public void updateUnknownServerOpenRegions(int unknownRegions) {
118+
unknownServerOpenRegions.set(unknownRegions);
119+
}
120+
107121
@Override
108122
public OperationMetrics getAssignMetrics() {
109123
return assignMetrics;

hbase-server/src/main/java/org/apache/hadoop/hbase/master/MetricsAssignmentManager.java

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,14 @@ public void incrementOperationCounter() {
9696
assignmentManagerSource.incrementOperationCounter();
9797
}
9898

99+
public void updateDeadServerOpenRegions(int deadRegions) {
100+
assignmentManagerSource.updateDeadServerOpenRegions(deadRegions);
101+
}
102+
103+
public void updateUnknownServerOpenRegions(int unknownRegions) {
104+
assignmentManagerSource.updateUnknownServerOpenRegions(unknownRegions);
105+
}
106+
99107
/**
100108
* @return Set of common metrics for assign procedure
101109
*/

hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -620,8 +620,9 @@ public synchronized boolean expireServer(final ServerName serverName) {
620620
}
621621
}
622622

623+
// Note: this is currently invoked from RPC, not just tests. Locking in this class needs cleanup.
623624
@VisibleForTesting
624-
public void moveFromOnlineToDeadServers(final ServerName sn) {
625+
public synchronized void moveFromOnlineToDeadServers(final ServerName sn) {
625626
synchronized (onlineServers) {
626627
if (!this.onlineServers.containsKey(sn)) {
627628
LOG.trace("Expiration of {} but server not online", sn);
@@ -907,6 +908,20 @@ public boolean isServerOnline(ServerName serverName) {
907908
return serverName != null && onlineServers.containsKey(serverName);
908909
}
909910

911+
public enum ServerLiveState {
912+
LIVE,
913+
DEAD,
914+
UNKNOWN
915+
}
916+
917+
/**
918+
* @return whether the server is online, dead, or unknown.
919+
*/
920+
public synchronized ServerLiveState isServerKnownAndOnline(ServerName serverName) {
921+
return onlineServers.containsKey(serverName) ? ServerLiveState.LIVE
922+
: (deadservers.isDeadServer(serverName) ? ServerLiveState.DEAD : ServerLiveState.UNKNOWN);
923+
}
924+
910925
/**
911926
* Check if a server is known to be dead. A server can be online,
912927
* or known to be dead, or unknown to this manager (i.e, not online,

hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java

Lines changed: 88 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
import java.util.Collection;
2323
import java.util.Collections;
2424
import java.util.HashMap;
25+
import java.util.HashSet;
2526
import java.util.List;
2627
import java.util.Map;
2728
import java.util.Set;
@@ -53,6 +54,7 @@
5354
import org.apache.hadoop.hbase.master.RegionPlan;
5455
import org.apache.hadoop.hbase.master.RegionState;
5556
import org.apache.hadoop.hbase.master.RegionState.State;
57+
import org.apache.hadoop.hbase.master.ServerManager;
5658
import org.apache.hadoop.hbase.master.TableStateManager;
5759
import org.apache.hadoop.hbase.master.balancer.FavoredStochasticBalancer;
5860
import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv;
@@ -127,6 +129,10 @@ public class AssignmentManager {
127129
"hbase.assignment.rit.chore.interval.msec";
128130
private static final int DEFAULT_RIT_CHORE_INTERVAL_MSEC = 60 * 1000;
129131

132+
public static final String DEAD_REGION_METRIC_CHORE_INTERVAL_MSEC_CONF_KEY =
133+
"hbase.assignment.dead.region.metric.chore.interval.msec";
134+
private static final int DEFAULT_DEAD_REGION_METRIC_CHORE_INTERVAL_MSEC = 120 * 1000;
135+
130136
public static final String ASSIGN_MAX_ATTEMPTS =
131137
"hbase.assignment.maximum.attempts";
132138
private static final int DEFAULT_ASSIGN_MAX_ATTEMPTS = Integer.MAX_VALUE;
@@ -145,6 +151,7 @@ public class AssignmentManager {
145151

146152
private final MetricsAssignmentManager metrics;
147153
private final RegionInTransitionChore ritChore;
154+
private final DeadServerMetricRegionChore deadMetricChore;
148155
private final MasterServices master;
149156

150157
private final AtomicBoolean running = new AtomicBoolean(false);
@@ -190,6 +197,14 @@ public AssignmentManager(final MasterServices master) {
190197
int ritChoreInterval = conf.getInt(RIT_CHORE_INTERVAL_MSEC_CONF_KEY,
191198
DEFAULT_RIT_CHORE_INTERVAL_MSEC);
192199
this.ritChore = new RegionInTransitionChore(ritChoreInterval);
200+
201+
int deadRegionChoreInterval = conf.getInt(DEAD_REGION_METRIC_CHORE_INTERVAL_MSEC_CONF_KEY,
202+
DEFAULT_DEAD_REGION_METRIC_CHORE_INTERVAL_MSEC);
203+
if (deadRegionChoreInterval > 0) {
204+
this.deadMetricChore = new DeadServerMetricRegionChore(deadRegionChoreInterval);
205+
} else {
206+
this.deadMetricChore = null;
207+
}
193208
}
194209

195210
public void start() throws IOException, KeeperException {
@@ -271,6 +286,9 @@ public void stop() {
271286
// Remove the RIT chore
272287
if (hasProcExecutor) {
273288
master.getMasterProcedureExecutor().removeChore(this.ritChore);
289+
if (this.deadMetricChore != null) {
290+
master.getMasterProcedureExecutor().removeChore(this.deadMetricChore);
291+
}
274292
}
275293

276294
// Stop the Assignment Thread
@@ -1130,6 +1148,69 @@ protected void periodicExecute(final MasterProcedureEnv env) {
11301148
}
11311149
}
11321150

1151+
private static class DeadServerMetricRegionChore
1152+
extends ProcedureInMemoryChore<MasterProcedureEnv> {
1153+
public DeadServerMetricRegionChore(final int timeoutMsec) {
1154+
super(timeoutMsec);
1155+
}
1156+
1157+
@Override
1158+
protected void periodicExecute(final MasterProcedureEnv env) {
1159+
final ServerManager sm = env.getMasterServices().getServerManager();
1160+
final AssignmentManager am = env.getAssignmentManager();
1161+
// To minimize inconsistencies we are not going to snapshot live servers in advance in case
1162+
// new servers are added; OTOH we don't want to add heavy sync for a consistent view since
1163+
// this is for metrics. Instead, we're going to check each regions as we go; to avoid making
1164+
// too many checks, we maintain a local lists of server, limiting us to false negatives. If
1165+
// we miss some recently-dead server, we'll just see it next time.
1166+
Set<ServerName> recentlyLiveServers = new HashSet<>();
1167+
int deadRegions = 0, unknownRegions = 0;
1168+
for (RegionStateNode rsn : am.getRegionStates().getRegionStateNodes()) {
1169+
if (rsn.getState() != State.OPEN) {
1170+
continue; // Opportunistic check, should quickly skip RITs, offline tables, etc.
1171+
}
1172+
ServerName sn;
1173+
State state;
1174+
rsn.lock();
1175+
try {
1176+
sn = rsn.getRegionLocation();
1177+
state = rsn.getState();
1178+
} finally {
1179+
rsn.unlock();
1180+
}
1181+
if (state != State.OPEN) {
1182+
continue; // Mostly skipping RITs that are already being take care of.
1183+
}
1184+
if (sn == null) {
1185+
++unknownRegions; // Opened on null?
1186+
continue;
1187+
}
1188+
if (recentlyLiveServers.contains(sn)) {
1189+
continue;
1190+
}
1191+
ServerManager.ServerLiveState sls = sm.isServerKnownAndOnline(sn);
1192+
switch (sls) {
1193+
case LIVE:
1194+
recentlyLiveServers.add(sn);
1195+
break;
1196+
case DEAD:
1197+
++deadRegions;
1198+
break;
1199+
case UNKNOWN:
1200+
++unknownRegions;
1201+
break;
1202+
default: throw new AssertionError("Unexpected " + sls);
1203+
}
1204+
}
1205+
if (deadRegions > 0 || unknownRegions > 0) {
1206+
LOG.info("Found {} OPEN regions on dead servers and {} OPEN regions on unknown servers",
1207+
deadRegions, unknownRegions);
1208+
}
1209+
1210+
am.updateDeadServerRegionMetrics(deadRegions, unknownRegions);
1211+
}
1212+
}
1213+
11331214
public RegionInTransitionStat computeRegionInTransitionStat() {
11341215
final RegionInTransitionStat rit = new RegionInTransitionStat(getConfiguration());
11351216
rit.update(this);
@@ -1236,6 +1317,11 @@ private void updateRegionsInTransitionMetrics(final RegionInTransitionStat ritSt
12361317
metrics.updateRITCountOverThreshold(ritStat.getTotalRITsOverThreshold());
12371318
}
12381319

1320+
private void updateDeadServerRegionMetrics(int deadRegions, int unknownRegions) {
1321+
metrics.updateDeadServerOpenRegions(deadRegions);
1322+
metrics.updateUnknownServerOpenRegions(unknownRegions);
1323+
}
1324+
12391325
private void handleRegionOverStuckWarningThreshold(final RegionInfo regionInfo) {
12401326
final RegionStateNode regionNode = regionStates.getRegionStateNode(regionInfo);
12411327
//if (regionNode.isStuck()) {
@@ -1261,8 +1347,9 @@ public void joinCluster() throws IOException {
12611347
}
12621348
LOG.info("Number of RegionServers={}", master.getServerManager().countOfRegionServers());
12631349

1264-
// Start the RIT chore
1350+
// Start the chores
12651351
master.getMasterProcedureExecutor().addChore(this.ritChore);
1352+
master.getMasterProcedureExecutor().addChore(this.deadMetricChore);
12661353

12671354
long costMs = TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - startTime);
12681355
LOG.info("Joined the cluster in {}", StringUtils.humanTimeDiff(costMs));

hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionStates.java

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -178,12 +178,14 @@ ArrayList<RegionInfo> getTableRegionsInfo(final TableName tableName) {
178178
return regions;
179179
}
180180

181-
Collection<RegionStateNode> getRegionStateNodes() {
182-
return regionsMap.values();
181+
/** @return A view of region state nodes for all the regions. */
182+
public Collection<RegionStateNode> getRegionStateNodes() {
183+
return Collections.unmodifiableCollection(regionsMap.values());
183184
}
184185

186+
/** @return A snapshot of region state nodes for all the regions. */
185187
public ArrayList<RegionState> getRegionStates() {
186-
final ArrayList<RegionState> regions = new ArrayList<RegionState>(regionsMap.size());
188+
final ArrayList<RegionState> regions = new ArrayList<>(regionsMap.size());
187189
for (RegionStateNode node: regionsMap.values()) {
188190
regions.add(node.toRegionState());
189191
}

0 commit comments

Comments
 (0)