From ec004949b079d265a764df0c0e1009290e08214b Mon Sep 17 00:00:00 2001 From: Sergey Shelukhin Date: Mon, 13 May 2019 16:54:47 -0700 Subject: [PATCH] HBASE-22410 add the notion of the expected # of servers for non-fixed server sets; report an alternative dead server metric 01 --- .../hbase/master/MetricsMasterSource.java | 3 +++ .../hbase/master/MetricsMasterWrapper.java | 6 +++++ .../hbase/master/MetricsMasterSourceImpl.java | 2 ++ .../master/MetricsMasterWrapperImpl.java | 9 ++++++++ .../hadoop/hbase/master/ServerManager.java | 22 +++++++++++++++++++ 5 files changed, 42 insertions(+) diff --git a/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/master/MetricsMasterSource.java b/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/master/MetricsMasterSource.java index f947a3acb9c6..22beb6b3b92b 100644 --- a/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/master/MetricsMasterSource.java +++ b/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/master/MetricsMasterSource.java @@ -55,6 +55,7 @@ public interface MetricsMasterSource extends BaseSource { String AVERAGE_LOAD_NAME = "averageLoad"; String LIVE_REGION_SERVERS_NAME = "liveRegionServers"; String DEAD_REGION_SERVERS_NAME = "deadRegionServers"; + String SERVERS_BELOW_EXPECTED_COUNT_NAME = "serversBelowExpectedCount"; String NUM_REGION_SERVERS_NAME = "numRegionServers"; String NUM_DEAD_REGION_SERVERS_NAME = "numDeadRegionServers"; String ZOOKEEPER_QUORUM_NAME = "zookeeperQuorum"; @@ -74,6 +75,8 @@ public interface MetricsMasterSource extends BaseSource { String AVERAGE_LOAD_DESC = "AverageLoad"; String LIVE_REGION_SERVERS_DESC = "Names of live RegionServers"; String NUMBER_OF_REGION_SERVERS_DESC = "Number of RegionServers"; + String SERVERS_BELOW_EXPECTED_COUNT_DESC = "Number of region servers missing to reach" + + " the expected value specified in master configuration"; String DEAD_REGION_SERVERS_DESC = "Names of dead RegionServers"; String NUMBER_OF_DEAD_REGION_SERVERS_DESC = "Number of dead RegionServers"; String ZOOKEEPER_QUORUM_DESC = "ZooKeeper Quorum"; diff --git a/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/master/MetricsMasterWrapper.java b/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/master/MetricsMasterWrapper.java index 0c5ab59e6efd..1dbed5b0389f 100644 --- a/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/master/MetricsMasterWrapper.java +++ b/hbase-hadoop-compat/src/main/java/org/apache/hadoop/hbase/master/MetricsMasterWrapper.java @@ -149,4 +149,10 @@ public interface MetricsMasterWrapper { * @return pair of count for online regions and offline regions */ PairOfSameType getRegionCounts(); + + /** + * @return Number of region servers missing to reach + * the expected value specified in master configuration + */ + int getNumServersBelowExpected(); } diff --git a/hbase-hadoop2-compat/src/main/java/org/apache/hadoop/hbase/master/MetricsMasterSourceImpl.java b/hbase-hadoop2-compat/src/main/java/org/apache/hadoop/hbase/master/MetricsMasterSourceImpl.java index fc49a40aadf5..bab07d5264ca 100644 --- a/hbase-hadoop2-compat/src/main/java/org/apache/hadoop/hbase/master/MetricsMasterSourceImpl.java +++ b/hbase-hadoop2-compat/src/main/java/org/apache/hadoop/hbase/master/MetricsMasterSourceImpl.java @@ -106,6 +106,8 @@ public void getMetrics(MetricsCollector metricsCollector, boolean all) { regionNumberPair.getFirst()) .addGauge(Interns.info(OFFLINE_REGION_COUNT_NAME, OFFLINE_REGION_COUNT_DESC), regionNumberPair.getSecond()) + .addGauge(Interns.info(SERVERS_BELOW_EXPECTED_COUNT_NAME, + SERVERS_BELOW_EXPECTED_COUNT_DESC), masterWrapper.getNumServersBelowExpected()) .tag(Interns.info(LIVE_REGION_SERVERS_NAME, LIVE_REGION_SERVERS_DESC), masterWrapper.getRegionServers()) .addGauge(Interns.info(NUM_REGION_SERVERS_NAME, diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MetricsMasterWrapperImpl.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MetricsMasterWrapperImpl.java index 0f30ceb321d3..521fae11cdcc 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MetricsMasterWrapperImpl.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MetricsMasterWrapperImpl.java @@ -134,6 +134,15 @@ public int getNumDeadRegionServers() { return serverManager.getDeadServers().size(); } + @Override + public int getNumServersBelowExpected() { + ServerManager serverManager = this.master.getServerManager(); + if (serverManager == null) { + return 0; + } + return serverManager.getNumServersBelowExpected(); + } + @Override public String getServerName() { ServerName serverName = master.getServerName(); diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java index 88edb79d7944..165dc98ede22 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java @@ -131,6 +131,18 @@ public class ServerManager { public static final int FLUSHEDSEQUENCEID_FLUSHER_INTERVAL_DEFAULT = 3 * 60 * 60 * 1000; // 3 hours + + /** + * The count of region servers master expects to be present; for use in "cloud" use-cases. When + * running on a fixed set of machines, currently the dead server accounting replaces a dead + * server when a new live one comes up on the same host+port; however, this may never happen if + * HBase is running on smth like YARN with many more machines than RS. Instead, one can tell + * master how many region servers to expect in total, so it could report an alternative metric. + * -1 (the default) disables this feature. + */ + public static final String REGIONSERVERS_EXPECTED_COUNT = "hbase.master.expected.regionservers"; + public static final int REGIONSERVERS_EXPECTED_COUNT_DEFAULT = -1; + private static final Logger LOG = LoggerFactory.getLogger(ServerManager.class); // Set if we are to shutdown the cluster. @@ -147,6 +159,8 @@ public class ServerManager { /** File on hdfs to store last flushed sequence id of regions */ private static final String LAST_FLUSHED_SEQ_ID_FILE = ".lastflushedseqids"; private FlushedSequenceIdFlusher flushedSeqIdFlusher; + /** Expected RS count, for metrics; negative means no expectations. */ + private final int expectedRsCount; /** @@ -193,6 +207,7 @@ public ServerManager(final MasterServices master) { this.rpcControllerFactory = this.connection == null? null: connection.getRpcControllerFactory(); persistFlushedSequenceId = c.getBoolean(PERSIST_FLUSHEDSEQUENCEID, PERSIST_FLUSHEDSEQUENCEID_DEFAULT); + expectedRsCount = c.getInt(REGIONSERVERS_EXPECTED_COUNT, REGIONSERVERS_EXPECTED_COUNT_DEFAULT); } /** @@ -1171,6 +1186,13 @@ public void removeDeletedRegionFromLoadedFlushedSequenceIds() { } } + public int getNumServersBelowExpected() { + if (expectedRsCount <= 0) { + return 0; + } + return Math.max(0, expectedRsCount - this.onlineServers.size()); + } + private class FlushedSequenceIdFlusher extends ScheduledChore { public FlushedSequenceIdFlusher(String name, int p) {