diff --git a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/ConnectionImplementation.java b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/ConnectionImplementation.java index 9f06db7e95cf..1b10775721fd 100644 --- a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/ConnectionImplementation.java +++ b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/ConnectionImplementation.java @@ -963,6 +963,7 @@ private RegionLocations locateRegionInMeta(TableName tableName, byte[] row, bool // Query the meta region long pauseBase = this.pause; takeUserRegionLock(); + final long lockStartTime = EnvironmentEdgeManager.currentTime(); try { // We don't need to check if useCache is enabled or not. Even if useCache is false // we already cleared the cache for this row before acquiring userRegion lock so if this @@ -1063,6 +1064,10 @@ rpcControllerFactory, getMetaLookupPool(), metaReplicaCallTimeoutScanInMicroSeco !(e instanceof RegionOfflineException || e instanceof NoServerForRegionException); } finally { userRegionLock.unlock(); + // update duration of the lock being held + if (metrics != null) { + metrics.updateUserRegionLockHeld(EnvironmentEdgeManager.currentTime() - lockStartTime); + } } try { Thread.sleep(ConnectionUtils.getPauseTime(pauseBase, tries)); @@ -1076,9 +1081,19 @@ rpcControllerFactory, getMetaLookupPool(), metaReplicaCallTimeoutScanInMicroSeco void takeUserRegionLock() throws IOException { try { long waitTime = connectionConfig.getMetaOperationTimeout(); + if (metrics != null) { + metrics.updateUserRegionLockQueue(userRegionLock.getQueueLength()); + } + final long waitStartTime = EnvironmentEdgeManager.currentTime(); if (!userRegionLock.tryLock(waitTime, TimeUnit.MILLISECONDS)) { + if (metrics != null) { + metrics.incrUserRegionLockTimeout(); + } throw new LockTimeoutException("Failed to get user region lock in" + waitTime + " ms. " + " for accessing meta region server."); + } else if (metrics != null) { + // successfully grabbed the lock, start timer of holding the lock + metrics.updateUserRegionLockWaiting(EnvironmentEdgeManager.currentTime() - waitStartTime); } } catch (InterruptedException ie) { LOG.error("Interrupted while waiting for a lock", ie); diff --git a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/MetricsConnection.java b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/MetricsConnection.java index b4792fdd3489..16a13df1e68e 100644 --- a/hbase-client/src/main/java/org/apache/hadoop/hbase/client/MetricsConnection.java +++ b/hbase-client/src/main/java/org/apache/hadoop/hbase/client/MetricsConnection.java @@ -289,6 +289,10 @@ public Counter newMetric(Class clazz, String name, String scope) { protected final Counter hedgedReadWin; protected final Histogram concurrentCallsPerServerHist; protected final Histogram numActionsPerServerHist; + protected final Counter userRegionLockTimeoutCount; + protected final Timer userRegionLockWaitingTimer; + protected final Timer userRegionLockHeldTimer; + protected final Histogram userRegionLockQueueHist; // dynamic metrics @@ -348,6 +352,14 @@ protected Ratio getRatio() { registry.histogram(name(MetricsConnection.class, "concurrentCallsPerServer", scope)); this.numActionsPerServerHist = registry.histogram(name(MetricsConnection.class, "numActionsPerServer", scope)); + this.userRegionLockTimeoutCount = + registry.counter(name(this.getClass(), "userRegionLockTimeoutCount", scope)); + this.userRegionLockWaitingTimer = + registry.timer(name(this.getClass(), "userRegionLockWaitingDuration", scope)); + this.userRegionLockHeldTimer = + registry.timer(name(this.getClass(), "userRegionLockHeldDuration", scope)); + this.userRegionLockQueueHist = + registry.histogram(name(MetricsConnection.class, "userRegionLockQueueLength", scope)); this.reporter = JmxReporter.forRegistry(this.registry).build(); this.reporter.start(); @@ -425,6 +437,24 @@ public void incrDelayRunnersAndUpdateDelayInterval(long interval) { this.runnerStats.updateDelayInterval(interval); } + /** incr */ + public void incrUserRegionLockTimeout() { + userRegionLockTimeoutCount.inc(); + } + + /** update */ + public void updateUserRegionLockWaiting(long duration) { + userRegionLockWaitingTimer.update(duration, TimeUnit.MILLISECONDS); + } + + public void updateUserRegionLockHeld(long duration) { + userRegionLockHeldTimer.update(duration, TimeUnit.MILLISECONDS); + } + + public void updateUserRegionLockQueue(int count) { + userRegionLockQueueHist.update(count); + } + /** * Get a metric for {@code key} from {@code map}, or create it with {@code factory}. */ diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestMetaCache.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestMetaCache.java index cc03fff3207b..3e48cb239376 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestMetaCache.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestMetaCache.java @@ -563,6 +563,7 @@ public void testUserRegionLockThrowsException() throws IOException, InterruptedE conf.setInt(HConstants.HBASE_CLIENT_RETRIES_NUMBER, 0); conf.setLong(HConstants.HBASE_CLIENT_META_OPERATION_TIMEOUT, 2000); conf.setLong(HConstants.HBASE_CLIENT_SCANNER_TIMEOUT_PERIOD, 2000); + conf.setBoolean(MetricsConnection.CLIENT_SIDE_METRICS_ENABLED_KEY, true); try (ConnectionImplementation conn = (ConnectionImplementation) ConnectionFactory.createConnection(conf)) { @@ -587,6 +588,28 @@ public void testUserRegionLockThrowsException() throws IOException, InterruptedE assertTrue(client1.getException() instanceof LockTimeoutException ^ client2.getException() instanceof LockTimeoutException); + + // obtain the client metrics + MetricsConnection metrics = conn.getConnectionMetrics(); + long queueCount = metrics.userRegionLockQueueHist.getCount(); + assertEquals("Queue of userRegionLock should be updated twice. queueCount: " + queueCount, + queueCount, 2); + + long timeoutCount = metrics.userRegionLockTimeoutCount.getCount(); + assertEquals("Timeout of userRegionLock should happen once. timeoutCount: " + timeoutCount, + timeoutCount, 1); + + long waitingTimerCount = metrics.userRegionLockWaitingTimer.getCount(); + assertEquals("userRegionLock should be grabbed successfully once. waitingTimerCount: " + + waitingTimerCount, waitingTimerCount, 1); + + long heldTimerCount = metrics.userRegionLockHeldTimer.getCount(); + assertEquals( + "userRegionLock should be held successfully once. heldTimerCount: " + heldTimerCount, + heldTimerCount, 1); + double heldTime = metrics.userRegionLockHeldTimer.getSnapshot().getMax(); + assertTrue("Max held time should be greater than 2 seconds. heldTime: " + heldTime, + heldTime >= 2E9); } }