diff --git a/hadoop-common-project/hadoop-common/.debug.log.crc b/hadoop-common-project/hadoop-common/.debug.log.crc new file mode 100644 index 0000000000000..7f8e0a3c8ee75 Binary files /dev/null and b/hadoop-common-project/hadoop-common/.debug.log.crc differ diff --git a/hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/client/HdfsClientConfigKeys.java b/hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/client/HdfsClientConfigKeys.java index e8b540286c280..da00a745c8229 100755 --- a/hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/client/HdfsClientConfigKeys.java +++ b/hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/client/HdfsClientConfigKeys.java @@ -250,6 +250,11 @@ public interface HdfsClientConfigKeys { String DFS_LEASE_HARDLIMIT_KEY = "dfs.namenode.lease-hard-limit-sec"; long DFS_LEASE_HARDLIMIT_DEFAULT = 20 * 60; + String DFS_ROUTER_RPC_RETRY_INTERVAL_KEY = "dfs.router.rpc.retry.interval.seconds"; + int DFS_ROUTER_RPC_RETRY_INTERVAL_DEFAULT = 10; + String DFS_ROUTER_RPC_RETRY_COUNT_KEY = "dfs.router.rpc.retry.count"; + int DFS_ROUTER_RPC_RETRY_COUNT_DEFAULT = 3; + /** * These are deprecated config keys to client code. */ diff --git a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/router/RouterRpcClient.java b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/router/RouterRpcClient.java index dae4b9356436c..eebf0b3dce3a2 100644 --- a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/router/RouterRpcClient.java +++ b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/router/RouterRpcClient.java @@ -112,6 +112,10 @@ public class RouterRpcClient { private final ThreadPoolExecutor executorService; /** Retry policy for router -> NN communication. */ private final RetryPolicy retryPolicy; + /** Retry time interval for router -> NN communication when cluster is unavailable. */ + private long retryTimeInterval; + /** Maximum number of retries for router -> NN communication when cluster is unavailable. */ + private int maxRetryCount; /** Optional perf monitor. */ private final RouterRpcMonitor rpcMonitor; @@ -172,6 +176,13 @@ public RouterRpcClient(Configuration conf, Router router, this.retryPolicy = RetryPolicies.failoverOnNetworkException( RetryPolicies.TRY_ONCE_THEN_FAIL, maxFailoverAttempts, maxRetryAttempts, failoverSleepBaseMillis, failoverSleepMaxMillis); + this.retryTimeInterval = conf.getTimeDuration( + HdfsClientConfigKeys.DFS_ROUTER_RPC_RETRY_INTERVAL_KEY, + HdfsClientConfigKeys.DFS_ROUTER_RPC_RETRY_INTERVAL_DEFAULT, + TimeUnit.SECONDS); + this.maxRetryCount = conf.getInt( + HdfsClientConfigKeys.DFS_ROUTER_RPC_RETRY_COUNT_KEY, + HdfsClientConfigKeys.DFS_ROUTER_RPC_RETRY_COUNT_DEFAULT); } /** @@ -357,7 +368,7 @@ private RetryDecision shouldRetry(final IOException ioe, final int retryCount, // check for the case of cluster unavailable state if (isClusterUnAvailable(nsId)) { // we allow to retry once if cluster is unavailable - if (retryCount == 0) { + if (retryCount < maxRetryCount) { return RetryDecision.RETRY; } else { throw new NoNamenodesAvailableException(nsId, ioe); @@ -557,6 +568,19 @@ private Object invoke(String nsId, int retryCount, final Method method, } // retry + try { + for (int i = 0; i < maxRetryCount; i++) { + if (retryCount == i) { + TimeUnit.SECONDS.sleep(retryTimeInterval * i); + break; + } + } + } catch (InterruptedException ex) { + LOG.warn("Router rpc retry sleep encounter exception."); + } finally { + LOG.info("Router rpc retry, maxRetryCount={}, retryCount={}, method={}, params={} ", + maxRetryCount, retryCount, method, params); + } return invoke(nsId, ++retryCount, method, obj, params); } else if (decision == RetryDecision.FAILOVER_AND_RETRY) { // failover, invoker looks for standby exceptions for failover. diff --git a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/router/TestRouterRPCClientRetries.java b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/router/TestRouterRPCClientRetries.java index cea5212965cec..730987a1d4e4f 100644 --- a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/router/TestRouterRPCClientRetries.java +++ b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/router/TestRouterRPCClientRetries.java @@ -139,7 +139,7 @@ public void testRetryWhenAllNameServiceDown() throws Exception { // Verify the retry times, it should only retry one time. FederationRPCMetrics rpcMetrics = routerContext.getRouter() .getRpcServer().getRPCMetrics(); - assertEquals(1, rpcMetrics.getProxyOpRetries()); + assertEquals(3, rpcMetrics.getProxyOpRetries()); } @Test @@ -158,7 +158,7 @@ public void testRetryWhenOneNameServiceDown() throws Exception { // Verify the retry times, it will retry one time for ns0. FederationRPCMetrics rpcMetrics = routerContext.getRouter() .getRpcServer().getRPCMetrics(); - assertEquals(1, rpcMetrics.getProxyOpRetries()); + assertEquals(3, rpcMetrics.getProxyOpRetries()); } /**