-
Couldn't load subscription status.
- Fork 9.1k
HDFS-15419. RBF: Router should retry communicate with NN when cluster is unavailable using configurable time interval #2082
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: trunk
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -250,6 +250,11 @@ public interface HdfsClientConfigKeys { | |
| String DFS_LEASE_HARDLIMIT_KEY = "dfs.namenode.lease-hard-limit-sec"; | ||
| long DFS_LEASE_HARDLIMIT_DEFAULT = 20 * 60; | ||
|
|
||
| String DFS_ROUTER_RPC_RETRY_INTERVAL_KEY = "dfs.router.rpc.retry.interval.seconds"; | ||
| int DFS_ROUTER_RPC_RETRY_INTERVAL_DEFAULT = 10; | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Make it TimeUnit.SECONDS.toXXXX(10) |
||
| String DFS_ROUTER_RPC_RETRY_COUNT_KEY = "dfs.router.rpc.retry.count"; | ||
| int DFS_ROUTER_RPC_RETRY_COUNT_DEFAULT = 3; | ||
|
|
||
| /** | ||
| * These are deprecated config keys to client code. | ||
| */ | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -112,6 +112,10 @@ public class RouterRpcClient { | |
| private final ThreadPoolExecutor executorService; | ||
| /** Retry policy for router -> NN communication. */ | ||
| private final RetryPolicy retryPolicy; | ||
| /** Retry time interval for router -> NN communication when cluster is unavailable. */ | ||
| private long retryTimeInterval; | ||
| /** Maximum number of retries for router -> NN communication when cluster is unavailable. */ | ||
| private int maxRetryCount; | ||
| /** Optional perf monitor. */ | ||
| private final RouterRpcMonitor rpcMonitor; | ||
|
|
||
|
|
@@ -172,6 +176,13 @@ public RouterRpcClient(Configuration conf, Router router, | |
| this.retryPolicy = RetryPolicies.failoverOnNetworkException( | ||
| RetryPolicies.TRY_ONCE_THEN_FAIL, maxFailoverAttempts, maxRetryAttempts, | ||
| failoverSleepBaseMillis, failoverSleepMaxMillis); | ||
| this.retryTimeInterval = conf.getTimeDuration( | ||
| HdfsClientConfigKeys.DFS_ROUTER_RPC_RETRY_INTERVAL_KEY, | ||
| HdfsClientConfigKeys.DFS_ROUTER_RPC_RETRY_INTERVAL_DEFAULT, | ||
| TimeUnit.SECONDS); | ||
| this.maxRetryCount = conf.getInt( | ||
| HdfsClientConfigKeys.DFS_ROUTER_RPC_RETRY_COUNT_KEY, | ||
| HdfsClientConfigKeys.DFS_ROUTER_RPC_RETRY_COUNT_DEFAULT); | ||
| } | ||
|
|
||
| /** | ||
|
|
@@ -357,7 +368,7 @@ private RetryDecision shouldRetry(final IOException ioe, final int retryCount, | |
| // check for the case of cluster unavailable state | ||
| if (isClusterUnAvailable(nsId)) { | ||
| // we allow to retry once if cluster is unavailable | ||
| if (retryCount == 0) { | ||
|
||
| if (retryCount < maxRetryCount) { | ||
| return RetryDecision.RETRY; | ||
| } else { | ||
| throw new NoNamenodesAvailableException(nsId, ioe); | ||
|
|
@@ -557,6 +568,19 @@ private Object invoke(String nsId, int retryCount, final Method method, | |
| } | ||
|
|
||
| // retry | ||
| try { | ||
|
||
| for (int i = 0; i < maxRetryCount; i++) { | ||
| if (retryCount == i) { | ||
| TimeUnit.SECONDS.sleep(retryTimeInterval * i); | ||
| break; | ||
| } | ||
| } | ||
| } catch (InterruptedException ex) { | ||
| LOG.warn("Router rpc retry sleep encounter exception."); | ||
| } finally { | ||
| LOG.info("Router rpc retry, maxRetryCount={}, retryCount={}, method={}, params={} ", | ||
| maxRetryCount, retryCount, method, params); | ||
| } | ||
| return invoke(nsId, ++retryCount, method, obj, params); | ||
| } else if (decision == RetryDecision.FAILOVER_AND_RETRY) { | ||
| // failover, invoker looks for standby exceptions for failover. | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Technically is not seconds but time duration.