Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added hadoop-common-project/hadoop-common/.debug.log.crc
Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -250,6 +250,11 @@ public interface HdfsClientConfigKeys {
String DFS_LEASE_HARDLIMIT_KEY = "dfs.namenode.lease-hard-limit-sec";
long DFS_LEASE_HARDLIMIT_DEFAULT = 20 * 60;

String DFS_ROUTER_RPC_RETRY_INTERVAL_KEY = "dfs.router.rpc.retry.interval.seconds";
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Technically is not seconds but time duration.

int DFS_ROUTER_RPC_RETRY_INTERVAL_DEFAULT = 10;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Make it TimeUnit.SECONDS.toXXXX(10)

String DFS_ROUTER_RPC_RETRY_COUNT_KEY = "dfs.router.rpc.retry.count";
int DFS_ROUTER_RPC_RETRY_COUNT_DEFAULT = 3;

/**
* These are deprecated config keys to client code.
*/
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,10 @@ public class RouterRpcClient {
private final ThreadPoolExecutor executorService;
/** Retry policy for router -> NN communication. */
private final RetryPolicy retryPolicy;
/** Retry time interval for router -> NN communication when cluster is unavailable. */
private long retryTimeInterval;
/** Maximum number of retries for router -> NN communication when cluster is unavailable. */
private int maxRetryCount;
/** Optional perf monitor. */
private final RouterRpcMonitor rpcMonitor;

Expand Down Expand Up @@ -172,6 +176,13 @@ public RouterRpcClient(Configuration conf, Router router,
this.retryPolicy = RetryPolicies.failoverOnNetworkException(
RetryPolicies.TRY_ONCE_THEN_FAIL, maxFailoverAttempts, maxRetryAttempts,
failoverSleepBaseMillis, failoverSleepMaxMillis);
this.retryTimeInterval = conf.getTimeDuration(
HdfsClientConfigKeys.DFS_ROUTER_RPC_RETRY_INTERVAL_KEY,
HdfsClientConfigKeys.DFS_ROUTER_RPC_RETRY_INTERVAL_DEFAULT,
TimeUnit.SECONDS);
this.maxRetryCount = conf.getInt(
HdfsClientConfigKeys.DFS_ROUTER_RPC_RETRY_COUNT_KEY,
HdfsClientConfigKeys.DFS_ROUTER_RPC_RETRY_COUNT_DEFAULT);
}

/**
Expand Down Expand Up @@ -357,7 +368,7 @@ private RetryDecision shouldRetry(final IOException ioe, final int retryCount,
// check for the case of cluster unavailable state
if (isClusterUnAvailable(nsId)) {
// we allow to retry once if cluster is unavailable
if (retryCount == 0) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This logic starts with this counter at a value and then decreases, it makes sense to keep it as such.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In fact, not all clients are so smart, they may not configred appropriately. When those clients encounter this problem, they complain about router not retry more.

if (retryCount < maxRetryCount) {
return RetryDecision.RETRY;
} else {
throw new NoNamenodesAvailableException(nsId, ioe);
Expand Down Expand Up @@ -557,6 +568,19 @@ private Object invoke(String nsId, int retryCount, final Method method,
}

// retry
try {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The idea before was for us to surface the unavailability to the client and then the client retry with the regular client logic.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In fact, not all clients are so smart, they may not configred appropriately. When those clients encounter this problem, they complain about router not retry more.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure, but we should change the logic to match the old style or at least to not have the two styles together.

for (int i = 0; i < maxRetryCount; i++) {
if (retryCount == i) {
TimeUnit.SECONDS.sleep(retryTimeInterval * i);
break;
}
}
} catch (InterruptedException ex) {
LOG.warn("Router rpc retry sleep encounter exception.");
} finally {
LOG.info("Router rpc retry, maxRetryCount={}, retryCount={}, method={}, params={} ",
maxRetryCount, retryCount, method, params);
}
return invoke(nsId, ++retryCount, method, obj, params);
} else if (decision == RetryDecision.FAILOVER_AND_RETRY) {
// failover, invoker looks for standby exceptions for failover.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ public void testRetryWhenAllNameServiceDown() throws Exception {
// Verify the retry times, it should only retry one time.
FederationRPCMetrics rpcMetrics = routerContext.getRouter()
.getRpcServer().getRPCMetrics();
assertEquals(1, rpcMetrics.getProxyOpRetries());
assertEquals(3, rpcMetrics.getProxyOpRetries());
}

@Test
Expand All @@ -158,7 +158,7 @@ public void testRetryWhenOneNameServiceDown() throws Exception {
// Verify the retry times, it will retry one time for ns0.
FederationRPCMetrics rpcMetrics = routerContext.getRouter()
.getRpcServer().getRPCMetrics();
assertEquals(1, rpcMetrics.getProxyOpRetries());
assertEquals(3, rpcMetrics.getProxyOpRetries());
}

/**
Expand Down