From 5dc0a24ae22b6cbcfd219c0a2f9fdf0015baa9ab Mon Sep 17 00:00:00 2001 From: Stephen Wu Date: Wed, 22 Jun 2022 21:13:56 -0700 Subject: [PATCH 1/4] HADOOP-18310 Add option and make 400 bad request retryable --- .../java/org/apache/hadoop/fs/s3a/Constants.java | 5 +++++ .../org/apache/hadoop/fs/s3a/S3ARetryPolicy.java | 6 +++++- .../org/apache/hadoop/fs/s3a/TestInvoker.java | 15 ++++++++++++++- 3 files changed, 24 insertions(+), 2 deletions(-) diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/Constants.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/Constants.java index 764a6adaca27d..f59b752163782 100644 --- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/Constants.java +++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/Constants.java @@ -1203,4 +1203,9 @@ private Constants() { * Default maximum read size in bytes during vectored reads : {@value}. */ public static final int DEFAULT_AWS_S3_VECTOR_READS_MAX_MERGED_READ_SIZE = 1253376; //1M + + public static final String FAIL_ON_AWS_BAD_REQUEST = "fs.s3a.retry.failOnAwsBadRequest"; + + public static final boolean DEFAULT_FAIL_ON_AWS_BAD_REQUEST = true; + } diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3ARetryPolicy.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3ARetryPolicy.java index 30427f7672a43..c04fdf49982dd 100644 --- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3ARetryPolicy.java +++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3ARetryPolicy.java @@ -214,7 +214,11 @@ protected Map, RetryPolicy> createExceptionMap() { // policy on a 400/bad request still ambiguous. // Treated as an immediate failure - policyMap.put(AWSBadRequestException.class, fail); + RetryPolicy awsBadRequestExceptionRetryPolicy = + configuration.getBoolean(FAIL_ON_AWS_BAD_REQUEST, DEFAULT_FAIL_ON_AWS_BAD_REQUEST) ? + fail : + retryIdempotentCalls; + policyMap.put(AWSBadRequestException.class, awsBadRequestExceptionRetryPolicy); // Status 500 error code is also treated as a connectivity problem policyMap.put(AWSStatus500Exception.class, connectivityFailure); diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/TestInvoker.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/TestInvoker.java index 35199f4092790..68f30a5da2920 100644 --- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/TestInvoker.java +++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/TestInvoker.java @@ -311,12 +311,25 @@ public void testRetryAWSConnectivity() throws Throwable { */ @Test(expected = AWSBadRequestException.class) public void testRetryBadRequestNotIdempotent() throws Throwable { - invoker.retry("test", null, false, + + invoker.retry("test", null, true, () -> { throw BAD_REQUEST; }); } + @Test + public void testRetryBadRequestIdempotent() throws Throwable { + Configuration conf = new Configuration(FAST_RETRY_CONF); + conf.setBoolean(FAIL_ON_AWS_BAD_REQUEST, false); + S3ARetryPolicy retryPolicy = new S3ARetryPolicy(conf); + + IOException ex = translateException("GET", "/", BAD_REQUEST); + assertRetryAction("Expected retry on aws bad request", + retryPolicy, RetryPolicy.RetryAction.RETRY, + ex, 1, true); + } + @Test public void testConnectionRetryPolicyIdempotent() throws Throwable { assertRetryAction("Expected retry on connection timeout", From 7d4c4f1184d2cd4a69f7e8df1c70b2971ccbcbe0 Mon Sep 17 00:00:00 2001 From: Stephen Wu Date: Tue, 21 Jun 2022 15:27:06 -0700 Subject: [PATCH 2/4] fix checkstyle --- .../main/java/org/apache/hadoop/fs/s3a/S3ARetryPolicy.java | 5 ++--- .../src/test/java/org/apache/hadoop/fs/s3a/TestInvoker.java | 4 ++-- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3ARetryPolicy.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3ARetryPolicy.java index c04fdf49982dd..0d0ee47b58cfb 100644 --- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3ARetryPolicy.java +++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3ARetryPolicy.java @@ -215,9 +215,8 @@ protected Map, RetryPolicy> createExceptionMap() { // policy on a 400/bad request still ambiguous. // Treated as an immediate failure RetryPolicy awsBadRequestExceptionRetryPolicy = - configuration.getBoolean(FAIL_ON_AWS_BAD_REQUEST, DEFAULT_FAIL_ON_AWS_BAD_REQUEST) ? - fail : - retryIdempotentCalls; + configuration.getBoolean(FAIL_ON_AWS_BAD_REQUEST, DEFAULT_FAIL_ON_AWS_BAD_REQUEST) ? + fail : retryIdempotentCalls; policyMap.put(AWSBadRequestException.class, awsBadRequestExceptionRetryPolicy); // Status 500 error code is also treated as a connectivity problem diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/TestInvoker.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/TestInvoker.java index 68f30a5da2920..1e5e8a02e0ecc 100644 --- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/TestInvoker.java +++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/TestInvoker.java @@ -326,8 +326,8 @@ public void testRetryBadRequestIdempotent() throws Throwable { IOException ex = translateException("GET", "/", BAD_REQUEST); assertRetryAction("Expected retry on aws bad request", - retryPolicy, RetryPolicy.RetryAction.RETRY, - ex, 1, true); + retryPolicy, RetryPolicy.RetryAction.RETRY, + ex, 1, true); } @Test From f57025b0b24962cb67bfee97eb7b1c08d95e8599 Mon Sep 17 00:00:00 2001 From: Stephen Wu Date: Wed, 22 Jun 2022 21:09:12 -0700 Subject: [PATCH 3/4] address comments --- .../main/java/org/apache/hadoop/fs/s3a/Constants.java | 11 ++++++++++- .../src/site/markdown/tools/hadoop-aws/index.md | 7 +++++-- 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/Constants.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/Constants.java index f59b752163782..d1d0b48115d33 100644 --- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/Constants.java +++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/Constants.java @@ -1204,8 +1204,17 @@ private Constants() { */ public static final int DEFAULT_AWS_S3_VECTOR_READS_MAX_MERGED_READ_SIZE = 1253376; //1M - public static final String FAIL_ON_AWS_BAD_REQUEST = "fs.s3a.retry.failOnAwsBadRequest"; + /** + * Flag for immediate failure when observing a {@link AWSBadRequestException}. + * If it's disabled and set to false, the failure is treated as retryable. + * Value {@value}. + */ + public static final String FAIL_ON_AWS_BAD_REQUEST = "fs.s3a.fail.on.aws.bad.request"; + /** + * Default value for immediate failure when observing a + * {@link AWSBadRequestException}: {@value}. + */ public static final boolean DEFAULT_FAIL_ON_AWS_BAD_REQUEST = true; } diff --git a/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/index.md b/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/index.md index 7c0a49f8fbeda..55dbf2e52c653 100644 --- a/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/index.md +++ b/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/index.md @@ -1118,12 +1118,15 @@ from them. * Connection timeout: `ConnectTimeoutException`. Timeout before setting up a connection to the S3 endpoint (or proxy). -* HTTP response status code 400, "Bad Request" +* HTTP response status code 400, "Bad Request" aka `AWSBadRequestException` The status code 400, Bad Request usually means that the request is unrecoverable; it's the generic "No" response. Very rarely it does recover, which is why it is in this category, rather than that -of unrecoverable failures. +of unrecoverable failures. The default behavior fails immediately +without retry. If your system is failure sensitive, you can +configure `fs.s3a.fail.on.aws.bad.request` to `false` and allow +to retry when observing a Bad Request with status code 400. These failures will be retried with an exponential sleep interval set in `fs.s3a.retry.interval`, up to the limit set in `fs.s3a.retry.limit`. From 60df0f6f3637b114bb3fc7d43c74dc2208268310 Mon Sep 17 00:00:00 2001 From: Stephen Wu Date: Thu, 23 Jun 2022 01:16:27 -0700 Subject: [PATCH 4/4] fix blanks eol --- .../hadoop-aws/src/site/markdown/tools/hadoop-aws/index.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/index.md b/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/index.md index 55dbf2e52c653..c78f6b4f2fea7 100644 --- a/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/index.md +++ b/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/index.md @@ -1125,8 +1125,8 @@ is unrecoverable; it's the generic "No" response. Very rarely it does recover, which is why it is in this category, rather than that of unrecoverable failures. The default behavior fails immediately without retry. If your system is failure sensitive, you can -configure `fs.s3a.fail.on.aws.bad.request` to `false` and allow -to retry when observing a Bad Request with status code 400. +configure `fs.s3a.fail.on.aws.bad.request` to `false` and allow +to retry when observing a Bad Request with status code 400. These failures will be retried with an exponential sleep interval set in `fs.s3a.retry.interval`, up to the limit set in `fs.s3a.retry.limit`.