apache · taklwu · Jun 23, 2022 · Jun 21, 2022 · Jun 23, 2022 · Jun 23, 2022
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/Constants.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/Constants.java
@@ -1203,4 +1203,18 @@ private Constants() {
    * Default maximum read size in bytes during vectored reads : {@value}.
    */
   public static final int DEFAULT_AWS_S3_VECTOR_READS_MAX_MERGED_READ_SIZE = 1253376; //1M
+
+  /**
+   * Flag for immediate failure when observing a {@link AWSBadRequestException}.
+   * If it's disabled and set to false, the failure is treated as retryable.
+   * Value {@value}.
+   */
+  public static final String FAIL_ON_AWS_BAD_REQUEST = "fs.s3a.fail.on.aws.bad.request";
+
+  /**
+   * Default value for immediate failure when observing a
+   * {@link AWSBadRequestException}: {@value}.
+   */
+  public static final boolean DEFAULT_FAIL_ON_AWS_BAD_REQUEST = true;
+
 }
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3ARetryPolicy.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3ARetryPolicy.java
@@ -214,7 +214,10 @@ protected Map<Class<? extends Exception>, RetryPolicy> createExceptionMap() {
 
     // policy on a 400/bad request still ambiguous.
     // Treated as an immediate failure
-    policyMap.put(AWSBadRequestException.class, fail);
+    RetryPolicy awsBadRequestExceptionRetryPolicy =
+        configuration.getBoolean(FAIL_ON_AWS_BAD_REQUEST, DEFAULT_FAIL_ON_AWS_BAD_REQUEST) ?
+            fail : retryIdempotentCalls;
+    policyMap.put(AWSBadRequestException.class, awsBadRequestExceptionRetryPolicy);
 
     // Status 500 error code is also treated as a connectivity problem
     policyMap.put(AWSStatus500Exception.class, connectivityFailure);

diff --git a/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/index.md b/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/index.md
@@ -1118,12 +1118,15 @@ from them.
 
 * Connection timeout: `ConnectTimeoutException`. Timeout before
 setting up a connection to the S3 endpoint (or proxy).
-* HTTP response status code 400, "Bad Request"
+* HTTP response status code 400, "Bad Request" aka `AWSBadRequestException`
 
 The status code 400, Bad Request usually means that the request
 is unrecoverable; it's the generic "No" response. Very rarely it
 does recover, which is why it is in this category, rather than that
-of unrecoverable failures.
+of unrecoverable failures. The default behavior fails immediately
+without retry. If your system is failure sensitive, you can
+configure `fs.s3a.fail.on.aws.bad.request` to `false` and allow
+to retry when observing a Bad Request with status code 400.
 
 These failures will be retried with an exponential sleep interval set in
 `fs.s3a.retry.interval`, up to the limit set in `fs.s3a.retry.limit`.

diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/TestInvoker.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/TestInvoker.java
@@ -311,12 +311,25 @@ public void testRetryAWSConnectivity() throws Throwable {
    */
   @Test(expected = AWSBadRequestException.class)
   public void testRetryBadRequestNotIdempotent() throws Throwable {
-    invoker.retry("test", null, false,
+
+    invoker.retry("test", null, true,
         () -> {
           throw BAD_REQUEST;
         });
   }
 
+  @Test
+  public void testRetryBadRequestIdempotent() throws Throwable {
+    Configuration conf = new Configuration(FAST_RETRY_CONF);
+    conf.setBoolean(FAIL_ON_AWS_BAD_REQUEST, false);
+    S3ARetryPolicy retryPolicy = new S3ARetryPolicy(conf);
+
+    IOException ex = translateException("GET", "/", BAD_REQUEST);
+    assertRetryAction("Expected retry on aws bad request",
+        retryPolicy, RetryPolicy.RetryAction.RETRY,
+        ex, 1, true);
+  }
+
   @Test
   public void testConnectionRetryPolicyIdempotent() throws Throwable {
     assertRetryAction("Expected retry on connection timeout",