From 12d7d26b668bb33868250d93b9ccc625d05b14df Mon Sep 17 00:00:00 2001
From: Sahil Takiar <sahiltakiar@users.noreply.github.com>
Date: Sat, 28 Sep 2019 00:55:03 +0530
Subject: [PATCH 01/40] HDFS-14564: Add libhdfs APIs for readFully; add
 readFully to ByteBufferPositionedReadable (#963) Contributed by Sahil Takiar.

Reviewed-by: Siyao Meng <smeng@cloudera.com>
---
 .../hadoop/crypto/CryptoInputStream.java      |  35 +-
 .../fs/ByteBufferPositionedReadable.java      |  90 ++
 .../apache/hadoop/fs/FSDataInputStream.java   |  34 +-
 .../hadoop/crypto/CryptoStreamsTestBase.java  | 115 ++-
 .../hadoop/crypto/TestCryptoStreams.java      |  57 +-
 .../crypto/TestCryptoStreamsForLocalFS.java   |  15 +
 .../crypto/TestCryptoStreamsNormal.java       |  10 +
 .../apache/hadoop/hdfs/DFSInputStream.java    |  30 +-
 .../native/libhdfs-tests/test_libhdfs_ops.c   | 296 +++++-
 .../src/main/native/libhdfs/hdfs.c            | 841 ++++++++++++------
 .../main/native/libhdfs/include/hdfs/hdfs.h   |  21 +-
 .../main/native/libhdfspp/tests/hdfs_shim.c   |   6 +
 .../libhdfspp/tests/libhdfs_wrapper_defines.h |   1 +
 .../libhdfspp/tests/libhdfs_wrapper_undefs.h  |   1 +
 .../tests/libhdfspp_wrapper_defines.h         |   1 +
 .../hadoop/hdfs/TestByteBufferPread.java      | 290 ++++++
 16 files changed, 1471 insertions(+), 372 deletions(-)
 create mode 100644 hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/ByteBufferPositionedReadable.java
 create mode 100644 hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestByteBufferPread.java

diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/crypto/CryptoInputStream.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/crypto/CryptoInputStream.java
index a2273bf83343b..b2ee0c184a490 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/crypto/CryptoInputStream.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/crypto/CryptoInputStream.java
@@ -33,6 +33,7 @@
 import com.google.common.base.Preconditions;
 import org.apache.hadoop.classification.InterfaceAudience;
 import org.apache.hadoop.classification.InterfaceStability;
+import org.apache.hadoop.fs.ByteBufferPositionedReadable;
 import org.apache.hadoop.fs.ByteBufferReadable;
 import org.apache.hadoop.fs.CanSetDropBehind;
 import org.apache.hadoop.fs.CanSetReadahead;
@@ -328,20 +329,40 @@ public int read(long position, byte[] buffer, int offset, int length)
       throws IOException {
     checkStream();
     try {
-      final int n = ((PositionedReadable) in).read(position, buffer, offset, 
+      final int n = ((PositionedReadable) in).read(position, buffer, offset,
           length);
       if (n > 0) {
         // This operation does not change the current offset of the file
         decrypt(position, buffer, offset, n);
       }
-      
+
       return n;
     } catch (ClassCastException e) {
       throw new UnsupportedOperationException("This stream does not support " +
           "positioned read.");
     }
   }
-  
+
+  /**
+   * Positioned readFully using {@link ByteBuffer}s. This method is thread-safe.
+   */
+  @Override
+  public void readFully(long position, final ByteBuffer buf)
+      throws IOException {
+    checkStream();
+    if (!(in instanceof ByteBufferPositionedReadable)) {
+      throw new UnsupportedOperationException(in.getClass().getCanonicalName()
+          + " does not support positioned reads with byte buffers.");
+    }
+    int bufPos = buf.position();
+    ((ByteBufferPositionedReadable) in).readFully(position, buf);
+    final int n = buf.position() - bufPos;
+    if (n > 0) {
+      // This operation does not change the current offset of the file
+      decrypt(position, buf, n, bufPos);
+    }
+  }
+
   /**
    * Decrypt length bytes in buffer starting at offset. Output is also put 
    * into buffer starting at offset. It is thread-safe.
@@ -375,7 +396,7 @@ private void decrypt(long position, byte[] buffer, int offset, int length)
       returnDecryptor(decryptor);
     }
   }
-  
+
   /** Positioned read fully. It is thread-safe */
   @Override
   public void readFully(long position, byte[] buffer, int offset, int length)
@@ -407,7 +428,7 @@ public void seek(long pos) throws IOException {
     checkStream();
     try {
       /*
-       * If data of target pos in the underlying stream has already been read 
+       * If data of target pos in the underlying stream has already been read
        * and decrypted in outBuffer, we just need to re-position outBuffer.
        */
       if (pos <= streamOffset && pos >= (streamOffset - outBuffer.remaining())) {
@@ -523,7 +544,7 @@ public int read(ByteBuffer buf) throws IOException {
    * Output is also buf and same start position.
    * buf.position() and buf.limit() should be unchanged after decryption.
    */
-  private void decrypt(ByteBuffer buf, int n, int start) 
+  private void decrypt(ByteBuffer buf, int n, int start)
       throws IOException {
     final int pos = buf.position();
     final int limit = buf.limit();
@@ -605,7 +626,7 @@ public ByteBuffer read(ByteBufferPool bufferPool, int maxLength,
       }
       return buffer;
     } catch (ClassCastException e) {
-      throw new UnsupportedOperationException("This stream does not support " + 
+      throw new UnsupportedOperationException("This stream does not support " +
           "enhanced byte buffer access.");
     }
   }
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/ByteBufferPositionedReadable.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/ByteBufferPositionedReadable.java
new file mode 100644
index 0000000000000..f8282d88c46c3
--- /dev/null
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/ByteBufferPositionedReadable.java
@@ -0,0 +1,90 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.fs;
+
+import java.io.EOFException;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.classification.InterfaceStability;
+
+/**
+ * Implementers of this interface provide a positioned read API that writes to a
+ * {@link ByteBuffer} rather than a {@code byte[]}.
+ *
+ * @see PositionedReadable
+ * @see ByteBufferReadable
+ */
+@InterfaceAudience.Public
+@InterfaceStability.Evolving
+public interface ByteBufferPositionedReadable {
+  /**
+   * Reads up to {@code buf.remaining()} bytes into buf from a given position
+   * in the file and returns the number of bytes read. Callers should use
+   * {@code buf.limit(...)} to control the size of the desired read and
+   * {@code buf.position(...)} to control the offset into the buffer the data
+   * should be written to.
+   * <p>
+   * After a successful call, {@code buf.position()} will be advanced by the
+   * number of bytes read and {@code buf.limit()} will be unchanged.
+   * <p>
+   * In the case of an exception, the state of the buffer (the contents of the
+   * buffer, the {@code buf.position()}, the {@code buf.limit()}, etc.) is
+   * undefined, and callers should be prepared to recover from this
+   * eventuality.
+   * <p>
+   * Callers should use {@link StreamCapabilities#hasCapability(String)} with
+   * {@link StreamCapabilities#PREADBYTEBUFFER} to check if the underlying
+   * stream supports this interface, otherwise they might get a
+   * {@link UnsupportedOperationException}.
+   * <p>
+   * Implementations should treat 0-length requests as legitimate, and must not
+   * signal an error upon their receipt.
+   * <p>
+   * This does not change the current offset of a file, and is thread-safe.
+   *
+   * @param position position within file
+   * @param buf the ByteBuffer to receive the results of the read operation.
+   * @return the number of bytes read, possibly zero, or -1 if reached
+   *         end-of-stream
+   * @throws IOException if there is some error performing the read
+   */
+  int read(long position, ByteBuffer buf) throws IOException;
+
+  /**
+   * Reads {@code buf.remaining()} bytes into buf from a given position in
+   * the file or until the end of the data was reached before the read
+   * operation completed. Callers should use {@code buf.limit(...)} to
+   * control the size of the desired read and {@code buf.position(...)} to
+   * control the offset into the buffer the data should be written to.
+   * <p>
+   * This operation provides similar semantics to
+   * {@link #read(long, ByteBuffer)}, the difference is that this method is
+   * guaranteed to read data until the {@link ByteBuffer} is full, or until
+   * the end of the data stream is reached.
+   *
+   * @param position position within file
+   * @param buf the ByteBuffer to receive the results of the read operation.
+   * @throws IOException if there is some error performing the read
+   * @throws EOFException the end of the data was reached before
+   * the read operation completed
+   * @see #read(long, ByteBuffer)
+   */
+  void readFully(long position, ByteBuffer buf) throws IOException;
+}
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FSDataInputStream.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FSDataInputStream.java
index 08d71f16c0783..3b5fd7c370cef 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FSDataInputStream.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FSDataInputStream.java
@@ -38,7 +38,8 @@
 public class FSDataInputStream extends DataInputStream
     implements Seekable, PositionedReadable, 
       ByteBufferReadable, HasFileDescriptor, CanSetDropBehind, CanSetReadahead,
-      HasEnhancedByteBufferAccess, CanUnbuffer, StreamCapabilities {
+      HasEnhancedByteBufferAccess, CanUnbuffer, StreamCapabilities,
+      ByteBufferPositionedReadable {
   /**
    * Map ByteBuffers that we have handed out to readers to ByteBufferPool 
    * objects
@@ -50,8 +51,8 @@ public class FSDataInputStream extends DataInputStream
   public FSDataInputStream(InputStream in) {
     super(in);
     if( !(in instanceof Seekable) || !(in instanceof PositionedReadable) ) {
-      throw new IllegalArgumentException(
-          "In is not an instance of Seekable or PositionedReadable");
+      throw new IllegalArgumentException(in.getClass().getCanonicalName() +
+          " is not an instance of Seekable or PositionedReadable");
     }
   }
   
@@ -147,7 +148,8 @@ public int read(ByteBuffer buf) throws IOException {
       return ((ByteBufferReadable)in).read(buf);
     }
 
-    throw new UnsupportedOperationException("Byte-buffer read unsupported by input stream");
+    throw new UnsupportedOperationException("Byte-buffer read unsupported " +
+            "by " + in.getClass().getCanonicalName());
   }
 
   @Override
@@ -167,9 +169,8 @@ public void setReadahead(Long readahead)
     try {
       ((CanSetReadahead)in).setReadahead(readahead);
     } catch (ClassCastException e) {
-      throw new UnsupportedOperationException(
-          "this stream does not support setting the readahead " +
-          "caching strategy.");
+      throw new UnsupportedOperationException(in.getClass().getCanonicalName() +
+          " does not support setting the readahead caching strategy.");
     }
   }
 
@@ -246,4 +247,23 @@ public boolean hasCapability(String capability) {
   public String toString() {
     return super.toString() + ": " + in;
   }
+
+  @Override
+  public int read(long position, ByteBuffer buf) throws IOException {
+    if (in instanceof ByteBufferPositionedReadable) {
+      return ((ByteBufferPositionedReadable) in).read(position, buf);
+    }
+    throw new UnsupportedOperationException("Byte-buffer pread unsupported " +
+        "by " + in.getClass().getCanonicalName());
+  }
+
+  @Override
+  public void readFully(long position, ByteBuffer buf) throws IOException {
+    if (in instanceof ByteBufferPositionedReadable) {
+      ((ByteBufferPositionedReadable) in).readFully(position, buf);
+    } else {
+      throw new UnsupportedOperationException("Byte-buffer pread " +
+              "unsupported by " + in.getClass().getCanonicalName());
+    }
+  }
 }
diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/crypto/CryptoStreamsTestBase.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/crypto/CryptoStreamsTestBase.java
index a0eb105833809..7e5fe7071610e 100644
--- a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/crypto/CryptoStreamsTestBase.java
+++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/crypto/CryptoStreamsTestBase.java
@@ -316,42 +316,41 @@ private void positionedReadCheck(InputStream in, int pos) throws Exception {
     Assert.assertArrayEquals(readData, expectedData);
   }
   
-  /** Test read fully */
+  /** Test read fully. */
   @Test(timeout=120000)
   public void testReadFully() throws Exception {
     OutputStream out = getOutputStream(defaultBufferSize);
     writeData(out);
     
-    InputStream in = getInputStream(defaultBufferSize);
-    final int len1 = dataLen / 4;
-    // Read len1 bytes
-    byte[] readData = new byte[len1];
-    readAll(in, readData, 0, len1);
-    byte[] expectedData = new byte[len1];
-    System.arraycopy(data, 0, expectedData, 0, len1);
-    Assert.assertArrayEquals(readData, expectedData);
-    
-    // Pos: 1/3 dataLen
-    readFullyCheck(in, dataLen / 3);
-    
-    // Read len1 bytes
-    readData = new byte[len1];
-    readAll(in, readData, 0, len1);
-    expectedData = new byte[len1];
-    System.arraycopy(data, len1, expectedData, 0, len1);
-    Assert.assertArrayEquals(readData, expectedData);
-    
-    // Pos: 1/2 dataLen
-    readFullyCheck(in, dataLen / 2);
-    
-    // Read len1 bytes
-    readData = new byte[len1];
-    readAll(in, readData, 0, len1);
-    expectedData = new byte[len1];
-    System.arraycopy(data, 2 * len1, expectedData, 0, len1);
-    Assert.assertArrayEquals(readData, expectedData);
-    
-    in.close();
+    try (InputStream in = getInputStream(defaultBufferSize)) {
+      final int len1 = dataLen / 4;
+      // Read len1 bytes
+      byte[] readData = new byte[len1];
+      readAll(in, readData, 0, len1);
+      byte[] expectedData = new byte[len1];
+      System.arraycopy(data, 0, expectedData, 0, len1);
+      Assert.assertArrayEquals(readData, expectedData);
+
+      // Pos: 1/3 dataLen
+      readFullyCheck(in, dataLen / 3);
+
+      // Read len1 bytes
+      readData = new byte[len1];
+      readAll(in, readData, 0, len1);
+      expectedData = new byte[len1];
+      System.arraycopy(data, len1, expectedData, 0, len1);
+      Assert.assertArrayEquals(readData, expectedData);
+
+      // Pos: 1/2 dataLen
+      readFullyCheck(in, dataLen / 2);
+
+      // Read len1 bytes
+      readData = new byte[len1];
+      readAll(in, readData, 0, len1);
+      expectedData = new byte[len1];
+      System.arraycopy(data, 2 * len1, expectedData, 0, len1);
+      Assert.assertArrayEquals(readData, expectedData);
+    }
   }
   
   private void readFullyCheck(InputStream in, int pos) throws Exception {
@@ -369,6 +368,60 @@ private void readFullyCheck(InputStream in, int pos) throws Exception {
     } catch (EOFException e) {
     }
   }
+
+  /** Test byte byffer read fully. */
+  @Test(timeout=120000)
+  public void testByteBufferReadFully() throws Exception {
+    OutputStream out = getOutputStream(defaultBufferSize);
+    writeData(out);
+
+    try (InputStream in = getInputStream(defaultBufferSize)) {
+      final int len1 = dataLen / 4;
+      // Read len1 bytes
+      byte[] readData = new byte[len1];
+      readAll(in, readData, 0, len1);
+      byte[] expectedData = new byte[len1];
+      System.arraycopy(data, 0, expectedData, 0, len1);
+      Assert.assertArrayEquals(readData, expectedData);
+
+      // Pos: 1/3 dataLen
+      byteBufferReadFullyCheck(in, dataLen / 3);
+
+      // Read len1 bytes
+      readData = new byte[len1];
+      readAll(in, readData, 0, len1);
+      expectedData = new byte[len1];
+      System.arraycopy(data, len1, expectedData, 0, len1);
+      Assert.assertArrayEquals(readData, expectedData);
+
+      // Pos: 1/2 dataLen
+      byteBufferReadFullyCheck(in, dataLen / 2);
+
+      // Read len1 bytes
+      readData = new byte[len1];
+      readAll(in, readData, 0, len1);
+      expectedData = new byte[len1];
+      System.arraycopy(data, 2 * len1, expectedData, 0, len1);
+      Assert.assertArrayEquals(readData, expectedData);
+    }
+  }
+
+  private void byteBufferReadFullyCheck(InputStream in, int pos)
+          throws Exception {
+    ByteBuffer result = ByteBuffer.allocate(dataLen - pos);
+    ((ByteBufferPositionedReadable) in).readFully(pos, result);
+
+    byte[] expectedData = new byte[dataLen - pos];
+    System.arraycopy(data, pos, expectedData, 0, dataLen - pos);
+    Assert.assertArrayEquals(result.array(), expectedData);
+
+    result = ByteBuffer.allocate(dataLen); // Exceeds maximum length
+    try {
+      ((ByteBufferPositionedReadable) in).readFully(pos, result);
+      Assert.fail("Read fully exceeds maximum length should fail.");
+    } catch (EOFException e) {
+    }
+  }
   
   /** Test seek to different position. */
   @Test(timeout=120000)
diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/crypto/TestCryptoStreams.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/crypto/TestCryptoStreams.java
index cd7391a02c38f..514c54080a0a6 100644
--- a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/crypto/TestCryptoStreams.java
+++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/crypto/TestCryptoStreams.java
@@ -26,6 +26,7 @@
 import java.util.EnumSet;
 
 import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.ByteBufferPositionedReadable;
 import org.apache.hadoop.fs.ByteBufferReadable;
 import org.apache.hadoop.fs.CanSetDropBehind;
 import org.apache.hadoop.fs.CanSetReadahead;
@@ -180,7 +181,7 @@ static class FakeInputStream extends InputStream
       implements Seekable, PositionedReadable, ByteBufferReadable,
                  HasFileDescriptor, CanSetDropBehind, CanSetReadahead,
                  HasEnhancedByteBufferAccess, CanUnbuffer,
-                 StreamCapabilities {
+                 StreamCapabilities, ByteBufferPositionedReadable {
     private final byte[] oneByteBuf = new byte[1];
     private int pos = 0;
     private final byte[] data;
@@ -303,6 +304,56 @@ public int read(long position, byte[] b, int off, int len)
       return -1;
     }
 
+    @Override
+    public int read(long position, ByteBuffer buf) throws IOException {
+      if (buf == null) {
+        throw new NullPointerException();
+      } else if (!buf.hasRemaining()) {
+        return 0;
+      }
+
+      if (position > length) {
+        throw new IOException("Cannot read after EOF.");
+      }
+      if (position < 0) {
+        throw new IOException("Cannot read to negative offset.");
+      }
+
+      checkStream();
+
+      if (position < length) {
+        int n = (int) Math.min(buf.remaining(), length - position);
+        buf.put(data, (int) position, n);
+        return n;
+      }
+
+      return -1;
+    }
+
+    @Override
+    public void readFully(long position, ByteBuffer buf) throws IOException {
+      if (buf == null) {
+        throw new NullPointerException();
+      } else if (!buf.hasRemaining()) {
+        return;
+      }
+
+      if (position > length) {
+        throw new IOException("Cannot read after EOF.");
+      }
+      if (position < 0) {
+        throw new IOException("Cannot read to negative offset.");
+      }
+
+      checkStream();
+
+      if (position + buf.remaining() > length) {
+        throw new EOFException("Reach the end of stream.");
+      }
+
+      buf.put(data, (int) position, buf.remaining());
+    }
+
     @Override
     public void readFully(long position, byte[] b, int off, int len)
         throws IOException {
@@ -439,7 +490,9 @@ public void testHasCapability() throws Exception {
         new String[] {
             StreamCapabilities.DROPBEHIND,
             StreamCapabilities.READAHEAD,
-            StreamCapabilities.UNBUFFER
+            StreamCapabilities.UNBUFFER,
+            StreamCapabilities.READBYTEBUFFER,
+            StreamCapabilities.PREADBYTEBUFFER
         },
         new String[] {
             StreamCapabilities.HFLUSH,
diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/crypto/TestCryptoStreamsForLocalFS.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/crypto/TestCryptoStreamsForLocalFS.java
index bb3fd7a68d722..8453889b53a5a 100644
--- a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/crypto/TestCryptoStreamsForLocalFS.java
+++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/crypto/TestCryptoStreamsForLocalFS.java
@@ -90,11 +90,26 @@ protected InputStream getInputStream(int bufferSize, byte[] key, byte[] iv)
   @Override
   @Test(timeout=10000)
   public void testByteBufferRead() throws Exception {}
+
+  @Ignore("Wrapped stream doesn't support ByteBufferPositionedReadable")
+  @Override
+  @Test(timeout=10000)
+  public void testPositionedReadWithByteBuffer() throws IOException {}
+
+  @Ignore("Wrapped stream doesn't support ByteBufferPositionedReadable")
+  @Override
+  @Test(timeout=10000)
+  public void testByteBufferReadFully() throws Exception {}
   
   @Ignore("ChecksumFSOutputSummer doesn't support Syncable")
   @Override
   @Test(timeout=10000)
   public void testSyncable() throws IOException {}
+
+  @Ignore("Wrapped stream doesn't support ByteBufferPositionedReadable")
+  @Override
+  @Test(timeout=10000)
+  public void testByteBufferPread() throws IOException {}
   
   @Ignore("ChecksumFSInputChecker doesn't support ByteBuffer read")
   @Override
diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/crypto/TestCryptoStreamsNormal.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/crypto/TestCryptoStreamsNormal.java
index 7e300777a37a1..df7dc72cf886a 100644
--- a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/crypto/TestCryptoStreamsNormal.java
+++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/crypto/TestCryptoStreamsNormal.java
@@ -91,6 +91,16 @@ public void testSyncable() throws IOException {}
   @Test(timeout=10000)
   public void testPositionedRead() throws IOException {}
 
+  @Ignore("Wrapped stream doesn't support ByteBufferPositionedReadable")
+  @Override
+  @Test(timeout=10000)
+  public void testPositionedReadWithByteBuffer() throws IOException {}
+
+  @Ignore("Wrapped stream doesn't support ByteBufferPositionedReadable")
+  @Override
+  @Test(timeout=10000)
+  public void testByteBufferReadFully() throws Exception {}
+
   @Ignore("Wrapped stream doesn't support ReadFully")
   @Override
   @Test(timeout=10000)
diff --git a/hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/DFSInputStream.java b/hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/DFSInputStream.java
index a4bf4542d04d4..6a7a400121973 100644
--- a/hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/DFSInputStream.java
+++ b/hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/DFSInputStream.java
@@ -46,12 +46,14 @@
 
 import org.apache.commons.io.IOUtils;
 import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.fs.ByteBufferPositionedReadable;
 import org.apache.hadoop.fs.ByteBufferReadable;
 import org.apache.hadoop.fs.ByteBufferUtil;
 import org.apache.hadoop.fs.CanSetDropBehind;
 import org.apache.hadoop.fs.CanSetReadahead;
 import org.apache.hadoop.fs.CanUnbuffer;
 import org.apache.hadoop.fs.ChecksumException;
+import org.apache.hadoop.fs.FSExceptionMessages;
 import org.apache.hadoop.fs.FSInputStream;
 import org.apache.hadoop.fs.FileEncryptionInfo;
 import org.apache.hadoop.fs.HasEnhancedByteBufferAccess;
@@ -100,7 +102,8 @@
 @InterfaceAudience.Private
 public class DFSInputStream extends FSInputStream
     implements ByteBufferReadable, CanSetDropBehind, CanSetReadahead,
-               HasEnhancedByteBufferAccess, CanUnbuffer, StreamCapabilities {
+               HasEnhancedByteBufferAccess, CanUnbuffer, StreamCapabilities,
+               ByteBufferPositionedReadable {
   @VisibleForTesting
   public static boolean tcpReadsDisabledForTesting = false;
   private long hedgedReadOpsLoopNumForTesting = 0;
@@ -318,8 +321,7 @@ private long fetchLocatedBlocksAndGetLastBlockLength(boolean refresh)
     }
 
     if (locatedBlocks != null) {
-      Iterator<LocatedBlock> oldIter =
-          locatedBlocks.getLocatedBlocks().iterator();
+      Iterator<LocatedBlock> oldIter = locatedBlocks.getLocatedBlocks().iterator();
       Iterator<LocatedBlock> newIter = newInfo.getLocatedBlocks().iterator();
       while (oldIter.hasNext() && newIter.hasNext()) {
         if (!oldIter.next().getBlock().equals(newIter.next().getBlock())) {
@@ -642,7 +644,6 @@ private synchronized DatanodeInfo blockSeekTo(long target)
       //
       // Compute desired block
       //
-
       LocatedBlock targetBlock = getBlockAt(target);
 
       // update current position
@@ -1653,6 +1654,27 @@ public void reset() throws IOException {
     throw new IOException("Mark/reset not supported");
   }
 
+  @Override
+  public int read(long position, final ByteBuffer buf) throws IOException {
+    if (!buf.hasRemaining()) {
+      return 0;
+    }
+    return pread(position, buf);
+  }
+
+  @Override
+  public void readFully(long position, final ByteBuffer buf)
+      throws IOException {
+    int nread = 0;
+    while (buf.hasRemaining()) {
+      int nbytes = read(position + nread, buf);
+      if (nbytes < 0) {
+        throw new EOFException(FSExceptionMessages.EOF_IN_READ_FULLY);
+      }
+      nread += nbytes;
+    }
+  }
+
   /** Utility class to encapsulate data node info and its address. */
   static final class DNAddrPair {
     final DatanodeInfo info;
diff --git a/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs-tests/test_libhdfs_ops.c b/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs-tests/test_libhdfs_ops.c
index d69aa37794848..1e92e21ee9692 100644
--- a/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs-tests/test_libhdfs_ops.c
+++ b/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs-tests/test_libhdfs_ops.c
@@ -16,8 +16,10 @@
  * limitations under the License.
  */
 
-#include "hdfs/hdfs.h" 
-#include "hdfs_test.h" 
+#include "expect.h"
+#include "hdfs/hdfs.h"
+#include "hdfs_test.h"
+#include "native_mini_dfs.h"
 #include "platform.h"
 
 #include <inttypes.h>
@@ -59,7 +61,18 @@ void permission_disp(short permissions, char *rtr) {
       strncpy(rtr, perm, 3);
       rtr+=3;
     }
-} 
+}
+
+/**
+ * Shutdown and free the given mini cluster, and then exit with the provided exit_code. This method is meant to be
+ * called with a non-zero exit code, which is why we ignore the return status of calling MiniDFSCluster#shutdown since
+ * the process is going to fail anyway.
+ */
+void shutdown_and_exit(struct NativeMiniDfsCluster* cl, int exit_code) {
+    nmdShutdown(cl);
+    nmdFree(cl);
+    exit(exit_code);
+}
 
 int main(int argc, char **argv) {
     const char *writePath = "/tmp/testfile.txt";
@@ -75,9 +88,9 @@ int main(int argc, char **argv) {
     const char *userPath = "/tmp/usertestfile.txt";
 
     char buffer[32], buffer2[256], rdbuffer[32];
-    tSize num_written_bytes, num_read_bytes;
+    tSize num_written_bytes, num_read_bytes, num_pread_bytes;
     hdfsFS fs, lfs;
-    hdfsFile writeFile, readFile, localFile, appendFile, userFile;
+    hdfsFile writeFile, readFile, preadFile, localFile, appendFile, userFile;
     tOffset currentPos, seekPos;
     int exists, totalResult, result, numEntries, i, j;
     const char *resp;
@@ -88,16 +101,47 @@ int main(int argc, char **argv) {
     short newPerm = 0666;
     tTime newMtime, newAtime;
 
-    fs = hdfsConnectNewInstance("default", 0);
+    // Create and start the mini cluster
+    struct NativeMiniDfsCluster* cl;
+    struct NativeMiniDfsConf conf = {
+        1, /* doFormat */
+    };
+
+    cl = nmdCreate(&conf);
+    EXPECT_NONNULL(cl);
+    EXPECT_ZERO(nmdWaitClusterUp(cl));
+    tPort port;
+    port = (tPort) nmdGetNameNodePort(cl);
+
+    // Create a hdfs connection to the mini cluster
+    struct hdfsBuilder *bld;
+    bld = hdfsNewBuilder();
+    EXPECT_NONNULL(bld);
+
+    hdfsBuilderSetForceNewInstance(bld);
+    hdfsBuilderSetNameNode(bld, "localhost");
+    hdfsBuilderSetNameNodePort(bld, port);
+    // The HDFS append tests require setting this property otherwise the tests fail with:
+    //
+    //     IOException: Failed to replace a bad datanode on the existing pipeline due to no more good datanodes being
+    //     available to try. The current failed datanode replacement policy is DEFAULT, and a client may configure this
+    //     via 'dfs.client.block.write.replace-datanode-on-failure.policy' in its configuration.
+    //
+    // It seems that when operating against a mini DFS cluster, some HDFS append tests require setting this property
+    // (for example, see TestFileAppend#testMultipleAppends)
+    hdfsBuilderConfSetStr(bld, "dfs.client.block.write.replace-datanode-on-failure.enable", "false");
+
+    fs = hdfsBuilderConnect(bld);
+
     if(!fs) {
         fprintf(stderr, "Oops! Failed to connect to hdfs!\n");
-        exit(-1);
+        shutdown_and_exit(cl, -1);
     } 
  
     lfs = hdfsConnectNewInstance(NULL, 0);
     if(!lfs) {
         fprintf(stderr, "Oops! Failed to connect to 'local' hdfs!\n");
-        exit(-1);
+        shutdown_and_exit(cl, -1);
     } 
 
     {
@@ -106,7 +150,7 @@ int main(int argc, char **argv) {
         writeFile = hdfsOpenFile(fs, writePath, O_WRONLY|O_CREAT, 0, 0, 0);
         if(!writeFile) {
             fprintf(stderr, "Failed to open %s for writing!\n", writePath);
-            exit(-1);
+            shutdown_and_exit(cl, -1);
         }
         fprintf(stderr, "Opened %s for writing successfully...\n", writePath);
         num_written_bytes =
@@ -115,7 +159,7 @@ int main(int argc, char **argv) {
         if (num_written_bytes != strlen(fileContents) + 1) {
           fprintf(stderr, "Failed to write correct number of bytes - expected %d, got %d\n",
                   (int)(strlen(fileContents) + 1), (int)num_written_bytes);
-            exit(-1);
+            shutdown_and_exit(cl, -1);
         }
         fprintf(stderr, "Wrote %d bytes\n", num_written_bytes);
 
@@ -124,19 +168,19 @@ int main(int argc, char **argv) {
             fprintf(stderr, 
                     "Failed to get current file position correctly! Got %" PRId64 "!\n",
                     currentPos);
-            exit(-1);
+            shutdown_and_exit(cl, -1);
         }
         fprintf(stderr, "Current position: %" PRId64 "\n", currentPos);
 
         if (hdfsFlush(fs, writeFile)) {
             fprintf(stderr, "Failed to 'flush' %s\n", writePath); 
-            exit(-1);
+            shutdown_and_exit(cl, -1);
         }
         fprintf(stderr, "Flushed %s successfully!\n", writePath); 
 
         if (hdfsHFlush(fs, writeFile)) {
             fprintf(stderr, "Failed to 'hflush' %s\n", writePath);
-            exit(-1);
+            shutdown_and_exit(cl, -1);
         }
         fprintf(stderr, "HFlushed %s successfully!\n", writePath);
 
@@ -150,20 +194,20 @@ int main(int argc, char **argv) {
 
         if (exists) {
           fprintf(stderr, "Failed to validate existence of %s\n", readPath);
-          exit(-1);
+          shutdown_and_exit(cl, -1);
         }
 
         readFile = hdfsOpenFile(fs, readPath, O_RDONLY, 0, 0, 0);
         if (!readFile) {
             fprintf(stderr, "Failed to open %s for reading!\n", readPath);
-            exit(-1);
+            shutdown_and_exit(cl, -1);
         }
 
         if (!hdfsFileIsOpenForRead(readFile)) {
             fprintf(stderr, "hdfsFileIsOpenForRead: we just opened a file "
                     "with O_RDONLY, and it did not show up as 'open for "
                     "read'\n");
-            exit(-1);
+            shutdown_and_exit(cl, -1);
         }
 
         fprintf(stderr, "hdfsAvailable: %d\n", hdfsAvailable(fs, readFile));
@@ -171,7 +215,7 @@ int main(int argc, char **argv) {
         seekPos = 1;
         if(hdfsSeek(fs, readFile, seekPos)) {
             fprintf(stderr, "Failed to seek %s for reading!\n", readPath);
-            exit(-1);
+            shutdown_and_exit(cl, -1);
         }
 
         currentPos = -1;
@@ -179,14 +223,14 @@ int main(int argc, char **argv) {
             fprintf(stderr, 
                     "Failed to get current file position correctly! Got %" PRId64 "!\n",
                     currentPos);
-            exit(-1);
+            shutdown_and_exit(cl, -1);
         }
         fprintf(stderr, "Current position: %" PRId64 "\n", currentPos);
 
         if (!hdfsFileUsesDirectRead(readFile)) {
           fprintf(stderr, "Direct read support incorrectly not detected "
                   "for HDFS filesystem\n");
-          exit(-1);
+          shutdown_and_exit(cl, -1);
         }
 
         fprintf(stderr, "Direct read support detected for HDFS\n");
@@ -194,7 +238,7 @@ int main(int argc, char **argv) {
         // Test the direct read path
         if(hdfsSeek(fs, readFile, 0)) {
             fprintf(stderr, "Failed to seek %s for reading!\n", readPath);
-            exit(-1);
+            shutdown_and_exit(cl, -1);
         }
         memset(buffer, 0, sizeof(buffer));
         num_read_bytes = hdfsRead(fs, readFile, (void*)buffer,
@@ -202,30 +246,41 @@ int main(int argc, char **argv) {
         if (strncmp(fileContents, buffer, strlen(fileContents)) != 0) {
             fprintf(stderr, "Failed to read (direct). Expected %s but got %s (%d bytes)\n",
                     fileContents, buffer, num_read_bytes);
-            exit(-1);
+            shutdown_and_exit(cl, -1);
         }
         fprintf(stderr, "Read (direct) following %d bytes:\n%s\n",
                 num_read_bytes, buffer);
+        memset(buffer, 0, strlen(fileContents + 1));
         if (hdfsSeek(fs, readFile, 0L)) {
             fprintf(stderr, "Failed to seek to file start!\n");
-            exit(-1);
+            shutdown_and_exit(cl, -1);
         }
 
         // Disable the direct read path so that we really go through the slow
         // read path
         hdfsFileDisableDirectRead(readFile);
 
-        num_read_bytes = hdfsRead(fs, readFile, (void*)buffer, 
-                sizeof(buffer));
-        fprintf(stderr, "Read following %d bytes:\n%s\n", 
-                num_read_bytes, buffer);
+        if (hdfsFileUsesDirectRead(readFile)) {
+            fprintf(stderr, "Disabled direct reads, but it is still enabled");
+            shutdown_and_exit(cl, -1);
+        }
 
-        memset(buffer, 0, strlen(fileContents + 1));
+        if (!hdfsFileUsesDirectPread(readFile)) {
+            fprintf(stderr, "Disabled direct reads, but direct preads was "
+                            "disabled as well");
+            shutdown_and_exit(cl, -1);
+        }
 
-        num_read_bytes = hdfsPread(fs, readFile, 0, (void*)buffer, 
+        num_read_bytes = hdfsRead(fs, readFile, (void*)buffer,
                 sizeof(buffer));
-        fprintf(stderr, "Read following %d bytes:\n%s\n", 
+        if (strncmp(fileContents, buffer, strlen(fileContents)) != 0) {
+            fprintf(stderr, "Failed to read. Expected %s but got %s (%d bytes)\n",
+                    fileContents, buffer, num_read_bytes);
+            shutdown_and_exit(cl, -1);
+        }
+        fprintf(stderr, "Read following %d bytes:\n%s\n",
                 num_read_bytes, buffer);
+        memset(buffer, 0, strlen(fileContents + 1));
 
         hdfsCloseFile(fs, readFile);
 
@@ -233,7 +288,7 @@ int main(int argc, char **argv) {
         localFile = hdfsOpenFile(lfs, writePath, O_WRONLY|O_CREAT, 0, 0, 0);
         if(!localFile) {
             fprintf(stderr, "Failed to open %s for writing!\n", writePath);
-            exit(-1);
+            shutdown_and_exit(cl, -1);
         }
 
         num_written_bytes = hdfsWrite(lfs, localFile, (void*)fileContents,
@@ -245,7 +300,155 @@ int main(int argc, char **argv) {
         if (hdfsFileUsesDirectRead(localFile)) {
           fprintf(stderr, "Direct read support incorrectly detected for local "
                   "filesystem\n");
-          exit(-1);
+          shutdown_and_exit(cl, -1);
+        }
+
+        hdfsCloseFile(lfs, localFile);
+    }
+
+    {
+        // Pread tests
+
+        exists = hdfsExists(fs, readPath);
+
+        if (exists) {
+            fprintf(stderr, "Failed to validate existence of %s\n", readPath);
+            shutdown_and_exit(cl, -1);
+        }
+
+        preadFile = hdfsOpenFile(fs, readPath, O_RDONLY, 0, 0, 0);
+        if (!preadFile) {
+            fprintf(stderr, "Failed to open %s for reading!\n", readPath);
+            shutdown_and_exit(cl, -1);
+        }
+
+        if (!hdfsFileIsOpenForRead(preadFile)) {
+            fprintf(stderr, "hdfsFileIsOpenForRead: we just opened a file "
+                            "with O_RDONLY, and it did not show up as 'open for "
+                            "read'\n");
+            shutdown_and_exit(cl, -1);
+        }
+
+        fprintf(stderr, "hdfsAvailable: %d\n", hdfsAvailable(fs, preadFile));
+
+        num_pread_bytes = hdfsPread(fs, preadFile, 0, (void*)buffer, sizeof(buffer));
+        if (strncmp(fileContents, buffer, strlen(fileContents)) != 0) {
+            fprintf(stderr, "Failed to pread (direct). Expected %s but got %s (%d bytes)\n",
+                    fileContents, buffer, num_read_bytes);
+            shutdown_and_exit(cl, -1);
+        }
+        fprintf(stderr, "Pread (direct) following %d bytes:\n%s\n",
+                num_pread_bytes, buffer);
+        memset(buffer, 0, strlen(fileContents + 1));
+        if (hdfsTell(fs, preadFile) != 0) {
+            fprintf(stderr, "Pread changed position of file\n");
+            shutdown_and_exit(cl, -1);
+        }
+
+        // Test pread midway through the file rather than at the beginning
+        const char *fileContentsChunk = "World!";
+        num_pread_bytes = hdfsPread(fs, preadFile, 7, (void*)buffer, sizeof(buffer));
+        if (strncmp(fileContentsChunk, buffer, strlen(fileContentsChunk)) != 0) {
+            fprintf(stderr, "Failed to pread (direct). Expected %s but got %s (%d bytes)\n",
+                    fileContentsChunk, buffer, num_read_bytes);
+            shutdown_and_exit(cl, -1);
+        }
+        fprintf(stderr, "Pread (direct) following %d bytes:\n%s\n", num_pread_bytes, buffer);
+        memset(buffer, 0, strlen(fileContents + 1));
+        if (hdfsTell(fs, preadFile) != 0) {
+            fprintf(stderr, "Pread changed position of file\n");
+            shutdown_and_exit(cl, -1);
+        }
+
+        // hdfsPreadFully (direct) test
+        if (hdfsPreadFully(fs, preadFile, 0, (void*)buffer,
+                (tSize)(strlen(fileContents) + 1))) {
+            fprintf(stderr, "Failed to preadFully (direct).");
+            shutdown_and_exit(cl, -1);
+        }
+        if (strncmp(fileContents, buffer, strlen(fileContents)) != 0) {
+            fprintf(stderr, "Failed to preadFully (direct). Expected %s but "
+                            "got %s\n", fileContents, buffer);
+            shutdown_and_exit(cl, -1);
+        }
+        fprintf(stderr, "PreadFully (direct) following %d bytes:\n%s\n",
+                num_pread_bytes, buffer);
+        memset(buffer, 0, strlen(fileContents + 1));
+        if (hdfsTell(fs, preadFile) != 0) {
+            fprintf(stderr, "PreadFully changed position of file\n");
+            shutdown_and_exit(cl, -1);
+        }
+
+        // Disable the direct pread path so that we really go through the slow
+        // read path
+        hdfsFileDisableDirectPread(preadFile);
+
+        if (hdfsFileUsesDirectPread(preadFile)) {
+            fprintf(stderr, "Disabled direct preads, but it is still enabled");
+            shutdown_and_exit(cl, -1);
+        }
+
+        if (!hdfsFileUsesDirectRead(preadFile)) {
+            fprintf(stderr, "Disabled direct preads, but direct read was "
+                            "disabled as well");
+            shutdown_and_exit(cl, -1);
+        }
+
+        num_pread_bytes = hdfsPread(fs, preadFile, 0, (void*)buffer, sizeof(buffer));
+        if (strncmp(fileContents, buffer, strlen(fileContents)) != 0) {
+            fprintf(stderr, "Failed to pread. Expected %s but got %s (%d bytes)\n",
+                    fileContents, buffer, num_pread_bytes);
+            shutdown_and_exit(cl, -1);
+        }
+        fprintf(stderr, "Pread following %d bytes:\n%s\n", num_pread_bytes, buffer);
+        memset(buffer, 0, strlen(fileContents + 1));
+        if (hdfsTell(fs, preadFile) != 0) {
+            fprintf(stderr, "Pread changed position of file\n");
+            shutdown_and_exit(cl, -1);
+        }
+
+        // Test pread midway through the file rather than at the beginning
+        num_pread_bytes = hdfsPread(fs, preadFile, 7, (void*)buffer, sizeof(buffer));
+        if (strncmp(fileContentsChunk, buffer, strlen(fileContentsChunk)) != 0) {
+            fprintf(stderr, "Failed to pread. Expected %s but got %s (%d bytes)\n",
+                    fileContentsChunk, buffer, num_read_bytes);
+            shutdown_and_exit(cl, -1);
+        }
+        fprintf(stderr, "Pread following %d bytes:\n%s\n", num_pread_bytes, buffer);
+        memset(buffer, 0, strlen(fileContents + 1));
+        if (hdfsTell(fs, preadFile) != 0) {
+            fprintf(stderr, "Pread changed position of file\n");
+            shutdown_and_exit(cl, -1);
+        }
+
+        // hdfsPreadFully test
+        if (hdfsPreadFully(fs, preadFile, 0, (void*)buffer,
+                            (tSize)(strlen(fileContents) + 1))) {
+            fprintf(stderr, "Failed to preadFully.");
+            shutdown_and_exit(cl, -1);
+        }
+        if (strncmp(fileContents, buffer, strlen(fileContents)) != 0) {
+            fprintf(stderr, "Failed to preadFully. Expected %s but got %s\n",
+                    fileContents, buffer);
+            shutdown_and_exit(cl, -1);
+        }
+        fprintf(stderr, "PreadFully following %d bytes:\n%s\n",
+                num_pread_bytes, buffer);
+        memset(buffer, 0, strlen(fileContents + 1));
+        if (hdfsTell(fs, preadFile) != 0) {
+            fprintf(stderr, "PreadFully changed position of file\n");
+            shutdown_and_exit(cl, -1);
+        }
+
+        hdfsCloseFile(fs, preadFile);
+
+        // Test correct behaviour for unsupported filesystems
+        localFile = hdfsOpenFile(lfs, writePath, O_RDONLY, 0, 0, 0);
+
+        if (hdfsFileUsesDirectPread(localFile)) {
+            fprintf(stderr, "Direct pread support incorrectly detected for local "
+                            "filesystem\n");
+            shutdown_and_exit(cl, -1);
         }
 
         hdfsCloseFile(lfs, localFile);
@@ -425,7 +628,7 @@ int main(int argc, char **argv) {
       appendFile = hdfsOpenFile(fs, appendPath, O_WRONLY, 0, 0, 0);
       if(!appendFile) {
         fprintf(stderr, "Failed to open %s for writing!\n", appendPath);
-        exit(-1);
+        shutdown_and_exit(cl, -1);
       }
       fprintf(stderr, "Opened %s for writing successfully...\n", appendPath);
 
@@ -435,10 +638,10 @@ int main(int argc, char **argv) {
       fprintf(stderr, "Wrote %d bytes\n", num_written_bytes);
 
       if (hdfsFlush(fs, appendFile)) {
-        fprintf(stderr, "Failed to 'flush' %s\n", appendPath); 
-        exit(-1);
+        fprintf(stderr, "Failed to 'flush' %s\n", appendPath);
+        shutdown_and_exit(cl, -1);
         }
-      fprintf(stderr, "Flushed %s successfully!\n", appendPath); 
+      fprintf(stderr, "Flushed %s successfully!\n", appendPath);
 
       hdfsCloseFile(fs, appendFile);
 
@@ -446,7 +649,7 @@ int main(int argc, char **argv) {
       appendFile = hdfsOpenFile(fs, appendPath, O_WRONLY|O_APPEND, 0, 0, 0);
       if(!appendFile) {
         fprintf(stderr, "Failed to open %s for writing!\n", appendPath);
-        exit(-1);
+        shutdown_and_exit(cl, -1);
       }
       fprintf(stderr, "Opened %s for writing successfully...\n", appendPath);
 
@@ -456,10 +659,10 @@ int main(int argc, char **argv) {
       fprintf(stderr, "Wrote %d bytes\n", num_written_bytes);
 
       if (hdfsFlush(fs, appendFile)) {
-        fprintf(stderr, "Failed to 'flush' %s\n", appendPath); 
-        exit(-1);
+        fprintf(stderr, "Failed to 'flush' %s\n", appendPath);
+        shutdown_and_exit(cl, -1);
       }
-      fprintf(stderr, "Flushed %s successfully!\n", appendPath); 
+      fprintf(stderr, "Flushed %s successfully!\n", appendPath);
 
       hdfsCloseFile(fs, appendFile);
 
@@ -472,11 +675,11 @@ int main(int argc, char **argv) {
       readFile = hdfsOpenFile(fs, appendPath, O_RDONLY, 0, 0, 0);
       if (!readFile) {
         fprintf(stderr, "Failed to open %s for reading!\n", appendPath);
-        exit(-1);
+        shutdown_and_exit(cl, -1);
       }
 
       num_read_bytes = hdfsRead(fs, readFile, (void*)rdbuffer, sizeof(rdbuffer));
-      fprintf(stderr, "Read following %d bytes:\n%s\n", 
+      fprintf(stderr, "Read following %d bytes:\n%s\n",
               num_read_bytes, rdbuffer);
 
       fprintf(stderr, "read == Hello, World %s\n", ((result = (strcmp(rdbuffer, "Hello, World"))) == 0 ? "Success!" : "Failed!"));
@@ -496,16 +699,16 @@ int main(int argc, char **argv) {
       // the actual fs user capabilities. Thus just create a file and read
       // the owner is correct.
 
-      fs = hdfsConnectAsUserNewInstance("default", 0, tuser);
+      fs = hdfsConnectAsUserNewInstance("localhost", port, tuser);
       if(!fs) {
         fprintf(stderr, "Oops! Failed to connect to hdfs as user %s!\n",tuser);
-        exit(-1);
+        shutdown_and_exit(cl, -1);
       } 
 
         userFile = hdfsOpenFile(fs, userPath, O_WRONLY|O_CREAT, 0, 0, 0);
         if(!userFile) {
             fprintf(stderr, "Failed to open %s for writing!\n", userPath);
-            exit(-1);
+            shutdown_and_exit(cl, -1);
         }
         fprintf(stderr, "Opened %s for writing successfully...\n", userPath);
 
@@ -515,7 +718,7 @@ int main(int argc, char **argv) {
 
         if (hdfsFlush(fs, userFile)) {
             fprintf(stderr, "Failed to 'flush' %s\n", userPath); 
-            exit(-1);
+            shutdown_and_exit(cl, -1);
         }
         fprintf(stderr, "Flushed %s successfully!\n", userPath); 
 
@@ -528,6 +731,9 @@ int main(int argc, char **argv) {
     
     totalResult += (hdfsDisconnect(fs) != 0);
 
+    EXPECT_ZERO(nmdShutdown(cl));
+    nmdFree(cl);
+
     if (totalResult != 0) {
         return -1;
     } else {
diff --git a/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs/hdfs.c b/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs/hdfs.c
index 2d1b7e2fcc2cb..c25d354be0f42 100644
--- a/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs/hdfs.c
+++ b/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs/hdfs.c
@@ -18,6 +18,7 @@
 
 #include "exception.h"
 #include "hdfs/hdfs.h"
+#include "jclasses.h"
 #include "jni_helper.h"
 #include "platform.h"
 
@@ -26,23 +27,6 @@
 #include <stdio.h>
 #include <string.h>
 
-/* Some frequently used Java paths */
-#define HADOOP_CONF     "org/apache/hadoop/conf/Configuration"
-#define HADOOP_PATH     "org/apache/hadoop/fs/Path"
-#define HADOOP_LOCALFS  "org/apache/hadoop/fs/LocalFileSystem"
-#define HADOOP_FS       "org/apache/hadoop/fs/FileSystem"
-#define HADOOP_FSSTATUS "org/apache/hadoop/fs/FsStatus"
-#define HADOOP_BLK_LOC  "org/apache/hadoop/fs/BlockLocation"
-#define HADOOP_DFS      "org/apache/hadoop/hdfs/DistributedFileSystem"
-#define HADOOP_ISTRM    "org/apache/hadoop/fs/FSDataInputStream"
-#define HADOOP_OSTRM    "org/apache/hadoop/fs/FSDataOutputStream"
-#define HADOOP_STAT     "org/apache/hadoop/fs/FileStatus"
-#define HADOOP_FSPERM   "org/apache/hadoop/fs/permission/FsPermission"
-#define JAVA_NET_ISA    "java/net/InetSocketAddress"
-#define JAVA_NET_URI    "java/net/URI"
-#define JAVA_STRING     "java/lang/String"
-#define READ_OPTION     "org/apache/hadoop/fs/ReadOption"
-
 #define JAVA_VOID       "V"
 
 /* Macros for constructing method signatures */
@@ -56,8 +40,26 @@
 
 // Bit fields for hdfsFile_internal flags
 #define HDFS_FILE_SUPPORTS_DIRECT_READ (1<<0)
+#define HDFS_FILE_SUPPORTS_DIRECT_PREAD (1<<1)
 
+/**
+ * Reads bytes using the read(ByteBuffer) API. By using Java
+ * DirectByteBuffers we can avoid copying the bytes onto the Java heap.
+ * Instead the data will be directly copied from kernel space to the C heap.
+ */
 tSize readDirect(hdfsFS fs, hdfsFile f, void* buffer, tSize length);
+
+/**
+ * Reads bytes using the read(long, ByteBuffer) API. By using Java
+ * DirectByteBuffers we can avoid copying the bytes onto the Java heap.
+ * Instead the data will be directly copied from kernel space to the C heap.
+ */
+tSize preadDirect(hdfsFS fs, hdfsFile file, tOffset position, void* buffer,
+                  tSize length);
+
+int preadFullyDirect(hdfsFS fs, hdfsFile file, tOffset position, void* buffer,
+                  tSize length);
+
 static void hdfsFreeFileInfoEntry(hdfsFileInfo *hdfsFileInfo);
 
 /**
@@ -109,9 +111,8 @@ int hdfsGetHedgedReadMetrics(hdfsFS fs, struct hdfsHedgedReadMetrics **metrics)
     }
 
     jthr = invokeMethod(env, &jVal, INSTANCE, jFS,
-                  HADOOP_DFS,
-                  "getHedgedReadMetrics",
-                  "()Lorg/apache/hadoop/hdfs/DFSHedgedReadMetrics;");
+            JC_DISTRIBUTED_FILE_SYSTEM, "getHedgedReadMetrics",
+            "()Lorg/apache/hadoop/hdfs/DFSHedgedReadMetrics;");
     if (jthr) {
         ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
             "hdfsGetHedgedReadMetrics: getHedgedReadMetrics failed");
@@ -126,8 +127,7 @@ int hdfsGetHedgedReadMetrics(hdfsFS fs, struct hdfsHedgedReadMetrics **metrics)
     }
 
     jthr = invokeMethod(env, &jVal, INSTANCE, hedgedReadMetrics,
-                  "org/apache/hadoop/hdfs/DFSHedgedReadMetrics",
-                  "getHedgedReadOps", "()J");
+            JC_DFS_HEDGED_READ_METRICS, "getHedgedReadOps", "()J");
     if (jthr) {
         ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
             "hdfsGetHedgedReadStatistics: getHedgedReadOps failed");
@@ -136,8 +136,7 @@ int hdfsGetHedgedReadMetrics(hdfsFS fs, struct hdfsHedgedReadMetrics **metrics)
     m->hedgedReadOps = jVal.j;
 
     jthr = invokeMethod(env, &jVal, INSTANCE, hedgedReadMetrics,
-                  "org/apache/hadoop/hdfs/DFSHedgedReadMetrics",
-                  "getHedgedReadWins", "()J");
+            JC_DFS_HEDGED_READ_METRICS, "getHedgedReadWins", "()J");
     if (jthr) {
         ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
             "hdfsGetHedgedReadStatistics: getHedgedReadWins failed");
@@ -146,8 +145,7 @@ int hdfsGetHedgedReadMetrics(hdfsFS fs, struct hdfsHedgedReadMetrics **metrics)
     m->hedgedReadOpsWin = jVal.j;
 
     jthr = invokeMethod(env, &jVal, INSTANCE, hedgedReadMetrics,
-                  "org/apache/hadoop/hdfs/DFSHedgedReadMetrics",
-                  "getHedgedReadOpsInCurThread", "()J");
+            JC_DFS_HEDGED_READ_METRICS, "getHedgedReadOpsInCurThread", "()J");
     if (jthr) {
         ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
             "hdfsGetHedgedReadStatistics: getHedgedReadOpsInCurThread failed");
@@ -192,10 +190,9 @@ int hdfsFileGetReadStatistics(hdfsFile file,
         ret = EINVAL;
         goto done;
     }
-    jthr = invokeMethod(env, &jVal, INSTANCE, file->file, 
-                  "org/apache/hadoop/hdfs/client/HdfsDataInputStream",
-                  "getReadStatistics",
-                  "()Lorg/apache/hadoop/hdfs/ReadStatistics;");
+    jthr = invokeMethod(env, &jVal, INSTANCE, file->file,
+            JC_HDFS_DATA_INPUT_STREAM, "getReadStatistics",
+            "()Lorg/apache/hadoop/hdfs/ReadStatistics;");
     if (jthr) {
         ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
             "hdfsFileGetReadStatistics: getReadStatistics failed");
@@ -208,8 +205,7 @@ int hdfsFileGetReadStatistics(hdfsFile file,
         goto done;
     }
     jthr = invokeMethod(env, &jVal, INSTANCE, readStats,
-                  "org/apache/hadoop/hdfs/ReadStatistics",
-                  "getTotalBytesRead", "()J");
+            JC_READ_STATISTICS, "getTotalBytesRead", "()J");
     if (jthr) {
         ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
             "hdfsFileGetReadStatistics: getTotalBytesRead failed");
@@ -218,8 +214,7 @@ int hdfsFileGetReadStatistics(hdfsFile file,
     s->totalBytesRead = jVal.j;
 
     jthr = invokeMethod(env, &jVal, INSTANCE, readStats,
-                  "org/apache/hadoop/hdfs/ReadStatistics",
-                  "getTotalLocalBytesRead", "()J");
+            JC_READ_STATISTICS, "getTotalLocalBytesRead", "()J");
     if (jthr) {
         ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
             "hdfsFileGetReadStatistics: getTotalLocalBytesRead failed");
@@ -228,8 +223,8 @@ int hdfsFileGetReadStatistics(hdfsFile file,
     s->totalLocalBytesRead = jVal.j;
 
     jthr = invokeMethod(env, &jVal, INSTANCE, readStats,
-                  "org/apache/hadoop/hdfs/ReadStatistics",
-                  "getTotalShortCircuitBytesRead", "()J");
+            JC_READ_STATISTICS, "getTotalShortCircuitBytesRead",
+            "()J");
     if (jthr) {
         ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
             "hdfsFileGetReadStatistics: getTotalShortCircuitBytesRead failed");
@@ -237,8 +232,8 @@ int hdfsFileGetReadStatistics(hdfsFile file,
     }
     s->totalShortCircuitBytesRead = jVal.j;
     jthr = invokeMethod(env, &jVal, INSTANCE, readStats,
-                  "org/apache/hadoop/hdfs/ReadStatistics",
-                  "getTotalZeroCopyBytesRead", "()J");
+            JC_READ_STATISTICS, "getTotalZeroCopyBytesRead",
+            "()J");
     if (jthr) {
         ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
             "hdfsFileGetReadStatistics: getTotalZeroCopyBytesRead failed");
@@ -280,8 +275,8 @@ int hdfsFileClearReadStatistics(hdfsFile file)
         goto done;
     }
     jthr = invokeMethod(env, NULL, INSTANCE, file->file,
-                  "org/apache/hadoop/hdfs/client/HdfsDataInputStream",
-                  "clearReadStatistics", "()V");
+            JC_HDFS_DATA_INPUT_STREAM, "clearReadStatistics",
+            "()V");
     if (jthr) {
         ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
             "hdfsFileClearReadStatistics: clearReadStatistics failed");
@@ -308,7 +303,7 @@ int hdfsFileIsOpenForWrite(hdfsFile file)
 
 int hdfsFileUsesDirectRead(hdfsFile file)
 {
-    return !!(file->flags & HDFS_FILE_SUPPORTS_DIRECT_READ);
+    return (file->flags & HDFS_FILE_SUPPORTS_DIRECT_READ) != 0;
 }
 
 void hdfsFileDisableDirectRead(hdfsFile file)
@@ -316,6 +311,17 @@ void hdfsFileDisableDirectRead(hdfsFile file)
     file->flags &= ~HDFS_FILE_SUPPORTS_DIRECT_READ;
 }
 
+int hdfsFileUsesDirectPread(hdfsFile file)
+{
+    return (file->flags & HDFS_FILE_SUPPORTS_DIRECT_PREAD) != 0;
+}
+
+void hdfsFileDisableDirectPread(hdfsFile file)
+{
+    file->flags &= ~HDFS_FILE_SUPPORTS_DIRECT_PREAD;
+}
+
+
 int hdfsDisableDomainSocketSecurity(void)
 {
     jthrowable jthr;
@@ -324,8 +330,7 @@ int hdfsDisableDomainSocketSecurity(void)
       errno = EINTERNAL;
       return -1;
     }
-    jthr = invokeMethod(env, NULL, STATIC, NULL,
-            "org/apache/hadoop/net/unix/DomainSocket",
+    jthr = invokeMethod(env, NULL, STATIC, NULL, JC_DOMAIN_SOCKET,
             "disableBindPathValidation", "()V");
     if (jthr) {
         errno = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
@@ -363,8 +368,8 @@ static jthrowable constructNewObjectOfPath(JNIEnv *env, const char *path,
     if (jthr)
         return jthr;
     //Construct the org.apache.hadoop.fs.Path object
-    jthr = constructNewObjectOfClass(env, &jPath, "org/apache/hadoop/fs/Path",
-                                     "(Ljava/lang/String;)V", jPathString);
+    jthr = constructNewObjectOfCachedClass(env, &jPath, JC_PATH,
+            "(Ljava/lang/String;)V", jPathString);
     destroyLocalReference(env, jPathString);
     if (jthr)
         return jthr;
@@ -383,8 +388,8 @@ static jthrowable hadoopConfGetStr(JNIEnv *env, jobject jConfiguration,
     if (jthr)
         goto done;
     jthr = invokeMethod(env, &jVal, INSTANCE, jConfiguration,
-            HADOOP_CONF, "get", JMETHOD1(JPARAM(JAVA_STRING),
-                                         JPARAM(JAVA_STRING)), jkey);
+            JC_CONFIGURATION, "get", JMETHOD1(JPARAM(JAVA_STRING),
+                    JPARAM(JAVA_STRING)), jkey);
     if (jthr)
         goto done;
     jRet = jVal.l;
@@ -407,7 +412,8 @@ int hdfsConfGetStr(const char *key, char **val)
         ret = EINTERNAL;
         goto done;
     }
-    jthr = constructNewObjectOfClass(env, &jConfiguration, HADOOP_CONF, "()V");
+    jthr = constructNewObjectOfCachedClass(env, &jConfiguration,
+            JC_CONFIGURATION, "()V");
     if (jthr) {
         ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
             "hdfsConfGetStr(%s): new Configuration", key);
@@ -443,8 +449,8 @@ static jthrowable hadoopConfGetInt(JNIEnv *env, jobject jConfiguration,
     if (jthr)
         return jthr;
     jthr = invokeMethod(env, &jVal, INSTANCE, jConfiguration,
-            HADOOP_CONF, "getInt", JMETHOD2(JPARAM(JAVA_STRING), "I", "I"),
-            jkey, (jint)(*val));
+            JC_CONFIGURATION, "getInt",
+            JMETHOD2(JPARAM(JAVA_STRING), "I", "I"), jkey, (jint)(*val));
     destroyLocalReference(env, jkey);
     if (jthr)
         return jthr;
@@ -464,7 +470,8 @@ int hdfsConfGetInt(const char *key, int32_t *val)
       ret = EINTERNAL;
       goto done;
     }
-    jthr = constructNewObjectOfClass(env, &jConfiguration, HADOOP_CONF, "()V");
+    jthr = constructNewObjectOfCachedClass(env, &jConfiguration,
+            JC_CONFIGURATION, "()V");
     if (jthr) {
         ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
             "hdfsConfGetInt(%s): new Configuration", key);
@@ -697,7 +704,8 @@ hdfsFS hdfsBuilderConnect(struct hdfsBuilder *bld)
     }
 
     //  jConfiguration = new Configuration();
-    jthr = constructNewObjectOfClass(env, &jConfiguration, HADOOP_CONF, "()V");
+    jthr = constructNewObjectOfCachedClass(env, &jConfiguration,
+            JC_CONFIGURATION, "()V");
     if (jthr) {
         ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
             "hdfsBuilderConnect(%s)", hdfsBuilderToStr(bld, buf, sizeof(buf)));
@@ -719,9 +727,10 @@ hdfsFS hdfsBuilderConnect(struct hdfsBuilder *bld)
         // Get a local filesystem.
         if (bld->forceNewInstance) {
             // fs = FileSytem#newInstanceLocal(conf);
-            jthr = invokeMethod(env, &jVal, STATIC, NULL, HADOOP_FS,
-                    "newInstanceLocal", JMETHOD1(JPARAM(HADOOP_CONF),
-                    JPARAM(HADOOP_LOCALFS)), jConfiguration);
+            jthr = invokeMethod(env, &jVal, STATIC, NULL,
+                    JC_FILE_SYSTEM, "newInstanceLocal",
+                    JMETHOD1(JPARAM(HADOOP_CONF), JPARAM(HADOOP_LOCALFS)),
+                    jConfiguration);
             if (jthr) {
                 ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
                     "hdfsBuilderConnect(%s)",
@@ -731,10 +740,10 @@ hdfsFS hdfsBuilderConnect(struct hdfsBuilder *bld)
             jFS = jVal.l;
         } else {
             // fs = FileSytem#getLocal(conf);
-            jthr = invokeMethod(env, &jVal, STATIC, NULL, HADOOP_FS, "getLocal",
-                             JMETHOD1(JPARAM(HADOOP_CONF),
-                                      JPARAM(HADOOP_LOCALFS)),
-                             jConfiguration);
+            jthr = invokeMethod(env, &jVal, STATIC, NULL,
+                    JC_FILE_SYSTEM, "getLocal",
+                    JMETHOD1(JPARAM(HADOOP_CONF), JPARAM(HADOOP_LOCALFS)),
+                    jConfiguration);
             if (jthr) {
                 ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
                     "hdfsBuilderConnect(%s)",
@@ -746,10 +755,10 @@ hdfsFS hdfsBuilderConnect(struct hdfsBuilder *bld)
     } else {
         if (!strcmp(bld->nn, "default")) {
             // jURI = FileSystem.getDefaultUri(conf)
-            jthr = invokeMethod(env, &jVal, STATIC, NULL, HADOOP_FS,
-                          "getDefaultUri",
-                          "(Lorg/apache/hadoop/conf/Configuration;)Ljava/net/URI;",
-                          jConfiguration);
+            jthr = invokeMethod(env, &jVal, STATIC, NULL,
+                    JC_FILE_SYSTEM, "getDefaultUri",
+                    "(Lorg/apache/hadoop/conf/Configuration;)Ljava/net/URI;",
+                    jConfiguration);
             if (jthr) {
                 ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
                     "hdfsBuilderConnect(%s)",
@@ -769,9 +778,9 @@ hdfsFS hdfsBuilderConnect(struct hdfsBuilder *bld)
                     hdfsBuilderToStr(bld, buf, sizeof(buf)));
                 goto done;
             }
-            jthr = invokeMethod(env, &jVal, STATIC, NULL, JAVA_NET_URI,
-                             "create", "(Ljava/lang/String;)Ljava/net/URI;",
-                             jURIString);
+            jthr = invokeMethod(env, &jVal, STATIC, NULL,
+                    JC_URI, "create",
+                    "(Ljava/lang/String;)Ljava/net/URI;", jURIString);
             if (jthr) {
                 ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
                     "hdfsBuilderConnect(%s)",
@@ -799,11 +808,11 @@ hdfsFS hdfsBuilderConnect(struct hdfsBuilder *bld)
             goto done;
         }
         if (bld->forceNewInstance) {
-            jthr = invokeMethod(env, &jVal, STATIC, NULL, HADOOP_FS,
-                    "newInstance", JMETHOD3(JPARAM(JAVA_NET_URI), 
-                        JPARAM(HADOOP_CONF), JPARAM(JAVA_STRING),
-                        JPARAM(HADOOP_FS)),
-                    jURI, jConfiguration, jUserString);
+            jthr = invokeMethod(env, &jVal, STATIC, NULL,
+                    JC_FILE_SYSTEM, "newInstance",
+                    JMETHOD3(JPARAM(JAVA_NET_URI), JPARAM(HADOOP_CONF),
+                             JPARAM(JAVA_STRING), JPARAM(HADOOP_FS)), jURI,
+                    jConfiguration, jUserString);
             if (jthr) {
                 ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
                     "hdfsBuilderConnect(%s)",
@@ -812,10 +821,11 @@ hdfsFS hdfsBuilderConnect(struct hdfsBuilder *bld)
             }
             jFS = jVal.l;
         } else {
-            jthr = invokeMethod(env, &jVal, STATIC, NULL, HADOOP_FS, "get",
+            jthr = invokeMethod(env, &jVal, STATIC, NULL,
+                    JC_FILE_SYSTEM, "get",
                     JMETHOD3(JPARAM(JAVA_NET_URI), JPARAM(HADOOP_CONF),
-                        JPARAM(JAVA_STRING), JPARAM(HADOOP_FS)),
-                        jURI, jConfiguration, jUserString);
+                            JPARAM(JAVA_STRING), JPARAM(HADOOP_FS)), jURI,
+                            jConfiguration, jUserString);
             if (jthr) {
                 ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
                     "hdfsBuilderConnect(%s)",
@@ -877,8 +887,8 @@ int hdfsDisconnect(hdfsFS fs)
         return -1;
     }
 
-    jthr = invokeMethod(env, NULL, INSTANCE, jFS, HADOOP_FS,
-                     "close", "()V");
+    jthr = invokeMethod(env, NULL, INSTANCE, jFS, JC_FILE_SYSTEM,
+            "close", "()V");
     if (jthr) {
         ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
             "hdfsDisconnect: FileSystem#close");
@@ -909,8 +919,9 @@ static jthrowable getDefaultBlockSize(JNIEnv *env, jobject jFS,
     jthrowable jthr;
     jvalue jVal;
 
-    jthr = invokeMethod(env, &jVal, INSTANCE, jFS, HADOOP_FS,
-                 "getDefaultBlockSize", JMETHOD1(JPARAM(HADOOP_PATH), "J"), jPath);
+    jthr = invokeMethod(env, &jVal, INSTANCE, jFS, JC_FILE_SYSTEM,
+            "getDefaultBlockSize", JMETHOD1(JPARAM(HADOOP_PATH),
+                    "J"), jPath);
     if (jthr)
         return jthr;
     *out = jVal.j;
@@ -945,14 +956,9 @@ struct hdfsStreamBuilder {
 struct hdfsStreamBuilder *hdfsStreamBuilderAlloc(hdfsFS fs,
                                             const char *path, int flags)
 {
-    size_t path_len = strlen(path);
+    int path_len = strlen(path);
     struct hdfsStreamBuilder *bld;
 
-    // Check for overflow in path_len
-    if (path_len > SIZE_MAX - sizeof(struct hdfsStreamBuilder)) {
-        errno = EOVERFLOW;
-        return NULL;
-    }
     // sizeof(hdfsStreamBuilder->path) includes one byte for the string
     // terminator
     bld = malloc(sizeof(struct hdfsStreamBuilder) + path_len);
@@ -1008,6 +1014,62 @@ int hdfsStreamBuilderSetDefaultBlockSize(struct hdfsStreamBuilder *bld,
     return 0;
 }
 
+/**
+ * Delegates to FsDataInputStream#hasCapability(String). Used to check if a
+ * given input stream supports certain methods, such as
+ * ByteBufferReadable#read(ByteBuffer).
+ *
+ * @param jFile the FsDataInputStream to call hasCapability on
+ * @param capability the name of the capability to query; for a full list of
+ *        possible values see StreamCapabilities
+ *
+ * @return true if the given jFile has the given capability, false otherwise
+ *
+ * @see org.apache.hadoop.fs.StreamCapabilities
+ */
+static int hdfsHasStreamCapability(jobject jFile,
+        const char *capability) {
+    int ret = 0;
+    jthrowable jthr = NULL;
+    jvalue jVal;
+    jstring jCapabilityString = NULL;
+
+    /* Get the JNIEnv* corresponding to current thread */
+    JNIEnv* env = getJNIEnv();
+    if (env == NULL) {
+        errno = EINTERNAL;
+        return 0;
+    }
+
+    jthr = newJavaStr(env, capability, &jCapabilityString);
+    if (jthr) {
+        ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
+                "hdfsHasStreamCapability(%s): newJavaStr", capability);
+        goto done;
+    }
+    jthr = invokeMethod(env, &jVal, INSTANCE, jFile,
+            JC_FS_DATA_INPUT_STREAM, "hasCapability", "(Ljava/lang/String;)Z",
+            jCapabilityString);
+    if (jthr) {
+        ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
+                "hdfsHasStreamCapability(%s): FSDataInputStream#hasCapability",
+                capability);
+        goto done;
+    }
+
+done:
+    destroyLocalReference(env, jthr);
+    destroyLocalReference(env, jCapabilityString);
+    if (ret) {
+        errno = ret;
+        return 0;
+    }
+    if (jVal.z == JNI_TRUE) {
+        return 1;
+    }
+    return 0;
+}
+
 static hdfsFile hdfsOpenFileImpl(hdfsFS fs, const char *path, int flags,
                   int32_t bufferSize, int16_t replication, int64_t blockSize)
 {
@@ -1057,14 +1119,14 @@ static hdfsFile hdfsOpenFileImpl(hdfsFS fs, const char *path, int flags,
     }
 
     if (accmode == O_RDONLY) {
-	method = "open";
-        signature = JMETHOD2(JPARAM(HADOOP_PATH), "I", JPARAM(HADOOP_ISTRM));
+        method = "open";
+        signature = JMETHOD2(JPARAM(HADOOP_PATH), "I", JPARAM(HADOOP_FSDISTRM));
     } else if (flags & O_APPEND) {
-	method = "append";
-	signature = JMETHOD1(JPARAM(HADOOP_PATH), JPARAM(HADOOP_OSTRM));
+        method = "append";
+        signature = JMETHOD1(JPARAM(HADOOP_PATH), JPARAM(HADOOP_FSDOSTRM));
     } else {
-	method = "create";
-	signature = JMETHOD2(JPARAM(HADOOP_PATH), "ZISJ", JPARAM(HADOOP_OSTRM));
+        method = "create";
+        signature = JMETHOD2(JPARAM(HADOOP_PATH), "ZISJ", JPARAM(HADOOP_FSDOSTRM));
     }
 
     /* Create an object of org.apache.hadoop.fs.Path */
@@ -1076,8 +1138,8 @@ static hdfsFile hdfsOpenFileImpl(hdfsFS fs, const char *path, int flags,
     }
 
     /* Get the Configuration object from the FileSystem object */
-    jthr = invokeMethod(env, &jVal, INSTANCE, jFS, HADOOP_FS,
-                     "getConf", JMETHOD1("", JPARAM(HADOOP_CONF)));
+    jthr = invokeMethod(env, &jVal, INSTANCE, jFS, JC_FILE_SYSTEM,
+            "getConf", JMETHOD1("", JPARAM(HADOOP_CONF)));
     if (jthr) {
         ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
             "hdfsOpenFile(%s): FileSystem#getConf", path);
@@ -1097,9 +1159,9 @@ static hdfsFile hdfsOpenFileImpl(hdfsFS fs, const char *path, int flags,
     }
 
     if (!bufferSize) {
-        jthr = invokeMethod(env, &jVal, INSTANCE, jConfiguration, 
-                         HADOOP_CONF, "getInt", "(Ljava/lang/String;I)I",
-                         jStrBufferSize, 4096);
+        jthr = invokeMethod(env, &jVal, INSTANCE, jConfiguration,
+                JC_CONFIGURATION, "getInt",
+                "(Ljava/lang/String;I)I", jStrBufferSize, 4096);
         if (jthr) {
             ret = printExceptionAndFree(env, jthr, NOPRINT_EXC_FILE_NOT_FOUND |
                 NOPRINT_EXC_ACCESS_CONTROL | NOPRINT_EXC_UNRESOLVED_LINK,
@@ -1112,9 +1174,9 @@ static hdfsFile hdfsOpenFileImpl(hdfsFS fs, const char *path, int flags,
 
     if ((accmode == O_WRONLY) && (flags & O_APPEND) == 0) {
         if (!replication) {
-            jthr = invokeMethod(env, &jVal, INSTANCE, jConfiguration, 
-                             HADOOP_CONF, "getInt", "(Ljava/lang/String;I)I",
-                             jStrReplication, 1);
+            jthr = invokeMethod(env, &jVal, INSTANCE, jConfiguration,
+                             JC_CONFIGURATION, "getInt",
+                             "(Ljava/lang/String;I)I", jStrReplication, 1);
             if (jthr) {
                 ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
                     "hdfsOpenFile(%s): Configuration#getInt(dfs.replication)",
@@ -1130,12 +1192,12 @@ static hdfsFile hdfsOpenFileImpl(hdfsFS fs, const char *path, int flags,
 
     // READ?
     if (accmode == O_RDONLY) {
-        jthr = invokeMethod(env, &jVal, INSTANCE, jFS, HADOOP_FS,
-                       method, signature, jPath, jBufferSize);
+        jthr = invokeMethod(env, &jVal, INSTANCE, jFS, JC_FILE_SYSTEM,
+                method, signature, jPath, jBufferSize);
     }  else if ((accmode == O_WRONLY) && (flags & O_APPEND)) {
         // WRITE/APPEND?
-       jthr = invokeMethod(env, &jVal, INSTANCE, jFS, HADOOP_FS,
-                       method, signature, jPath);
+       jthr = invokeMethod(env, &jVal, INSTANCE, jFS, JC_FILE_SYSTEM,
+               method, signature, jPath);
     } else {
         // WRITE/CREATE
         jboolean jOverWrite = 1;
@@ -1148,9 +1210,9 @@ static hdfsFile hdfsOpenFileImpl(hdfsFS fs, const char *path, int flags,
                 goto done;
             }
         }
-        jthr = invokeMethod(env, &jVal, INSTANCE, jFS, HADOOP_FS,
-                         method, signature, jPath, jOverWrite,
-                         jBufferSize, jReplication, jBlockSize);
+        jthr = invokeMethod(env, &jVal, INSTANCE, jFS, JC_FILE_SYSTEM,
+                method, signature, jPath, jOverWrite, jBufferSize,
+                jReplication, jBlockSize);
     }
     if (jthr) {
         ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
@@ -1176,16 +1238,16 @@ static hdfsFile hdfsOpenFileImpl(hdfsFS fs, const char *path, int flags,
     file->flags = 0;
 
     if ((flags & O_WRONLY) == 0) {
-        // Try a test read to see if we can do direct reads
-        char buf;
-        if (readDirect(fs, file, &buf, 0) == 0) {
-            // Success - 0-byte read should return 0
+        // Check the StreamCapabilities of jFile to see if we can do direct
+        // reads
+        if (hdfsHasStreamCapability(jFile, "in:readbytebuffer")) {
             file->flags |= HDFS_FILE_SUPPORTS_DIRECT_READ;
-        } else if (errno != ENOTSUP) {
-            // Unexpected error. Clear it, don't set the direct flag.
-            fprintf(stderr,
-                  "hdfsOpenFile(%s): WARN: Unexpected error %d when testing "
-                  "for direct read compatibility\n", path, errno);
+        }
+
+        // Check the StreamCapabilities of jFile to see if we can do direct
+        // preads
+        if (hdfsHasStreamCapability(jFile, "in:preadbytebuffer")) {
+            file->flags |= HDFS_FILE_SUPPORTS_DIRECT_PREAD;
         }
     }
     ret = 0;
@@ -1193,9 +1255,9 @@ static hdfsFile hdfsOpenFileImpl(hdfsFS fs, const char *path, int flags,
 done:
     destroyLocalReference(env, jStrBufferSize);
     destroyLocalReference(env, jStrReplication);
-    destroyLocalReference(env, jConfiguration); 
-    destroyLocalReference(env, jPath); 
-    destroyLocalReference(env, jFile); 
+    destroyLocalReference(env, jConfiguration);
+    destroyLocalReference(env, jPath);
+    destroyLocalReference(env, jFile);
     if (ret) {
         if (file) {
             if (file->file) {
@@ -1241,9 +1303,9 @@ int hdfsTruncateFile(hdfsFS fs, const char* path, tOffset newlength)
         return -1;
     }
 
-    jthr = invokeMethod(env, &jVal, INSTANCE, jFS, HADOOP_FS,
-                        "truncate", JMETHOD2(JPARAM(HADOOP_PATH), "J", "Z"),
-                        jPath, newlength);
+    jthr = invokeMethod(env, &jVal, INSTANCE, jFS, JC_FILE_SYSTEM,
+            "truncate", JMETHOD2(JPARAM(HADOOP_PATH), "J", "Z"),
+            jPath, newlength);
     destroyLocalReference(env, jPath);
     if (jthr) {
         errno = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
@@ -1270,11 +1332,11 @@ int hdfsUnbufferFile(hdfsFile file)
         ret = ENOTSUP;
         goto done;
     }
-    jthr = invokeMethod(env, NULL, INSTANCE, file->file, HADOOP_ISTRM,
-                     "unbuffer", "()V");
+    jthr = invokeMethod(env, NULL, INSTANCE, file->file,
+            JC_FS_DATA_INPUT_STREAM, "unbuffer", "()V");
     if (jthr) {
         ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
-                HADOOP_ISTRM "#unbuffer failed:");
+                HADOOP_FSDISTRM "#unbuffer failed:");
         goto done;
     }
     ret = 0;
@@ -1291,7 +1353,7 @@ int hdfsCloseFile(hdfsFS fs, hdfsFile file)
     //  file.close 
 
     //The interface whose 'close' method to be called
-    const char *interface;
+    CachedJavaClass cachedJavaClass;
     const char *interfaceShortName;
 
     //Caught exception
@@ -1310,11 +1372,14 @@ int hdfsCloseFile(hdfsFS fs, hdfsFile file)
         return -1;
     }
 
-    interface = (file->type == HDFS_STREAM_INPUT) ?
-        HADOOP_ISTRM : HADOOP_OSTRM;
+    if (file->type == HDFS_STREAM_INPUT) {
+        cachedJavaClass = JC_FS_DATA_INPUT_STREAM;
+    } else {
+        cachedJavaClass = JC_FS_DATA_OUTPUT_STREAM;
+    }
   
-    jthr = invokeMethod(env, NULL, INSTANCE, file->file, interface,
-                     "close", "()V");
+    jthr = invokeMethod(env, NULL, INSTANCE, file->file,
+            cachedJavaClass, "close", "()V");
     if (jthr) {
         interfaceShortName = (file->type == HDFS_STREAM_INPUT) ? 
             "FSDataInputStream" : "FSDataOutputStream";
@@ -1358,7 +1423,7 @@ int hdfsExists(hdfsFS fs, const char *path)
             "hdfsExists: constructNewObjectOfPath");
         return -1;
     }
-    jthr = invokeMethod(env, &jVal, INSTANCE, jFS, HADOOP_FS,
+    jthr = invokeMethod(env, &jVal, INSTANCE, jFS, JC_FILE_SYSTEM,
             "exists", JMETHOD1(JPARAM(HADOOP_PATH), "Z"), jPath);
     destroyLocalReference(env, jPath);
     if (jthr) {
@@ -1397,11 +1462,17 @@ static int readPrepare(JNIEnv* env, hdfsFS fs, hdfsFile f,
     return 0;
 }
 
+/**
+ * If the underlying stream supports the ByteBufferReadable interface then
+ * this method will transparently use read(ByteBuffer). This can help
+ * improve performance as it avoids unnecessarily copying data on to the Java
+ * heap. Instead the data will be directly copied from kernel space to the C
+ * heap.
+ */
 tSize hdfsRead(hdfsFS fs, hdfsFile f, void* buffer, tSize length)
 {
     jobject jInputStream;
     jbyteArray jbRarray;
-    jint noReadBytes = length;
     jvalue jVal;
     jthrowable jthr;
     JNIEnv* env;
@@ -1440,8 +1511,8 @@ tSize hdfsRead(hdfsFS fs, hdfsFile f, void* buffer, tSize length)
         return -1;
     }
 
-    jthr = invokeMethod(env, &jVal, INSTANCE, jInputStream, HADOOP_ISTRM,
-                               "read", "([B)I", jbRarray);
+    jthr = invokeMethod(env, &jVal, INSTANCE, jInputStream,
+            JC_FS_DATA_INPUT_STREAM, "read", "([B)I", jbRarray);
     if (jthr) {
         destroyLocalReference(env, jbRarray);
         errno = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
@@ -1457,7 +1528,12 @@ tSize hdfsRead(hdfsFS fs, hdfsFile f, void* buffer, tSize length)
         errno = EINTR;
         return -1;
     }
-    (*env)->GetByteArrayRegion(env, jbRarray, 0, noReadBytes, buffer);
+    // We only copy the portion of the jbRarray that was actually filled by
+    // the call to FsDataInputStream#read; #read is not guaranteed to fill the
+    // entire buffer, instead it returns the number of bytes read into the
+    // buffer; we use the return value as the input in GetByteArrayRegion to
+    // ensure don't copy more bytes than necessary
+    (*env)->GetByteArrayRegion(env, jbRarray, 0, jVal.i, buffer);
     destroyLocalReference(env, jbRarray);
     if ((*env)->ExceptionCheck(env)) {
         errno = printPendingExceptionAndFree(env, PRINT_EXC_ALL,
@@ -1467,12 +1543,11 @@ tSize hdfsRead(hdfsFS fs, hdfsFile f, void* buffer, tSize length)
     return jVal.i;
 }
 
-// Reads using the read(ByteBuffer) API, which does fewer copies
 tSize readDirect(hdfsFS fs, hdfsFile f, void* buffer, tSize length)
 {
     // JAVA EQUIVALENT:
-    //  ByteBuffer bbuffer = ByteBuffer.allocateDirect(length) // wraps C buffer
-    //  fis.read(bbuffer);
+    //  ByteBuffer buf = ByteBuffer.allocateDirect(length) // wraps C buffer
+    //  fis.read(buf);
 
     jobject jInputStream;
     jvalue jVal;
@@ -1499,16 +1574,33 @@ tSize readDirect(hdfsFS fs, hdfsFile f, void* buffer, tSize length)
     }
 
     jthr = invokeMethod(env, &jVal, INSTANCE, jInputStream,
-        HADOOP_ISTRM, "read", "(Ljava/nio/ByteBuffer;)I", bb);
+            JC_FS_DATA_INPUT_STREAM, "read",
+            "(Ljava/nio/ByteBuffer;)I", bb);
     destroyLocalReference(env, bb);
     if (jthr) {
         errno = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
             "readDirect: FSDataInputStream#read");
         return -1;
     }
-    return (jVal.i < 0) ? 0 : jVal.i;
+    // Reached EOF, return 0
+    if (jVal.i < 0) {
+        return 0;
+    }
+    // 0 bytes read, return error
+    if (jVal.i == 0) {
+        errno = EINTR;
+        return -1;
+    }
+    return jVal.i;
 }
 
+/**
+ * If the underlying stream supports the ByteBufferPositionedReadable
+ * interface then this method will transparently use read(long, ByteBuffer).
+ * This can help improve performance as it avoids unnecessarily copying data
+ * on to the Java heap. Instead the data will be directly copied from kernel
+ * space to the C heap.
+ */
 tSize hdfsPread(hdfsFS fs, hdfsFile f, tOffset position,
                 void* buffer, tSize length)
 {
@@ -1528,6 +1620,10 @@ tSize hdfsPread(hdfsFS fs, hdfsFile f, tOffset position,
         return -1;
     }
 
+    if (f->flags & HDFS_FILE_SUPPORTS_DIRECT_PREAD) {
+      return preadDirect(fs, f, position, buffer, length);
+    }
+
     env = getJNIEnv();
     if (env == NULL) {
       errno = EINTERNAL;
@@ -1550,8 +1646,10 @@ tSize hdfsPread(hdfsFS fs, hdfsFile f, tOffset position,
             "hdfsPread: NewByteArray");
         return -1;
     }
-    jthr = invokeMethod(env, &jVal, INSTANCE, f->file, HADOOP_ISTRM,
-                     "read", "(J[BII)I", position, jbRarray, 0, length);
+
+    jthr = invokeMethod(env, &jVal, INSTANCE, f->file,
+            JC_FS_DATA_INPUT_STREAM, "read", "(J[BII)I", position,
+            jbRarray, 0, length);
     if (jthr) {
         destroyLocalReference(env, jbRarray);
         errno = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
@@ -1577,6 +1675,173 @@ tSize hdfsPread(hdfsFS fs, hdfsFile f, tOffset position,
     return jVal.i;
 }
 
+tSize preadDirect(hdfsFS fs, hdfsFile f, tOffset position, void* buffer,
+                  tSize length)
+{
+    // JAVA EQUIVALENT:
+    //  ByteBuffer buf = ByteBuffer.allocateDirect(length) // wraps C buffer
+    //  fis.read(position, buf);
+
+    jvalue jVal;
+    jthrowable jthr;
+    jobject bb;
+
+    //Get the JNIEnv* corresponding to current thread
+    JNIEnv* env = getJNIEnv();
+    if (env == NULL) {
+      errno = EINTERNAL;
+      return -1;
+    }
+
+    //Error checking... make sure that this file is 'readable'
+    if (f->type != HDFS_STREAM_INPUT) {
+        fprintf(stderr, "Cannot read from a non-InputStream object!\n");
+        errno = EINVAL;
+        return -1;
+    }
+
+    //Read the requisite bytes
+    bb = (*env)->NewDirectByteBuffer(env, buffer, length);
+    if (bb == NULL) {
+        errno = printPendingExceptionAndFree(env, PRINT_EXC_ALL,
+            "readDirect: NewDirectByteBuffer");
+        return -1;
+    }
+
+    jthr = invokeMethod(env, &jVal, INSTANCE, f->file,
+            JC_FS_DATA_INPUT_STREAM, "read", "(JLjava/nio/ByteBuffer;)I",
+            position, bb);
+    destroyLocalReference(env, bb);
+    if (jthr) {
+       errno = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
+           "preadDirect: FSDataInputStream#read");
+       return -1;
+    }
+    // Reached EOF, return 0
+    if (jVal.i < 0) {
+        return 0;
+    }
+    // 0 bytes read, return error
+    if (jVal.i == 0) {
+        errno = EINTR;
+        return -1;
+    }
+    return jVal.i;
+}
+
+/**
+ * Like hdfsPread, if the underlying stream supports the
+ * ByteBufferPositionedReadable interface then this method will transparently
+ * use readFully(long, ByteBuffer).
+ */
+int hdfsPreadFully(hdfsFS fs, hdfsFile f, tOffset position,
+                void* buffer, tSize length) {
+    JNIEnv* env;
+    jbyteArray jbRarray;
+    jthrowable jthr;
+
+    if (length == 0) {
+        return 0;
+    } else if (length < 0) {
+        errno = EINVAL;
+        return -1;
+    }
+    if (!f || f->type == HDFS_STREAM_UNINITIALIZED) {
+        errno = EBADF;
+        return -1;
+    }
+
+    if (f->flags & HDFS_FILE_SUPPORTS_DIRECT_PREAD) {
+        return preadFullyDirect(fs, f, position, buffer, length);
+    }
+
+    env = getJNIEnv();
+    if (env == NULL) {
+        errno = EINTERNAL;
+        return -1;
+    }
+
+    //Error checking... make sure that this file is 'readable'
+    if (f->type != HDFS_STREAM_INPUT) {
+        fprintf(stderr, "Cannot read from a non-InputStream object!\n");
+        errno = EINVAL;
+        return -1;
+    }
+
+    // JAVA EQUIVALENT:
+    //  byte [] bR = new byte[length];
+    //  fis.read(pos, bR, 0, length);
+    jbRarray = (*env)->NewByteArray(env, length);
+    if (!jbRarray) {
+        errno = printPendingExceptionAndFree(env, PRINT_EXC_ALL,
+                                             "hdfsPread: NewByteArray");
+        return -1;
+    }
+
+    jthr = invokeMethod(env, NULL, INSTANCE, f->file,
+                        JC_FS_DATA_INPUT_STREAM, "readFully", "(J[BII)V",
+                        position, jbRarray, 0, length);
+    if (jthr) {
+        destroyLocalReference(env, jbRarray);
+        errno = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
+                                      "hdfsPread: FSDataInputStream#read");
+        return -1;
+    }
+
+    (*env)->GetByteArrayRegion(env, jbRarray, 0, length, buffer);
+    destroyLocalReference(env, jbRarray);
+    if ((*env)->ExceptionCheck(env)) {
+        errno = printPendingExceptionAndFree(env, PRINT_EXC_ALL,
+                "hdfsPread: GetByteArrayRegion");
+        return -1;
+    }
+    return 0;
+}
+
+int preadFullyDirect(hdfsFS fs, hdfsFile f, tOffset position, void* buffer,
+                  tSize length)
+{
+    // JAVA EQUIVALENT:
+    //  ByteBuffer buf = ByteBuffer.allocateDirect(length) // wraps C buffer
+    //  fis.read(position, buf);
+
+    jthrowable jthr;
+    jobject bb;
+
+    //Get the JNIEnv* corresponding to current thread
+    JNIEnv* env = getJNIEnv();
+    if (env == NULL) {
+        errno = EINTERNAL;
+        return -1;
+    }
+
+    //Error checking... make sure that this file is 'readable'
+    if (f->type != HDFS_STREAM_INPUT) {
+        fprintf(stderr, "Cannot read from a non-InputStream object!\n");
+        errno = EINVAL;
+        return -1;
+    }
+
+    //Read the requisite bytes
+    bb = (*env)->NewDirectByteBuffer(env, buffer, length);
+    if (bb == NULL) {
+        errno = printPendingExceptionAndFree(env, PRINT_EXC_ALL,
+                "readDirect: NewDirectByteBuffer");
+        return -1;
+    }
+
+    jthr = invokeMethod(env, NULL, INSTANCE, f->file,
+            JC_FS_DATA_INPUT_STREAM, "readFully",
+            "(JLjava/nio/ByteBuffer;)V", position, bb);
+    destroyLocalReference(env, bb);
+    if (jthr) {
+        errno = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
+                "preadDirect: FSDataInputStream#read");
+        return -1;
+    }
+    return 0;
+}
+
 tSize hdfsWrite(hdfsFS fs, hdfsFile f, const void* buffer, tSize length)
 {
     // JAVA EQUIVALENT
@@ -1636,7 +1901,8 @@ tSize hdfsWrite(hdfsFS fs, hdfsFile f, const void* buffer, tSize length)
         return -1;
     }
     jthr = invokeMethod(env, NULL, INSTANCE, jOutputStream,
-            HADOOP_OSTRM, "write", "([B)V", jbWarray);
+            JC_FS_DATA_OUTPUT_STREAM, "write", "([B)V",
+            jbWarray);
     destroyLocalReference(env, jbWarray);
     if (jthr) {
         errno = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
@@ -1671,7 +1937,7 @@ int hdfsSeek(hdfsFS fs, hdfsFile f, tOffset desiredPos)
 
     jInputStream = f->file;
     jthr = invokeMethod(env, NULL, INSTANCE, jInputStream,
-            HADOOP_ISTRM, "seek", "(J)V", desiredPos);
+            JC_FS_DATA_INPUT_STREAM, "seek", "(J)V", desiredPos);
     if (jthr) {
         errno = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
             "hdfsSeek(desiredPos=%" PRId64 ")"
@@ -1681,15 +1947,13 @@ int hdfsSeek(hdfsFS fs, hdfsFile f, tOffset desiredPos)
     return 0;
 }
 
-
-
 tOffset hdfsTell(hdfsFS fs, hdfsFile f)
 {
     // JAVA EQUIVALENT
     //  pos = f.getPos();
 
     jobject jStream;
-    const char *interface;
+    CachedJavaClass cachedJavaClass;
     jvalue jVal;
     jthrowable jthr;
 
@@ -1708,10 +1972,13 @@ tOffset hdfsTell(hdfsFS fs, hdfsFile f)
 
     //Parameters
     jStream = f->file;
-    interface = (f->type == HDFS_STREAM_INPUT) ?
-        HADOOP_ISTRM : HADOOP_OSTRM;
+    if (f->type == HDFS_STREAM_INPUT) {
+        cachedJavaClass = JC_FS_DATA_INPUT_STREAM;
+    } else {
+        cachedJavaClass = JC_FS_DATA_OUTPUT_STREAM;
+    }
     jthr = invokeMethod(env, &jVal, INSTANCE, jStream,
-                     interface, "getPos", "()J");
+            cachedJavaClass, "getPos", "()J");
     if (jthr) {
         errno = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
             "hdfsTell: %s#getPos",
@@ -1742,7 +2009,7 @@ int hdfsFlush(hdfsFS fs, hdfsFile f)
         return -1;
     }
     jthr = invokeMethod(env, NULL, INSTANCE, f->file,
-                     HADOOP_OSTRM, "flush", "()V");
+            JC_FS_DATA_OUTPUT_STREAM, "flush", "()V");
     if (jthr) {
         errno = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
             "hdfsFlush: FSDataInputStream#flush");
@@ -1771,7 +2038,7 @@ int hdfsHFlush(hdfsFS fs, hdfsFile f)
 
     jOutputStream = f->file;
     jthr = invokeMethod(env, NULL, INSTANCE, jOutputStream,
-                     HADOOP_OSTRM, "hflush", "()V");
+            JC_FS_DATA_OUTPUT_STREAM, "hflush", "()V");
     if (jthr) {
         errno = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
             "hdfsHFlush: FSDataOutputStream#hflush");
@@ -1800,7 +2067,7 @@ int hdfsHSync(hdfsFS fs, hdfsFile f)
 
     jOutputStream = f->file;
     jthr = invokeMethod(env, NULL, INSTANCE, jOutputStream,
-                     HADOOP_OSTRM, "hsync", "()V");
+            JC_FS_DATA_OUTPUT_STREAM, "hsync", "()V");
     if (jthr) {
         errno = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
             "hdfsHSync: FSDataOutputStream#hsync");
@@ -1834,7 +2101,7 @@ int hdfsAvailable(hdfsFS fs, hdfsFile f)
     //Parameters
     jInputStream = f->file;
     jthr = invokeMethod(env, &jVal, INSTANCE, jInputStream,
-                     HADOOP_ISTRM, "available", "()I");
+            JC_FS_DATA_INPUT_STREAM, "available", "()I");
     if (jthr) {
         errno = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
             "hdfsAvailable: FSDataInputStream#available");
@@ -1879,8 +2146,8 @@ static int hdfsCopyImpl(hdfsFS srcFS, const char *src, hdfsFS dstFS,
     }
 
     //Create the org.apache.hadoop.conf.Configuration object
-    jthr = constructNewObjectOfClass(env, &jConfiguration,
-                                     HADOOP_CONF, "()V");
+    jthr = constructNewObjectOfCachedClass(env, &jConfiguration,
+            JC_CONFIGURATION, "()V");
     if (jthr) {
         ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
             "hdfsCopyImpl: Configuration constructor");
@@ -1888,8 +2155,8 @@ static int hdfsCopyImpl(hdfsFS srcFS, const char *src, hdfsFS dstFS,
     }
 
     //FileUtil#copy
-    jthr = invokeMethod(env, &jVal, STATIC,
-            NULL, "org/apache/hadoop/fs/FileUtil", "copy",
+    jthr = invokeMethod(env, &jVal, STATIC, NULL, JC_FILE_UTIL,
+            "copy",
             "(Lorg/apache/hadoop/fs/FileSystem;Lorg/apache/hadoop/fs/Path;"
             "Lorg/apache/hadoop/fs/FileSystem;Lorg/apache/hadoop/fs/Path;"
             "ZLorg/apache/hadoop/conf/Configuration;)Z",
@@ -1955,9 +2222,9 @@ int hdfsDelete(hdfsFS fs, const char *path, int recursive)
         return -1;
     }
     jRecursive = recursive ? JNI_TRUE : JNI_FALSE;
-    jthr = invokeMethod(env, &jVal, INSTANCE, jFS, HADOOP_FS,
-                     "delete", "(Lorg/apache/hadoop/fs/Path;Z)Z",
-                     jPath, jRecursive);
+    jthr = invokeMethod(env, &jVal, INSTANCE, jFS, JC_FILE_SYSTEM,
+            "delete", "(Lorg/apache/hadoop/fs/Path;Z)Z", jPath,
+            jRecursive);
     destroyLocalReference(env, jPath);
     if (jthr) {
         errno = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
@@ -2009,9 +2276,9 @@ int hdfsRename(hdfsFS fs, const char *oldPath, const char *newPath)
 
     // Rename the file
     // TODO: use rename2 here?  (See HDFS-3592)
-    jthr = invokeMethod(env, &jVal, INSTANCE, jFS, HADOOP_FS, "rename",
-                     JMETHOD2(JPARAM(HADOOP_PATH), JPARAM(HADOOP_PATH), "Z"),
-                     jOldPath, jNewPath);
+    jthr = invokeMethod(env, &jVal, INSTANCE, jFS, JC_FILE_SYSTEM,
+            "rename", JMETHOD2(JPARAM(HADOOP_PATH), JPARAM
+            (HADOOP_PATH), "Z"), jOldPath, jNewPath);
     if (jthr) {
         errno = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
             "hdfsRename(oldPath=%s, newPath=%s): FileSystem#rename",
@@ -2054,9 +2321,8 @@ char* hdfsGetWorkingDirectory(hdfsFS fs, char* buffer, size_t bufferSize)
     }
 
     //FileSystem#getWorkingDirectory()
-    jthr = invokeMethod(env, &jVal, INSTANCE, jFS,
-                     HADOOP_FS, "getWorkingDirectory",
-                     "()Lorg/apache/hadoop/fs/Path;");
+    jthr = invokeMethod(env, &jVal, INSTANCE, jFS, JC_FILE_SYSTEM,
+            "getWorkingDirectory", "()Lorg/apache/hadoop/fs/Path;");
     if (jthr) {
         ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
             "hdfsGetWorkingDirectory: FileSystem#getWorkingDirectory");
@@ -2071,9 +2337,8 @@ char* hdfsGetWorkingDirectory(hdfsFS fs, char* buffer, size_t bufferSize)
     }
 
     //Path#toString()
-    jthr = invokeMethod(env, &jVal, INSTANCE, jPath, 
-                     "org/apache/hadoop/fs/Path", "toString",
-                     "()Ljava/lang/String;");
+    jthr = invokeMethod(env, &jVal, INSTANCE, jPath, JC_PATH, "toString",
+            "()Ljava/lang/String;");
     if (jthr) {
         ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
             "hdfsGetWorkingDirectory: Path#toString");
@@ -2137,9 +2402,9 @@ int hdfsSetWorkingDirectory(hdfsFS fs, const char *path)
     }
 
     //FileSystem#setWorkingDirectory()
-    jthr = invokeMethod(env, NULL, INSTANCE, jFS, HADOOP_FS,
-                     "setWorkingDirectory", 
-                     "(Lorg/apache/hadoop/fs/Path;)V", jPath);
+    jthr = invokeMethod(env, NULL, INSTANCE, jFS, JC_FILE_SYSTEM,
+            "setWorkingDirectory", "(Lorg/apache/hadoop/fs/Path;)V",
+            jPath);
     destroyLocalReference(env, jPath);
     if (jthr) {
         errno = printExceptionAndFree(env, jthr, NOPRINT_EXC_ILLEGAL_ARGUMENT,
@@ -2179,9 +2444,8 @@ int hdfsCreateDirectory(hdfsFS fs, const char *path)
 
     //Create the directory
     jVal.z = 0;
-    jthr = invokeMethod(env, &jVal, INSTANCE, jFS, HADOOP_FS,
-                     "mkdirs", "(Lorg/apache/hadoop/fs/Path;)Z",
-                     jPath);
+    jthr = invokeMethod(env, &jVal, INSTANCE, jFS, JC_FILE_SYSTEM,
+            "mkdirs", "(Lorg/apache/hadoop/fs/Path;)Z", jPath);
     destroyLocalReference(env, jPath);
     if (jthr) {
         errno = printExceptionAndFree(env, jthr,
@@ -2229,9 +2493,9 @@ int hdfsSetReplication(hdfsFS fs, const char *path, int16_t replication)
     }
 
     //Create the directory
-    jthr = invokeMethod(env, &jVal, INSTANCE, jFS, HADOOP_FS,
-                     "setReplication", "(Lorg/apache/hadoop/fs/Path;S)Z",
-                     jPath, replication);
+    jthr = invokeMethod(env, &jVal, INSTANCE, jFS, JC_FILE_SYSTEM,
+            "setReplication", "(Lorg/apache/hadoop/fs/Path;S)Z",
+            jPath, replication);
     destroyLocalReference(env, jPath);
     if (jthr) {
         errno = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
@@ -2292,8 +2556,8 @@ int hdfsChown(hdfsFS fs, const char *path, const char *owner, const char *group)
     }
 
     //Create the directory
-    jthr = invokeMethod(env, NULL, INSTANCE, jFS, HADOOP_FS,
-            "setOwner", JMETHOD3(JPARAM(HADOOP_PATH), 
+    jthr = invokeMethod(env, NULL, INSTANCE, jFS, JC_FILE_SYSTEM,
+            "setOwner", JMETHOD3(JPARAM(HADOOP_PATH),
                     JPARAM(JAVA_STRING), JPARAM(JAVA_STRING), JAVA_VOID),
             jPath, jOwner, jGroup);
     if (jthr) {
@@ -2337,12 +2601,12 @@ int hdfsChmod(hdfsFS fs, const char *path, short mode)
     }
 
     // construct jPerm = FsPermission.createImmutable(short mode);
-    jthr = constructNewObjectOfClass(env, &jPermObj,
-                HADOOP_FSPERM,"(S)V",jmode);
+    jthr = constructNewObjectOfCachedClass(env, &jPermObj, JC_FS_PERMISSION,
+            "(S)V",jmode);
     if (jthr) {
         ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
-            "constructNewObjectOfClass(%s)", HADOOP_FSPERM);
-        return -1;
+            "constructNewObjectOfCachedClass(%s)", HADOOP_FSPERM);
+        goto done;
     }
 
     //Create an object of org.apache.hadoop.fs.Path
@@ -2354,10 +2618,9 @@ int hdfsChmod(hdfsFS fs, const char *path, short mode)
     }
 
     //Create the directory
-    jthr = invokeMethod(env, NULL, INSTANCE, jFS, HADOOP_FS,
-            "setPermission",
-            JMETHOD2(JPARAM(HADOOP_PATH), JPARAM(HADOOP_FSPERM), JAVA_VOID),
-            jPath, jPermObj);
+    jthr = invokeMethod(env, NULL, INSTANCE, jFS, JC_FILE_SYSTEM,
+            "setPermission", JMETHOD2(JPARAM(HADOOP_PATH),
+                    JPARAM(HADOOP_FSPERM), JAVA_VOID), jPath, jPermObj);
     if (jthr) {
         ret = printExceptionAndFree(env, jthr,
             NOPRINT_EXC_ACCESS_CONTROL | NOPRINT_EXC_FILE_NOT_FOUND |
@@ -2407,9 +2670,9 @@ int hdfsUtime(hdfsFS fs, const char *path, tTime mtime, tTime atime)
     jmtime = (mtime == NO_CHANGE) ? -1 : (mtime * (jlong)1000);
     jatime = (atime == NO_CHANGE) ? -1 : (atime * (jlong)1000);
 
-    jthr = invokeMethod(env, NULL, INSTANCE, jFS, HADOOP_FS,
-            "setTimes", JMETHOD3(JPARAM(HADOOP_PATH), "J", "J", JAVA_VOID),
-            jPath, jmtime, jatime);
+    jthr = invokeMethod(env, NULL, INSTANCE, jFS, JC_FILE_SYSTEM,
+            "setTimes", JMETHOD3(JPARAM(HADOOP_PATH), "J", "J",
+                    JAVA_VOID), jPath, jmtime, jatime);
     destroyLocalReference(env, jPath);
     if (jthr) {
         errno = printExceptionAndFree(env, jthr,
@@ -2485,6 +2748,8 @@ int hadoopRzOptionsSetByteBufferPool(
     JNIEnv *env;
     jthrowable jthr;
     jobject byteBufferPool = NULL;
+    jobject globalByteBufferPool = NULL;
+    int ret;
 
     env = getJNIEnv();
     if (!env) {
@@ -2501,15 +2766,37 @@ int hadoopRzOptionsSetByteBufferPool(
       if (jthr) {
           printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
               "hadoopRzOptionsSetByteBufferPool(className=%s): ", className);
-          errno = EINVAL;
-          return -1;
+          ret = EINVAL;
+          goto done;
       }
-    }
-    if (opts->byteBufferPool) {
-        // Delete any previous ByteBufferPool we had.
+      // Only set opts->byteBufferPool if creating a global reference is
+      // successful
+      globalByteBufferPool = (*env)->NewGlobalRef(env, byteBufferPool);
+      if (!globalByteBufferPool) {
+          printPendingExceptionAndFree(env, PRINT_EXC_ALL,
+                  "hadoopRzOptionsSetByteBufferPool(className=%s): ",
+                  className);
+          ret = EINVAL;
+          goto done;
+      }
+      // Delete any previous ByteBufferPool we had before setting a new one.
+      if (opts->byteBufferPool) {
+          (*env)->DeleteGlobalRef(env, opts->byteBufferPool);
+      }
+      opts->byteBufferPool = globalByteBufferPool;
+    } else if (opts->byteBufferPool) {
+        // If the specified className is NULL, delete any previous
+        // ByteBufferPool we had.
         (*env)->DeleteGlobalRef(env, opts->byteBufferPool);
+        opts->byteBufferPool = NULL;
+    }
+    ret = 0;
+done:
+    destroyLocalReference(env, byteBufferPool);
+    if (ret) {
+        errno = ret;
+        return -1;
     }
-    opts->byteBufferPool = byteBufferPool;
     return 0;
 }
 
@@ -2549,28 +2836,28 @@ static jthrowable hadoopRzOptionsGetEnumSet(JNIEnv *env,
         goto done;
     }
     if (opts->skipChecksums) {
-        jthr = fetchEnumInstance(env, READ_OPTION,
+        jthr = fetchEnumInstance(env, HADOOP_RO,
                   "SKIP_CHECKSUMS", &enumInst);
         if (jthr) {
             goto done;
         }
-        jthr = invokeMethod(env, &jVal, STATIC, NULL,
-                "java/util/EnumSet", "of",
-                "(Ljava/lang/Enum;)Ljava/util/EnumSet;", enumInst);
+        jthr = invokeMethod(env, &jVal, STATIC, NULL, JC_ENUM_SET,
+                "of", "(Ljava/lang/Enum;)Ljava/util/EnumSet;", enumInst);
         if (jthr) {
             goto done;
         }
         enumSetObj = jVal.l;
     } else {
-        jclass clazz = (*env)->FindClass(env, READ_OPTION);
+        jclass clazz = (*env)->FindClass(env, HADOOP_RO);
         if (!clazz) {
-            jthr = newRuntimeError(env, "failed "
-                    "to find class for %s", READ_OPTION);
+            jthr = getPendingExceptionAndClear(env);
+            goto done;
+        }
+        jthr = invokeMethod(env, &jVal, STATIC, NULL, JC_ENUM_SET,
+                "noneOf", "(Ljava/lang/Class;)Ljava/util/EnumSet;", clazz);
+        if (jthr) {
             goto done;
         }
-        jthr = invokeMethod(env, &jVal, STATIC, NULL,
-                "java/util/EnumSet", "noneOf",
-                "(Ljava/lang/Class;)Ljava/util/EnumSet;", clazz);
         enumSetObj = jVal.l;
     }
     // create global ref
@@ -2599,7 +2886,7 @@ static int hadoopReadZeroExtractBuffer(JNIEnv *env,
     jarray array = NULL;
 
     jthr = invokeMethod(env, &jVal, INSTANCE, buffer->byteBuffer,
-                     "java/nio/ByteBuffer", "remaining", "()I");
+            JC_BYTE_BUFFER, "remaining", "()I");
     if (jthr) {
         ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
                 "hadoopReadZeroExtractBuffer: ByteBuffer#remaining failed: ");
@@ -2607,7 +2894,7 @@ static int hadoopReadZeroExtractBuffer(JNIEnv *env,
     }
     buffer->length = jVal.i;
     jthr = invokeMethod(env, &jVal, INSTANCE, buffer->byteBuffer,
-                     "java/nio/ByteBuffer", "position", "()I");
+            JC_BYTE_BUFFER, "position", "()I");
     if (jthr) {
         ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
                 "hadoopReadZeroExtractBuffer: ByteBuffer#position failed: ");
@@ -2638,7 +2925,7 @@ static int hadoopReadZeroExtractBuffer(JNIEnv *env,
     }
     // Get the backing array object of this buffer.
     jthr = invokeMethod(env, &jVal, INSTANCE, buffer->byteBuffer,
-                     "java/nio/ByteBuffer", "array", "()[B");
+            JC_BYTE_BUFFER, "array", "()[B");
     if (jthr) {
         ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
                 "hadoopReadZeroExtractBuffer: ByteBuffer#array failed: ");
@@ -2691,6 +2978,7 @@ static int translateZCRException(JNIEnv *env, jthrowable exc)
     }
     if (!strcmp(className, "java.lang.UnsupportedOperationException")) {
         ret = EPROTONOSUPPORT;
+        destroyLocalReference(env, exc);
         goto done;
     }
     ret = printExceptionAndFree(env, exc, PRINT_EXC_ALL,
@@ -2731,9 +3019,10 @@ struct hadoopRzBuffer* hadoopReadZero(hdfsFile file,
                 "hadoopReadZero: hadoopRzOptionsGetEnumSet failed: ");
         goto done;
     }
-    jthr = invokeMethod(env, &jVal, INSTANCE, file->file, HADOOP_ISTRM, "read",
-        "(Lorg/apache/hadoop/io/ByteBufferPool;ILjava/util/EnumSet;)"
-        "Ljava/nio/ByteBuffer;", opts->byteBufferPool, maxLength, enumSet);
+    jthr = invokeMethod(env, &jVal, INSTANCE, file->file,
+            JC_FS_DATA_INPUT_STREAM, "read",
+            "(Lorg/apache/hadoop/io/ByteBufferPool;ILjava/util/EnumSet;)"
+            "Ljava/nio/ByteBuffer;", opts->byteBufferPool, maxLength, enumSet);
     if (jthr) {
         ret = translateZCRException(env, jthr);
         goto done;
@@ -2796,8 +3085,8 @@ void hadoopRzBufferFree(hdfsFile file, struct hadoopRzBuffer *buffer)
     }
     if (buffer->byteBuffer) {
         jthr = invokeMethod(env, &jVal, INSTANCE, file->file,
-                    HADOOP_ISTRM, "releaseBuffer",
-                    "(Ljava/nio/ByteBuffer;)V", buffer->byteBuffer);
+                JC_FS_DATA_INPUT_STREAM, "releaseBuffer",
+                "(Ljava/nio/ByteBuffer;)V", buffer->byteBuffer);
         if (jthr) {
             printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
                     "hadoopRzBufferFree: releaseBuffer failed: ");
@@ -2846,8 +3135,8 @@ hdfsGetHosts(hdfsFS fs, const char *path, tOffset start, tOffset length)
             "hdfsGetHosts(path=%s): constructNewObjectOfPath", path);
         goto done;
     }
-    jthr = invokeMethod(env, &jFSVal, INSTANCE, jFS,
-            HADOOP_FS, "getFileStatus", "(Lorg/apache/hadoop/fs/Path;)"
+    jthr = invokeMethod(env, &jFSVal, INSTANCE, jFS, JC_FILE_SYSTEM,
+            "getFileStatus", "(Lorg/apache/hadoop/fs/Path;)"
             "Lorg/apache/hadoop/fs/FileStatus;", jPath);
     if (jthr) {
         ret = printExceptionAndFree(env, jthr, NOPRINT_EXC_FILE_NOT_FOUND,
@@ -2859,11 +3148,11 @@ hdfsGetHosts(hdfsFS fs, const char *path, tOffset start, tOffset length)
     jFileStatus = jFSVal.l;
 
     //org.apache.hadoop.fs.FileSystem#getFileBlockLocations
-    jthr = invokeMethod(env, &jVal, INSTANCE, jFS,
-                     HADOOP_FS, "getFileBlockLocations", 
-                     "(Lorg/apache/hadoop/fs/FileStatus;JJ)"
-                     "[Lorg/apache/hadoop/fs/BlockLocation;",
-                     jFileStatus, start, length);
+    jthr = invokeMethod(env, &jVal, INSTANCE, jFS, JC_FILE_SYSTEM,
+            "getFileBlockLocations",
+            "(Lorg/apache/hadoop/fs/FileStatus;JJ)"
+            "[Lorg/apache/hadoop/fs/BlockLocation;", jFileStatus, start,
+            length);
     if (jthr) {
         ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
                 "hdfsGetHosts(path=%s, start=%"PRId64", length=%"PRId64"):"
@@ -2890,15 +3179,17 @@ hdfsGetHosts(hdfsFS fs, const char *path, tOffset start, tOffset length)
     for (i = 0; i < jNumFileBlocks; ++i) {
         jFileBlock =
             (*env)->GetObjectArrayElement(env, jBlockLocations, i);
-        if (!jFileBlock) {
-            ret = printPendingExceptionAndFree(env, PRINT_EXC_ALL,
+        jthr = (*env)->ExceptionOccurred(env);
+        if (jthr || !jFileBlock) {
+            ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
                 "hdfsGetHosts(path=%s, start=%"PRId64", length=%"PRId64"):"
                 "GetObjectArrayElement(%d)", path, start, length, i);
             goto done;
         }
         
-        jthr = invokeMethod(env, &jVal, INSTANCE, jFileBlock, HADOOP_BLK_LOC,
-                         "getHosts", "()[Ljava/lang/String;");
+        jthr = invokeMethod(env, &jVal, INSTANCE, jFileBlock,
+                JC_BLOCK_LOCATION, "getHosts",
+                "()[Ljava/lang/String;");
         if (jthr) {
             ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
                 "hdfsGetHosts(path=%s, start=%"PRId64", length=%"PRId64"):"
@@ -2924,8 +3215,9 @@ hdfsGetHosts(hdfsFS fs, const char *path, tOffset start, tOffset length)
         //Now parse each hostname
         for (j = 0; j < jNumBlockHosts; ++j) {
             jHost = (*env)->GetObjectArrayElement(env, jFileBlockHosts, j);
-            if (!jHost) {
-                ret = printPendingExceptionAndFree(env, PRINT_EXC_ALL,
+            jthr = (*env)->ExceptionOccurred(env);
+            if (jthr || !jHost) {
+                ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
                     "hdfsGetHosts(path=%s, start=%"PRId64", length=%"PRId64"): "
                     "NewByteArray", path, start, length);
                 goto done;
@@ -3002,8 +3294,8 @@ tOffset hdfsGetDefaultBlockSize(hdfsFS fs)
     }
 
     //FileSystem#getDefaultBlockSize()
-    jthr = invokeMethod(env, &jVal, INSTANCE, jFS, HADOOP_FS,
-                     "getDefaultBlockSize", "()J");
+    jthr = invokeMethod(env, &jVal, INSTANCE, jFS, JC_FILE_SYSTEM,
+            "getDefaultBlockSize", "()J");
     if (jthr) {
         errno = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
             "hdfsGetDefaultBlockSize: FileSystem#getDefaultBlockSize");
@@ -3066,16 +3358,16 @@ tOffset hdfsGetCapacity(hdfsFS fs)
     }
 
     //FileSystem#getStatus
-    jthr = invokeMethod(env, &jVal, INSTANCE, jFS, HADOOP_FS,
-                     "getStatus", "()Lorg/apache/hadoop/fs/FsStatus;");
+    jthr = invokeMethod(env, &jVal, INSTANCE, jFS, JC_FILE_SYSTEM,
+            "getStatus", "()Lorg/apache/hadoop/fs/FsStatus;");
     if (jthr) {
         errno = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
             "hdfsGetCapacity: FileSystem#getStatus");
         return -1;
     }
     fss = (jobject)jVal.l;
-    jthr = invokeMethod(env, &jVal, INSTANCE, fss, HADOOP_FSSTATUS,
-                     "getCapacity", "()J");
+    jthr = invokeMethod(env, &jVal, INSTANCE, fss,
+            JC_FS_STATUS, "getCapacity", "()J");
     destroyLocalReference(env, fss);
     if (jthr) {
         errno = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
@@ -3106,16 +3398,16 @@ tOffset hdfsGetUsed(hdfsFS fs)
     }
 
     //FileSystem#getStatus
-    jthr = invokeMethod(env, &jVal, INSTANCE, jFS, HADOOP_FS,
-                     "getStatus", "()Lorg/apache/hadoop/fs/FsStatus;");
+    jthr = invokeMethod(env, &jVal, INSTANCE, jFS, JC_FILE_SYSTEM,
+            "getStatus", "()Lorg/apache/hadoop/fs/FsStatus;");
     if (jthr) {
         errno = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
             "hdfsGetUsed: FileSystem#getStatus");
         return -1;
     }
     fss = (jobject)jVal.l;
-    jthr = invokeMethod(env, &jVal, INSTANCE, fss, HADOOP_FSSTATUS,
-                     "getUsed", "()J");
+    jthr = invokeMethod(env, &jVal, INSTANCE, fss, JC_FS_STATUS,
+            HADOOP_FSSTATUS,"getUsed", "()J");
     destroyLocalReference(env, fss);
     if (jthr) {
         errno = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
@@ -3173,46 +3465,46 @@ getFileInfoFromStat(JNIEnv *env, jobject jStat, hdfsFileInfo *fileInfo)
     struct hdfsExtendedFileInfo *extInfo;
     size_t extOffset;
 
-    jthr = invokeMethod(env, &jVal, INSTANCE, jStat,
-                     HADOOP_STAT, "isDir", "()Z");
+    jthr = invokeMethod(env, &jVal, INSTANCE, jStat, JC_FILE_STATUS, "isDir",
+            "()Z");
     if (jthr)
         goto done;
     fileInfo->mKind = jVal.z ? kObjectKindDirectory : kObjectKindFile;
 
-    jthr = invokeMethod(env, &jVal, INSTANCE, jStat,
-                     HADOOP_STAT, "getReplication", "()S");
+    jthr = invokeMethod(env, &jVal, INSTANCE, jStat, JC_FILE_STATUS,
+            "getReplication", "()S");
     if (jthr)
         goto done;
     fileInfo->mReplication = jVal.s;
 
-    jthr = invokeMethod(env, &jVal, INSTANCE, jStat,
-                     HADOOP_STAT, "getBlockSize", "()J");
+    jthr = invokeMethod(env, &jVal, INSTANCE, jStat, JC_FILE_STATUS,
+            "getBlockSize", "()J");
     if (jthr)
         goto done;
     fileInfo->mBlockSize = jVal.j;
 
-    jthr = invokeMethod(env, &jVal, INSTANCE, jStat,
-                     HADOOP_STAT, "getModificationTime", "()J");
+    jthr = invokeMethod(env, &jVal, INSTANCE, jStat, JC_FILE_STATUS,
+            "getModificationTime", "()J");
     if (jthr)
         goto done;
     fileInfo->mLastMod = jVal.j / 1000;
 
-    jthr = invokeMethod(env, &jVal, INSTANCE, jStat,
-                     HADOOP_STAT, "getAccessTime", "()J");
+    jthr = invokeMethod(env, &jVal, INSTANCE, jStat, JC_FILE_STATUS,
+            "getAccessTime", "()J");
     if (jthr)
         goto done;
     fileInfo->mLastAccess = (tTime) (jVal.j / 1000);
 
     if (fileInfo->mKind == kObjectKindFile) {
-        jthr = invokeMethod(env, &jVal, INSTANCE, jStat,
-                         HADOOP_STAT, "getLen", "()J");
+        jthr = invokeMethod(env, &jVal, INSTANCE, jStat, JC_FILE_STATUS,
+                "getLen", "()J");
         if (jthr)
             goto done;
         fileInfo->mSize = jVal.j;
     }
 
-    jthr = invokeMethod(env, &jVal, INSTANCE, jStat, HADOOP_STAT,
-                     "getPath", "()Lorg/apache/hadoop/fs/Path;");
+    jthr = invokeMethod(env, &jVal, INSTANCE, jStat,  JC_FILE_STATUS,
+            "getPath", "()Lorg/apache/hadoop/fs/Path;");
     if (jthr)
         goto done;
     jPath = jVal.l;
@@ -3222,8 +3514,8 @@ getFileInfoFromStat(JNIEnv *env, jobject jStat, hdfsFileInfo *fileInfo)
         goto done;
     }
 
-    jthr = invokeMethod(env, &jVal, INSTANCE, jPath, HADOOP_PATH,
-                     "toString", "()Ljava/lang/String;");
+    jthr = invokeMethod(env, &jVal, INSTANCE, jPath, JC_PATH, "toString",
+            "()Ljava/lang/String;");
     if (jthr)
         goto done;
     jPathName = jVal.l;
@@ -3235,8 +3527,8 @@ getFileInfoFromStat(JNIEnv *env, jobject jStat, hdfsFileInfo *fileInfo)
     }
     fileInfo->mName = strdup(cPathName);
     (*env)->ReleaseStringUTFChars(env, jPathName, cPathName);
-    jthr = invokeMethod(env, &jVal, INSTANCE, jStat, HADOOP_STAT,
-                    "getOwner", "()Ljava/lang/String;");
+    jthr = invokeMethod(env, &jVal, INSTANCE, jStat, JC_FILE_STATUS, "getOwner",
+            "()Ljava/lang/String;");
     if (jthr)
         goto done;
     jUserName = jVal.l;
@@ -3256,16 +3548,16 @@ getFileInfoFromStat(JNIEnv *env, jobject jStat, hdfsFileInfo *fileInfo)
     (*env)->ReleaseStringUTFChars(env, jUserName, cUserName);
     extInfo = getExtendedFileInfo(fileInfo);
     memset(extInfo, 0, sizeof(*extInfo));
-    jthr = invokeMethod(env, &jVal, INSTANCE, jStat,
-                    HADOOP_STAT, "isEncrypted", "()Z");
+    jthr = invokeMethod(env, &jVal, INSTANCE, jStat, JC_FILE_STATUS,
+            "isEncrypted", "()Z");
     if (jthr) {
         goto done;
     }
     if (jVal.z == JNI_TRUE) {
         extInfo->flags |= HDFS_EXTENDED_FILE_INFO_ENCRYPTED;
     }
-    jthr = invokeMethod(env, &jVal, INSTANCE, jStat, HADOOP_STAT,
-                    "getGroup", "()Ljava/lang/String;");
+    jthr = invokeMethod(env, &jVal, INSTANCE, jStat, JC_FILE_STATUS,
+            "getGroup", "()Ljava/lang/String;");
     if (jthr)
         goto done;
     jGroupName = jVal.l;
@@ -3277,19 +3569,19 @@ getFileInfoFromStat(JNIEnv *env, jobject jStat, hdfsFileInfo *fileInfo)
     fileInfo->mGroup = strdup(cGroupName);
     (*env)->ReleaseStringUTFChars(env, jGroupName, cGroupName);
 
-    jthr = invokeMethod(env, &jVal, INSTANCE, jStat, HADOOP_STAT,
+    jthr = invokeMethod(env, &jVal, INSTANCE, jStat, JC_FILE_STATUS,
             "getPermission",
             "()Lorg/apache/hadoop/fs/permission/FsPermission;");
     if (jthr)
         goto done;
     if (jVal.l == NULL) {
         jthr = newRuntimeError(env, "%s#getPermission returned NULL!",
-            HADOOP_STAT);
+            HADOOP_FILESTAT);
         goto done;
     }
     jPermission = jVal.l;
-    jthr = invokeMethod(env, &jVal, INSTANCE, jPermission, HADOOP_FSPERM,
-                         "toShort", "()S");
+    jthr = invokeMethod(env, &jVal, INSTANCE, jPermission,
+            JC_FS_PERMISSION, "toShort", "()S");
     if (jthr)
         goto done;
     fileInfo->mPermissions = jVal.s;
@@ -3303,7 +3595,6 @@ getFileInfoFromStat(JNIEnv *env, jobject jStat, hdfsFileInfo *fileInfo)
     destroyLocalReference(env, jUserName);
     destroyLocalReference(env, jGroupName);
     destroyLocalReference(env, jPermission);
-    destroyLocalReference(env, jPath);
     return jthr;
 }
 
@@ -3323,18 +3614,17 @@ getFileInfo(JNIEnv *env, jobject jFS, jobject jPath, hdfsFileInfo **fileInfo)
     jvalue  jVal;
     jthrowable jthr;
 
-    jthr = invokeMethod(env, &jVal, INSTANCE, jFS, HADOOP_FS,
-                     "exists", JMETHOD1(JPARAM(HADOOP_PATH), "Z"),
-                     jPath);
+    jthr = invokeMethod(env, &jVal, INSTANCE, jFS, JC_FILE_SYSTEM, "exists",
+            JMETHOD1(JPARAM(HADOOP_PATH), "Z"), jPath);
     if (jthr)
         return jthr;
     if (jVal.z == 0) {
         *fileInfo = NULL;
         return NULL;
     }
-    jthr = invokeMethod(env, &jVal, INSTANCE, jFS,
-            HADOOP_FS, "getFileStatus",
-            JMETHOD1(JPARAM(HADOOP_PATH), JPARAM(HADOOP_STAT)), jPath);
+    jthr = invokeMethod(env, &jVal, INSTANCE, jFS, JC_FILE_SYSTEM,
+            "getFileStatus", JMETHOD1(JPARAM(HADOOP_PATH), JPARAM
+            (HADOOP_FILESTAT)), jPath);
     if (jthr)
         return jthr;
     jStat = jVal.l;
@@ -3384,9 +3674,9 @@ hdfsFileInfo* hdfsListDirectory(hdfsFS fs, const char *path, int *numEntries)
         goto done;
     }
 
-    jthr = invokeMethod(env, &jVal, INSTANCE, jFS, HADOOP_DFS, "listStatus",
-                     JMETHOD1(JPARAM(HADOOP_PATH), JARRPARAM(HADOOP_STAT)),
-                     jPath);
+    jthr = invokeMethod(env, &jVal, INSTANCE, jFS,
+            JC_DISTRIBUTED_FILE_SYSTEM, "listStatus",
+            JMETHOD1(JPARAM(HADOOP_PATH), JARRPARAM(HADOOP_FILESTAT)), jPath);
     if (jthr) {
         ret = printExceptionAndFree(env, jthr,
             NOPRINT_EXC_ACCESS_CONTROL | NOPRINT_EXC_FILE_NOT_FOUND |
@@ -3413,8 +3703,9 @@ hdfsFileInfo* hdfsListDirectory(hdfsFS fs, const char *path, int *numEntries)
     //Save path information in pathList
     for (i=0; i < jPathListSize; ++i) {
         tmpStat = (*env)->GetObjectArrayElement(env, jPathList, i);
-        if (!tmpStat) {
-            ret = printPendingExceptionAndFree(env, PRINT_EXC_ALL,
+        jthr = (*env)->ExceptionOccurred(env);
+        if (jthr || !tmpStat) {
+            ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
                 "hdfsListDirectory(%s): GetObjectArrayElement(%d out of %d)",
                 path, i, jPathListSize);
             goto done;
diff --git a/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs/include/hdfs/hdfs.h b/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs/include/hdfs/hdfs.h
index 7e45634d4e02b..e58a6232d205a 100644
--- a/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs/include/hdfs/hdfs.h
+++ b/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs/include/hdfs/hdfs.h
@@ -600,7 +600,8 @@ extern  "C" {
     tSize hdfsRead(hdfsFS fs, hdfsFile file, void* buffer, tSize length);
 
     /** 
-     * hdfsPread - Positional read of data from an open file.
+     * hdfsPread - Positional read of data from an open file. Reads up to the
+     * number of specified bytes in length.
      * @param fs The configured filesystem handle.
      * @param file The file handle.
      * @param position Position from which to read
@@ -612,6 +613,24 @@ extern  "C" {
     tSize hdfsPread(hdfsFS fs, hdfsFile file, tOffset position,
                     void* buffer, tSize length);
 
+    /**
+     * hdfsPreadFully - Positional read of data from an open file. Reads the
+     * number of specified bytes in length, or until the end of the data is
+     * reached. Unlike hdfsRead and hdfsPread, this method does not return
+     * the number of bytes read because either (1) the entire length of the
+     * buffer is filled, or (2) the end of the file is reached. If the eof is
+     * reached, an exception is thrown and errno is set to EINTR.
+     * @param fs The configured filesystem handle.
+     * @param file The file handle.
+     * @param position Position from which to read
+     * @param buffer The buffer to copy read bytes into.
+     * @param length The length of the buffer.
+     * @return Returns 0 on success, -1 on error.
+     */
+    LIBHDFS_EXTERNAL
+    int hdfsPreadFully(hdfsFS fs, hdfsFile file, tOffset position,
+                    void* buffer, tSize length);
+
 
     /** 
      * hdfsWrite - Write data into an open file.
diff --git a/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfspp/tests/hdfs_shim.c b/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfspp/tests/hdfs_shim.c
index 54d4cf651eb9e..bda27b9a43202 100644
--- a/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfspp/tests/hdfs_shim.c
+++ b/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfspp/tests/hdfs_shim.c
@@ -317,6 +317,12 @@ tSize hdfsPread(hdfsFS fs, hdfsFile file, tOffset position,
   return ret;
 }
 
+int hdfsPreadFully(hdfsFS fs, hdfsFile file, tOffset position,
+                void* buffer, tSize length) {
+  return libhdfs_hdfsPreadFully(fs->libhdfsRep, file->libhdfsRep, position,
+          buffer, length);
+}
+
 tSize hdfsWrite(hdfsFS fs, hdfsFile file, const void* buffer,
                 tSize length) {
   return libhdfs_hdfsWrite(fs->libhdfsRep, file->libhdfsRep, buffer, length);
diff --git a/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfspp/tests/libhdfs_wrapper_defines.h b/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfspp/tests/libhdfs_wrapper_defines.h
index b90776893f6b8..0d014341b4c57 100644
--- a/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfspp/tests/libhdfs_wrapper_defines.h
+++ b/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfspp/tests/libhdfs_wrapper_defines.h
@@ -47,6 +47,7 @@
 #define hdfsTell libhdfs_hdfsTell
 #define hdfsRead libhdfs_hdfsRead
 #define hdfsPread libhdfs_hdfsPread
+#define hdfsPreadFully libhdfs_hdfsPreadFully
 #define hdfsWrite libhdfs_hdfsWrite
 #define hdfsFlush libhdfs_hdfsFlush
 #define hdfsHFlush libhdfs_hdfsHFlush
diff --git a/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfspp/tests/libhdfs_wrapper_undefs.h b/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfspp/tests/libhdfs_wrapper_undefs.h
index fce0e823ddeb8..d46768c02ad39 100644
--- a/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfspp/tests/libhdfs_wrapper_undefs.h
+++ b/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfspp/tests/libhdfs_wrapper_undefs.h
@@ -47,6 +47,7 @@
 #undef hdfsTell
 #undef hdfsRead
 #undef hdfsPread
+#undef hdfsPreadFully
 #undef hdfsWrite
 #undef hdfsFlush
 #undef hdfsHFlush
diff --git a/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfspp/tests/libhdfspp_wrapper_defines.h b/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfspp/tests/libhdfspp_wrapper_defines.h
index d0411c2126c88..4b08d0556c3aa 100644
--- a/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfspp/tests/libhdfspp_wrapper_defines.h
+++ b/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfspp/tests/libhdfspp_wrapper_defines.h
@@ -47,6 +47,7 @@
 #define hdfsTell libhdfspp_hdfsTell
 #define hdfsRead libhdfspp_hdfsRead
 #define hdfsPread libhdfspp_hdfsPread
+#define hdfsPreadFully libhdfspp_hdfsPreadFully
 #define hdfsWrite libhdfspp_hdfsWrite
 #define hdfsFlush libhdfspp_hdfsFlush
 #define hdfsHFlush libhdfspp_hdfsHFlush
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestByteBufferPread.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestByteBufferPread.java
new file mode 100644
index 0000000000000..1c7f1500f3689
--- /dev/null
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestByteBufferPread.java
@@ -0,0 +1,290 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hdfs;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.Arrays;
+import java.util.Random;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSDataInputStream;
+import org.apache.hadoop.fs.FSDataOutputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+
+import org.junit.AfterClass;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import static org.junit.Assert.assertArrayEquals;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+
+/**
+ * This class tests the DFS positional read functionality on a single node
+ * mini-cluster. These tests are inspired from {@link TestPread}. The tests
+ * are much less comprehensive than other pread tests because pread already
+ * internally uses {@link ByteBuffer}s.
+ */
+public class TestByteBufferPread {
+
+  private static MiniDFSCluster cluster;
+  private static FileSystem fs;
+  private static byte[] fileContents;
+  private static Path testFile;
+  private static Random rand;
+
+  private static final long SEED = 0xDEADBEEFL;
+  private static final int BLOCK_SIZE = 4096;
+  private static final int FILE_SIZE = 12 * BLOCK_SIZE;
+
+  @BeforeClass
+  public static void setup() throws IOException {
+    // Setup the cluster with a small block size so we can create small files
+    // that span multiple blocks
+    Configuration conf = new Configuration();
+    conf.setLong(DFSConfigKeys.DFS_BLOCK_SIZE_KEY, BLOCK_SIZE);
+    cluster = new MiniDFSCluster.Builder(conf).numDataNodes(3).build();
+    fs = cluster.getFileSystem();
+
+    // Create a test file that spans 12 blocks, and contains a bunch of random
+    // bytes
+    fileContents = new byte[FILE_SIZE];
+    rand = new Random(SEED);
+    rand.nextBytes(fileContents);
+    testFile = new Path("/byte-buffer-pread-test.dat");
+    try (FSDataOutputStream out = fs.create(testFile, (short) 3)) {
+      out.write(fileContents);
+    }
+  }
+
+  /**
+   * Test preads with {@link java.nio.HeapByteBuffer}s.
+   */
+  @Test
+  public void testPreadWithHeapByteBuffer() throws IOException {
+    testPreadWithByteBuffer(ByteBuffer.allocate(FILE_SIZE));
+    testPreadWithFullByteBuffer(ByteBuffer.allocate(FILE_SIZE));
+    testPreadWithPositionedByteBuffer(ByteBuffer.allocate(FILE_SIZE));
+    testPreadWithLimitedByteBuffer(ByteBuffer.allocate(FILE_SIZE));
+    testPositionedPreadWithByteBuffer(ByteBuffer.allocate(FILE_SIZE));
+    testPreadFullyWithByteBuffer(ByteBuffer.allocate(FILE_SIZE));
+  }
+
+  /**
+   * Test preads with {@link java.nio.DirectByteBuffer}s.
+   */
+  @Test
+  public void testPreadWithDirectByteBuffer() throws IOException {
+    testPreadWithByteBuffer(ByteBuffer.allocateDirect(FILE_SIZE));
+    testPreadWithFullByteBuffer(ByteBuffer.allocateDirect(FILE_SIZE));
+    testPreadWithPositionedByteBuffer(ByteBuffer.allocateDirect(FILE_SIZE));
+    testPreadWithLimitedByteBuffer(ByteBuffer.allocateDirect(FILE_SIZE));
+    testPositionedPreadWithByteBuffer(ByteBuffer.allocateDirect(FILE_SIZE));
+    testPreadFullyWithByteBuffer(ByteBuffer.allocateDirect(FILE_SIZE));
+  }
+
+  /**
+   * Reads the entire testFile using the pread API and validates that its
+   * contents are properly loaded into the supplied {@link ByteBuffer}.
+   */
+  private void testPreadWithByteBuffer(ByteBuffer buffer) throws IOException {
+    int bytesRead;
+    int totalBytesRead = 0;
+    try (FSDataInputStream in = fs.open(testFile)) {
+      while ((bytesRead = in.read(totalBytesRead, buffer)) > 0) {
+        totalBytesRead += bytesRead;
+        // Check that each call to read changes the position of the ByteBuffer
+        // correctly
+        assertEquals(totalBytesRead, buffer.position());
+      }
+
+      // Make sure the buffer is full
+      assertFalse(buffer.hasRemaining());
+      // Make sure the contents of the read buffer equal the contents of the
+      // file
+      buffer.position(0);
+      byte[] bufferContents = new byte[FILE_SIZE];
+      buffer.get(bufferContents);
+      assertArrayEquals(bufferContents, fileContents);
+    }
+  }
+
+  /**
+   * Attempts to read the testFile into a {@link ByteBuffer} that is already
+   * full, and validates that doing so does not change the contents of the
+   * supplied {@link ByteBuffer}.
+   */
+  private void testPreadWithFullByteBuffer(ByteBuffer buffer)
+          throws IOException {
+    // Load some dummy data into the buffer
+    byte[] existingBufferBytes = new byte[FILE_SIZE];
+    rand.nextBytes(existingBufferBytes);
+    buffer.put(existingBufferBytes);
+    // Make sure the buffer is full
+    assertFalse(buffer.hasRemaining());
+
+    try (FSDataInputStream in = fs.open(testFile)) {
+      // Attempt to read into the buffer, 0 bytes should be read since the
+      // buffer is full
+      assertEquals(0, in.read(buffer));
+
+      // Double check the buffer is still full and its contents have not
+      // changed
+      assertFalse(buffer.hasRemaining());
+      buffer.position(0);
+      byte[] bufferContents = new byte[FILE_SIZE];
+      buffer.get(bufferContents);
+      assertArrayEquals(bufferContents, existingBufferBytes);
+    }
+  }
+
+  /**
+   * Reads half of the testFile into the {@link ByteBuffer} by setting a
+   * {@link ByteBuffer#limit()} on the buffer. Validates that only half of the
+   * testFile is loaded into the buffer.
+   */
+  private void testPreadWithLimitedByteBuffer(
+          ByteBuffer buffer) throws IOException {
+    int bytesRead;
+    int totalBytesRead = 0;
+    // Set the buffer limit to half the size of the file
+    buffer.limit(FILE_SIZE / 2);
+
+    try (FSDataInputStream in = fs.open(testFile)) {
+      while ((bytesRead = in.read(totalBytesRead, buffer)) > 0) {
+        totalBytesRead += bytesRead;
+        // Check that each call to read changes the position of the ByteBuffer
+        // correctly
+        assertEquals(totalBytesRead, buffer.position());
+      }
+
+      // Since we set the buffer limit to half the size of the file, we should
+      // have only read half of the file into the buffer
+      assertEquals(totalBytesRead, FILE_SIZE / 2);
+      // Check that the buffer is full and the contents equal the first half of
+      // the file
+      assertFalse(buffer.hasRemaining());
+      buffer.position(0);
+      byte[] bufferContents = new byte[FILE_SIZE / 2];
+      buffer.get(bufferContents);
+      assertArrayEquals(bufferContents,
+              Arrays.copyOfRange(fileContents, 0, FILE_SIZE / 2));
+    }
+  }
+
+  /**
+   * Reads half of the testFile into the {@link ByteBuffer} by setting the
+   * {@link ByteBuffer#position()} the half the size of the file. Validates that
+   * only half of the testFile is loaded into the buffer.
+   */
+  private void testPreadWithPositionedByteBuffer(
+          ByteBuffer buffer) throws IOException {
+    int bytesRead;
+    int totalBytesRead = 0;
+    // Set the buffer position to half the size of the file
+    buffer.position(FILE_SIZE / 2);
+
+    try (FSDataInputStream in = fs.open(testFile)) {
+      while ((bytesRead = in.read(totalBytesRead, buffer)) > 0) {
+        totalBytesRead += bytesRead;
+        // Check that each call to read changes the position of the ByteBuffer
+        // correctly
+        assertEquals(totalBytesRead + FILE_SIZE / 2, buffer.position());
+      }
+
+      // Since we set the buffer position to half the size of the file, we
+      // should have only read half of the file into the buffer
+      assertEquals(totalBytesRead, FILE_SIZE / 2);
+      // Check that the buffer is full and the contents equal the first half of
+      // the file
+      assertFalse(buffer.hasRemaining());
+      buffer.position(FILE_SIZE / 2);
+      byte[] bufferContents = new byte[FILE_SIZE / 2];
+      buffer.get(bufferContents);
+      assertArrayEquals(bufferContents,
+              Arrays.copyOfRange(fileContents, 0, FILE_SIZE / 2));
+    }
+  }
+
+  /**
+   * Reads half of the testFile into the {@link ByteBuffer} by specifying a
+   * position for the pread API that is half of the file size. Validates that
+   * only half of the testFile is loaded into the buffer.
+   */
+  private void testPositionedPreadWithByteBuffer(
+          ByteBuffer buffer) throws IOException {
+    int bytesRead;
+    int totalBytesRead = 0;
+
+    try (FSDataInputStream in = fs.open(testFile)) {
+      // Start reading from halfway through the file
+      while ((bytesRead = in.read(totalBytesRead + FILE_SIZE / 2,
+              buffer)) > 0) {
+        totalBytesRead += bytesRead;
+        // Check that each call to read changes the position of the ByteBuffer
+        // correctly
+        assertEquals(totalBytesRead, buffer.position());
+      }
+
+      // Since we starting reading halfway through the file, the buffer should
+      // only be half full
+      assertEquals(totalBytesRead, FILE_SIZE / 2);
+      assertEquals(buffer.position(), FILE_SIZE / 2);
+      assertTrue(buffer.hasRemaining());
+      // Check that the buffer contents equal the second half of the file
+      buffer.position(0);
+      byte[] bufferContents = new byte[FILE_SIZE / 2];
+      buffer.get(bufferContents);
+      assertArrayEquals(bufferContents,
+              Arrays.copyOfRange(fileContents, FILE_SIZE / 2, FILE_SIZE));
+    }
+  }
+
+  /**
+   * Reads the entire testFile using the preadFully API and validates that its
+   * contents are properly loaded into the supplied {@link ByteBuffer}.
+   */
+  private void testPreadFullyWithByteBuffer(ByteBuffer buffer)
+          throws IOException {
+    int totalBytesRead = 0;
+    try (FSDataInputStream in = fs.open(testFile)) {
+      in.readFully(totalBytesRead, buffer);
+      // Make sure the buffer is full
+      assertFalse(buffer.hasRemaining());
+      // Make sure the contents of the read buffer equal the contents of the
+      // file
+      buffer.position(0);
+      byte[] bufferContents = new byte[FILE_SIZE];
+      buffer.get(bufferContents);
+      assertArrayEquals(bufferContents, fileContents);
+    }
+  }
+
+  @AfterClass
+  public static void shutdown() throws IOException {
+    try {
+      fs.delete(testFile, false);
+      fs.close();
+    } finally {
+      cluster.shutdown(true);
+    }
+  }
+}

From 57202ccdad87f11ff94526e964aa1d162b0c91b4 Mon Sep 17 00:00:00 2001
From: Sahil Takiar <sahiltakiar@users.noreply.github.com>
Date: Wed, 1 May 2019 03:22:16 +0530
Subject: [PATCH 02/40] HDFS-3246: pRead equivalent for direct read path (#597)

HDFS-3246: pRead equivalent for direct read path

Contributed by Sahil Takiar
---
 .../hadoop/crypto/CryptoInputStream.java      |  47 ++++-
 .../apache/hadoop/fs/FSDataInputStream.java   |   3 +-
 .../apache/hadoop/fs/StreamCapabilities.java  |  12 ++
 .../hadoop/crypto/CryptoStreamsTestBase.java  | 185 +++++++++++++++++-
 .../hadoop/crypto/TestCryptoStreams.java      |   2 +
 .../crypto/TestCryptoStreamsNormal.java       |  17 +-
 .../src/main/native/libhdfs-tests/hdfs_test.h |  18 ++
 .../hadoop/hdfs/TestByteBufferPread.java      |   2 +-
 8 files changed, 267 insertions(+), 19 deletions(-)

diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/crypto/CryptoInputStream.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/crypto/CryptoInputStream.java
index b2ee0c184a490..67997b1a9066a 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/crypto/CryptoInputStream.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/crypto/CryptoInputStream.java
@@ -566,7 +566,52 @@ private void decrypt(ByteBuffer buf, int n, int start)
     }
     buf.position(pos);
   }
-  
+
+  private void decrypt(long filePosition, ByteBuffer buf, int length, int start)
+          throws IOException {
+    ByteBuffer localInBuffer = null;
+    ByteBuffer localOutBuffer = null;
+
+    // Duplicate the buffer so we don't have to worry about resetting the
+    // original position and limit at the end of the method
+    buf = buf.duplicate();
+
+    int decryptedBytes = 0;
+    Decryptor localDecryptor = null;
+    try {
+      localInBuffer = getBuffer();
+      localOutBuffer = getBuffer();
+      localDecryptor = getDecryptor();
+      byte[] localIV = initIV.clone();
+      updateDecryptor(localDecryptor, filePosition, localIV);
+      byte localPadding = getPadding(filePosition);
+      // Set proper filePosition for inputdata.
+      localInBuffer.position(localPadding);
+
+      while (decryptedBytes < length) {
+        buf.position(start + decryptedBytes);
+        buf.limit(start + decryptedBytes +
+                Math.min(length - decryptedBytes, localInBuffer.remaining()));
+        localInBuffer.put(buf);
+        // Do decryption
+        try {
+          decrypt(localDecryptor, localInBuffer, localOutBuffer, localPadding);
+          buf.position(start + decryptedBytes);
+          buf.limit(start + length);
+          decryptedBytes += localOutBuffer.remaining();
+          buf.put(localOutBuffer);
+        } finally {
+          localPadding = afterDecryption(localDecryptor, localInBuffer,
+                                         filePosition + length, localIV);
+        }
+      }
+    } finally {
+      returnBuffer(localInBuffer);
+      returnBuffer(localOutBuffer);
+      returnDecryptor(localDecryptor);
+    }
+  }
+
   @Override
   public int available() throws IOException {
     checkStream();
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FSDataInputStream.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FSDataInputStream.java
index 3b5fd7c370cef..e15d744935fd0 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FSDataInputStream.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FSDataInputStream.java
@@ -19,6 +19,7 @@
 package org.apache.hadoop.fs;
 
 import java.io.DataInputStream;
+import java.io.EOFException;
 import java.io.FileDescriptor;
 import java.io.FileInputStream;
 import java.io.IOException;
@@ -266,4 +267,4 @@ public void readFully(long position, ByteBuffer buf) throws IOException {
               "unsupported by " + in.getClass().getCanonicalName());
     }
   }
-}
+}
\ No newline at end of file
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/StreamCapabilities.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/StreamCapabilities.java
index 3549cdc4fa392..e68e7b351ed78 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/StreamCapabilities.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/StreamCapabilities.java
@@ -59,6 +59,18 @@ public interface StreamCapabilities {
    */
   String UNBUFFER = "in:unbuffer";
 
+  /**
+   * Stream read(ByteBuffer) capability implemented by
+   * {@link ByteBufferReadable#read(java.nio.ByteBuffer)}.
+   */
+  String READBYTEBUFFER = "in:readbytebuffer";
+
+  /**
+   * Stream read(long, ByteBuffer) capability implemented by
+   * {@link ByteBufferPositionedReadable#read(long, java.nio.ByteBuffer)}.
+   */
+  String PREADBYTEBUFFER = "in:preadbytebuffer";
+
   /**
    * Capabilities that a stream can support and be queried for.
    */
diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/crypto/CryptoStreamsTestBase.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/crypto/CryptoStreamsTestBase.java
index 7e5fe7071610e..64bb966b15b0f 100644
--- a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/crypto/CryptoStreamsTestBase.java
+++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/crypto/CryptoStreamsTestBase.java
@@ -26,6 +26,7 @@
 import java.util.EnumSet;
 import java.util.Random;
 
+import org.apache.hadoop.fs.ByteBufferPositionedReadable;
 import org.apache.hadoop.fs.ByteBufferReadable;
 import org.apache.hadoop.fs.CanUnbuffer;
 import org.apache.hadoop.fs.FSDataOutputStream;
@@ -129,6 +130,32 @@ private void preadCheck(PositionedReadable in) throws Exception {
     Assert.assertArrayEquals(result, expectedData);
   }
 
+  private int byteBufferPreadAll(ByteBufferPositionedReadable in,
+                                 ByteBuffer buf) throws IOException {
+    int n = 0;
+    int total = 0;
+    while (n != -1) {
+      total += n;
+      if (!buf.hasRemaining()) {
+        break;
+      }
+      n = in.read(total, buf);
+    }
+
+    return total;
+  }
+
+  private void byteBufferPreadCheck(ByteBufferPositionedReadable in)
+          throws Exception {
+    ByteBuffer result = ByteBuffer.allocate(dataLen);
+    int n = byteBufferPreadAll(in, result);
+
+    Assert.assertEquals(dataLen, n);
+    ByteBuffer expectedData = ByteBuffer.allocate(n);
+    expectedData.put(data, 0, n);
+    Assert.assertArrayEquals(result.array(), expectedData.array());
+  }
+
   protected OutputStream getOutputStream(int bufferSize) throws IOException {
     return getOutputStream(bufferSize, key, iv);
   }
@@ -288,20 +315,36 @@ private int readAll(InputStream in, long pos, byte[] b, int off, int len)
     
     return total;
   }
+
+  private int readAll(InputStream in, long pos, ByteBuffer buf)
+      throws IOException {
+    int n = 0;
+    int total = 0;
+    while (n != -1) {
+      total += n;
+      if (!buf.hasRemaining()) {
+        break;
+      }
+      n = ((ByteBufferPositionedReadable) in).read(pos + total, buf);
+    }
+
+    return total;
+  }
   
   /** Test positioned read. */
   @Test(timeout=120000)
   public void testPositionedRead() throws Exception {
-    OutputStream out = getOutputStream(defaultBufferSize);
-    writeData(out);
+    try (OutputStream out = getOutputStream(defaultBufferSize)) {
+      writeData(out);
+    }
     
-    InputStream in = getInputStream(defaultBufferSize);
-    // Pos: 1/3 dataLen
-    positionedReadCheck(in , dataLen / 3);
+    try (InputStream in = getInputStream(defaultBufferSize)) {
+      // Pos: 1/3 dataLen
+      positionedReadCheck(in, dataLen / 3);
 
-    // Pos: 1/2 dataLen
-    positionedReadCheck(in, dataLen / 2);
-    in.close();
+      // Pos: 1/2 dataLen
+      positionedReadCheck(in, dataLen / 2);
+    }
   }
   
   private void positionedReadCheck(InputStream in, int pos) throws Exception {
@@ -315,6 +358,35 @@ private void positionedReadCheck(InputStream in, int pos) throws Exception {
     System.arraycopy(data, pos, expectedData, 0, n);
     Assert.assertArrayEquals(readData, expectedData);
   }
+
+  /** Test positioned read with ByteBuffers. */
+  @Test(timeout=120000)
+  public void testPositionedReadWithByteBuffer() throws Exception {
+    try (OutputStream out = getOutputStream(defaultBufferSize)) {
+      writeData(out);
+    }
+
+    try (InputStream in = getInputStream(defaultBufferSize)) {
+      // Pos: 1/3 dataLen
+      positionedReadCheckWithByteBuffer(in, dataLen / 3);
+
+      // Pos: 1/2 dataLen
+      positionedReadCheckWithByteBuffer(in, dataLen / 2);
+    }
+  }
+
+  private void positionedReadCheckWithByteBuffer(InputStream in, int pos)
+          throws Exception {
+    ByteBuffer result = ByteBuffer.allocate(dataLen);
+    int n = readAll(in, pos, result);
+
+    Assert.assertEquals(dataLen, n + pos);
+    byte[] readData = new byte[n];
+    System.arraycopy(result.array(), 0, readData, 0, n);
+    byte[] expectedData = new byte[n];
+    System.arraycopy(data, pos, expectedData, 0, n);
+    Assert.assertArrayEquals(readData, expectedData);
+  }
   
   /** Test read fully. */
   @Test(timeout=120000)
@@ -558,12 +630,40 @@ private void byteBufferReadCheck(InputStream in, ByteBuffer buf,
     System.arraycopy(data, 0, expectedData, 0, n);
     Assert.assertArrayEquals(readData, expectedData);
   }
+
+  private void byteBufferPreadCheck(InputStream in, ByteBuffer buf,
+      int bufPos) throws Exception {
+    // Test reading from position 0
+    buf.position(bufPos);
+    int n = ((ByteBufferPositionedReadable) in).read(0, buf);
+    Assert.assertEquals(bufPos + n, buf.position());
+    byte[] readData = new byte[n];
+    buf.rewind();
+    buf.position(bufPos);
+    buf.get(readData);
+    byte[] expectedData = new byte[n];
+    System.arraycopy(data, 0, expectedData, 0, n);
+    Assert.assertArrayEquals(readData, expectedData);
+
+    // Test reading from half way through the data
+    buf.position(bufPos);
+    n = ((ByteBufferPositionedReadable) in).read(dataLen / 2, buf);
+    Assert.assertEquals(bufPos + n, buf.position());
+    readData = new byte[n];
+    buf.rewind();
+    buf.position(bufPos);
+    buf.get(readData);
+    expectedData = new byte[n];
+    System.arraycopy(data, dataLen / 2, expectedData, 0, n);
+    Assert.assertArrayEquals(readData, expectedData);
+  }
   
   /** Test byte buffer read with different buffer size. */
   @Test(timeout=120000)
   public void testByteBufferRead() throws Exception {
-    OutputStream out = getOutputStream(defaultBufferSize);
-    writeData(out);
+    try (OutputStream out = getOutputStream(defaultBufferSize)) {
+      writeData(out);
+    }
     
     // Default buffer size, initial buffer position is 0
     InputStream in = getInputStream(defaultBufferSize);
@@ -613,6 +713,53 @@ public void testByteBufferRead() throws Exception {
     byteBufferReadCheck(in, buf, 11);
     in.close();
   }
+
+  /** Test byte buffer pread with different buffer size. */
+  @Test(timeout=120000)
+  public void testByteBufferPread() throws Exception {
+    try (OutputStream out = getOutputStream(defaultBufferSize)) {
+      writeData(out);
+    }
+
+    try (InputStream defaultBuf = getInputStream(defaultBufferSize);
+         InputStream smallBuf = getInputStream(smallBufferSize)) {
+
+      ByteBuffer buf = ByteBuffer.allocate(dataLen + 100);
+
+      // Default buffer size, initial buffer position is 0
+      byteBufferPreadCheck(defaultBuf, buf, 0);
+
+      // Default buffer size, initial buffer position is not 0
+      buf.clear();
+      byteBufferPreadCheck(defaultBuf, buf, 11);
+
+      // Small buffer size, initial buffer position is 0
+      buf.clear();
+      byteBufferPreadCheck(smallBuf, buf, 0);
+
+      // Small buffer size, initial buffer position is not 0
+      buf.clear();
+      byteBufferPreadCheck(smallBuf, buf, 11);
+
+      // Test with direct ByteBuffer
+      buf = ByteBuffer.allocateDirect(dataLen + 100);
+
+      // Direct buffer, default buffer size, initial buffer position is 0
+      byteBufferPreadCheck(defaultBuf, buf, 0);
+
+      // Direct buffer, default buffer size, initial buffer position is not 0
+      buf.clear();
+      byteBufferPreadCheck(defaultBuf, buf, 11);
+
+      // Direct buffer, small buffer size, initial buffer position is 0
+      buf.clear();
+      byteBufferPreadCheck(smallBuf, buf, 0);
+
+      // Direct buffer, small buffer size, initial buffer position is not 0
+      buf.clear();
+      byteBufferPreadCheck(smallBuf, buf, 11);
+    }
+  }
   
   @Test(timeout=120000)
   public void testCombinedOp() throws Exception {
@@ -850,5 +997,23 @@ public void testUnbuffer() throws Exception {
         // The close will be called when exiting this try-with-resource block
       }
     }
+
+    // Test ByteBuffer pread
+    try (InputStream in = getInputStream(smallBufferSize)) {
+      if (in instanceof ByteBufferPositionedReadable) {
+        ByteBufferPositionedReadable bbpin = (ByteBufferPositionedReadable) in;
+
+        // Test unbuffer after pread
+        byteBufferPreadCheck(bbpin);
+        ((CanUnbuffer) in).unbuffer();
+
+        // Test pread again after unbuffer
+        byteBufferPreadCheck(bbpin);
+
+        // Test close after unbuffer
+        ((CanUnbuffer) in).unbuffer();
+        // The close will be called when exiting this try-with-resource block
+      }
+    }
   }
 }
diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/crypto/TestCryptoStreams.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/crypto/TestCryptoStreams.java
index 514c54080a0a6..73c6249612387 100644
--- a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/crypto/TestCryptoStreams.java
+++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/crypto/TestCryptoStreams.java
@@ -429,6 +429,8 @@ public boolean hasCapability(String capability) {
       case StreamCapabilities.READAHEAD:
       case StreamCapabilities.DROPBEHIND:
       case StreamCapabilities.UNBUFFER:
+      case StreamCapabilities.READBYTEBUFFER:
+      case StreamCapabilities.PREADBYTEBUFFER:
         return true;
       default:
         return false;
diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/crypto/TestCryptoStreamsNormal.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/crypto/TestCryptoStreamsNormal.java
index df7dc72cf886a..3114ca18325a7 100644
--- a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/crypto/TestCryptoStreamsNormal.java
+++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/crypto/TestCryptoStreamsNormal.java
@@ -105,27 +105,32 @@ public void testByteBufferReadFully() throws Exception {}
   @Override
   @Test(timeout=10000)
   public void testReadFully() throws IOException {}
-  
+
   @Ignore("Wrapped stream doesn't support Seek")
   @Override
   @Test(timeout=10000)
   public void testSeek() throws IOException {}
-  
+
   @Ignore("Wrapped stream doesn't support ByteBufferRead")
   @Override
   @Test(timeout=10000)
   public void testByteBufferRead() throws IOException {}
-  
+
+  @Ignore("Wrapped stream doesn't support ByteBufferPositionedReadable")
+  @Override
+  @Test(timeout=10000)
+  public void testByteBufferPread() throws IOException {}
+
   @Ignore("Wrapped stream doesn't support ByteBufferRead, Seek")
   @Override
   @Test(timeout=10000)
   public void testCombinedOp() throws IOException {}
-  
+
   @Ignore("Wrapped stream doesn't support SeekToNewSource")
   @Override
   @Test(timeout=10000)
   public void testSeekToNewSource() throws IOException {}
-  
+
   @Ignore("Wrapped stream doesn't support HasEnhancedByteBufferAccess")
   @Override
   @Test(timeout=10000)
@@ -135,4 +140,4 @@ public void testHasEnhancedByteBufferAccess() throws IOException {}
   @Override
   @Test
   public void testUnbuffer() throws Exception {}
-}
+}
\ No newline at end of file
diff --git a/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs-tests/hdfs_test.h b/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs-tests/hdfs_test.h
index 0eab9a68aea7f..f00326317f24a 100644
--- a/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs-tests/hdfs_test.h
+++ b/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs-tests/hdfs_test.h
@@ -49,6 +49,24 @@ extern  "C" {
      */
     void hdfsFileDisableDirectRead(struct hdfsFile_internal *file);
 
+    /**
+    * Determine if a file is using the "direct pread" optimization.
+    *
+    * @param file     The HDFS file
+    * @return         1 if the file is using the direct pread optimization,
+    *                 0 otherwise.
+    */
+    int hdfsFileUsesDirectPread(struct hdfsFile_internal *file);
+
+    /**
+     * Disable the direct pread optimization for a file.
+     *
+     * This is mainly provided for unit testing purposes.
+     *
+     * @param file     The HDFS file
+     */
+    void hdfsFileDisableDirectPread(struct hdfsFile_internal *file);
+
     /**
      * Disable domain socket security checks.
      *
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestByteBufferPread.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestByteBufferPread.java
index 1c7f1500f3689..0692f5a12893c 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestByteBufferPread.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestByteBufferPread.java
@@ -287,4 +287,4 @@ public static void shutdown() throws IOException {
       cluster.shutdown(true);
     }
   }
-}
+}
\ No newline at end of file

From 5a6d88bd7daaf77d1a9634d8eb5b643496d83b14 Mon Sep 17 00:00:00 2001
From: Sunil G <sunilg@apache.org>
Date: Wed, 31 Oct 2018 12:32:49 +0530
Subject: [PATCH 03/40] HDFS-14033. [libhdfs++] Disable libhdfs++ build on
 systems that do not support thread_local. Contributed by Anatoli Shein.

(cherry picked from commit 9c438abe52d4ee0b25345a4b7ec1697dd66f85e9)
---
 .../src/CMakeLists.txt                          | 17 +++--------------
 1 file changed, 3 insertions(+), 14 deletions(-)

diff --git a/hadoop-hdfs-project/hadoop-hdfs-native-client/src/CMakeLists.txt b/hadoop-hdfs-project/hadoop-hdfs-native-client/src/CMakeLists.txt
index 18396c7855477..026be9f7b0bcf 100644
--- a/hadoop-hdfs-project/hadoop-hdfs-native-client/src/CMakeLists.txt
+++ b/hadoop-hdfs-project/hadoop-hdfs-native-client/src/CMakeLists.txt
@@ -58,21 +58,11 @@ if(WIN32)
     # Omit unneeded headers.
     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DWIN32_LEAN_AND_MEAN")
     set(OS_DIR ${CMAKE_SOURCE_DIR}/main/native/libhdfs/os/windows)
-
-    # IMPORTANT: OUT_DIR MUST be relative to maven's
-    # project.build.directory (=target) and match dist-copynativelibs
-    # in order to be in a release
-    set(OUT_DIR bin)
+    set(OUT_DIR target/bin)
 else()
     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fvisibility=hidden")
-    # using old default behavior on GCC >= 10.0
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fcommon")
     set(OS_DIR ${CMAKE_SOURCE_DIR}/main/native/libhdfs/os/posix)
-
-    # IMPORTANT: OUT_DIR MUST be relative to maven's
-    # project.build.directory (=target) and match dist-copynativelibs
-    # in order to be in a release
-    set(OUT_DIR native/target/usr/local/lib)
+    set(OUT_DIR target/usr/local/lib)
 endif()
 
 # Configure JNI.
@@ -152,8 +142,7 @@ add_subdirectory(main/native/libhdfs-tests)
 # Temporary fix to disable Libhdfs++ build on older systems that do not support thread_local
 include(CheckCXXSourceCompiles)
 unset (THREAD_LOCAL_SUPPORTED CACHE)
-set (CMAKE_CXX_STANDARD 11)
-set (CMAKE_CXX_STANDARD_REQUIRED ON)
+set (CMAKE_REQUIRED_DEFINITIONS "-std=c++11")
 set (CMAKE_REQUIRED_LIBRARIES ${CMAKE_THREAD_LIBS_INIT})
 check_cxx_source_compiles(
     "#include <thread>

From d1664a79277b6879b4cc1a29d92f63ff416dcea3 Mon Sep 17 00:00:00 2001
From: Sahil Takiar <stakiar@apache.org>
Date: Thu, 21 Feb 2019 01:06:37 +0530
Subject: [PATCH 04/40] HDFS-14267. Add test_libhdfs_ops to libhdfs tests, mark
 libhdfs_read/write.c as examples. Contributed by Sahil Takiar.

Signed-off-by: Wei-Chiu Chuang <weichiu@apache.org>
---
 .../src/CMakeLists.txt                        | 17 ++++++++--
 .../native/libhdfs-examples/CMakeLists.txt    | 34 +++++++++++++++++++
 .../main/native/libhdfs-examples/README.md    | 24 +++++++++++++
 .../libhdfs_read.c}                           | 15 +++++---
 .../libhdfs_write.c}                          | 13 ++++---
 .../native/libhdfs-examples}/test-libhdfs.sh  |  6 ++--
 .../src/main/native/libhdfs/CMakeLists.txt    |  8 ++---
 7 files changed, 97 insertions(+), 20 deletions(-)
 create mode 100644 hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs-examples/CMakeLists.txt
 create mode 100644 hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs-examples/README.md
 rename hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/{libhdfs-tests/test_libhdfs_read.c => libhdfs-examples/libhdfs_read.c} (91%)
 rename hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/{libhdfs-tests/test_libhdfs_write.c => libhdfs-examples/libhdfs_write.c} (93%)
 rename hadoop-hdfs-project/{hadoop-hdfs/src/main/native/tests => hadoop-hdfs-native-client/src/main/native/libhdfs-examples}/test-libhdfs.sh (98%)

diff --git a/hadoop-hdfs-project/hadoop-hdfs-native-client/src/CMakeLists.txt b/hadoop-hdfs-project/hadoop-hdfs-native-client/src/CMakeLists.txt
index 026be9f7b0bcf..df40502e6d3db 100644
--- a/hadoop-hdfs-project/hadoop-hdfs-native-client/src/CMakeLists.txt
+++ b/hadoop-hdfs-project/hadoop-hdfs-native-client/src/CMakeLists.txt
@@ -58,11 +58,21 @@ if(WIN32)
     # Omit unneeded headers.
     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DWIN32_LEAN_AND_MEAN")
     set(OS_DIR ${CMAKE_SOURCE_DIR}/main/native/libhdfs/os/windows)
-    set(OUT_DIR target/bin)
+
+    # IMPORTANT: OUT_DIR MUST be relative to maven's
+    # project.build.directory (=target) and match dist-copynativelibs
+    # in order to be in a release
+    set(OUT_DIR bin)
 else()
     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fvisibility=hidden")
+    # using old default behavior on GCC >= 10.0
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fcommon")
     set(OS_DIR ${CMAKE_SOURCE_DIR}/main/native/libhdfs/os/posix)
-    set(OUT_DIR target/usr/local/lib)
+
+    # IMPORTANT: OUT_DIR MUST be relative to maven's
+    # project.build.directory (=target) and match dist-copynativelibs
+    # in order to be in a release
+    set(OUT_DIR native/target/usr/local/lib)
 endif()
 
 # Configure JNI.
@@ -138,6 +148,7 @@ endif()
 
 add_subdirectory(main/native/libhdfs)
 add_subdirectory(main/native/libhdfs-tests)
+add_subdirectory(main/native/libhdfs-examples)
 
 # Temporary fix to disable Libhdfs++ build on older systems that do not support thread_local
 include(CheckCXXSourceCompiles)
@@ -181,4 +192,4 @@ else()
     if(REQUIRE_FUSE)
         message(FATAL_ERROR "Required component fuse_dfs could not be built.")
     endif()
-endif()
+endif()
\ No newline at end of file
diff --git a/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs-examples/CMakeLists.txt b/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs-examples/CMakeLists.txt
new file mode 100644
index 0000000000000..1d33639f3db68
--- /dev/null
+++ b/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs-examples/CMakeLists.txt
@@ -0,0 +1,34 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+cmake_minimum_required(VERSION 3.1 FATAL_ERROR)
+
+include_directories(
+    ${CMAKE_CURRENT_SOURCE_DIR}/../libhdfs/include
+    ${GENERATED_JAVAH}
+    ${CMAKE_BINARY_DIR}
+    ${CMAKE_CURRENT_SOURCE_DIR}/../libhdfs
+    ${JNI_INCLUDE_DIRS}
+    ${OS_DIR}
+)
+
+add_executable(hdfs_read libhdfs_read.c)
+target_link_libraries(hdfs_read hdfs)
+
+add_executable(hdfs_write libhdfs_write.c)
+target_link_libraries(hdfs_write hdfs)
\ No newline at end of file
diff --git a/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs-examples/README.md b/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs-examples/README.md
new file mode 100644
index 0000000000000..c962feba526c7
--- /dev/null
+++ b/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs-examples/README.md
@@ -0,0 +1,24 @@
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+
+The files in this directory are purely meant to provide additional examples for how to use libhdfs. They are compiled as
+part of the build and are thus guaranteed to compile against the associated version of lidhdfs. However, no tests exists
+for these examples so their functionality is not guaranteed.
+
+The examples are written to run against a mini-dfs cluster. The script `test-libhdfs.sh` can setup a mini DFS cluster
+that the examples can run against. Again, none of this is tested and is thus not guaranteed to work.
\ No newline at end of file
diff --git a/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs-tests/test_libhdfs_read.c b/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs-examples/libhdfs_read.c
similarity index 91%
rename from hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs-tests/test_libhdfs_read.c
rename to hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs-examples/libhdfs_read.c
index 4b90f2a4ab0be..419be1268b284 100644
--- a/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs-tests/test_libhdfs_read.c
+++ b/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs-examples/libhdfs_read.c
@@ -16,11 +16,16 @@
  * limitations under the License.
  */
 
-#include "hdfs/hdfs.h" 
+#include "hdfs/hdfs.h"
 
 #include <stdio.h>
 #include <stdlib.h>
 
+/**
+ * An example of using libhdfs to read files. The usage of this file is as follows:
+ *
+ *   Usage: hdfs_read <filename> <filesize> <buffersize>
+ */
 int main(int argc, char **argv) {
     hdfsFS fs;
     const char *rfile = argv[1];
@@ -33,12 +38,12 @@ int main(int argc, char **argv) {
         fprintf(stderr, "Usage: hdfs_read <filename> <filesize> <buffersize>\n");
         exit(-1);
     }
-    
+
     fs = hdfsConnect("default", 0);
     if (!fs) {
         fprintf(stderr, "Oops! Failed to connect to hdfs!\n");
         exit(-1);
-    } 
+    }
 
     readFile = hdfsOpenFile(fs, rfile, O_RDONLY, bufferSize, 0, 0);
     if (!readFile) {
@@ -51,13 +56,13 @@ int main(int argc, char **argv) {
     if(buffer == NULL) {
         return -2;
     }
-    
+
     // read from the file
     curSize = bufferSize;
     for (; curSize == bufferSize;) {
         curSize = hdfsRead(fs, readFile, (void*)buffer, curSize);
     }
-    
+
 
     free(buffer);
     hdfsCloseFile(fs, readFile);
diff --git a/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs-tests/test_libhdfs_write.c b/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs-examples/libhdfs_write.c
similarity index 93%
rename from hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs-tests/test_libhdfs_write.c
rename to hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs-examples/libhdfs_write.c
index c55c8e330c33b..8fbf87e524439 100644
--- a/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs-tests/test_libhdfs_write.c
+++ b/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs-examples/libhdfs_write.c
@@ -16,13 +16,18 @@
  * limitations under the License.
  */
 
-#include "hdfs/hdfs.h" 
+#include "hdfs/hdfs.h"
 
 #include <limits.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <sys/types.h>
 
+/**
+ * An example of using libhdfs to write files. The usage of this file is as follows:
+ *
+ *   Usage: hdfs_write <filename> <filesize> <buffersize>
+ */
 int main(int argc, char **argv) {
     hdfsFS fs;
     const char *writeFileName = argv[1];
@@ -40,12 +45,12 @@ int main(int argc, char **argv) {
         fprintf(stderr, "Usage: hdfs_write <filename> <filesize> <buffersize>\n");
         exit(-1);
     }
-    
+
     fs = hdfsConnect("default", 0);
     if (!fs) {
         fprintf(stderr, "Oops! Failed to connect to hdfs!\n");
         exit(-1);
-    } 
+    }
 
     // sanity check
     if(fileTotalSize == ULONG_MAX && errno == ERANGE) {
@@ -79,7 +84,7 @@ int main(int argc, char **argv) {
 
     // write to the file
     for (nrRemaining = fileTotalSize; nrRemaining > 0; nrRemaining -= bufferSize ) {
-      curSize = ( bufferSize < nrRemaining ) ? bufferSize : (tSize)nrRemaining; 
+      curSize = ( bufferSize < nrRemaining ) ? bufferSize : (tSize)nrRemaining;
       if ((written = hdfsWrite(fs, writeFile, (void*)buffer, curSize)) != curSize) {
         fprintf(stderr, "ERROR: hdfsWrite returned an error on write: %d\n", written);
         exit(-3);
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/native/tests/test-libhdfs.sh b/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs-examples/test-libhdfs.sh
similarity index 98%
rename from hadoop-hdfs-project/hadoop-hdfs/src/main/native/tests/test-libhdfs.sh
rename to hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs-examples/test-libhdfs.sh
index 3407e9cf8e26a..e43b0a52903dd 100755
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/native/tests/test-libhdfs.sh
+++ b/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs-examples/test-libhdfs.sh
@@ -70,7 +70,7 @@ $HADOOP_HOME/share/hadoop/common/
 $HADOOP_HOME/share/hadoop/hdfs
 $HADOOP_HOME/share/hadoop/hdfs/lib/"
 
-for d in $JAR_DIRS; do 
+for d in $JAR_DIRS; do
   for j in $d/*.jar; do
     CLASSPATH=${CLASSPATH}:$j
   done;
@@ -114,14 +114,14 @@ LIB_JVM_DIR=`findlibjvm`
 echo  "++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"
 echo  LIB_JVM_DIR = $LIB_JVM_DIR
 echo  "++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"
-# Put delays to ensure hdfs is up and running and also shuts down 
+# Put delays to ensure hdfs is up and running and also shuts down
 # after the tests are complete
 rm $HDFS_TEST_CONF_DIR/core-site.xml
 
 $HADOOP_HOME/bin/hadoop jar $HDFS_TEST_JAR \
     org.apache.hadoop.test.MiniDFSClusterManager \
     -format -nnport 20300 -writeConfig $HDFS_TEST_CONF_DIR/core-site.xml \
-    > /tmp/libhdfs-test-cluster.out 2>&1 & 
+    > /tmp/libhdfs-test-cluster.out 2>&1 &
 
 MINI_CLUSTER_PID=$!
 for i in {1..15}; do
diff --git a/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs/CMakeLists.txt b/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs/CMakeLists.txt
index 08765f5e28046..77fbea31d46aa 100644
--- a/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs/CMakeLists.txt
+++ b/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs/CMakeLists.txt
@@ -55,11 +55,9 @@ set_target_properties(hdfs PROPERTIES
     SOVERSION ${LIBHDFS_VERSION})
 
 build_libhdfs_test(test_libhdfs_ops hdfs_static test_libhdfs_ops.c)
-link_libhdfs_test(test_libhdfs_ops hdfs_static ${JAVA_JVM_LIBRARY})
-build_libhdfs_test(test_libhdfs_reads hdfs_static test_libhdfs_read.c)
-link_libhdfs_test(test_libhdfs_reads hdfs_static ${JAVA_JVM_LIBRARY})
-build_libhdfs_test(test_libhdfs_write hdfs_static test_libhdfs_write.c)
-link_libhdfs_test(test_libhdfs_write hdfs_static ${JAVA_JVM_LIBRARY})
+link_libhdfs_test(test_libhdfs_ops hdfs_static native_mini_dfs ${JAVA_JVM_LIBRARY})
+add_libhdfs_test(test_libhdfs_ops hdfs_static)
+
 build_libhdfs_test(test_libhdfs_threaded hdfs_static expect.c test_libhdfs_threaded.c ${OS_DIR}/thread.c)
 link_libhdfs_test(test_libhdfs_threaded hdfs_static native_mini_dfs)
 add_libhdfs_test(test_libhdfs_threaded hdfs_static)

From 350ffec00d0bab02b21abbc2cc17d23e3fb196a3 Mon Sep 17 00:00:00 2001
From: Sahil Takiar <stakiar@cloudera.com>
Date: Tue, 19 Mar 2019 22:50:56 +0530
Subject: [PATCH 05/40] HDFS-14304: High lock contention on hdfsHashMutex in
 libhdfs

This closes #595

Signed-off-by: Todd Lipcon <todd@apache.org>
---
 .../main/native/libhdfs-tests/CMakeLists.txt  |   5 +-
 .../native/libhdfs-tests/native_mini_dfs.c    |  42 ++-
 .../main/native/libhdfs-tests/test_htable.c   | 100 ------
 .../src/main/native/libhdfs/CMakeLists.txt    |   2 +-
 .../src/main/native/libhdfs/common/htable.c   | 287 ------------------
 .../src/main/native/libhdfs/common/htable.h   | 161 ----------
 .../src/main/native/libhdfs/exception.c       |   6 +-
 .../src/main/native/libhdfs/hdfs.c            |  59 ++--
 .../src/main/native/libhdfs/jclasses.c        | 136 +++++++++
 .../src/main/native/libhdfs/jclasses.h        | 112 +++++++
 .../src/main/native/libhdfs/jni_helper.c      | 223 +++++++-------
 .../src/main/native/libhdfs/jni_helper.h      |  37 ++-
 .../src/main/native/libhdfs/os/mutexes.h      |   6 +-
 .../main/native/libhdfs/os/posix/mutexes.c    |   2 +-
 .../libhdfs/os/posix/thread_local_storage.c   |  99 +++++-
 .../main/native/libhdfs/os/windows/mutexes.c  |   4 +-
 16 files changed, 549 insertions(+), 732 deletions(-)
 delete mode 100644 hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs-tests/test_htable.c
 delete mode 100644 hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs/common/htable.c
 delete mode 100644 hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs/common/htable.h
 create mode 100644 hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs/jclasses.c
 create mode 100644 hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs/jclasses.h

diff --git a/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs-tests/CMakeLists.txt b/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs-tests/CMakeLists.txt
index 08fc030bbbbcc..f16cc9eb1b033 100644
--- a/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs-tests/CMakeLists.txt
+++ b/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs-tests/CMakeLists.txt
@@ -29,8 +29,8 @@ include_directories(
 
 add_library(native_mini_dfs
     native_mini_dfs.c
-    ../libhdfs/common/htable.c
     ../libhdfs/exception.c
+    ../libhdfs/jclasses.c
     ../libhdfs/jni_helper.c
     ${OS_DIR}/mutexes.c
     ${OS_DIR}/thread_local_storage.c
@@ -39,6 +39,3 @@ add_library(native_mini_dfs
 add_executable(test_native_mini_dfs test_native_mini_dfs.c)
 target_link_libraries(test_native_mini_dfs native_mini_dfs ${JAVA_JVM_LIBRARY})
 add_test(test_test_native_mini_dfs test_native_mini_dfs)
-
-add_executable(test_htable ../libhdfs/common/htable.c test_htable.c)
-target_link_libraries(test_htable ${OS_LINK_LIBRARIES})
diff --git a/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs-tests/native_mini_dfs.c b/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs-tests/native_mini_dfs.c
index 6938109d53e4d..3af56f1e4f96e 100644
--- a/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs-tests/native_mini_dfs.c
+++ b/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs-tests/native_mini_dfs.c
@@ -17,6 +17,7 @@
  */
 
 #include "exception.h"
+#include "jclasses.h"
 #include "jni_helper.h"
 #include "native_mini_dfs.h"
 #include "platform.h"
@@ -36,9 +37,7 @@
 
 #define MINIDFS_CLUSTER_BUILDER "org/apache/hadoop/hdfs/MiniDFSCluster$Builder"
 #define MINIDFS_CLUSTER "org/apache/hadoop/hdfs/MiniDFSCluster"
-#define HADOOP_CONF     "org/apache/hadoop/conf/Configuration"
 #define HADOOP_NAMENODE "org/apache/hadoop/hdfs/server/namenode/NameNode"
-#define JAVA_INETSOCKETADDRESS "java/net/InetSocketAddress"
 
 struct NativeMiniDfsCluster {
     /**
@@ -60,8 +59,7 @@ static int hdfsDisableDomainSocketSecurity(void)
       errno = EINTERNAL;
       return -1;
     }
-    jthr = invokeMethod(env, NULL, STATIC, NULL,
-            "org/apache/hadoop/net/unix/DomainSocket",
+    jthr = invokeMethod(env, NULL, STATIC, NULL, JC_DOMAIN_SOCKET,
             "disableBindPathValidation", "()V");
     if (jthr) {
         errno = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
@@ -126,11 +124,6 @@ struct NativeMiniDfsCluster* nmdCreate(struct NativeMiniDfsConf *conf)
             "nmdCreate: new Configuration");
         goto error;
     }
-    if (jthr) {
-        printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
-                              "nmdCreate: Configuration::setBoolean");
-        goto error;
-    }
     // Disable 'minimum block size' -- it's annoying in tests.
     (*env)->DeleteLocalRef(env, jconfStr);
     jconfStr = NULL;
@@ -140,8 +133,9 @@ struct NativeMiniDfsCluster* nmdCreate(struct NativeMiniDfsConf *conf)
                               "nmdCreate: new String");
         goto error;
     }
-    jthr = invokeMethod(env, NULL, INSTANCE, cobj, HADOOP_CONF,
-                        "setLong", "(Ljava/lang/String;J)V", jconfStr, 0LL);
+    jthr = invokeMethod(env, NULL, INSTANCE, cobj,
+            JC_CONFIGURATION, "setLong", "(Ljava/lang/String;J)V", jconfStr,
+            0LL);
     if (jthr) {
         printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
                               "nmdCreate: Configuration::setLong");
@@ -163,7 +157,7 @@ struct NativeMiniDfsCluster* nmdCreate(struct NativeMiniDfsConf *conf)
             goto error;
         }
     }
-    jthr = invokeMethod(env, &val, INSTANCE, bld, MINIDFS_CLUSTER_BUILDER,
+    jthr = findClassAndInvokeMethod(env, &val, INSTANCE, bld, MINIDFS_CLUSTER_BUILDER,
             "format", "(Z)L" MINIDFS_CLUSTER_BUILDER ";", conf->doFormat);
     if (jthr) {
         printExceptionAndFree(env, jthr, PRINT_EXC_ALL, "nmdCreate: "
@@ -172,7 +166,7 @@ struct NativeMiniDfsCluster* nmdCreate(struct NativeMiniDfsConf *conf)
     }
     (*env)->DeleteLocalRef(env, val.l);
     if (conf->webhdfsEnabled) {
-        jthr = invokeMethod(env, &val, INSTANCE, bld, MINIDFS_CLUSTER_BUILDER,
+        jthr = findClassAndInvokeMethod(env, &val, INSTANCE, bld, MINIDFS_CLUSTER_BUILDER,
                         "nameNodeHttpPort", "(I)L" MINIDFS_CLUSTER_BUILDER ";",
                         conf->namenodeHttpPort);
         if (jthr) {
@@ -183,7 +177,7 @@ struct NativeMiniDfsCluster* nmdCreate(struct NativeMiniDfsConf *conf)
         (*env)->DeleteLocalRef(env, val.l);
     }
     if (conf->numDataNodes) {
-        jthr = invokeMethod(env, &val, INSTANCE, bld, MINIDFS_CLUSTER_BUILDER,
+        jthr = findClassAndInvokeMethod(env, &val, INSTANCE, bld, MINIDFS_CLUSTER_BUILDER,
                 "numDataNodes", "(I)L" MINIDFS_CLUSTER_BUILDER ";", conf->numDataNodes);
         if (jthr) {
             printExceptionAndFree(env, jthr, PRINT_EXC_ALL, "nmdCreate: "
@@ -192,7 +186,7 @@ struct NativeMiniDfsCluster* nmdCreate(struct NativeMiniDfsConf *conf)
         }
     }
     (*env)->DeleteLocalRef(env, val.l);
-    jthr = invokeMethod(env, &val, INSTANCE, bld, MINIDFS_CLUSTER_BUILDER,
+    jthr = findClassAndInvokeMethod(env, &val, INSTANCE, bld, MINIDFS_CLUSTER_BUILDER,
             "build", "()L" MINIDFS_CLUSTER ";");
     if (jthr) {
         printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
@@ -242,7 +236,7 @@ int nmdShutdown(struct NativeMiniDfsCluster* cl)
         fprintf(stderr, "nmdShutdown: getJNIEnv failed\n");
         return -EIO;
     }
-    jthr = invokeMethod(env, NULL, INSTANCE, cl->obj,
+    jthr = findClassAndInvokeMethod(env, NULL, INSTANCE, cl->obj,
             MINIDFS_CLUSTER, "shutdown", "()V");
     if (jthr) {
         printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
@@ -260,7 +254,7 @@ int nmdWaitClusterUp(struct NativeMiniDfsCluster *cl)
         fprintf(stderr, "nmdWaitClusterUp: getJNIEnv failed\n");
         return -EIO;
     }
-    jthr = invokeMethod(env, NULL, INSTANCE, cl->obj,
+    jthr = findClassAndInvokeMethod(env, NULL, INSTANCE, cl->obj,
             MINIDFS_CLUSTER, "waitClusterUp", "()V");
     if (jthr) {
         printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
@@ -282,7 +276,7 @@ int nmdGetNameNodePort(const struct NativeMiniDfsCluster *cl)
     }
     // Note: this will have to be updated when HA nativeMiniDfs clusters are
     // supported
-    jthr = invokeMethod(env, &jVal, INSTANCE, cl->obj,
+    jthr = findClassAndInvokeMethod(env, &jVal, INSTANCE, cl->obj,
             MINIDFS_CLUSTER, "getNameNodePort", "()I");
     if (jthr) {
         printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
@@ -307,7 +301,7 @@ int nmdGetNameNodeHttpAddress(const struct NativeMiniDfsCluster *cl,
         return -EIO;
     }
     // First get the (first) NameNode of the cluster
-    jthr = invokeMethod(env, &jVal, INSTANCE, cl->obj, MINIDFS_CLUSTER,
+    jthr = findClassAndInvokeMethod(env, &jVal, INSTANCE, cl->obj, MINIDFS_CLUSTER,
                         "getNameNode", "()L" HADOOP_NAMENODE ";");
     if (jthr) {
         printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
@@ -318,8 +312,8 @@ int nmdGetNameNodeHttpAddress(const struct NativeMiniDfsCluster *cl,
     jNameNode = jVal.l;
 
     // Then get the http address (InetSocketAddress) of the NameNode
-    jthr = invokeMethod(env, &jVal, INSTANCE, jNameNode, HADOOP_NAMENODE,
-                        "getHttpAddress", "()L" JAVA_INETSOCKETADDRESS ";");
+    jthr = findClassAndInvokeMethod(env, &jVal, INSTANCE, jNameNode, HADOOP_NAMENODE,
+                        "getHttpAddress", "()L" JAVA_NET_ISA ";");
     if (jthr) {
         ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
                                     "nmdGetNameNodeHttpAddress: "
@@ -328,8 +322,8 @@ int nmdGetNameNodeHttpAddress(const struct NativeMiniDfsCluster *cl,
     }
     jAddress = jVal.l;
 
-    jthr = invokeMethod(env, &jVal, INSTANCE, jAddress,
-                        JAVA_INETSOCKETADDRESS, "getPort", "()I");
+    jthr = findClassAndInvokeMethod(env, &jVal, INSTANCE, jAddress,
+                        JAVA_NET_ISA, "getPort", "()I");
     if (jthr) {
         ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
                                     "nmdGetNameNodeHttpAddress: "
@@ -338,7 +332,7 @@ int nmdGetNameNodeHttpAddress(const struct NativeMiniDfsCluster *cl,
     }
     *port = jVal.i;
 
-    jthr = invokeMethod(env, &jVal, INSTANCE, jAddress, JAVA_INETSOCKETADDRESS,
+    jthr = findClassAndInvokeMethod(env, &jVal, INSTANCE, jAddress, JAVA_NET_ISA,
                         "getHostName", "()Ljava/lang/String;");
     if (jthr) {
         ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
diff --git a/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs-tests/test_htable.c b/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs-tests/test_htable.c
deleted file mode 100644
index 0c3861bfa7f9a..0000000000000
--- a/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs-tests/test_htable.c
+++ /dev/null
@@ -1,100 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "common/htable.h"
-#include "expect.h"
-#include "hdfs_test.h"
-
-#include <errno.h>
-#include <inttypes.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-// Disable type cast and loss of precision warnings, because the test
-// manipulates void* values manually on purpose.
-#ifdef WIN32
-#pragma warning(disable: 4244 4306)
-#endif
-
-static uint32_t simple_hash(const void *key, uint32_t size)
-{
-    uintptr_t k = (uintptr_t)key;
-    return ((13 + k) * 6367) % size;
-}
-
-static int simple_compare(const void *a, const void *b)
-{
-    return a == b;
-}
-
-static void expect_102(void *f, void *k, void *v)
-{
-    int *found_102 = f;
-    uintptr_t key = (uintptr_t)k;
-    uintptr_t val = (uintptr_t)v;
-
-    if ((key == 2) && (val == 102)) {
-        *found_102 = 1;
-    } else {
-        abort();
-    }
-}
-
-static void *htable_pop_val(struct htable *ht, void *key)
-{
-    void *old_key, *old_val;
-
-    htable_pop(ht, key, &old_key, &old_val);
-    return old_val;
-}
-
-int main(void)
-{
-    struct htable *ht;
-    int found_102 = 0;
-
-    ht = htable_alloc(4, simple_hash, simple_compare);
-    EXPECT_INT_EQ(0, htable_used(ht));
-    EXPECT_INT_EQ(4, htable_capacity(ht));
-    EXPECT_NULL(htable_get(ht, (void*)123));
-    EXPECT_NULL(htable_pop_val(ht, (void*)123));
-    EXPECT_ZERO(htable_put(ht, (void*)123, (void*)456));
-    EXPECT_INT_EQ(456, (uintptr_t)htable_get(ht, (void*)123));
-    EXPECT_INT_EQ(456, (uintptr_t)htable_pop_val(ht, (void*)123));
-    EXPECT_NULL(htable_pop_val(ht, (void*)123));
-
-    // Enlarge the hash table
-    EXPECT_ZERO(htable_put(ht, (void*)1, (void*)101));
-    EXPECT_ZERO(htable_put(ht, (void*)2, (void*)102));
-    EXPECT_ZERO(htable_put(ht, (void*)3, (void*)103));
-    EXPECT_INT_EQ(3, htable_used(ht));
-    EXPECT_INT_EQ(8, htable_capacity(ht));
-    EXPECT_INT_EQ(102, (uintptr_t)htable_get(ht, (void*)2));
-    EXPECT_INT_EQ(101, (uintptr_t)htable_pop_val(ht, (void*)1));
-    EXPECT_INT_EQ(103, (uintptr_t)htable_pop_val(ht, (void*)3));
-    EXPECT_INT_EQ(1, htable_used(ht));
-    htable_visit(ht, expect_102, &found_102);
-    EXPECT_INT_EQ(1, found_102);
-    htable_free(ht);
-
-    fprintf(stderr, "SUCCESS.\n");
-    return EXIT_SUCCESS;
-}
-
-// vim: ts=4:sw=4:tw=79:et
diff --git a/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs/CMakeLists.txt b/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs/CMakeLists.txt
index 77fbea31d46aa..a7fb311125110 100644
--- a/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs/CMakeLists.txt
+++ b/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs/CMakeLists.txt
@@ -35,7 +35,7 @@ hadoop_add_dual_library(hdfs
     exception.c
     jni_helper.c
     hdfs.c
-    common/htable.c
+    jclasses.c
     ${OS_DIR}/mutexes.c
     ${OS_DIR}/thread_local_storage.c
 )
diff --git a/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs/common/htable.c b/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs/common/htable.c
deleted file mode 100644
index 50c89ea9cf707..0000000000000
--- a/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs/common/htable.c
+++ /dev/null
@@ -1,287 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "common/htable.h"
-
-#include <errno.h>
-#include <inttypes.h>
-#include <stdlib.h>
-#include <string.h>
-#include <unistd.h>
-
-struct htable_pair {
-    void *key;
-    void *val;
-};
-
-/**
- * A hash table which uses linear probing.
- */
-struct htable {
-    uint32_t capacity;
-    uint32_t used;
-    htable_hash_fn_t hash_fun;
-    htable_eq_fn_t eq_fun;
-    struct htable_pair *elem;
-};
-
-/**
- * An internal function for inserting a value into the hash table.
- *
- * Note: this function assumes that you have made enough space in the table.
- *
- * @param nelem         The new element to insert.
- * @param capacity      The capacity of the hash table.
- * @param hash_fun      The hash function to use.
- * @param key           The key to insert.
- * @param val           The value to insert.
- */
-static void htable_insert_internal(struct htable_pair *nelem, 
-        uint32_t capacity, htable_hash_fn_t hash_fun, void *key,
-        void *val)
-{
-    uint32_t i;
-
-    i = hash_fun(key, capacity);
-    while (1) {
-        if (!nelem[i].key) {
-            nelem[i].key = key;
-            nelem[i].val = val;
-            return;
-        }
-        i++;
-        if (i == capacity) {
-            i = 0;
-        }
-    }
-}
-
-static int htable_realloc(struct htable *htable, uint32_t new_capacity)
-{
-    struct htable_pair *nelem;
-    uint32_t i, old_capacity = htable->capacity;
-    htable_hash_fn_t hash_fun = htable->hash_fun;
-
-    nelem = calloc(new_capacity, sizeof(struct htable_pair));
-    if (!nelem) {
-        return ENOMEM;
-    }
-    for (i = 0; i < old_capacity; i++) {
-        struct htable_pair *pair = htable->elem + i;
-        if (pair->key) {
-            htable_insert_internal(nelem, new_capacity, hash_fun,
-                                   pair->key, pair->val);
-        }
-    }
-    free(htable->elem);
-    htable->elem = nelem;
-    htable->capacity = new_capacity;
-    return 0;
-}
-
-static uint32_t round_up_to_power_of_2(uint32_t i)
-{
-    if (i == 0) {
-        return 1;
-    }
-    i--;
-    i |= i >> 1;
-    i |= i >> 2;
-    i |= i >> 4;
-    i |= i >> 8;
-    i |= i >> 16;
-    i++;
-    return i;
-}
-
-struct htable *htable_alloc(uint32_t size,
-                htable_hash_fn_t hash_fun, htable_eq_fn_t eq_fun)
-{
-    struct htable *htable;
-
-    htable = calloc(1, sizeof(*htable));
-    if (!htable) {
-        return NULL;
-    }
-    size = round_up_to_power_of_2(size);
-    if (size < HTABLE_MIN_SIZE) {
-        size = HTABLE_MIN_SIZE;
-    }
-    htable->hash_fun = hash_fun;
-    htable->eq_fun = eq_fun;
-    htable->used = 0;
-    if (htable_realloc(htable, size)) {
-        free(htable);
-        return NULL;
-    }
-    return htable;
-}
-
-void htable_visit(struct htable *htable, visitor_fn_t fun, void *ctx)
-{
-    uint32_t i;
-
-    for (i = 0; i != htable->capacity; ++i) {
-        struct htable_pair *elem = htable->elem + i;
-        if (elem->key) {
-            fun(ctx, elem->key, elem->val);
-        }
-    }
-}
-
-void htable_free(struct htable *htable)
-{
-    if (htable) {
-        free(htable->elem);
-        free(htable);
-    }
-}
-
-int htable_put(struct htable *htable, void *key, void *val)
-{
-    int ret;
-    uint32_t nused;
-
-    // NULL is not a valid key value.
-    // This helps us implement htable_get_internal efficiently, since we know
-    // that we can stop when we encounter the first NULL key.
-    if (!key) {
-        return EINVAL;
-    }
-    // NULL is not a valid value.  Otherwise the results of htable_get would
-    // be confusing (does a NULL return mean entry not found, or that the
-    // entry was found and was NULL?) 
-    if (!val) {
-        return EINVAL;
-    }
-    // Re-hash if we have used more than half of the hash table
-    nused = htable->used + 1;
-    if (nused >= (htable->capacity / 2)) {
-        ret = htable_realloc(htable, htable->capacity * 2);
-        if (ret)
-            return ret;
-    }
-    htable_insert_internal(htable->elem, htable->capacity,
-                                htable->hash_fun, key, val);
-    htable->used++;
-    return 0;
-}
-
-static int htable_get_internal(const struct htable *htable,
-                               const void *key, uint32_t *out)
-{
-    uint32_t start_idx, idx;
-
-    start_idx = htable->hash_fun(key, htable->capacity);
-    idx = start_idx;
-    while (1) {
-        struct htable_pair *pair = htable->elem + idx;
-        if (!pair->key) {
-            // We always maintain the invariant that the entries corresponding
-            // to a given key are stored in a contiguous block, not separated
-            // by any NULLs.  So if we encounter a NULL, our search is over.
-            return ENOENT;
-        } else if (htable->eq_fun(pair->key, key)) {
-            *out = idx;
-            return 0;
-        }
-        idx++;
-        if (idx == htable->capacity) {
-            idx = 0;
-        }
-        if (idx == start_idx) {
-            return ENOENT;
-        }
-    }
-}
-
-void *htable_get(const struct htable *htable, const void *key)
-{
-    uint32_t idx;
-
-    if (htable_get_internal(htable, key, &idx)) {
-        return NULL;
-    }
-    return htable->elem[idx].val;
-}
-
-void htable_pop(struct htable *htable, const void *key,
-                void **found_key, void **found_val)
-{
-    uint32_t hole, i;
-    const void *nkey;
-
-    if (htable_get_internal(htable, key, &hole)) {
-        *found_key = NULL;
-        *found_val = NULL;
-        return;
-    }
-    i = hole;
-    htable->used--;
-    // We need to maintain the compactness invariant used in
-    // htable_get_internal.  This invariant specifies that the entries for any
-    // given key are never separated by NULLs (although they may be separated
-    // by entries for other keys.)
-    while (1) {
-        i++;
-        if (i == htable->capacity) {
-            i = 0;
-        }
-        nkey = htable->elem[i].key;
-        if (!nkey) {
-            *found_key = htable->elem[hole].key;
-            *found_val = htable->elem[hole].val;
-            htable->elem[hole].key = NULL;
-            htable->elem[hole].val = NULL;
-            return;
-        } else if (htable->eq_fun(key, nkey)) {
-            htable->elem[hole].key = htable->elem[i].key;
-            htable->elem[hole].val = htable->elem[i].val;
-            hole = i;
-        }
-    }
-}
-
-uint32_t htable_used(const struct htable *htable)
-{
-    return htable->used;
-}
-
-uint32_t htable_capacity(const struct htable *htable)
-{
-    return htable->capacity;
-}
-
-uint32_t ht_hash_string(const void *str, uint32_t max)
-{
-    const char *s = str;
-    uint32_t hash = 0;
-
-    while (*s) {
-        hash = (hash * 31) + *s;
-        s++;
-    }
-    return hash % max;
-}
-
-int ht_compare_string(const void *a, const void *b)
-{
-    return strcmp(a, b) == 0;
-}
-
-// vim: ts=4:sw=4:tw=79:et
diff --git a/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs/common/htable.h b/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs/common/htable.h
deleted file mode 100644
index 33f1229051582..0000000000000
--- a/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs/common/htable.h
+++ /dev/null
@@ -1,161 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef HADOOP_CORE_COMMON_HASH_TABLE
-#define HADOOP_CORE_COMMON_HASH_TABLE
-
-#include <inttypes.h>
-#include <stdio.h>
-#include <stdint.h>
-
-#define HTABLE_MIN_SIZE 4
-
-struct htable;
-
-/**
- * An HTable hash function.
- *
- * @param key       The key.
- * @param capacity  The total capacity.
- *
- * @return          The hash slot.  Must be less than the capacity.
- */
-typedef uint32_t (*htable_hash_fn_t)(const void *key, uint32_t capacity);
-
-/**
- * An HTable equality function.  Compares two keys.
- *
- * @param a         First key.
- * @param b         Second key.
- *
- * @return          nonzero if the keys are equal.
- */
-typedef int (*htable_eq_fn_t)(const void *a, const void *b);
-
-/**
- * Allocate a new hash table.
- *
- * @param capacity  The minimum suggested starting capacity.
- * @param hash_fun  The hash function to use in this hash table.
- * @param eq_fun    The equals function to use in this hash table.
- *
- * @return          The new hash table on success; NULL on OOM.
- */
-struct htable *htable_alloc(uint32_t capacity, htable_hash_fn_t hash_fun,
-                            htable_eq_fn_t eq_fun);
-
-typedef void (*visitor_fn_t)(void *ctx, void *key, void *val);
-
-/**
- * Visit all of the entries in the hash table.
- *
- * @param htable    The hash table.
- * @param fun       The callback function to invoke on each key and value.
- * @param ctx       Context pointer to pass to the callback.
- */
-void htable_visit(struct htable *htable, visitor_fn_t fun, void *ctx);
-
-/**
- * Free the hash table.
- *
- * It is up the calling code to ensure that the keys and values inside the
- * table are de-allocated, if that is necessary.
- *
- * @param htable    The hash table.
- */
-void htable_free(struct htable *htable);
-
-/**
- * Add an entry to the hash table.
- *
- * @param htable    The hash table.
- * @param key       The key to add.  This cannot be NULL.
- * @param fun       The value to add.  This cannot be NULL.
- *
- * @return          0 on success;
- *                  EEXIST if the value already exists in the table;
- *                  ENOMEM if there is not enough memory to add the element.
- *                  EFBIG if the hash table has too many entries to fit in 32
- *                      bits.
- */
-int htable_put(struct htable *htable, void *key, void *val);
-
-/**
- * Get an entry from the hash table.
- *
- * @param htable    The hash table.
- * @param key       The key to find.
- *
- * @return          NULL if there is no such entry; the entry otherwise.
- */
-void *htable_get(const struct htable *htable, const void *key);
-
-/**
- * Get an entry from the hash table and remove it.
- *
- * @param htable    The hash table.
- * @param key       The key for the entry find and remove.
- * @param found_key (out param) NULL if the entry was not found; the found key
- *                      otherwise.
- * @param found_val (out param) NULL if the entry was not found; the found
- *                      value otherwise.
- */
-void htable_pop(struct htable *htable, const void *key,
-                void **found_key, void **found_val);
-
-/**
- * Get the number of entries used in the hash table.
- *
- * @param htable    The hash table.
- *
- * @return          The number of entries used in the hash table.
- */
-uint32_t htable_used(const struct htable *htable);
-
-/**
- * Get the capacity of the hash table.
- *
- * @param htable    The hash table.
- *
- * @return          The capacity of the hash table.
- */
-uint32_t htable_capacity(const struct htable *htable);
-
-/**
- * Hash a string.
- *
- * @param str       The string.
- * @param max       Maximum hash value
- *
- * @return          A number less than max.
- */
-uint32_t ht_hash_string(const void *str, uint32_t max);
-
-/**
- * Compare two strings.
- *
- * @param a         The first string.
- * @param b         The second string.
- *
- * @return          1 if the strings are identical; 0 otherwise.
- */
-int ht_compare_string(const void *a, const void *b);
-
-#endif
-
-// vim: ts=4:sw=4:tw=79:et
diff --git a/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs/exception.c b/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs/exception.c
index bcbb851534d88..fec9a103b4e23 100644
--- a/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs/exception.c
+++ b/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs/exception.c
@@ -18,6 +18,7 @@
 
 #include "exception.h"
 #include "hdfs/hdfs.h"
+#include "jclasses.h"
 #include "jni_helper.h"
 #include "platform.h"
 
@@ -129,9 +130,8 @@ static char* getExceptionUtilString(JNIEnv *env, jthrowable exc, char *methodNam
     jvalue jVal;
     jstring jStr = NULL;
     char *excString = NULL;
-    jthr = invokeMethod(env, &jVal, STATIC, NULL,
-        "org/apache/commons/lang3/exception/ExceptionUtils",
-        methodName, "(Ljava/lang/Throwable;)Ljava/lang/String;", exc);
+    jthr = invokeMethod(env, &jVal, STATIC, NULL, JC_EXCEPTION_UTILS,
+            methodName, "(Ljava/lang/Throwable;)Ljava/lang/String;", exc);
     if (jthr) {
         destroyLocalReference(env, jthr);
         return NULL;
diff --git a/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs/hdfs.c b/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs/hdfs.c
index c25d354be0f42..03924927793cd 100644
--- a/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs/hdfs.c
+++ b/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs/hdfs.c
@@ -351,7 +351,7 @@ typedef struct
 
 /**
  * Helper function to create a org.apache.hadoop.fs.Path object.
- * @param env: The JNIEnv pointer. 
+ * @param env: The JNIEnv pointer.
  * @param path: The file-path for which to construct org.apache.hadoop.fs.Path
  * object.
  * @return Returns a jobject on success and NULL on error.
@@ -520,7 +520,7 @@ int hdfsBuilderConfSetStr(struct hdfsBuilder *bld, const char *key,
                           const char *val)
 {
     struct hdfsBuilderConfOpt *opt, *next;
-    
+
     opt = calloc(1, sizeof(struct hdfsBuilderConfOpt));
     if (!opt)
         return -ENOMEM;
@@ -721,7 +721,7 @@ hdfsFS hdfsBuilderConnect(struct hdfsBuilder *bld)
             goto done;
         }
     }
- 
+
     //Check what type of FileSystem the caller wants...
     if (bld->nn == NULL) {
         // Get a local filesystem.
@@ -1109,13 +1109,15 @@ static hdfsFile hdfsOpenFileImpl(hdfsFS fs, const char *path, int flags,
       errno = ENOTSUP;
       return NULL;
     } else {
-      fprintf(stderr, "ERROR: cannot open an hdfs file in mode 0x%x\n", accmode);
+      fprintf(stderr, "ERROR: cannot open an hdfs file in mode 0x%x\n",
+              accmode);
       errno = EINVAL;
       return NULL;
     }
 
     if ((flags & O_CREAT) && (flags & O_EXCL)) {
-      fprintf(stderr, "WARN: hdfs does not truly support O_CREATE && O_EXCL\n");
+      fprintf(stderr,
+              "WARN: hdfs does not truly support O_CREATE && O_EXCL\n");
     }
 
     if (accmode == O_RDONLY) {
@@ -1147,7 +1149,7 @@ static hdfsFile hdfsOpenFileImpl(hdfsFS fs, const char *path, int flags,
     }
     jConfiguration = jVal.l;
 
-    jStrBufferSize = (*env)->NewStringUTF(env, "io.file.buffer.size"); 
+    jStrBufferSize = (*env)->NewStringUTF(env, "io.file.buffer.size");
     if (!jStrBufferSize) {
         ret = printPendingExceptionAndFree(env, PRINT_EXC_ALL, "OOM");
         goto done;
@@ -1186,7 +1188,7 @@ static hdfsFile hdfsOpenFileImpl(hdfsFS fs, const char *path, int flags,
             jReplication = (jshort)jVal.i;
         }
     }
- 
+
     /* Create and return either the FSDataInputStream or
        FSDataOutputStream references jobject jStream */
 
@@ -1230,7 +1232,7 @@ static hdfsFile hdfsOpenFileImpl(hdfsFS fs, const char *path, int flags,
     file->file = (*env)->NewGlobalRef(env, jFile);
     if (!file->file) {
         ret = printPendingExceptionAndFree(env, PRINT_EXC_ALL,
-            "hdfsOpenFile(%s): NewGlobalRef", path); 
+            "hdfsOpenFile(%s): NewGlobalRef", path);
         goto done;
     }
     file->type = (((flags & O_WRONLY) == 0) ? HDFS_STREAM_INPUT :
@@ -1350,7 +1352,7 @@ int hdfsCloseFile(hdfsFS fs, hdfsFile file)
 {
     int ret;
     // JAVA EQUIVALENT:
-    //  file.close 
+    //  file.close
 
     //The interface whose 'close' method to be called
     CachedJavaClass cachedJavaClass;
@@ -1377,11 +1379,11 @@ int hdfsCloseFile(hdfsFS fs, hdfsFile file)
     } else {
         cachedJavaClass = JC_FS_DATA_OUTPUT_STREAM;
     }
-  
+
     jthr = invokeMethod(env, NULL, INSTANCE, file->file,
             cachedJavaClass, "close", "()V");
     if (jthr) {
-        interfaceShortName = (file->type == HDFS_STREAM_INPUT) ? 
+        interfaceShortName = (file->type == HDFS_STREAM_INPUT) ?
             "FSDataInputStream" : "FSDataOutputStream";
         ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
                 "%s#close", interfaceShortName);
@@ -1412,7 +1414,7 @@ int hdfsExists(hdfsFS fs, const char *path)
         errno = EINTERNAL;
         return -1;
     }
-    
+
     if (path == NULL) {
         errno = EINVAL;
         return -1;
@@ -1866,7 +1868,7 @@ tSize hdfsWrite(hdfsFS fs, hdfsFile f, const void* buffer, tSize length)
     }
 
     jOutputStream = f->file;
-    
+
     if (length < 0) {
     	errno = EINVAL;
     	return -1;
@@ -1914,7 +1916,7 @@ tSize hdfsWrite(hdfsFS fs, hdfsFile f, const void* buffer, tSize length)
     return length;
 }
 
-int hdfsSeek(hdfsFS fs, hdfsFile f, tOffset desiredPos) 
+int hdfsSeek(hdfsFS fs, hdfsFile f, tOffset desiredPos)
 {
     // JAVA EQUIVALENT
     //  fis.seek(pos);
@@ -1989,7 +1991,7 @@ tOffset hdfsTell(hdfsFS fs, hdfsFile f)
     return jVal.j;
 }
 
-int hdfsFlush(hdfsFS fs, hdfsFile f) 
+int hdfsFlush(hdfsFS fs, hdfsFile f)
 {
     // JAVA EQUIVALENT
     //  fos.flush();
@@ -2160,7 +2162,7 @@ static int hdfsCopyImpl(hdfsFS srcFS, const char *src, hdfsFS dstFS,
             "(Lorg/apache/hadoop/fs/FileSystem;Lorg/apache/hadoop/fs/Path;"
             "Lorg/apache/hadoop/fs/FileSystem;Lorg/apache/hadoop/fs/Path;"
             "ZLorg/apache/hadoop/conf/Configuration;)Z",
-            jSrcFS, jSrcPath, jDstFS, jDstPath, deleteSource, 
+            jSrcFS, jSrcPath, jDstFS, jDstPath, deleteSource,
             jConfiguration);
     if (jthr) {
         ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
@@ -2178,7 +2180,7 @@ static int hdfsCopyImpl(hdfsFS srcFS, const char *src, hdfsFS dstFS,
     destroyLocalReference(env, jConfiguration);
     destroyLocalReference(env, jSrcPath);
     destroyLocalReference(env, jDstPath);
-  
+
     if (ret) {
         errno = ret;
         return -1;
@@ -2302,7 +2304,7 @@ int hdfsRename(hdfsFS fs, const char *oldPath, const char *newPath)
 char* hdfsGetWorkingDirectory(hdfsFS fs, char* buffer, size_t bufferSize)
 {
     // JAVA EQUIVALENT:
-    //  Path p = fs.getWorkingDirectory(); 
+    //  Path p = fs.getWorkingDirectory();
     //  return p.toString()
 
     jobject jPath = NULL;
@@ -2379,7 +2381,7 @@ char* hdfsGetWorkingDirectory(hdfsFS fs, char* buffer, size_t bufferSize)
 int hdfsSetWorkingDirectory(hdfsFS fs, const char *path)
 {
     // JAVA EQUIVALENT:
-    //  fs.setWorkingDirectory(Path(path)); 
+    //  fs.setWorkingDirectory(Path(path));
 
     jobject jFS = (jobject)fs;
     jthrowable jthr;
@@ -2542,7 +2544,7 @@ int hdfsChown(hdfsFS fs, const char *path, const char *owner, const char *group)
         goto done;
     }
 
-    jthr = newJavaStr(env, owner, &jOwner); 
+    jthr = newJavaStr(env, owner, &jOwner);
     if (jthr) {
         ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
             "hdfsChown(path=%s): newJavaStr(%s)", path, owner);
@@ -3077,7 +3079,7 @@ void hadoopRzBufferFree(hdfsFile file, struct hadoopRzBuffer *buffer)
     jvalue jVal;
     jthrowable jthr;
     JNIEnv* env;
-    
+
     env = getJNIEnv();
     if (env == NULL) {
         errno = EINTERNAL;
@@ -3186,7 +3188,7 @@ hdfsGetHosts(hdfsFS fs, const char *path, tOffset start, tOffset length)
                 "GetObjectArrayElement(%d)", path, start, length, i);
             goto done;
         }
-        
+
         jthr = invokeMethod(env, &jVal, INSTANCE, jFileBlock,
                 JC_BLOCK_LOCATION, "getHosts",
                 "()[Ljava/lang/String;");
@@ -3378,7 +3380,7 @@ tOffset hdfsGetCapacity(hdfsFS fs)
 }
 
 
-  
+
 tOffset hdfsGetUsed(hdfsFS fs)
 {
     // JAVA EQUIVALENT:
@@ -3416,7 +3418,7 @@ tOffset hdfsGetUsed(hdfsFS fs)
     }
     return jVal.j;
 }
- 
+
 /**
  * We cannot add new fields to the hdfsFileInfo structure because it would break
  * binary compatibility.  The reason is because we return an array
@@ -3595,6 +3597,7 @@ getFileInfoFromStat(JNIEnv *env, jobject jStat, hdfsFileInfo *fileInfo)
     destroyLocalReference(env, jUserName);
     destroyLocalReference(env, jGroupName);
     destroyLocalReference(env, jPermission);
+    destroyLocalReference(env, jPath);
     return jthr;
 }
 
@@ -3633,7 +3636,7 @@ getFileInfo(JNIEnv *env, jobject jFS, jobject jPath, hdfsFileInfo **fileInfo)
         destroyLocalReference(env, jStat);
         return newRuntimeError(env, "getFileInfo: OOM allocating hdfsFileInfo");
     }
-    jthr = getFileInfoFromStat(env, jStat, *fileInfo); 
+    jthr = getFileInfoFromStat(env, jStat, *fileInfo);
     destroyLocalReference(env, jStat);
     return jthr;
 }
@@ -3645,13 +3648,13 @@ hdfsFileInfo* hdfsListDirectory(hdfsFS fs, const char *path, int *numEntries)
     // JAVA EQUIVALENT:
     //  Path p(path);
     //  Path []pathList = fs.listPaths(p)
-    //  foreach path in pathList 
+    //  foreach path in pathList
     //    getFileInfo(path)
 
     jobject jFS = (jobject)fs;
     jthrowable jthr;
     jobject jPath = NULL;
-    hdfsFileInfo *pathList = NULL; 
+    hdfsFileInfo *pathList = NULL;
     jobjectArray jPathList = NULL;
     jvalue jVal;
     jsize jPathListSize = 0;
@@ -3821,4 +3824,4 @@ char* hdfsGetLastExceptionStackTrace()
 
 /**
  * vim: ts=4: sw=4: et:
- */
+ */
\ No newline at end of file
diff --git a/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs/jclasses.c b/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs/jclasses.c
new file mode 100644
index 0000000000000..cf880e91b7596
--- /dev/null
+++ b/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs/jclasses.c
@@ -0,0 +1,136 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "exception.h"
+#include "jclasses.h"
+#include "jni_helper.h"
+#include "os/mutexes.h"
+
+#include <assert.h>
+
+/**
+ * Whether initCachedClasses has been called or not. Protected by the mutex
+ * jclassInitMutex.
+ */
+static int jclassesInitialized = 0;
+
+typedef struct {
+    jclass javaClass;
+    const char *className;
+} javaClassAndName;
+
+/**
+ * A collection of commonly used jclass objects that are used throughout
+ * libhdfs. The jclasses are loaded immediately after the JVM is created (see
+ * initCachedClasses). The array is indexed using CachedJavaClass.
+ */
+javaClassAndName cachedJavaClasses[NUM_CACHED_CLASSES];
+
+/**
+ * Helper method that creates and sets a jclass object given a class name.
+ * Returns a jthrowable on error, NULL otherwise.
+ */
+static jthrowable initCachedClass(JNIEnv *env, const char *className,
+        jclass *cachedJclass) {
+    assert(className != NULL && "Found a CachedJavaClass without a class "
+                                "name");
+    jthrowable jthr = NULL;
+    jclass tempLocalClassRef;
+    tempLocalClassRef = (*env)->FindClass(env, className);
+    if (!tempLocalClassRef) {
+        jthr = getPendingExceptionAndClear(env);
+        goto done;
+    }
+    *cachedJclass = (jclass) (*env)->NewGlobalRef(env, tempLocalClassRef);
+    if (!*cachedJclass) {
+        jthr = getPendingExceptionAndClear(env);
+        goto done;
+    }
+done:
+    destroyLocalReference(env, tempLocalClassRef);
+    return jthr;
+}
+
+jthrowable initCachedClasses(JNIEnv* env) {
+    mutexLock(&jclassInitMutex);
+    if (!jclassesInitialized) {
+        // Set all the class names
+        cachedJavaClasses[JC_CONFIGURATION].className =
+                "org/apache/hadoop/conf/Configuration";
+        cachedJavaClasses[JC_PATH].className =
+                "org/apache/hadoop/fs/Path";
+        cachedJavaClasses[JC_FILE_SYSTEM].className =
+                "org/apache/hadoop/fs/FileSystem";
+        cachedJavaClasses[JC_FS_STATUS].className =
+                "org/apache/hadoop/fs/FsStatus";
+        cachedJavaClasses[JC_FILE_UTIL].className =
+                "org/apache/hadoop/fs/FileUtil";
+        cachedJavaClasses[JC_BLOCK_LOCATION].className =
+                "org/apache/hadoop/fs/BlockLocation";
+        cachedJavaClasses[JC_DFS_HEDGED_READ_METRICS].className =
+                "org/apache/hadoop/hdfs/DFSHedgedReadMetrics";
+        cachedJavaClasses[JC_DISTRIBUTED_FILE_SYSTEM].className =
+                "org/apache/hadoop/hdfs/DistributedFileSystem";
+        cachedJavaClasses[JC_FS_DATA_INPUT_STREAM].className =
+                "org/apache/hadoop/fs/FSDataInputStream";
+        cachedJavaClasses[JC_FS_DATA_OUTPUT_STREAM].className =
+                "org/apache/hadoop/fs/FSDataOutputStream";
+        cachedJavaClasses[JC_FILE_STATUS].className =
+                "org/apache/hadoop/fs/FileStatus";
+        cachedJavaClasses[JC_FS_PERMISSION].className =
+                "org/apache/hadoop/fs/permission/FsPermission";
+        cachedJavaClasses[JC_READ_STATISTICS].className =
+                "org/apache/hadoop/hdfs/ReadStatistics";
+        cachedJavaClasses[JC_HDFS_DATA_INPUT_STREAM].className =
+                "org/apache/hadoop/hdfs/client/HdfsDataInputStream";
+        cachedJavaClasses[JC_DOMAIN_SOCKET].className =
+                "org/apache/hadoop/net/unix/DomainSocket";
+        cachedJavaClasses[JC_URI].className =
+                "java/net/URI";
+        cachedJavaClasses[JC_BYTE_BUFFER].className =
+                "java/nio/ByteBuffer";
+        cachedJavaClasses[JC_ENUM_SET].className =
+                "java/util/EnumSet";
+        cachedJavaClasses[JC_EXCEPTION_UTILS].className =
+                "org/apache/commons/lang3/exception/ExceptionUtils";
+
+        // Create and set the jclass objects based on the class names set above
+        jthrowable jthr;
+        int numCachedClasses =
+                sizeof(cachedJavaClasses) / sizeof(javaClassAndName);
+        for (int i = 0; i < numCachedClasses; i++) {
+            jthr = initCachedClass(env, cachedJavaClasses[i].className,
+                                   &cachedJavaClasses[i].javaClass);
+            if (jthr) {
+                mutexUnlock(&jclassInitMutex);
+                return jthr;
+            }
+        }
+        jclassesInitialized = 1;
+    }
+    mutexUnlock(&jclassInitMutex);
+    return NULL;
+}
+
+jclass getJclass(CachedJavaClass cachedJavaClass) {
+    return cachedJavaClasses[cachedJavaClass].javaClass;
+}
+
+const char *getClassName(CachedJavaClass cachedJavaClass) {
+    return cachedJavaClasses[cachedJavaClass].className;
+}
diff --git a/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs/jclasses.h b/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs/jclasses.h
new file mode 100644
index 0000000000000..92cdd542e2371
--- /dev/null
+++ b/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs/jclasses.h
@@ -0,0 +1,112 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBHDFS_JCLASSES_H
+#define LIBHDFS_JCLASSES_H
+
+#include <jni.h>
+
+/**
+ * Encapsulates logic to cache jclass objects so they can re-used across
+ * calls to FindClass. Creating jclass objects every time libhdfs has to
+ * invoke a method can hurt performance. By cacheing jclass objects we avoid
+ * this overhead.
+ *
+ * We use the term "cached" here loosely; jclasses are not truly cached,
+ * instead they are created once during JVM load and are kept alive until the
+ * process shutdowns. There is no eviction of jclass objects.
+ *
+ * @see https://www.ibm.com/developerworks/library/j-jni/index.html#notc
+ */
+
+/**
+ * Each enum value represents one jclass that is cached. Enum values should
+ * be passed to getJclass or getName to get the jclass object or class name
+ * represented by the enum value.
+ */
+typedef enum {
+    JC_CONFIGURATION,
+    JC_PATH,
+    JC_FILE_SYSTEM,
+    JC_FS_STATUS,
+    JC_FILE_UTIL,
+    JC_BLOCK_LOCATION,
+    JC_DFS_HEDGED_READ_METRICS,
+    JC_DISTRIBUTED_FILE_SYSTEM,
+    JC_FS_DATA_INPUT_STREAM,
+    JC_FS_DATA_OUTPUT_STREAM,
+    JC_FILE_STATUS,
+    JC_FS_PERMISSION,
+    JC_READ_STATISTICS,
+    JC_HDFS_DATA_INPUT_STREAM,
+    JC_DOMAIN_SOCKET,
+    JC_URI,
+    JC_BYTE_BUFFER,
+    JC_ENUM_SET,
+    JC_EXCEPTION_UTILS,
+    // A special marker enum that counts the number of cached jclasses
+    NUM_CACHED_CLASSES
+} CachedJavaClass;
+
+/**
+ * Internally initializes all jclass objects listed in the CachedJavaClass
+ * enum. This method is idempotent and thread-safe.
+ */
+jthrowable initCachedClasses(JNIEnv* env);
+
+/**
+ * Return the jclass object represented by the given CachedJavaClass
+ */
+jclass getJclass(CachedJavaClass cachedJavaClass);
+
+/**
+ * Return the class name represented by the given CachedJavaClass
+ */
+const char *getClassName(CachedJavaClass cachedJavaClass);
+
+/* Some frequently used HDFS class names */
+#define HADOOP_CONF     "org/apache/hadoop/conf/Configuration"
+#define HADOOP_PATH     "org/apache/hadoop/fs/Path"
+#define HADOOP_LOCALFS  "org/apache/hadoop/fs/LocalFileSystem"
+#define HADOOP_FS       "org/apache/hadoop/fs/FileSystem"
+#define HADOOP_FSSTATUS "org/apache/hadoop/fs/FsStatus"
+#define HADOOP_FILEUTIL "org/apache/hadoop/fs/FileUtil"
+#define HADOOP_BLK_LOC  "org/apache/hadoop/fs/BlockLocation"
+#define HADOOP_DFS_HRM  "org/apache/hadoop/hdfs/DFSHedgedReadMetrics"
+#define HADOOP_DFS      "org/apache/hadoop/hdfs/DistributedFileSystem"
+#define HADOOP_FSDISTRM "org/apache/hadoop/fs/FSDataInputStream"
+#define HADOOP_FSDOSTRM "org/apache/hadoop/fs/FSDataOutputStream"
+#define HADOOP_FILESTAT "org/apache/hadoop/fs/FileStatus"
+#define HADOOP_FSPERM   "org/apache/hadoop/fs/permission/FsPermission"
+#define HADOOP_RSTAT    "org/apache/hadoop/hdfs/ReadStatistics"
+#define HADOOP_HDISTRM  "org/apache/hadoop/hdfs/client/HdfsDataInputStream"
+#define HADOOP_RO       "org/apache/hadoop/fs/ReadOption"
+#define HADOOP_DS       "org/apache/hadoop/net/unix/DomainSocket"
+
+/* Some frequently used Java class names */
+#define JAVA_NET_ISA    "java/net/InetSocketAddress"
+#define JAVA_NET_URI    "java/net/URI"
+#define JAVA_BYTEBUFFER "java/nio/ByteBuffer"
+#define JAVA_STRING     "java/lang/String"
+#define JAVA_ENUMSET    "java/util/EnumSet"
+
+/* Some frequently used third-party class names */
+
+#define EXCEPTION_UTILS "org/apache/commons/lang3/exception/ExceptionUtils"
+
+#endif /*LIBHDFS_JCLASSES_H*/
diff --git a/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs/jni_helper.c b/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs/jni_helper.c
index 91a3c1cafc8f4..ccc1e3f6b8f0b 100644
--- a/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs/jni_helper.c
+++ b/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs/jni_helper.c
@@ -18,9 +18,9 @@
 
 #include "config.h"
 #include "exception.h"
+#include "jclasses.h"
 #include "jni_helper.h"
 #include "platform.h"
-#include "common/htable.h"
 #include "os/mutexes.h"
 #include "os/thread_local_storage.h"
 
@@ -29,8 +29,6 @@
 #include <stdio.h> 
 #include <string.h> 
 
-static struct htable *gClassRefHTable = NULL;
-
 /** The Native return types that methods could return */
 #define JVOID         'V'
 #define JOBJECT       'L'
@@ -44,13 +42,6 @@ static struct htable *gClassRefHTable = NULL;
 #define JFLOAT        'F'
 #define JDOUBLE       'D'
 
-
-/**
- * MAX_HASH_TABLE_ELEM: The maximum no. of entries in the hashtable.
- * It's set to 4096 to account for (classNames + No. of threads)
- */
-#define MAX_HASH_TABLE_ELEM 4096
-
 /**
  * Length of buffer for retrieving created JVMs.  (We only ever create one.)
  */
@@ -108,32 +99,27 @@ jthrowable newCStr(JNIEnv *env, jstring jstr, char **out)
     return NULL;
 }
 
-jthrowable invokeMethod(JNIEnv *env, jvalue *retval, MethType methType,
-                 jobject instObj, const char *className,
-                 const char *methName, const char *methSignature, ...)
+/**
+ * Does the work to actually execute a Java method. Takes in an existing jclass
+ * object and a va_list of arguments for the Java method to be invoked.
+ */
+static jthrowable invokeMethodOnJclass(JNIEnv *env, jvalue *retval,
+        MethType methType, jobject instObj, jclass cls, const char *className,
+        const char *methName, const char *methSignature, va_list args)
 {
-    va_list args;
-    jclass cls;
     jmethodID mid;
     jthrowable jthr;
-    const char *str; 
+    const char *str;
     char returnType;
-    
-    jthr = validateMethodType(env, methType);
-    if (jthr)
-        return jthr;
-    jthr = globalClassReference(className, env, &cls);
-    if (jthr)
-        return jthr;
-    jthr = methodIdFromClass(className, methName, methSignature, 
-                            methType, env, &mid);
+
+    jthr = methodIdFromClass(cls, className, methName, methSignature, methType,
+                             env, &mid);
     if (jthr)
         return jthr;
     str = methSignature;
     while (*str != ')') str++;
     str++;
     returnType = *str;
-    va_start(args, methSignature);
     if (returnType == JOBJECT || returnType == JARRAYOBJECT) {
         jobject jobj = NULL;
         if (methType == STATIC) {
@@ -192,7 +178,6 @@ jthrowable invokeMethod(JNIEnv *env, jvalue *retval, MethType methType,
         }
         retval->i = ji;
     }
-    va_end(args);
 
     jthr = (*env)->ExceptionOccurred(env);
     if (jthr) {
@@ -202,43 +187,115 @@ jthrowable invokeMethod(JNIEnv *env, jvalue *retval, MethType methType,
     return NULL;
 }
 
-jthrowable constructNewObjectOfClass(JNIEnv *env, jobject *out, const char *className, 
-                                  const char *ctorSignature, ...)
+jthrowable findClassAndInvokeMethod(JNIEnv *env, jvalue *retval,
+        MethType methType, jobject instObj, const char *className,
+        const char *methName, const char *methSignature, ...)
 {
+    jclass cls = NULL;
+    jthrowable jthr = NULL;
+
     va_list args;
-    jclass cls;
-    jmethodID mid; 
+    va_start(args, methSignature);
+
+    jthr = validateMethodType(env, methType);
+    if (jthr) {
+        goto done;
+    }
+
+    cls = (*env)->FindClass(env, className);
+    if (!cls) {
+        jthr = getPendingExceptionAndClear(env);
+        goto done;
+    }
+
+    jthr = invokeMethodOnJclass(env, retval, methType, instObj, cls,
+            className, methName, methSignature, args);
+
+done:
+    va_end(args);
+    destroyLocalReference(env, cls);
+    return jthr;
+}
+
+jthrowable invokeMethod(JNIEnv *env, jvalue *retval, MethType methType,
+        jobject instObj, CachedJavaClass class,
+        const char *methName, const char *methSignature, ...)
+{
+    jthrowable jthr;
+
+    va_list args;
+    va_start(args, methSignature);
+
+    jthr = invokeMethodOnJclass(env, retval, methType, instObj,
+            getJclass(class), getClassName(class), methName, methSignature,
+            args);
+
+    va_end(args);
+    return jthr;
+}
+
+static jthrowable constructNewObjectOfJclass(JNIEnv *env,
+        jobject *out, jclass cls, const char *className,
+                const char *ctorSignature, va_list args) {
+    jmethodID mid;
     jobject jobj;
     jthrowable jthr;
 
-    jthr = globalClassReference(className, env, &cls);
+    jthr = methodIdFromClass(cls, className, "<init>", ctorSignature, INSTANCE,
+            env, &mid);
     if (jthr)
         return jthr;
-    jthr = methodIdFromClass(className, "<init>", ctorSignature, 
-                            INSTANCE, env, &mid);
-    if (jthr)
-        return jthr;
-    va_start(args, ctorSignature);
     jobj = (*env)->NewObjectV(env, cls, mid, args);
-    va_end(args);
     if (!jobj)
         return getPendingExceptionAndClear(env);
     *out = jobj;
     return NULL;
 }
 
-
-jthrowable methodIdFromClass(const char *className, const char *methName, 
-                            const char *methSignature, MethType methType, 
-                            JNIEnv *env, jmethodID *out)
+jthrowable constructNewObjectOfClass(JNIEnv *env, jobject *out,
+        const char *className, const char *ctorSignature, ...)
 {
+    va_list args;
     jclass cls;
+    jthrowable jthr = NULL;
+
+    cls = (*env)->FindClass(env, className);
+    if (!cls) {
+        jthr = getPendingExceptionAndClear(env);
+        goto done;
+    }
+
+    va_start(args, ctorSignature);
+    jthr = constructNewObjectOfJclass(env, out, cls, className,
+            ctorSignature, args);
+    va_end(args);
+done:
+    destroyLocalReference(env, cls);
+    return jthr;
+}
+
+jthrowable constructNewObjectOfCachedClass(JNIEnv *env, jobject *out,
+        CachedJavaClass cachedJavaClass, const char *ctorSignature, ...)
+{
+    jthrowable jthr = NULL;
+    va_list args;
+    va_start(args, ctorSignature);
+
+    jthr = constructNewObjectOfJclass(env, out,
+            getJclass(cachedJavaClass), getClassName(cachedJavaClass),
+            ctorSignature, args);
+
+    va_end(args);
+    return jthr;
+}
+
+jthrowable methodIdFromClass(jclass cls, const char *className,
+        const char *methName, const char *methSignature, MethType methType,
+        JNIEnv *env, jmethodID *out)
+{
     jthrowable jthr;
     jmethodID mid = 0;
 
-    jthr = globalClassReference(className, env, &cls);
-    if (jthr)
-        return jthr;
     jthr = validateMethodType(env, methType);
     if (jthr)
         return jthr;
@@ -257,54 +314,6 @@ jthrowable methodIdFromClass(const char *className, const char *methName,
     return NULL;
 }
 
-jthrowable globalClassReference(const char *className, JNIEnv *env, jclass *out)
-{
-    jthrowable jthr = NULL;
-    jclass local_clazz = NULL;
-    jclass clazz = NULL;
-    int ret;
-
-    mutexLock(&hdfsHashMutex);
-    if (!gClassRefHTable) {
-        gClassRefHTable = htable_alloc(MAX_HASH_TABLE_ELEM, ht_hash_string,
-            ht_compare_string);
-        if (!gClassRefHTable) {
-            jthr = newRuntimeError(env, "htable_alloc failed\n");
-            goto done;
-        }
-    }
-    clazz = htable_get(gClassRefHTable, className);
-    if (clazz) {
-        *out = clazz;
-        goto done;
-    }
-    local_clazz = (*env)->FindClass(env,className);
-    if (!local_clazz) {
-        jthr = getPendingExceptionAndClear(env);
-        goto done;
-    }
-    clazz = (*env)->NewGlobalRef(env, local_clazz);
-    if (!clazz) {
-        jthr = getPendingExceptionAndClear(env);
-        goto done;
-    }
-    ret = htable_put(gClassRefHTable, (void*)className, clazz);
-    if (ret) {
-        jthr = newRuntimeError(env, "htable_put failed with error "
-                               "code %d\n", ret);
-        goto done;
-    }
-    *out = clazz;
-    jthr = NULL;
-done:
-    mutexUnlock(&hdfsHashMutex);
-    (*env)->DeleteLocalRef(env, local_clazz);
-    if (jthr && clazz) {
-        (*env)->DeleteGlobalRef(env, clazz);
-    }
-    return jthr;
-}
-
 jthrowable classNameOfObject(jobject jobj, JNIEnv *env, char **name)
 {
     jthrowable jthr;
@@ -358,7 +367,6 @@ jthrowable classNameOfObject(jobject jobj, JNIEnv *env, char **name)
     return jthr;
 }
 
-
 /**
  * For the given path, expand it by filling in with all *.jar or *.JAR files,
  * separated by PATH_SEPARATOR. Assumes that expanded is big enough to hold the
@@ -731,14 +739,17 @@ static JNIEnv* getGlobalJNIEnv(void)
                     "with error: %d\n", rv);
             return NULL;
         }
-        jthr = invokeMethod(env, NULL, STATIC, NULL,
-                         "org/apache/hadoop/fs/FileSystem",
-                         "loadFileSystems", "()V");
+
+        // We use findClassAndInvokeMethod here because the jclasses in
+        // jclasses.h have not loaded yet
+        jthr = findClassAndInvokeMethod(env, NULL, STATIC, NULL, HADOOP_FS,
+                "loadFileSystems", "()V");
         if (jthr) {
-            printExceptionAndFree(env, jthr, PRINT_EXC_ALL, "loadFileSystems");
+            printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
+                    "FileSystem: loadFileSystems failed");
+            return NULL;
         }
-    }
-    else {
+    } else {
         //Attach this thread to the VM
         vm = vmBuf[0];
         rv = (*vm)->AttachCurrentThread(vm, (void*)&env, 0);
@@ -809,6 +820,15 @@ JNIEnv* getJNIEnv(void)
 
     state->env = getGlobalJNIEnv();
     mutexUnlock(&jvmMutex);
+
+    jthrowable jthr = NULL;
+    jthr = initCachedClasses(state->env);
+    if (jthr) {
+      printExceptionAndFree(state->env, jthr, PRINT_EXC_ALL,
+                            "initCachedClasses failed");
+      goto fail;
+    }
+
     if (!state->env) {
       goto fail;
     }
@@ -898,8 +918,7 @@ jthrowable hadoopConfSetStr(JNIEnv *env, jobject jConfiguration,
     if (jthr)
         goto done;
     jthr = invokeMethod(env, NULL, INSTANCE, jConfiguration,
-            "org/apache/hadoop/conf/Configuration", "set", 
-            "(Ljava/lang/String;Ljava/lang/String;)V",
+            JC_CONFIGURATION, "set", "(Ljava/lang/String;Ljava/lang/String;)V",
             jkey, jvalue);
     if (jthr)
         goto done;
diff --git a/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs/jni_helper.h b/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs/jni_helper.h
index f0d06d72fc040..41d6fab2a75ae 100644
--- a/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs/jni_helper.h
+++ b/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs/jni_helper.h
@@ -19,6 +19,8 @@
 #ifndef LIBHDFS_JNI_HELPER_H
 #define LIBHDFS_JNI_HELPER_H
 
+#include "jclasses.h"
+
 #include <jni.h>
 #include <stdio.h>
 
@@ -36,7 +38,6 @@
 
 // #define _LIBHDFS_JNI_HELPER_DEBUGGING_ON_
 
-
 /** Denote the method we want to invoke as STATIC or INSTANCE */
 typedef enum {
     STATIC,
@@ -74,12 +75,12 @@ jthrowable newJavaStr(JNIEnv *env, const char *str, jstring *out);
 void destroyLocalReference(JNIEnv *env, jobject jObject);
 
 /** invokeMethod: Invoke a Static or Instance method.
- * className: Name of the class where the method can be found
  * methName: Name of the method
  * methSignature: the signature of the method "(arg-types)ret-type"
  * methType: The type of the method (STATIC or INSTANCE)
  * instObj: Required if the methType is INSTANCE. The object to invoke
    the method on.
+ * class: The CachedJavaClass to call the method on.
  * env: The JNIEnv pointer
  * retval: The pointer to a union type which will contain the result of the
    method invocation, e.g. if the method returns an Object, retval will be
@@ -91,17 +92,33 @@ void destroyLocalReference(JNIEnv *env, jobject jObject);
    a valid exception reference, and the result stored at retval is undefined.
  */
 jthrowable invokeMethod(JNIEnv *env, jvalue *retval, MethType methType,
-                 jobject instObj, const char *className, const char *methName, 
-                 const char *methSignature, ...);
+        jobject instObj, CachedJavaClass class,
+        const char *methName, const char *methSignature, ...);
 
-jthrowable constructNewObjectOfClass(JNIEnv *env, jobject *out, const char *className, 
-                                  const char *ctorSignature, ...);
+/**
+ * findClassAndInvokeMethod: Same as invokeMethod, but it calls FindClass on
+ * the given className first and then calls invokeMethod. This method exists
+ * mainly for test infrastructure, any production code should use
+ * invokeMethod. Calling FindClass repeatedly can introduce performance
+ * overhead, so users should prefer invokeMethod and supply a CachedJavaClass.
+ */
+jthrowable findClassAndInvokeMethod(JNIEnv *env, jvalue *retval,
+        MethType methType, jobject instObj, const char *className,
+        const char *methName, const char *methSignature, ...);
 
-jthrowable methodIdFromClass(const char *className, const char *methName, 
-                            const char *methSignature, MethType methType, 
-                            JNIEnv *env, jmethodID *out);
+jthrowable constructNewObjectOfClass(JNIEnv *env, jobject *out,
+        const char *className, const char *ctorSignature, ...);
+
+/**
+ * Same as constructNewObjectOfClass but it takes in a CachedJavaClass
+ * rather than a className. This avoids an extra call to FindClass.
+ */
+jthrowable constructNewObjectOfCachedClass(JNIEnv *env, jobject *out,
+        CachedJavaClass cachedJavaClass, const char *ctorSignature, ...);
 
-jthrowable globalClassReference(const char *className, JNIEnv *env, jclass *out);
+jthrowable methodIdFromClass(jclass cls, const char *className,
+        const char *methName, const char *methSignature, MethType methType,
+        JNIEnv *env, jmethodID *out);
 
 /** classNameOfObject: Get an object's class name.
  * @param jobj: The object.
diff --git a/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs/os/mutexes.h b/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs/os/mutexes.h
index da30bf4974f77..92afabd7c75c6 100644
--- a/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs/os/mutexes.h
+++ b/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs/os/mutexes.h
@@ -30,12 +30,12 @@
 
 #include "platform.h"
 
-/** Mutex protecting the class reference hash table. */
-extern mutex hdfsHashMutex;
-
 /** Mutex protecting singleton JVM instance. */
 extern mutex jvmMutex;
 
+/** Mutex protecting initialization of jclasses in jclasses.h. */
+extern mutex jclassInitMutex;
+
 /**
  * Locks a mutex.
  *
diff --git a/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs/os/posix/mutexes.c b/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs/os/posix/mutexes.c
index 20dafaa020b99..5c6b429d5ec03 100644
--- a/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs/os/posix/mutexes.c
+++ b/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs/os/posix/mutexes.c
@@ -21,8 +21,8 @@
 #include <pthread.h>
 #include <stdio.h>
 
-mutex hdfsHashMutex = PTHREAD_MUTEX_INITIALIZER;
 mutex jvmMutex;
+mutex jclassInitMutex = PTHREAD_MUTEX_INITIALIZER;
 pthread_mutexattr_t jvmMutexAttr;
 
 __attribute__((constructor)) static void init() {
diff --git a/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs/os/posix/thread_local_storage.c b/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs/os/posix/thread_local_storage.c
index 110c71a855853..a55dc35f2b296 100644
--- a/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs/os/posix/thread_local_storage.c
+++ b/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs/os/posix/thread_local_storage.c
@@ -23,12 +23,20 @@
 #include <pthread.h>
 #include <stdio.h>
 
+#include "exception.h"
+#include "jni_helper.h"
+
+#define UNKNOWN "UNKNOWN"
+#define MAXTHRID 256
+
 /** Key that allows us to retrieve thread-local storage */
 static pthread_key_t gTlsKey;
 
 /** nonzero if we succeeded in initializing gTlsKey. Protected by the jvmMutex */
 static int gTlsKeyInitialized = 0;
 
+static void get_current_thread_id(JNIEnv* env, char* id, int max);
+
 /**
  * The function that is called whenever a thread with libhdfs thread local data
  * is destroyed.
@@ -41,16 +49,35 @@ void hdfsThreadDestructor(void *v)
   struct ThreadLocalState *state = (struct ThreadLocalState*)v;
   JNIEnv *env = state->env;;
   jint ret;
+  jthrowable jthr;
+  char thr_name[MAXTHRID];
 
   /* Detach the current thread from the JVM */
-  if ((env != NULL) && (*env != NULL)) {
+  if (env) {
     ret = (*env)->GetJavaVM(env, &vm);
-    if (ret) {
+
+    if (ret != 0) {
       fprintf(stderr, "hdfsThreadDestructor: GetJavaVM failed with error %d\n",
         ret);
-      (*env)->ExceptionDescribe(env);
+      jthr = (*env)->ExceptionOccurred(env);
+      if (jthr) {
+        (*env)->ExceptionDescribe(env);
+        (*env)->ExceptionClear(env);
+      }
     } else {
-      (*vm)->DetachCurrentThread(vm);
+      ret = (*vm)->DetachCurrentThread(vm);
+
+      if (ret != JNI_OK) {
+        jthr = (*env)->ExceptionOccurred(env);
+        if (jthr) {
+          (*env)->ExceptionDescribe(env);
+          (*env)->ExceptionClear(env);
+        }
+        get_current_thread_id(env, thr_name, MAXTHRID);
+
+        fprintf(stderr, "hdfsThreadDestructor: Unable to detach thread %s "
+            "from the JVM. Error code: %d\n", thr_name, ret);
+      }
     }
   }
 
@@ -62,13 +89,73 @@ void hdfsThreadDestructor(void *v)
   free(state);
 }
 
+static void get_current_thread_id(JNIEnv* env, char* id, int max) {
+  jvalue jVal;
+  jobject thr = NULL;
+  jstring thr_name = NULL;
+  jlong thr_id = 0;
+  jthrowable jthr = NULL;
+  const char *thr_name_str;
+
+  jthr = findClassAndInvokeMethod(env, &jVal, STATIC, NULL, "java/lang/Thread",
+          "currentThread", "()Ljava/lang/Thread;");
+  if (jthr) {
+    snprintf(id, max, "%s", UNKNOWN);
+    printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
+            "get_current_thread_id: Thread#currentThread failed: ");
+    goto done;
+  }
+  thr = jVal.l;
+
+  jthr = findClassAndInvokeMethod(env, &jVal, INSTANCE, thr,
+          "java/lang/Thread", "getId", "()J");
+  if (jthr) {
+    snprintf(id, max, "%s", UNKNOWN);
+    printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
+            "get_current_thread_id: Thread#getId failed: ");
+    goto done;
+  }
+  thr_id = jVal.j;
+
+  jthr = findClassAndInvokeMethod(env, &jVal, INSTANCE, thr,
+          "java/lang/Thread", "toString", "()Ljava/lang/String;");
+  if (jthr) {
+    snprintf(id, max, "%s:%ld", UNKNOWN, thr_id);
+    printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
+            "get_current_thread_id: Thread#toString failed: ");
+    goto done;
+  }
+  thr_name = jVal.l;
+
+  thr_name_str = (*env)->GetStringUTFChars(env, thr_name, NULL);
+  if (!thr_name_str) {
+    printPendingExceptionAndFree(env, PRINT_EXC_ALL,
+            "get_current_thread_id: GetStringUTFChars failed: ");
+    snprintf(id, max, "%s:%ld", UNKNOWN, thr_id);
+    goto done;
+  }
+
+  // Treating the jlong as a long *should* be safe
+  snprintf(id, max, "%s:%ld", thr_name_str, thr_id);
+
+  // Release the char*
+  (*env)->ReleaseStringUTFChars(env, thr_name, thr_name_str);
+
+done:
+  destroyLocalReference(env, thr);
+  destroyLocalReference(env, thr_name);
+
+  // Make sure the id is null terminated in case we overflow the max length
+  id[max - 1] = '\0';
+}
+
 struct ThreadLocalState* threadLocalStorageCreate()
 {
   struct ThreadLocalState *state;
   state = (struct ThreadLocalState*)malloc(sizeof(struct ThreadLocalState));
   if (state == NULL) {
     fprintf(stderr,
-      "threadLocalStorageSet: OOM - Unable to allocate thread local state\n");
+      "threadLocalStorageCreate: OOM - Unable to allocate thread local state\n");
     return NULL;
   }
   state->lastExceptionStackTrace = NULL;
@@ -103,4 +190,4 @@ int threadLocalStorageSet(struct ThreadLocalState *state)
     hdfsThreadDestructor(state);
   }
   return ret;
-}
+}
\ No newline at end of file
diff --git a/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs/os/windows/mutexes.c b/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs/os/windows/mutexes.c
index 875f03386a817..ac7f9fd35b8af 100644
--- a/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs/os/windows/mutexes.c
+++ b/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs/os/windows/mutexes.c
@@ -20,8 +20,8 @@
 
 #include <windows.h>
 
-mutex hdfsHashMutex;
 mutex jvmMutex;
+mutex jclassInitMutex;
 
 /**
  * Unfortunately, there is no simple static initializer for a critical section.
@@ -34,8 +34,8 @@ mutex jvmMutex;
  * http://msdn.microsoft.com/en-us/library/bb918180.aspx
  */
 static void __cdecl initializeMutexes(void) {
-  InitializeCriticalSection(&hdfsHashMutex);
   InitializeCriticalSection(&jvmMutex);
+  InitializeCriticalSection(&jclassInitMutex);
 }
 #pragma section(".CRT$XCU", read)
 __declspec(allocate(".CRT$XCU"))

From a277946063622e44825e90beccfb2992a2074924 Mon Sep 17 00:00:00 2001
From: Akira Ajisaka <aajisaka@apache.org>
Date: Fri, 16 Apr 2021 09:56:45 +0530
Subject: [PATCH 06/40] HDFS-15977. Call explicit_bzero only if it is
 available. (#2914)

Reviewed-by: Masatake Iwasaki <iwasakims@apache.org>
Reviewed-by: Inigo Goiri <inigoiri@apache.org>
(cherry picked from commit f0241ec2161f6eccdb9bdaf1cbcbee55be379217)

 Conflicts:
	hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfspp/lib/x-platform/syscall_linux.cc
---
 .../src/main/native/libhdfspp/CMakeLists.txt                | 6 ++++++
 .../src/main/native/libhdfspp/lib/bindings/c/hdfs.cc        | 4 ++++
 .../src/main/native/libhdfspp/tests/hdfs_ext_test.cc        | 4 ++++
 .../src/main/native/libhdfspp/tests/hdfspp_mini_dfs.h       | 4 ++++
 4 files changed, 18 insertions(+)

diff --git a/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfspp/CMakeLists.txt b/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfspp/CMakeLists.txt
index 2da5b6bbe52e3..f64eec10a8b98 100644
--- a/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfspp/CMakeLists.txt
+++ b/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfspp/CMakeLists.txt
@@ -48,6 +48,7 @@ find_package(GSasl)
 find_package(Threads)
 
 include(CheckCXXSourceCompiles)
+include(CheckSymbolExists)
 
 # Check if thread_local is supported
 unset (THREAD_LOCAL_SUPPORTED CACHE)
@@ -141,6 +142,11 @@ else (NOT NO_SASL)
     message(STATUS "Compiling with NO SASL SUPPORT")
 endif (NOT NO_SASL)
 
+check_symbol_exists(explicit_bzero "string.h" HAVE_EXPLICIT_BZERO)
+if(HAVE_EXPLICIT_BZERO)
+    add_definitions(-DHAVE_EXPLICIT_BZERO)
+endif()
+
 add_definitions(-DASIO_STANDALONE -DASIO_CPP11_DATE_TIME)
 
 # Disable optimizations if compiling debug
diff --git a/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfspp/lib/bindings/c/hdfs.cc b/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfspp/lib/bindings/c/hdfs.cc
index 6b2468fd5dbdc..549da93c2aa89 100644
--- a/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfspp/lib/bindings/c/hdfs.cc
+++ b/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfspp/lib/bindings/c/hdfs.cc
@@ -1402,7 +1402,11 @@ int hdfsGetBlockLocations(hdfsFS fs, const char *path, struct hdfsBlockLocations
     hdfsBlockLocations *locations = new struct hdfsBlockLocations();
     (*locations_out) = locations;
 
+#ifdef HAVE_EXPLICIT_BZERO
+    explicit_bzero(locations, sizeof(*locations));
+#else
     bzero(locations, sizeof(*locations));
+#endif
     locations->fileLength = ppLocations->getFileLength();
     locations->isLastBlockComplete = ppLocations->isLastBlockComplete();
     locations->isUnderConstruction = ppLocations->isUnderConstruction();
diff --git a/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfspp/tests/hdfs_ext_test.cc b/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfspp/tests/hdfs_ext_test.cc
index f364d0e15a978..29255ef282882 100644
--- a/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfspp/tests/hdfs_ext_test.cc
+++ b/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfspp/tests/hdfs_ext_test.cc
@@ -475,7 +475,11 @@ TEST_F(HdfsExtTest, TestReadStats) {
   hdfsFile file = hdfsOpenFile(fs, path.c_str(), O_WRONLY, 0, 0, 0);
   EXPECT_NE(nullptr, file);
   void * buf = malloc(size);
+#ifdef HAVE_EXPLICIT_BZERO
+  explicit_bzero(buf, size);
+#else
   bzero(buf, size);
+#endif
   EXPECT_EQ(size, hdfsWrite(fs, file, buf, size));
   free(buf);
   EXPECT_EQ(0, hdfsCloseFile(fs, file));
diff --git a/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfspp/tests/hdfspp_mini_dfs.h b/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfspp/tests/hdfspp_mini_dfs.h
index aecced1a8b6e5..320a958b10c0b 100644
--- a/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfspp/tests/hdfspp_mini_dfs.h
+++ b/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfspp/tests/hdfspp_mini_dfs.h
@@ -92,7 +92,11 @@ class HdfsHandle {
     hdfsFile file = hdfsOpenFile(*this, path.c_str(), O_WRONLY, 0, 0, 0);
     EXPECT_NE(nullptr, file);
     void * buf = malloc(size);
+#ifdef HAVE_EXPLICIT_BZERO
+    explicit_bzero(buf, size);
+#else
     bzero(buf, size);
+#endif
     EXPECT_EQ(1024, hdfsWrite(*this, file, buf, size));
     EXPECT_EQ(0, hdfsCloseFile(*this, file));
     free(buf);

From b1a36dcbe09007bbb9b6d379d1f930753cc04ddc Mon Sep 17 00:00:00 2001
From: Deepak Damri <deepak.damri@acceldata.io>
Date: Tue, 29 Oct 2024 12:11:44 +0530
Subject: [PATCH 07/40] ODP-2663: Comment override in CryptoInputStream.java
 for readFully method

---
 .../main/java/org/apache/hadoop/crypto/CryptoInputStream.java   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/crypto/CryptoInputStream.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/crypto/CryptoInputStream.java
index 67997b1a9066a..a2f6a5cc211c1 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/crypto/CryptoInputStream.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/crypto/CryptoInputStream.java
@@ -346,7 +346,7 @@ public int read(long position, byte[] buffer, int offset, int length)
   /**
    * Positioned readFully using {@link ByteBuffer}s. This method is thread-safe.
    */
-  @Override
+  // @Override
   public void readFully(long position, final ByteBuffer buf)
       throws IOException {
     checkStream();

From 2a6fd6cecef6b439b3f7811af95e5e66c8a47b06 Mon Sep 17 00:00:00 2001
From: Deepak Damri <deepak.damri@acceldata.io>
Date: Mon, 2 Dec 2024 17:05:36 +0530
Subject: [PATCH 08/40] HDFS-14846: libhdfs tests are failing on trunk due to
 jni usage bugs

---
 .../src/org/apache/hadoop/security/JniBasedUnixGroupsMapping.c | 3 ---
 .../src/main/native/libhdfs-tests/native_mini_dfs.c            | 2 +-
 .../hadoop-hdfs-native-client/src/main/native/libhdfs/hdfs.c   | 1 -
 3 files changed, 1 insertion(+), 5 deletions(-)

diff --git a/hadoop-common-project/hadoop-common/src/main/native/src/org/apache/hadoop/security/JniBasedUnixGroupsMapping.c b/hadoop-common-project/hadoop-common/src/main/native/src/org/apache/hadoop/security/JniBasedUnixGroupsMapping.c
index 402ffd5bb20a6..b463679fcdb6f 100644
--- a/hadoop-common-project/hadoop-common/src/main/native/src/org/apache/hadoop/security/JniBasedUnixGroupsMapping.c
+++ b/hadoop-common-project/hadoop-common/src/main/native/src/org/apache/hadoop/security/JniBasedUnixGroupsMapping.c
@@ -199,8 +199,5 @@ Java_org_apache_hadoop_security_JniBasedUnixGroupsMapping_getGroupsForUser
   if (ginfo) {
     hadoop_group_info_free(ginfo);
   }
-  if (jgroupname) {
-    (*env)->DeleteLocalRef(env, jgroupname);
-  }
   return jgroups;
 }
diff --git a/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs-tests/native_mini_dfs.c b/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs-tests/native_mini_dfs.c
index 3af56f1e4f96e..a69c6efe0c763 100644
--- a/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs-tests/native_mini_dfs.c
+++ b/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs-tests/native_mini_dfs.c
@@ -184,8 +184,8 @@ struct NativeMiniDfsCluster* nmdCreate(struct NativeMiniDfsConf *conf)
                                   "Builder::numDataNodes");
             goto error;
         }
+        (*env)->DeleteLocalRef(env, val.l);
     }
-    (*env)->DeleteLocalRef(env, val.l);
     jthr = findClassAndInvokeMethod(env, &val, INSTANCE, bld, MINIDFS_CLUSTER_BUILDER,
             "build", "()L" MINIDFS_CLUSTER ";");
     if (jthr) {
diff --git a/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs/hdfs.c b/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs/hdfs.c
index 03924927793cd..5eb84ed2ec7d6 100644
--- a/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs/hdfs.c
+++ b/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs/hdfs.c
@@ -3597,7 +3597,6 @@ getFileInfoFromStat(JNIEnv *env, jobject jStat, hdfsFileInfo *fileInfo)
     destroyLocalReference(env, jUserName);
     destroyLocalReference(env, jGroupName);
     destroyLocalReference(env, jPermission);
-    destroyLocalReference(env, jPath);
     return jthr;
 }
 

From 4eb7768c3b1d32fe8b4fdd0c4071376190d00760 Mon Sep 17 00:00:00 2001
From: Deepak Damri <deepak.damri@acceldata.io>
Date: Mon, 2 Dec 2024 17:08:47 +0530
Subject: [PATCH 09/40] HDFS-14111: hdfsOpenFile on HDFS causes unnecessary IO
 from file offset 0

---
 .../hadoop/crypto/CryptoInputStream.java      |  1 +
 .../apache/hadoop/fs/StreamCapabilities.java  |  1 -
 .../apache/hadoop/hdfs/DFSInputStream.java    |  1 +
 .../src/main/native/libhdfs/hdfs.c            | 37 +++++++++++++------
 .../native/libhdfspp/tests/hdfs_ext_test.cc   |  5 ++-
 5 files changed, 32 insertions(+), 13 deletions(-)

diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/crypto/CryptoInputStream.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/crypto/CryptoInputStream.java
index a2f6a5cc211c1..2603ae342a2ad 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/crypto/CryptoInputStream.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/crypto/CryptoInputStream.java
@@ -806,6 +806,7 @@ public boolean hasCapability(String capability) {
     case StreamCapabilities.READAHEAD:
     case StreamCapabilities.DROPBEHIND:
     case StreamCapabilities.UNBUFFER:
+    case StreamCapabilities.READBYTEBUFFER:
       return true;
     default:
       return false;
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/StreamCapabilities.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/StreamCapabilities.java
index e68e7b351ed78..9d4b6fe7bc2ae 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/StreamCapabilities.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/StreamCapabilities.java
@@ -64,7 +64,6 @@ public interface StreamCapabilities {
    * {@link ByteBufferReadable#read(java.nio.ByteBuffer)}.
    */
   String READBYTEBUFFER = "in:readbytebuffer";
-
   /**
    * Stream read(long, ByteBuffer) capability implemented by
    * {@link ByteBufferPositionedReadable#read(long, java.nio.ByteBuffer)}.
diff --git a/hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/DFSInputStream.java b/hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/DFSInputStream.java
index 6a7a400121973..8375ffd9d5aaf 100644
--- a/hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/DFSInputStream.java
+++ b/hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/DFSInputStream.java
@@ -1893,6 +1893,7 @@ public boolean hasCapability(String capability) {
     case StreamCapabilities.READAHEAD:
     case StreamCapabilities.DROPBEHIND:
     case StreamCapabilities.UNBUFFER:
+    case StreamCapabilities.READBYTEBUFFER:
       return true;
     default:
       return false;
diff --git a/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs/hdfs.c b/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs/hdfs.c
index 5eb84ed2ec7d6..eb5f88f2f6405 100644
--- a/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs/hdfs.c
+++ b/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs/hdfs.c
@@ -1080,7 +1080,7 @@ static hdfsFile hdfsOpenFileImpl(hdfsFS fs, const char *path, int flags,
        return f{is|os};
     */
     int accmode = flags & O_ACCMODE;
-    jstring jStrBufferSize = NULL, jStrReplication = NULL;
+    jstring jStrBufferSize = NULL, jStrReplication = NULL, jCapabilityString = NULL;
     jobject jConfiguration = NULL, jPath = NULL, jFile = NULL;
     jobject jFS = (jobject)fs;
     jthrowable jthr;
@@ -1240,18 +1240,31 @@ static hdfsFile hdfsOpenFileImpl(hdfsFS fs, const char *path, int flags,
     file->flags = 0;
 
     if ((flags & O_WRONLY) == 0) {
-        // Check the StreamCapabilities of jFile to see if we can do direct
-        // reads
-        if (hdfsHasStreamCapability(jFile, "in:readbytebuffer")) {
-            file->flags |= HDFS_FILE_SUPPORTS_DIRECT_READ;
+        // Try a test read to see if we can do direct reads
+        char buf;
+        if (readDirect(fs, file, &buf, 0) == 0) {
+            // Success - 0-byte read should return 0
+        // Check the StreamCapabilities of jFile to see if we can do direct reads
+        jthr = newJavaStr(env, "in:readbytebuffer", &jCapabilityString);
+        if (jthr) {
+            ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
+                                        "hdfsOpenFile(%s): newJavaStr", path);
+            goto done;
         }
-
-        // Check the StreamCapabilities of jFile to see if we can do direct
-        // preads
-        if (hdfsHasStreamCapability(jFile, "in:preadbytebuffer")) {
-            file->flags |= HDFS_FILE_SUPPORTS_DIRECT_PREAD;
+        jthr = invokeMethod(env, &jVal, INSTANCE, jFile, HADOOP_ISTRM,
+                            "hasCapability", "(Ljava/lang/String;)Z", jCapabilityString);
+        if (jthr) {
+            ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
+                                        "hdfsOpenFile(%s): FSDataInputStream#hasCapability", path);
+            goto done;
         }
-    }
+        if (jVal.z) {
+            file->flags |= HDFS_FILE_SUPPORTS_DIRECT_READ;
+        } else if (errno != ENOTSUP) {
+            // Unexpected error. Clear it, don't set the direct flag.
+            fprintf(stderr,
+                  "hdfsOpenFile(%s): WARN: Unexpected error %d when testing "
+                  "for direct read compatibility\n", path, errno);
     ret = 0;
 
 done:
@@ -1260,6 +1273,8 @@ static hdfsFile hdfsOpenFileImpl(hdfsFS fs, const char *path, int flags,
     destroyLocalReference(env, jConfiguration);
     destroyLocalReference(env, jPath);
     destroyLocalReference(env, jFile);
+    destroyLocalReference(env, jCapabilityString);
+
     if (ret) {
         if (file) {
             if (file->file) {
diff --git a/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfspp/tests/hdfs_ext_test.cc b/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfspp/tests/hdfs_ext_test.cc
index 29255ef282882..fba82b817ecb4 100644
--- a/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfspp/tests/hdfs_ext_test.cc
+++ b/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfspp/tests/hdfs_ext_test.cc
@@ -507,7 +507,10 @@ TEST_F(HdfsExtTest, TestReadStats) {
   hdfsFileFreeReadStatistics(stats);
 
   EXPECT_EQ(0, hdfsCloseFile(fs, file));
-  EXPECT_EQ(0, errno);
+  // Since libhdfs is not guaranteed to set errno to 0 on successful
+  // operations, we disable this check for now, see HDFS-14325 for a
+  // long term solution to this problem
+  // EXPECT_EQ(0, errno);
 }
 
 //Testing working directory

From f1c610fef6ea6d7d0f2c779166041891fb53cb50 Mon Sep 17 00:00:00 2001
From: Deepak Damri <deepak.damri@acceldata.io>
Date: Mon, 2 Dec 2024 17:13:43 +0530
Subject: [PATCH 10/40] HDFS-14478: Add libhdfs APIs for openFile

---
 .../native/libhdfs-tests/test_libhdfs_ops.c   |  62 +++
 .../src/main/native/libhdfs/hdfs.c            | 522 +++++++++++++++++-
 .../main/native/libhdfs/include/hdfs/hdfs.h   | 135 +++++
 .../src/main/native/libhdfs/jclasses.c        |   4 +
 .../src/main/native/libhdfs/jclasses.h        |   7 +
 .../main/native/libhdfspp/tests/hdfs_shim.c   |  59 ++
 .../libhdfspp/tests/libhdfs_wrapper_defines.h |  17 +
 .../libhdfspp/tests/libhdfs_wrapper_undefs.h  |  17 +
 .../tests/libhdfspp_wrapper_defines.h         |  17 +
 9 files changed, 817 insertions(+), 23 deletions(-)

diff --git a/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs-tests/test_libhdfs_ops.c b/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs-tests/test_libhdfs_ops.c
index 1e92e21ee9692..dd3e122695669 100644
--- a/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs-tests/test_libhdfs_ops.c
+++ b/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs-tests/test_libhdfs_ops.c
@@ -454,6 +454,68 @@ int main(int argc, char **argv) {
         hdfsCloseFile(lfs, localFile);
     }
 
+
+    {
+        // HDFS Open File Builder tests
+
+        exists = hdfsExists(fs, readPath);
+
+        if (exists) {
+            fprintf(stderr, "Failed to validate existence of %s\n", readPath);
+            shutdown_and_exit(cl, -1);
+        }
+
+        hdfsOpenFileBuilder *builder;
+        builder = hdfsOpenFileBuilderAlloc(fs, readPath);
+        hdfsOpenFileBuilderOpt(builder, "hello", "world");
+
+        hdfsOpenFileFuture *future;
+        future = hdfsOpenFileBuilderBuild(builder);
+
+        readFile = hdfsOpenFileFutureGet(future);
+        if (!hdfsOpenFileFutureCancel(future, 0)) {
+            fprintf(stderr, "Cancel on a completed Future should return false");
+            shutdown_and_exit(cl, -1);
+        }
+        hdfsOpenFileFutureFree(future);
+
+        memset(buffer, 0, sizeof(buffer));
+        num_read_bytes = hdfsRead(fs, readFile, (void *) buffer,
+                                  sizeof(buffer));
+        if (strncmp(fileContents, buffer, strlen(fileContents)) != 0) {
+            fprintf(stderr,
+                    "Failed to read. Expected %s but got %s (%d bytes)\n",
+                    fileContents, buffer, num_read_bytes);
+            shutdown_and_exit(cl, -1);
+        }
+        hdfsCloseFile(fs, readFile);
+
+        builder = hdfsOpenFileBuilderAlloc(fs, readPath);
+        hdfsOpenFileBuilderOpt(builder, "hello", "world");
+
+        future = hdfsOpenFileBuilderBuild(builder);
+
+        readFile = hdfsOpenFileFutureGetWithTimeout(future, 1, jDays);
+        if (!hdfsOpenFileFutureCancel(future, 0)) {
+            fprintf(stderr, "Cancel on a completed Future should return "
+                            "false");
+            shutdown_and_exit(cl, -1);
+        }
+        hdfsOpenFileFutureFree(future);
+
+        memset(buffer, 0, sizeof(buffer));
+        num_read_bytes = hdfsRead(fs, readFile, (void*)buffer,
+                                  sizeof(buffer));
+        if (strncmp(fileContents, buffer, strlen(fileContents)) != 0) {
+            fprintf(stderr, "Failed to read. Expected %s but got "
+                            "%s (%d bytes)\n", fileContents, buffer,
+                            num_read_bytes);
+            shutdown_and_exit(cl, -1);
+        }
+        memset(buffer, 0, strlen(fileContents + 1));
+        hdfsCloseFile(fs, readFile);
+    }
+
     totalResult = 0;
     result = 0;
     {
diff --git a/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs/hdfs.c b/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs/hdfs.c
index eb5f88f2f6405..0c1a021b5f8e0 100644
--- a/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs/hdfs.c
+++ b/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs/hdfs.c
@@ -38,6 +38,10 @@
 
 #define KERBEROS_TICKET_CACHE_PATH "hadoop.security.kerberos.ticket.cache.path"
 
+// StreamCapability flags taken from o.a.h.fs.StreamCapabilities
+#define IS_READ_BYTE_BUFFER_CAPABILITY "in:readbytebuffer"
+#define IS_PREAD_BYTE_BUFFER_CAPABILITY "in:preadbytebuffer"
+
 // Bit fields for hdfsFile_internal flags
 #define HDFS_FILE_SUPPORTS_DIRECT_READ (1<<0)
 #define HDFS_FILE_SUPPORTS_DIRECT_PREAD (1<<1)
@@ -1070,6 +1074,27 @@ static int hdfsHasStreamCapability(jobject jFile,
     return 0;
 }
 
+/**
+ * Sets the flags of the given hdfsFile based on the capabilities of the
+ * underlying stream.
+ *
+ * @param file file->flags will be updated based on the capabilities of jFile
+ * @param jFile the underlying stream to check for capabilities
+ */
+static void setFileFlagCapabilities(hdfsFile file, jobject jFile) {
+    // Check the StreamCapabilities of jFile to see if we can do direct
+    // reads
+    if (hdfsHasStreamCapability(jFile, IS_READ_BYTE_BUFFER_CAPABILITY)) {
+        file->flags |= HDFS_FILE_SUPPORTS_DIRECT_READ;
+    }
+
+    // Check the StreamCapabilities of jFile to see if we can do direct
+    // preads
+    if (hdfsHasStreamCapability(jFile, IS_PREAD_BYTE_BUFFER_CAPABILITY)) {
+        file->flags |= HDFS_FILE_SUPPORTS_DIRECT_PREAD;
+    }
+}
+
 static hdfsFile hdfsOpenFileImpl(hdfsFS fs, const char *path, int flags,
                   int32_t bufferSize, int16_t replication, int64_t blockSize)
 {
@@ -1240,31 +1265,19 @@ static hdfsFile hdfsOpenFileImpl(hdfsFS fs, const char *path, int flags,
     file->flags = 0;
 
     if ((flags & O_WRONLY) == 0) {
-        // Try a test read to see if we can do direct reads
-        char buf;
-        if (readDirect(fs, file, &buf, 0) == 0) {
-            // Success - 0-byte read should return 0
-        // Check the StreamCapabilities of jFile to see if we can do direct reads
-        jthr = newJavaStr(env, "in:readbytebuffer", &jCapabilityString);
-        if (jthr) {
-            ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
-                                        "hdfsOpenFile(%s): newJavaStr", path);
-            goto done;
+        // Check the StreamCapabilities of jFile to see if we can do direct
+        // reads
+        if (hdfsHasStreamCapability(jFile, "in:readbytebuffer")) {
+            file->flags |= HDFS_FILE_SUPPORTS_DIRECT_READ;
         }
-        jthr = invokeMethod(env, &jVal, INSTANCE, jFile, HADOOP_ISTRM,
-                            "hasCapability", "(Ljava/lang/String;)Z", jCapabilityString);
-        if (jthr) {
-            ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
-                                        "hdfsOpenFile(%s): FSDataInputStream#hasCapability", path);
-            goto done;
+
+        // Check the StreamCapabilities of jFile to see if we can do direct
+        // preads
+        if (hdfsHasStreamCapability(jFile, "in:preadbytebuffer")) {
+            file->flags |= HDFS_FILE_SUPPORTS_DIRECT_PREAD;
+        }
+        setFileFlagCapabilities(file, jFile);
         }
-        if (jVal.z) {
-            file->flags |= HDFS_FILE_SUPPORTS_DIRECT_READ;
-        } else if (errno != ENOTSUP) {
-            // Unexpected error. Clear it, don't set the direct flag.
-            fprintf(stderr,
-                  "hdfsOpenFile(%s): WARN: Unexpected error %d when testing "
-                  "for direct read compatibility\n", path, errno);
     ret = 0;
 
 done:
@@ -1298,6 +1311,469 @@ hdfsFile hdfsStreamBuilderBuild(struct hdfsStreamBuilder *bld)
     return file;
 }
 
+/**
+ * A wrapper around o.a.h.fs.FutureDataInputStreamBuilder and the file name
+ * associated with the builder.
+ */
+struct hdfsOpenFileBuilder {
+    jobject jBuilder;
+    const char *path;
+};
+
+/**
+ * A wrapper around a java.util.concurrent.Future (created by calling
+ * FutureDataInputStreamBuilder#build) and the file name associated with the
+ * builder.
+ */
+struct hdfsOpenFileFuture {
+    jobject jFuture;
+    const char *path;
+};
+
+hdfsOpenFileBuilder *hdfsOpenFileBuilderAlloc(hdfsFS fs,
+        const char *path) {
+    int ret = 0;
+    jthrowable jthr;
+    jvalue jVal;
+    jobject jFS = (jobject) fs;
+
+    jobject jPath = NULL;
+    jobject jBuilder = NULL;
+
+    JNIEnv *env = getJNIEnv();
+    if (!env) {
+        errno = EINTERNAL;
+        return NULL;
+    }
+
+    hdfsOpenFileBuilder *builder;
+    builder = calloc(1, sizeof(hdfsOpenFileBuilder));
+    if (!builder) {
+        fprintf(stderr, "hdfsOpenFileBuilderAlloc(%s): OOM when creating "
+                        "hdfsOpenFileBuilder\n", path);
+        errno = ENOMEM;
+        goto done;
+    }
+    builder->path = path;
+
+    jthr = constructNewObjectOfPath(env, path, &jPath);
+    if (jthr) {
+        errno = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
+                "hdfsOpenFileBuilderAlloc(%s): constructNewObjectOfPath",
+                path);
+        goto done;
+    }
+
+    jthr = invokeMethod(env, &jVal, INSTANCE, jFS, JC_FILE_SYSTEM,
+            "openFile", JMETHOD1(JPARAM(HADOOP_PATH), JPARAM(HADOOP_FDISB)),
+            jPath);
+    if (jthr) {
+        ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
+                "hdfsOpenFileBuilderAlloc(%s): %s#openFile(Path) failed",
+                HADOOP_FS, path);
+        goto done;
+    }
+    jBuilder = jVal.l;
+
+    builder->jBuilder = (*env)->NewGlobalRef(env, jBuilder);
+    if (!builder->jBuilder) {
+        printPendingExceptionAndFree(env, PRINT_EXC_ALL,
+                "hdfsOpenFileBuilderAlloc(%s): NewGlobalRef(%s) failed", path,
+                HADOOP_FDISB);
+        ret = EINVAL;
+        goto done;
+    }
+
+done:
+    destroyLocalReference(env, jPath);
+    destroyLocalReference(env, jBuilder);
+    if (ret) {
+        if (builder) {
+            if (builder->jBuilder) {
+                (*env)->DeleteGlobalRef(env, builder->jBuilder);
+            }
+            free(builder);
+        }
+        errno = ret;
+        return NULL;
+    }
+    return builder;
+}
+
+/**
+ * Used internally by hdfsOpenFileBuilderWithOption to switch between
+ * FSBuilder#must and #opt.
+ */
+typedef enum { must, opt } openFileBuilderOptionType;
+
+/**
+ * Shared implementation of hdfsOpenFileBuilderMust and hdfsOpenFileBuilderOpt
+ * that switches between each method depending on the value of
+ * openFileBuilderOptionType.
+ */
+static hdfsOpenFileBuilder *hdfsOpenFileBuilderWithOption(
+        hdfsOpenFileBuilder *builder, const char *key,
+        const char *value, openFileBuilderOptionType optionType) {
+    int ret = 0;
+    jthrowable jthr;
+    jvalue jVal;
+    jobject localJBuilder = NULL;
+    jobject globalJBuilder;
+    jstring jKeyString = NULL;
+    jstring jValueString = NULL;
+
+    // If the builder was not previously created by a prior call to
+    // hdfsOpenFileBuilderAlloc then exit
+    if (builder == NULL || builder->jBuilder == NULL) {
+        errno = EINVAL;
+        return NULL;
+    }
+
+    JNIEnv *env = getJNIEnv();
+    if (!env) {
+        errno = EINTERNAL;
+        return NULL;
+    }
+    jthr = newJavaStr(env, key, &jKeyString);
+    if (jthr) {
+        ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
+                "hdfsOpenFileBuilderWithOption(%s): newJavaStr(%s)",
+                builder->path, key);
+        goto done;
+    }
+    jthr = newJavaStr(env, value, &jValueString);
+    if (jthr) {
+        ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
+                "hdfsOpenFileBuilderWithOption(%s): newJavaStr(%s)",
+                builder->path, value);
+        goto done;
+    }
+
+    const char *optionTypeMethodName;
+    switch (optionType) {
+        case must:
+            optionTypeMethodName = "must";
+            break;
+        case opt:
+            optionTypeMethodName = "opt";
+            break;
+        default:
+            ret = EINTERNAL;
+            goto done;
+    }
+
+    jthr = invokeMethod(env, &jVal, INSTANCE, builder->jBuilder,
+            JC_FUTURE_DATA_IS_BUILDER, optionTypeMethodName,
+            JMETHOD2(JPARAM(JAVA_STRING), JPARAM(JAVA_STRING),
+                    JPARAM(HADOOP_FS_BLDR)), jKeyString,
+            jValueString);
+    if (jthr) {
+        ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
+                "hdfsOpenFileBuilderWithOption(%s): %s#%s(%s, %s) failed",
+                builder->path, HADOOP_FS_BLDR, optionTypeMethodName, key,
+                value);
+        goto done;
+    }
+
+    localJBuilder = jVal.l;
+    globalJBuilder = (*env)->NewGlobalRef(env, localJBuilder);
+    if (!globalJBuilder) {
+        printPendingExceptionAndFree(env, PRINT_EXC_ALL,
+                "hdfsOpenFileBuilderWithOption(%s): NewGlobalRef(%s) failed",
+                builder->path, HADOOP_FDISB);
+        ret = EINVAL;
+        goto done;
+    }
+    (*env)->DeleteGlobalRef(env, builder->jBuilder);
+    builder->jBuilder = globalJBuilder;
+
+done:
+    destroyLocalReference(env, jKeyString);
+    destroyLocalReference(env, jValueString);
+    destroyLocalReference(env, localJBuilder);
+    if (ret) {
+        errno = ret;
+        return NULL;
+    }
+    return builder;
+}
+
+hdfsOpenFileBuilder *hdfsOpenFileBuilderMust(hdfsOpenFileBuilder *builder,
+        const char *key, const char *value) {
+    openFileBuilderOptionType optionType;
+    optionType = must;
+    return hdfsOpenFileBuilderWithOption(builder, key, value, optionType);
+}
+
+hdfsOpenFileBuilder *hdfsOpenFileBuilderOpt(hdfsOpenFileBuilder *builder,
+        const char *key, const char *value) {
+    openFileBuilderOptionType optionType;
+    optionType = opt;
+    return hdfsOpenFileBuilderWithOption(builder, key, value, optionType);
+}
+
+hdfsOpenFileFuture *hdfsOpenFileBuilderBuild(hdfsOpenFileBuilder *builder) {
+    int ret = 0;
+    jthrowable jthr;
+    jvalue jVal;
+
+    jobject jFuture = NULL;
+
+    // If the builder was not previously created by a prior call to
+    // hdfsOpenFileBuilderAlloc then exit
+    if (builder == NULL || builder->jBuilder == NULL) {
+        ret = EINVAL;
+        return NULL;
+    }
+
+    JNIEnv *env = getJNIEnv();
+    if (!env) {
+        errno = EINTERNAL;
+        return NULL;
+    }
+
+    hdfsOpenFileFuture *future;
+    future = calloc(1, sizeof(hdfsOpenFileFuture));
+    if (!future) {
+        fprintf(stderr, "hdfsOpenFileBuilderBuild: OOM when creating "
+                        "hdfsOpenFileFuture\n");
+        errno = ENOMEM;
+        goto done;
+    }
+    future->path = builder->path;
+
+    jthr = invokeMethod(env, &jVal, INSTANCE, builder->jBuilder,
+            JC_FUTURE_DATA_IS_BUILDER, "build",
+            JMETHOD1("", JPARAM(JAVA_CFUTURE)));
+    if (jthr) {
+        ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
+                "hdfsOpenFileBuilderBuild(%s): %s#build() failed",
+                builder->path, HADOOP_FDISB);
+        goto done;
+    }
+    jFuture = jVal.l;
+
+    future->jFuture = (*env)->NewGlobalRef(env, jFuture);
+    if (!future->jFuture) {
+        printPendingExceptionAndFree(env, PRINT_EXC_ALL,
+                "hdfsOpenFileBuilderBuild(%s): NewGlobalRef(%s) failed",
+                builder->path, JAVA_CFUTURE);
+        ret = EINVAL;
+        goto done;
+    }
+
+done:
+    destroyLocalReference(env, jFuture);
+    if (ret) {
+        if (future) {
+            if (future->jFuture) {
+                (*env)->DeleteGlobalRef(env, future->jFuture);
+            }
+            free(future);
+        }
+        hdfsOpenFileBuilderFree(builder);
+        errno = ret;
+        return NULL;
+    }
+    hdfsOpenFileBuilderFree(builder);
+    return future;
+}
+
+void hdfsOpenFileBuilderFree(hdfsOpenFileBuilder *builder) {
+    JNIEnv *env;
+    env = getJNIEnv();
+    if (!env) {
+        return;
+    }
+    if (builder->jBuilder) {
+        (*env)->DeleteGlobalRef(env, builder->jBuilder);
+        builder->jBuilder = NULL;
+    }
+    free(builder);
+}
+
+/**
+ * Shared implementation of hdfsOpenFileFutureGet and
+ * hdfsOpenFileFutureGetWithTimeout. If a timeout is specified, calls
+ * Future#get() otherwise it calls Future#get(long, TimeUnit).
+ */
+static hdfsFile fileFutureGetWithTimeout(hdfsOpenFileFuture *future,
+        int64_t timeout, jobject jTimeUnit) {
+    int ret = 0;
+    jthrowable jthr;
+    jvalue jVal;
+
+    hdfsFile file = NULL;
+    jobject jFile = NULL;
+
+    JNIEnv *env = getJNIEnv();
+    if (!env) {
+        ret = EINTERNAL;
+        return NULL;
+    }
+
+    if (!jTimeUnit) {
+        jthr = invokeMethod(env, &jVal, INSTANCE, future->jFuture,
+                JC_CFUTURE, "get", JMETHOD1("", JPARAM(JAVA_OBJECT)));
+    } else {
+        jthr = invokeMethod(env, &jVal, INSTANCE, future->jFuture,
+                JC_CFUTURE, "get", JMETHOD2("J",
+                        JPARAM(JAVA_TIMEUNIT), JPARAM(JAVA_OBJECT)), timeout,
+                        jTimeUnit);
+    }
+    if (jthr) {
+        ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
+                "hdfsOpenFileFutureGet(%s): %s#get failed", future->path,
+                JAVA_CFUTURE);
+        goto done;
+    }
+
+    file = calloc(1, sizeof(struct hdfsFile_internal));
+    if (!file) {
+        fprintf(stderr, "hdfsOpenFileFutureGet(%s): OOM when creating "
+                        "hdfsFile\n", future->path);
+        ret = ENOMEM;
+        goto done;
+    }
+    jFile = jVal.l;
+    file->file = (*env)->NewGlobalRef(env, jFile);
+    if (!file->file) {
+        ret = printPendingExceptionAndFree(env, PRINT_EXC_ALL,
+                "hdfsOpenFileFutureGet(%s): NewGlobalRef(jFile) failed",
+                future->path);
+        goto done;
+    }
+
+    file->type = HDFS_STREAM_INPUT;
+    file->flags = 0;
+
+    setFileFlagCapabilities(file, jFile);
+
+done:
+    destroyLocalReference(env, jTimeUnit);
+    destroyLocalReference(env, jFile);
+    if (ret) {
+        if (file) {
+            if (file->file) {
+                (*env)->DeleteGlobalRef(env, file->file);
+            }
+            free(file);
+        }
+        errno = ret;
+        return NULL;
+    }
+    return file;
+}
+
+hdfsFile hdfsOpenFileFutureGet(hdfsOpenFileFuture *future) {
+    return fileFutureGetWithTimeout(future, -1, NULL);
+}
+
+hdfsFile hdfsOpenFileFutureGetWithTimeout(hdfsOpenFileFuture *future,
+        int64_t timeout, javaConcurrentTimeUnit timeUnit) {
+    int ret = 0;
+    jthrowable jthr;
+    jobject jTimeUnit = NULL;
+
+    JNIEnv *env = getJNIEnv();
+    if (!env) {
+        ret = EINTERNAL;
+        return NULL;
+    }
+
+    const char *timeUnitEnumName;
+    switch (timeUnit) {
+        case jNanoseconds:
+            timeUnitEnumName = "NANOSECONDS";
+            break;
+        case jMicroseconds:
+            timeUnitEnumName = "MICROSECONDS";
+            break;
+        case jMilliseconds:
+            timeUnitEnumName = "MILLISECONDS";
+            break;
+        case jSeconds:
+            timeUnitEnumName = "SECONDS";
+            break;
+        case jMinutes:
+            timeUnitEnumName = "MINUTES";
+            break;
+        case jHours:
+            timeUnitEnumName = "HOURS";
+            break;
+        case jDays:
+            timeUnitEnumName = "DAYS";
+            break;
+        default:
+            ret = EINTERNAL;
+            goto done;
+    }
+
+    jthr = fetchEnumInstance(env, JAVA_TIMEUNIT, timeUnitEnumName, &jTimeUnit);
+
+    if (jthr) {
+        ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
+                "hdfsOpenFileFutureGet(%s): %s#get failed", future->path,
+                JAVA_CFUTURE);
+        goto done;
+    }
+    return fileFutureGetWithTimeout(future, timeout, jTimeUnit);
+
+done:
+    if (ret) {
+        errno = ret;
+    }
+    return NULL;
+}
+
+int hdfsOpenFileFutureCancel(hdfsOpenFileFuture *future,
+        int mayInterruptIfRunning) {
+    int ret = 0;
+    jthrowable jthr;
+    jvalue jVal;
+
+    jboolean jMayInterruptIfRunning;
+
+    JNIEnv *env = getJNIEnv();
+    if (!env) {
+        ret = EINTERNAL;
+        return -1;
+    }
+
+    jMayInterruptIfRunning = mayInterruptIfRunning ? JNI_TRUE : JNI_FALSE;
+    jthr = invokeMethod(env, &jVal, INSTANCE, future->jFuture, JC_CFUTURE,
+            "cancel", JMETHOD1("Z", "Z"), jMayInterruptIfRunning);
+    if (jthr) {
+        ret = printExceptionAndFree(env, jthr, PRINT_EXC_ALL,
+                "hdfsOpenFileFutureCancel(%s): %s#cancel failed", future->path,
+                JAVA_CFUTURE);
+        goto done;
+    }
+
+done:
+    if (ret) {
+        errno = ret;
+        return -1;
+    }
+    if (!jVal.z) {
+        return -1;
+    }
+    return 0;
+}
+
+void hdfsOpenFileFutureFree(hdfsOpenFileFuture *future) {
+    JNIEnv *env;
+    env = getJNIEnv();
+    if (!env) {
+        return;
+    }
+    if (future->jFuture) {
+        (*env)->DeleteGlobalRef(env, future->jFuture);
+        future->jFuture = NULL;
+    }
+    free(future);
+}
+
 int hdfsTruncateFile(hdfsFS fs, const char* path, tOffset newlength)
 {
     jobject jFS = (jobject)fs;
diff --git a/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs/include/hdfs/hdfs.h b/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs/include/hdfs/hdfs.h
index e58a6232d205a..eba50ff6eb277 100644
--- a/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs/include/hdfs/hdfs.h
+++ b/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs/include/hdfs/hdfs.h
@@ -82,6 +82,29 @@ extern  "C" {
     } tObjectKind;
     struct hdfsStreamBuilder;
 
+    /**
+     * The C reflection of the enum values from java.util.concurrent.TimeUnit .
+     */
+    typedef enum javaConcurrentTimeUnit {
+        jNanoseconds,
+        jMicroseconds,
+        jMilliseconds,
+        jSeconds,
+        jMinutes,
+        jHours,
+        jDays,
+    } javaConcurrentTimeUnit;
+
+    /**
+     * The C reflection of java.util.concurrent.Future specifically used for
+     * opening HDFS files asynchronously.
+     */
+    typedef struct hdfsOpenFileFuture hdfsOpenFileFuture;
+
+    /**
+     * The C reflection of o.a.h.fs.FutureDataInputStreamBuilder .
+     */
+    typedef struct hdfsOpenFileBuilder hdfsOpenFileBuilder;
 
     /**
      * The C reflection of org.apache.org.hadoop.FileSystem .
@@ -429,6 +452,118 @@ extern  "C" {
     hdfsFile hdfsOpenFile(hdfsFS fs, const char* path, int flags,
                           int bufferSize, short replication, tSize blocksize);
 
+    /**
+     * hdfsOpenFileBuilderAlloc - Allocate a HDFS open file builder.
+     *
+     * @param fs The configured filesystem handle.
+     * @param path The full path to the file.
+     * @return Returns the hdfsOpenFileBuilder, or NULL on error.
+     */
+    LIBHDFS_EXTERNAL
+    hdfsOpenFileBuilder *hdfsOpenFileBuilderAlloc(hdfsFS fs,
+            const char *path);
+
+    /**
+     * hdfsOpenFileBuilderMust - Specifies a mandatory parameter for the open
+     * file builder. While the underlying FsBuilder supports various various
+     * types for the value (boolean, int, float, double), currently only
+     * strings are supported.
+     *
+     * @param builder The open file builder to set the config for.
+     * @param key The config key
+     * @param value The config value
+     * @return Returns the hdfsOpenFileBuilder, or NULL on error.
+     */
+    LIBHDFS_EXTERNAL
+    hdfsOpenFileBuilder *hdfsOpenFileBuilderMust(hdfsOpenFileBuilder *builder,
+            const char *key, const char *value);
+
+    /**
+     * hdfsOpenFileBuilderOpt - Specifies an optional parameter for the open
+     * file builder. While the underlying FsBuilder supports various various
+     * types for the value (boolean, int, float, double), currently only
+     * strings are supported.
+     *
+     * @param builder The open file builder to set the config for.
+     * @param key The config key
+     * @param value The config value
+     * @return Returns the hdfsOpenFileBuilder, or NULL on error.
+     */
+    LIBHDFS_EXTERNAL
+    hdfsOpenFileBuilder *hdfsOpenFileBuilderOpt(hdfsOpenFileBuilder *builder,
+            const char *key, const char *value);
+
+    /**
+     * hdfsOpenFileBuilderBuild - Builds the open file builder and returns a
+     * hdfsOpenFileFuture which tracks the asynchronous call to open the
+     * specified file.
+     *
+     * @param builder The open file builder to build.
+     * @return Returns the hdfsOpenFileFuture, or NULL on error.
+     */
+    LIBHDFS_EXTERNAL
+    hdfsOpenFileFuture *hdfsOpenFileBuilderBuild(hdfsOpenFileBuilder *builder);
+
+    /**
+     * hdfsOpenFileBuilderFree - Free a HDFS open file builder.
+     *
+     * It is normally not necessary to call this function since
+     * hdfsOpenFileBuilderBuild frees the builder.
+     *
+     * @param builder The hdfsOpenFileBuilder to free.
+     */
+    LIBHDFS_EXTERNAL
+    void hdfsOpenFileBuilderFree(hdfsOpenFileBuilder *builder);
+
+    /**
+     * hdfsOpenFileFutureGet - Call Future#get() on the underlying Java Future
+     * object. A call to #get() will block until the asynchronous operation has
+     * completed. In this case, until the open file call has completed. This
+     * method blocks indefinitely until blocking call completes.
+     *
+     * @param future The hdfsOpenFileFuture to call #get on
+     * @return Returns the opened hdfsFile, or NULL on error.
+     */
+    LIBHDFS_EXTERNAL
+    hdfsFile hdfsOpenFileFutureGet(hdfsOpenFileFuture *future);
+
+    /**
+     * hdfsOpenFileFutureGetWithTimeout - Call Future#get(long, TimeUnit) on
+     * the underlying Java Future object. A call to #get(long, TimeUnit) will
+     * block until the asynchronous operation has completed (in this case,
+     * until the open file call has completed) or the specified timeout has
+     * been reached.
+     *
+     * @param future The hdfsOpenFileFuture to call #get on
+     * @return Returns the opened hdfsFile, or NULL on error or if the timeout
+     *         has been reached.
+     */
+    LIBHDFS_EXTERNAL
+    hdfsFile hdfsOpenFileFutureGetWithTimeout(hdfsOpenFileFuture *future,
+            int64_t timeout, javaConcurrentTimeUnit timeUnit);
+
+    /**
+     * hdfsOpenFileFutureCancel - Call Future#cancel(boolean) on the
+     * underlying Java Future object. The value of mayInterruptedIfRunning
+     * controls whether the Java thread running the Future should be
+     * interrupted or not.
+     *
+     * @param future The hdfsOpenFileFuture to call #cancel on
+     * @param mayInterruptIfRunning if true, interrupts the running thread
+     * @return Returns 0 if the thread was successfully cancelled, else -1
+     */
+    LIBHDFS_EXTERNAL
+    int hdfsOpenFileFutureCancel(hdfsOpenFileFuture *future,
+            int mayInterruptIfRunning);
+
+    /**
+     * hdfsOpenFileFutureFree - Free a HDFS open file future.
+     *
+     * @param hdfsOpenFileFuture The hdfsOpenFileFuture to free.
+     */
+    LIBHDFS_EXTERNAL
+    void hdfsOpenFileFutureFree(hdfsOpenFileFuture *future);
+
     /**
      * hdfsStreamBuilderAlloc - Allocate an HDFS stream builder.
      *
diff --git a/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs/jclasses.c b/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs/jclasses.c
index cf880e91b7596..9f589ac257aa1 100644
--- a/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs/jclasses.c
+++ b/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs/jclasses.c
@@ -98,6 +98,8 @@ jthrowable initCachedClasses(JNIEnv* env) {
                 "org/apache/hadoop/hdfs/ReadStatistics";
         cachedJavaClasses[JC_HDFS_DATA_INPUT_STREAM].className =
                 "org/apache/hadoop/hdfs/client/HdfsDataInputStream";
+        cachedJavaClasses[JC_FUTURE_DATA_IS_BUILDER].className =
+                "org/apache/hadoop/fs/FutureDataInputStreamBuilder";
         cachedJavaClasses[JC_DOMAIN_SOCKET].className =
                 "org/apache/hadoop/net/unix/DomainSocket";
         cachedJavaClasses[JC_URI].className =
@@ -108,6 +110,8 @@ jthrowable initCachedClasses(JNIEnv* env) {
                 "java/util/EnumSet";
         cachedJavaClasses[JC_EXCEPTION_UTILS].className =
                 "org/apache/commons/lang3/exception/ExceptionUtils";
+        cachedJavaClasses[JC_CFUTURE].className =
+                "java/util/concurrent/CompletableFuture";
 
         // Create and set the jclass objects based on the class names set above
         jthrowable jthr;
diff --git a/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs/jclasses.h b/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs/jclasses.h
index 92cdd542e2371..0b174e1fecc56 100644
--- a/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs/jclasses.h
+++ b/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs/jclasses.h
@@ -54,11 +54,13 @@ typedef enum {
     JC_FS_PERMISSION,
     JC_READ_STATISTICS,
     JC_HDFS_DATA_INPUT_STREAM,
+    JC_FUTURE_DATA_IS_BUILDER,
     JC_DOMAIN_SOCKET,
     JC_URI,
     JC_BYTE_BUFFER,
     JC_ENUM_SET,
     JC_EXCEPTION_UTILS,
+    JC_CFUTURE,
     // A special marker enum that counts the number of cached jclasses
     NUM_CACHED_CLASSES
 } CachedJavaClass;
@@ -95,6 +97,8 @@ const char *getClassName(CachedJavaClass cachedJavaClass);
 #define HADOOP_FSPERM   "org/apache/hadoop/fs/permission/FsPermission"
 #define HADOOP_RSTAT    "org/apache/hadoop/hdfs/ReadStatistics"
 #define HADOOP_HDISTRM  "org/apache/hadoop/hdfs/client/HdfsDataInputStream"
+#define HADOOP_FDISB    "org/apache/hadoop/fs/FutureDataInputStreamBuilder"
+#define HADOOP_FS_BLDR  "org/apache/hadoop/fs/FSBuilder"
 #define HADOOP_RO       "org/apache/hadoop/fs/ReadOption"
 #define HADOOP_DS       "org/apache/hadoop/net/unix/DomainSocket"
 
@@ -104,6 +108,9 @@ const char *getClassName(CachedJavaClass cachedJavaClass);
 #define JAVA_BYTEBUFFER "java/nio/ByteBuffer"
 #define JAVA_STRING     "java/lang/String"
 #define JAVA_ENUMSET    "java/util/EnumSet"
+#define JAVA_CFUTURE    "java/util/concurrent/CompletableFuture"
+#define JAVA_TIMEUNIT   "java/util/concurrent/TimeUnit"
+#define JAVA_OBJECT     "java/lang/Object"
 
 /* Some frequently used third-party class names */
 
diff --git a/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfspp/tests/hdfs_shim.c b/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfspp/tests/hdfs_shim.c
index bda27b9a43202..2d265b8f03c0c 100644
--- a/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfspp/tests/hdfs_shim.c
+++ b/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfspp/tests/hdfs_shim.c
@@ -250,6 +250,65 @@ hdfsFile hdfsOpenFile(hdfsFS fs, const char* path, int flags,
   return ret;
 }
 
+hdfsOpenFileBuilder *hdfsOpenFileBuilderAlloc(hdfsFS fs,
+        const char *path) {
+  return libhdfs_hdfsOpenFileBuilderAlloc(fs->libhdfsRep, path);
+}
+
+hdfsOpenFileBuilder *hdfsOpenFileBuilderMust(
+        hdfsOpenFileBuilder *builder, const char *key,
+        const char *value) {
+  return libhdfs_hdfsOpenFileBuilderMust(builder, key, value);
+}
+
+hdfsOpenFileBuilder *hdfsOpenFileBuilderOpt(
+        hdfsOpenFileBuilder *builder, const char *key,
+        const char *value) {
+  return libhdfs_hdfsOpenFileBuilderOpt(builder, key, value);
+}
+
+hdfsOpenFileFuture *hdfsOpenFileBuilderBuild(
+        hdfsOpenFileBuilder *builder) {
+  return libhdfs_hdfsOpenFileBuilderBuild(builder);
+}
+
+void hdfsOpenFileBuilderFree(hdfsOpenFileBuilder *builder) {
+  libhdfs_hdfsOpenFileBuilderFree(builder);
+}
+
+hdfsFile hdfsOpenFileFutureGet(hdfsOpenFileFuture *future) {
+  hdfsFile ret = calloc(1, sizeof(struct hdfsFile_internal));
+  ret->libhdfsppRep = 0;
+  ret->libhdfsRep = libhdfs_hdfsOpenFileFutureGet(future);
+  if (!ret->libhdfsRep) {
+    free(ret);
+    ret = NULL;
+  }
+  return ret;
+}
+
+hdfsFile hdfsOpenFileFutureGetWithTimeout(hdfsOpenFileFuture *future,
+        int64_t timeout, javaConcurrentTimeUnit timeUnit) {
+  hdfsFile ret = calloc(1, sizeof(struct hdfsFile_internal));
+  ret->libhdfsppRep = 0;
+  ret->libhdfsRep = libhdfs_hdfsOpenFileFutureGetWithTimeout(future, timeout,
+                                                             timeUnit);
+  if (!ret->libhdfsRep) {
+    free(ret);
+    ret = NULL;
+  }
+  return ret;
+}
+
+int hdfsOpenFileFutureCancel(hdfsOpenFileFuture *future,
+        int mayInterruptIfRunning) {
+  return libhdfs_hdfsOpenFileFutureCancel(future, mayInterruptIfRunning);
+}
+
+void hdfsOpenFileFutureFree(hdfsOpenFileFuture *future) {
+  libhdfs_hdfsOpenFileFutureFree(future);
+}
+
 int hdfsTruncateFile(hdfsFS fs, const char* path, tOffset newlength) {
   return libhdfs_hdfsTruncateFile(fs->libhdfsRep, path, newlength);
 }
diff --git a/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfspp/tests/libhdfs_wrapper_defines.h b/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfspp/tests/libhdfs_wrapper_defines.h
index 0d014341b4c57..165744142558a 100644
--- a/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfspp/tests/libhdfs_wrapper_defines.h
+++ b/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfspp/tests/libhdfs_wrapper_defines.h
@@ -39,6 +39,23 @@
 #define hdfsConfStrFree libhdfs_hdfsConfStrFree
 #define hdfsDisconnect libhdfs_hdfsDisconnect
 #define hdfsOpenFile libhdfs_hdfsOpenFile
+#define hdfsOpenFileBuilderAlloc libhdfs_hdfsOpenFileBuilderAlloc
+#define hdfsOpenFileBuilderMust libhdfs_hdfsOpenFileBuilderMust
+#define hdfsOpenFileBuilderOpt libhdfs_hdfsOpenFileBuilderOpt
+#define hdfsOpenFileBuilderBuild libhdfs_hdfsOpenFileBuilderBuild
+#define hdfsOpenFileBuilderFree libhdfs_hdfsOpenFileBuilderFree
+#define hdfsOpenFileFutureGet libhdfs_hdfsOpenFileFutureGet
+#define javaConcurrentTimeUnit libhdfs_javaConcurrentTimeUnit
+#define jNanoseconds libhdfs_jNanoseconds
+#define jMicroseconds libhdfs_jMicroseconds
+#define jMilliseconds libhdfs_jMilliseconds
+#define jSeconds libhdfsj_jSeconds
+#define jMinutes libhdfs_jMinutes
+#define jHours libhdfs_jHours
+#define jDays libhdfs_jDays
+#define hdfsOpenFileFutureGetWithTimeout libhdfs_hdfsOpenFileFutureGetWithTimeout
+#define hdfsOpenFileFutureCancel libhdfs_hdfsOpenFileFutureCancel
+#define hdfsOpenFileFutureFree libhdfs_hdfsOpenFileFutureFree
 #define hdfsTruncateFile libhdfs_hdfsTruncateFile
 #define hdfsUnbufferFile libhdfs_hdfsUnbufferFile
 #define hdfsCloseFile libhdfs_hdfsCloseFile
diff --git a/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfspp/tests/libhdfs_wrapper_undefs.h b/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfspp/tests/libhdfs_wrapper_undefs.h
index d46768c02ad39..d84b8ba287525 100644
--- a/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfspp/tests/libhdfs_wrapper_undefs.h
+++ b/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfspp/tests/libhdfs_wrapper_undefs.h
@@ -39,6 +39,23 @@
 #undef hdfsConfStrFree
 #undef hdfsDisconnect
 #undef hdfsOpenFile
+#undef hdfsOpenFileBuilderAlloc
+#undef hdfsOpenFileBuilderMust
+#undef hdfsOpenFileBuilderOpt
+#undef hdfsOpenFileBuilderBuild
+#undef hdfsOpenFileBuilderFree
+#undef hdfsOpenFileFutureGet
+#undef javaConcurrentTimeUnit
+#undef jNanoseconds
+#undef jMicroseconds
+#undef jMilliseconds
+#undef jSeconds
+#undef jMinutes
+#undef jHours
+#undef jDays
+#undef hdfsOpenFileFutureGetWithTimeout
+#undef hdfsOpenFileFutureCancel
+#undef hdfsOpenFileFutureFree
 #undef hdfsTruncateFile
 #undef hdfsUnbufferFile
 #undef hdfsCloseFile
diff --git a/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfspp/tests/libhdfspp_wrapper_defines.h b/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfspp/tests/libhdfspp_wrapper_defines.h
index 4b08d0556c3aa..0a6d987409fec 100644
--- a/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfspp/tests/libhdfspp_wrapper_defines.h
+++ b/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfspp/tests/libhdfspp_wrapper_defines.h
@@ -39,6 +39,23 @@
 #define hdfsConfStrFree libhdfspp_hdfsConfStrFree
 #define hdfsDisconnect libhdfspp_hdfsDisconnect
 #define hdfsOpenFile libhdfspp_hdfsOpenFile
+#define hdfsOpenFileBuilderAlloc libhdfspp_hdfsOpenFileBuilderAlloc
+#define hdfsOpenFileBuilderMust libhdfspp_hdfsOpenFileBuilderMust
+#define hdfsOpenFileBuilderOpt libhdfspp_hdfsOpenFileBuilderOpt
+#define hdfsOpenFileBuilderBuild libhdfspp_hdfsOpenFileBuilderBuild
+#define hdfsOpenFileBuilderFree libhdfspp_hdfsOpenFileBuilderFree
+#define hdfsOpenFileFutureGet libhdfspp_hdfsOpenFileFutureGet
+#define javaConcurrentTimeUnit libhdfspp_javaConcurrentTimeUnit
+#define jNanoseconds libhdfspp_jNanoseconds
+#define jMicroseconds libhdfspp_jMicroseconds
+#define jMilliseconds libhdfspp_jMilliseconds
+#define jSeconds libhdfspp_jSeconds
+#define jMinutes libhdfspp_jMinutes
+#define jHours libhdfspp_jHours
+#define jDays libhdfspp_jDays
+#define hdfsOpenFileFutureGetWithTimeout libhdfspp_hdfsOpenFileFutureGetWithTimeout
+#define hdfsOpenFileFutureCancel libhdfspp_hdfsOpenFileFutureCancel
+#define hdfsOpenFileFutureFree libhdfspp_hdfsOpenFileFutureFree
 #define hdfsTruncateFile libhdfspp_hdfsTruncateFile
 #define hdfsUnbufferFile libhdfspp_hdfsUnbufferFile
 #define hdfsCloseFile libhdfspp_hdfsCloseFile

From 56442e7437ff81f64252549c03274b3013e789d8 Mon Sep 17 00:00:00 2001
From: basapuram-kumar <basapuram@acceldata.io>
Date: Wed, 13 Nov 2024 21:39:17 +0530
Subject: [PATCH 11/40] ODP-2583: HADOOP-11616 : bump up Curator to 5.2.0 #33

---
 .../hadoop/util/curator/ChildReaper.java      | 234 ------------------
 .../hadoop/util/curator/TestChildReaper.java  | 208 ----------------
 hadoop-project/pom.xml                        |   2 +-
 3 files changed, 1 insertion(+), 443 deletions(-)
 delete mode 100644 hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/curator/ChildReaper.java
 delete mode 100644 hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/util/curator/TestChildReaper.java

diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/curator/ChildReaper.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/curator/ChildReaper.java
deleted file mode 100644
index 86142fb6d3a4f..0000000000000
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/curator/ChildReaper.java
+++ /dev/null
@@ -1,234 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package org.apache.hadoop.util.curator;
-
-import com.google.common.base.Preconditions;
-import org.apache.curator.framework.recipes.locks.Reaper;
-import org.apache.curator.utils.CloseableUtils;
-import org.apache.curator.framework.CuratorFramework;
-import org.apache.curator.utils.CloseableScheduledExecutorService;
-import org.apache.curator.utils.ThreadUtils;
-import org.apache.curator.utils.ZKPaths;
-import org.apache.hadoop.classification.InterfaceAudience;
-import org.apache.hadoop.classification.InterfaceStability;
-import org.apache.zookeeper.data.Stat;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-import java.io.Closeable;
-import java.io.IOException;
-import java.util.Collection;
-import java.util.Collections;
-import java.util.List;
-import java.util.Set;
-import java.util.concurrent.ConcurrentHashMap;
-import java.util.concurrent.Future;
-import java.util.concurrent.ScheduledExecutorService;
-import java.util.concurrent.TimeUnit;
-import java.util.concurrent.atomic.AtomicReference;
-import org.apache.curator.utils.PathUtils;
-
-/**
- * This is a copy of Curator 2.7.1's ChildReaper class, modified to work with
- * Guava 11.0.2.  The problem is the 'paths' Collection, which calls Guava's
- * Sets.newConcurrentHashSet(), which was added in Guava 15.0.
- * <p>
- * Utility to reap empty child nodes of a parent node. Periodically calls getChildren on
- * the node and adds empty nodes to an internally managed {@link Reaper}
- */
-@InterfaceAudience.Private
-@InterfaceStability.Unstable
-public class ChildReaper implements Closeable
-{
-  private final Logger log = LoggerFactory.getLogger(getClass());
-  private final Reaper reaper;
-  private final AtomicReference<State> state = new AtomicReference<State>(State.LATENT);
-  private final CuratorFramework client;
-  private final Collection<String> paths = newConcurrentHashSet();
-  private final Reaper.Mode mode;
-  private final CloseableScheduledExecutorService executor;
-  private final int reapingThresholdMs;
-
-  private volatile Future<?> task;
-
-  // This is copied from Curator's Reaper class
-  static final int DEFAULT_REAPING_THRESHOLD_MS = (int)TimeUnit.MILLISECONDS.convert(5, TimeUnit.MINUTES);
-
-  // This is copied from Guava
-  /**
-   * Creates a thread-safe set backed by a hash map. The set is backed by a
-   * {@link ConcurrentHashMap} instance, and thus carries the same concurrency
-   * guarantees.
-   *
-   * <p>Unlike {@code HashSet}, this class does NOT allow {@code null} to be
-   * used as an element. The set is serializable.
-   *
-   * @return a new, empty thread-safe {@code Set}
-   * @since 15.0
-   */
-  public static <E> Set<E> newConcurrentHashSet() {
-    return Collections.newSetFromMap(new ConcurrentHashMap<E, Boolean>());
-  }
-
-  private enum State
-  {
-    LATENT,
-    STARTED,
-    CLOSED
-  }
-
-  /**
-   * @param client the client
-   * @param path path to reap children from
-   * @param mode reaping mode
-   */
-  public ChildReaper(CuratorFramework client, String path, Reaper.Mode mode)
-  {
-    this(client, path, mode, newExecutorService(), DEFAULT_REAPING_THRESHOLD_MS, null);
-  }
-
-  /**
-   * @param client the client
-   * @param path path to reap children from
-   * @param reapingThresholdMs threshold in milliseconds that determines that a path can be deleted
-   * @param mode reaping mode
-   */
-  public ChildReaper(CuratorFramework client, String path, Reaper.Mode mode, int reapingThresholdMs)
-  {
-    this(client, path, mode, newExecutorService(), reapingThresholdMs, null);
-  }
-
-  /**
-   * @param client the client
-   * @param path path to reap children from
-   * @param executor executor to use for background tasks
-   * @param reapingThresholdMs threshold in milliseconds that determines that a path can be deleted
-   * @param mode reaping mode
-   */
-  public ChildReaper(CuratorFramework client, String path, Reaper.Mode mode, ScheduledExecutorService executor, int reapingThresholdMs)
-  {
-    this(client, path, mode, executor, reapingThresholdMs, null);
-  }
-
-  /**
-   * @param client the client
-   * @param path path to reap children from
-   * @param executor executor to use for background tasks
-   * @param reapingThresholdMs threshold in milliseconds that determines that a path can be deleted
-   * @param mode reaping mode
-   * @param leaderPath if not null, uses a leader selection so that only 1 reaper is active in the cluster
-   */
-  public ChildReaper(CuratorFramework client, String path, Reaper.Mode mode, ScheduledExecutorService executor, int reapingThresholdMs, String leaderPath)
-  {
-    this.client = client;
-    this.mode = mode;
-    this.executor = new CloseableScheduledExecutorService(executor);
-    this.reapingThresholdMs = reapingThresholdMs;
-    this.reaper = new Reaper(client, executor, reapingThresholdMs, leaderPath);
-    addPath(path);
-  }
-
-  /**
-   * The reaper must be started
-   *
-   * @throws Exception errors
-   */
-  public void start() throws Exception
-  {
-    Preconditions.checkState(state.compareAndSet(State.LATENT, State.STARTED), "Cannot be started more than once");
-
-    task = executor.scheduleWithFixedDelay
-        (
-            new Runnable()
-            {
-              @Override
-              public void run()
-              {
-                doWork();
-              }
-            },
-            reapingThresholdMs,
-            reapingThresholdMs,
-            TimeUnit.MILLISECONDS
-        );
-
-    reaper.start();
-  }
-
-  @Override
-  public void close() throws IOException
-  {
-    if ( state.compareAndSet(State.STARTED, State.CLOSED) )
-    {
-      CloseableUtils.closeQuietly(reaper);
-      task.cancel(true);
-    }
-  }
-
-  /**
-   * Add a path to reap children from
-   *
-   * @param path the path
-   * @return this for chaining
-   */
-  public ChildReaper addPath(String path)
-  {
-    paths.add(PathUtils.validatePath(path));
-    return this;
-  }
-
-  /**
-   * Remove a path from reaping
-   *
-   * @param path the path
-   * @return true if the path existed and was removed
-   */
-  public boolean removePath(String path)
-  {
-    return paths.remove(PathUtils.validatePath(path));
-  }
-
-  private static ScheduledExecutorService newExecutorService()
-  {
-    return ThreadUtils.newFixedThreadScheduledPool(2, "ChildReaper");
-  }
-
-  private void doWork()
-  {
-    for ( String path : paths )
-    {
-      try
-      {
-        List<String> children = client.getChildren().forPath(path);
-        for ( String name : children )
-        {
-          String thisPath = ZKPaths.makePath(path, name);
-          Stat stat = client.checkExists().forPath(thisPath);
-          if ( (stat != null) && (stat.getNumChildren() == 0) )
-          {
-            reaper.addPath(thisPath, mode);
-          }
-        }
-      }
-      catch ( Exception e )
-      {
-        log.error("Could not get children for path: " + path, e);
-      }
-    }
-  }
-}
diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/util/curator/TestChildReaper.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/util/curator/TestChildReaper.java
deleted file mode 100644
index 11b254fc697eb..0000000000000
--- a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/util/curator/TestChildReaper.java
+++ /dev/null
@@ -1,208 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.hadoop.util.curator;
-
-import org.apache.curator.framework.recipes.locks.Reaper;
-import org.apache.curator.test.TestingServer;
-import org.apache.curator.utils.CloseableUtils;
-import org.apache.curator.framework.CuratorFramework;
-import org.apache.curator.framework.CuratorFrameworkFactory;
-import org.apache.curator.retry.RetryOneTime;
-import org.apache.curator.test.Timing;
-import org.apache.zookeeper.data.Stat;
-import org.junit.After;
-import org.junit.Assert;
-import org.junit.Before;
-import org.junit.Test;
-
-import java.net.BindException;
-import java.util.Random;
-
-/**
- * This is a copy of Curator 2.7.1's TestChildReaper class, with minor
- * modifications to make it work with JUnit (some setup code taken from
- * Curator's BaseClassForTests).  This is to ensure that the ChildReaper
- * class we modified is still correct.
- */
-public class TestChildReaper
-{
-  protected TestingServer server;
-
-  @Before
-  public void setup() throws Exception {
-    while(this.server == null) {
-      try {
-        this.server = new TestingServer();
-      } catch (BindException var2) {
-        System.err.println("Getting bind exception - retrying to allocate server");
-        this.server = null;
-      }
-    }
-  }
-
-  @After
-  public void teardown() throws Exception {
-    this.server.close();
-    this.server = null;
-  }
-
-  @Test
-  public void     testSomeNodes() throws Exception
-  {
-
-    Timing                  timing = new Timing();
-    ChildReaper             reaper = null;
-    CuratorFramework        client = CuratorFrameworkFactory.newClient(server.getConnectString(), timing.session(), timing.connection(), new RetryOneTime(1));
-    try
-    {
-      client.start();
-
-      Random              r = new Random();
-      int                 nonEmptyNodes = 0;
-      for ( int i = 0; i < 10; ++i )
-      {
-        client.create().creatingParentsIfNeeded().forPath("/test/" + Integer.toString(i));
-        if ( r.nextBoolean() )
-        {
-          client.create().forPath("/test/" + Integer.toString(i) + "/foo");
-          ++nonEmptyNodes;
-        }
-      }
-
-      reaper = new ChildReaper(client, "/test", Reaper.Mode.REAP_UNTIL_DELETE, 1);
-      reaper.start();
-
-      timing.forWaiting().sleepABit();
-
-      Stat    stat = client.checkExists().forPath("/test");
-      Assert.assertEquals(stat.getNumChildren(), nonEmptyNodes);
-    }
-    finally
-    {
-      CloseableUtils.closeQuietly(reaper);
-      CloseableUtils.closeQuietly(client);
-    }
-  }
-
-  @Test
-  public void     testSimple() throws Exception
-  {
-    Timing                  timing = new Timing();
-    ChildReaper             reaper = null;
-    CuratorFramework        client = CuratorFrameworkFactory.newClient(server.getConnectString(), timing.session(), timing.connection(), new RetryOneTime(1));
-    try
-    {
-      client.start();
-
-      for ( int i = 0; i < 10; ++i )
-      {
-        client.create().creatingParentsIfNeeded().forPath("/test/" + Integer.toString(i));
-      }
-
-      reaper = new ChildReaper(client, "/test", Reaper.Mode.REAP_UNTIL_DELETE, 1);
-      reaper.start();
-
-      timing.forWaiting().sleepABit();
-
-      Stat    stat = client.checkExists().forPath("/test");
-      Assert.assertEquals(stat.getNumChildren(), 0);
-    }
-    finally
-    {
-      CloseableUtils.closeQuietly(reaper);
-      CloseableUtils.closeQuietly(client);
-    }
-  }
-
-  @Test
-  public void     testMultiPath() throws Exception
-  {
-    Timing                  timing = new Timing();
-    ChildReaper             reaper = null;
-    CuratorFramework        client = CuratorFrameworkFactory.newClient(server.getConnectString(), timing.session(), timing.connection(), new RetryOneTime(1));
-    try
-    {
-      client.start();
-
-      for ( int i = 0; i < 10; ++i )
-      {
-        client.create().creatingParentsIfNeeded().forPath("/test1/" + Integer.toString(i));
-        client.create().creatingParentsIfNeeded().forPath("/test2/" + Integer.toString(i));
-        client.create().creatingParentsIfNeeded().forPath("/test3/" + Integer.toString(i));
-      }
-
-      reaper = new ChildReaper(client, "/test2", Reaper.Mode.REAP_UNTIL_DELETE, 1);
-      reaper.start();
-      reaper.addPath("/test1");
-
-      timing.forWaiting().sleepABit();
-
-      Stat    stat = client.checkExists().forPath("/test1");
-      Assert.assertEquals(stat.getNumChildren(), 0);
-      stat = client.checkExists().forPath("/test2");
-      Assert.assertEquals(stat.getNumChildren(), 0);
-      stat = client.checkExists().forPath("/test3");
-      Assert.assertEquals(stat.getNumChildren(), 10);
-    }
-    finally
-    {
-      CloseableUtils.closeQuietly(reaper);
-      CloseableUtils.closeQuietly(client);
-    }
-  }
-
-  @Test
-  public void     testNamespace() throws Exception
-  {
-    Timing                  timing = new Timing();
-    ChildReaper             reaper = null;
-    CuratorFramework        client = CuratorFrameworkFactory.builder()
-        .connectString(server.getConnectString())
-        .sessionTimeoutMs(timing.session())
-        .connectionTimeoutMs(timing.connection())
-        .retryPolicy(new RetryOneTime(1))
-        .namespace("foo")
-        .build();
-    try
-    {
-      client.start();
-
-      for ( int i = 0; i < 10; ++i )
-      {
-        client.create().creatingParentsIfNeeded().forPath("/test/" + Integer.toString(i));
-      }
-
-      reaper = new ChildReaper(client, "/test", Reaper.Mode.REAP_UNTIL_DELETE, 1);
-      reaper.start();
-
-      timing.forWaiting().sleepABit();
-
-      Stat    stat = client.checkExists().forPath("/test");
-      Assert.assertEquals(stat.getNumChildren(), 0);
-
-      stat = client.usingNamespace(null).checkExists().forPath("/foo/test");
-      Assert.assertNotNull(stat);
-      Assert.assertEquals(stat.getNumChildren(), 0);
-    }
-    finally
-    {
-      CloseableUtils.closeQuietly(reaper);
-      CloseableUtils.closeQuietly(client);
-    }
-  }
-}
diff --git a/hadoop-project/pom.xml b/hadoop-project/pom.xml
index e9aa199a8bf6b..26787bf6081b0 100644
--- a/hadoop-project/pom.xml
+++ b/hadoop-project/pom.xml
@@ -96,7 +96,7 @@
     <hadoop-thirdparty-shaded-guava-prefix>${hadoop-thirdparty-shaded-prefix}.com.google.common</hadoop-thirdparty-shaded-guava-prefix>
 
     <zookeeper.version>3.5.10.3.2.3.3-2</zookeeper.version>
-    <curator.version>4.2.0</curator.version>
+    <curator.version>5.2.0</curator.version>
     <findbugs.version>3.0.5</findbugs.version>
     <dnsjava.version>2.1.7</dnsjava.version>
 

From 66ff1bdfd9b614b67da6185d63426bc594e959f5 Mon Sep 17 00:00:00 2001
From: Syed Shameerur Rahman <rhmanns@amazon.com>
Date: Wed, 25 Sep 2024 09:40:15 +0530
Subject: [PATCH 12/40] ODP-2633: YARN-11702: Fix Yarn over allocating
 containers (#6990) Contributed by Syed Shameerur Rahman.

Reviewed-by: Akira Ajisaka <aajisaka@apache.org>
Signed-off-by: Shilun Fan <slfan1989@apache.org>
(cherry picked from commit 3b9faf6b21b9f76f8583f73ffb620089e229e641)
---
 .../hadoop/yarn/conf/YarnConfiguration.java   |  11 +
 .../src/main/resources/yarn-default.xml       |  15 +
 .../scheduler/AbstractYarnScheduler.java      | 218 +++++++++++
 .../SchedulerApplicationAttempt.java          |   3 +-
 .../scheduler/capacity/CapacityScheduler.java |   4 +
 .../scheduler/fair/FairScheduler.java         |   5 +
 .../scheduler/TestAbstractYarnScheduler.java  | 355 ++++++++++++++++++
 .../scheduler/capacity/TestUtils.java         |  14 +
 8 files changed, 624 insertions(+), 1 deletion(-)

diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java
index 013d33ddf7fe9..8beea5a64bb50 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java
@@ -1406,6 +1406,17 @@ public static boolean isAclEnabled(Configuration conf) {
   public static final int DEFAULT_RM_MAX_LOG_AGGREGATION_DIAGNOSTICS_IN_MEMORY =
       10;
 
+  /**
+   * The configuration key for enabling or disabling the auto-correction of container allocation.
+   */
+  public static final String RM_SCHEDULER_AUTOCORRECT_CONTAINER_ALLOCATION = RM_PREFIX
+      + "scheduler.autocorrect.container.allocation";
+
+  /**
+   * Default value: {@value}.
+   */
+  public static final boolean DEFAULT_RM_SCHEDULER_AUTOCORRECT_CONTAINER_ALLOCATION = false;
+
   /** Whether to enable log aggregation */
   public static final String LOG_AGGREGATION_ENABLED = YARN_PREFIX
       + "log-aggregation-enable";
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml
index 360945db726a5..3f35d20ddd3e8 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml
@@ -144,6 +144,21 @@
     <name>yarn.resourcemanager.principal</name>
   </property>
 
+  <property>
+    <description>
+      This configuration key enables or disables the auto-correction of container allocation in
+      YARN. Due to the asynchronous nature of container request and allocation, YARN may sometimes
+      over-allocate more containers than requested. The auto-correction feature addresses this by
+      automatically adjusting the number of requested containers based on those already allocated,
+      preventing extra containers from being allocated.
+      While the extra allocated containers will be released by the client within a few seconds,
+      this may not be a concern in normal circumstances. However, if the user is worried about
+      resource contention due to over-allocation, enabling this feature can help avoid such cases.
+    </description>
+    <name>yarn.resourcemanager.scheduler.autocorrect.container.allocation</name>
+    <value>false</value>
+  </property>
+
   <property>
     <description>The address of the scheduler interface.</description>
     <name>yarn.resourcemanager.scheduler.address</name>
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/AbstractYarnScheduler.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/AbstractYarnScheduler.java
index cc3998bf3d760..dd3e0bc26fd95 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/AbstractYarnScheduler.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/AbstractYarnScheduler.java
@@ -22,7 +22,10 @@
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collection;
+import java.util.Collections;
 import java.util.EnumSet;
+
+import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
 import java.util.Set;
@@ -33,6 +36,11 @@
 
 import com.google.gson.Gson;
 import com.google.gson.reflect.TypeToken;
+
+import org.apache.commons.lang3.builder.EqualsBuilder;
+import org.apache.commons.lang3.builder.HashCodeBuilder;
+import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerState;
+import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CSQueue;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.apache.hadoop.classification.InterfaceAudience.Private;
@@ -142,6 +150,7 @@ public abstract class AbstractYarnScheduler
   Thread updateThread;
   private final Object updateThreadMonitor = new Object();
   private Timer releaseCache;
+  private boolean autoCorrectContainerAllocation;
 
   /*
    * All schedulers which are inheriting AbstractYarnScheduler should use
@@ -196,6 +205,10 @@ public void serviceInit(Configuration conf) throws Exception {
     nmHeartbeatInterval =
         conf.getLong(YarnConfiguration.RM_NM_HEARTBEAT_INTERVAL_MS,
             YarnConfiguration.DEFAULT_RM_NM_HEARTBEAT_INTERVAL_MS);
+    skipNodeInterval = YarnConfiguration.getSkipNodeInterval(conf);
+    autoCorrectContainerAllocation =
+        conf.getBoolean(YarnConfiguration.RM_SCHEDULER_AUTOCORRECT_CONTAINER_ALLOCATION,
+            YarnConfiguration.DEFAULT_RM_SCHEDULER_AUTOCORRECT_CONTAINER_ALLOCATION);
     long configuredMaximumAllocationWaitTime =
         conf.getLong(YarnConfiguration.RM_WORK_PRESERVING_RECOVERY_SCHEDULING_WAIT_MS,
           YarnConfiguration.DEFAULT_RM_WORK_PRESERVING_RECOVERY_SCHEDULING_WAIT_MS);
@@ -589,6 +602,106 @@ public void recoverContainersOnNode(List<NMContainerStatus> containerReports,
     }
   }
 
+  /**
+   * Autocorrect container resourceRequests by decrementing the number of newly allocated containers
+   * from the current container request. This also updates the newlyAllocatedContainers to be within
+   * the limits of the current container resourceRequests.
+   * ResourceRequests locality/resourceName is not considered while autocorrecting the container
+   * request, hence when there are two types of resourceRequest which is same except for the
+   * locality/resourceName, it is counted as same {@link ContainerObjectType} and the container
+   * ask and number of newly allocated container is decremented accordingly.
+   * For example when a client requests for 4 containers with locality/resourceName
+   * as "node1", AMRMClientaugments the resourceRequest into two
+   * where R1(numContainer=4,locality=*) and R2(numContainer=4,locality=node1),
+   * if Yarn allocated 6 containers previously, it will release 2 containers as well as
+   * update the container ask to 0.
+   *
+   * If there is a client which directly calls Yarn (without AMRMClient) with
+   * two where R1(numContainer=4,locality=*) and R2(numContainer=4,locality=node1)
+   * the autocorrection may not work as expected. The use case of such client is very rare.
+   *
+   * <p>
+   * This method is called from {@link AbstractYarnScheduler#allocate} method. It is package private
+   * to be used within the scheduler package only.
+   * @param resourceRequests List of resources to be allocated
+   * @param application ApplicationAttempt
+   */
+  @VisibleForTesting
+  protected void autoCorrectContainerAllocation(List<ResourceRequest> resourceRequests,
+      SchedulerApplicationAttempt application) {
+
+    // if there is no resourceRequests for containers or no newly allocated container from
+    // the previous request there is nothing to do.
+    if (!autoCorrectContainerAllocation || resourceRequests.isEmpty() ||
+        application.newlyAllocatedContainers.isEmpty()) {
+      return;
+    }
+
+    // iterate newlyAllocatedContainers and form a mapping of container type
+    // and number of its occurrence.
+    Map<ContainerObjectType, List<RMContainer>> allocatedContainerMap = new HashMap<>();
+    for (RMContainer rmContainer : application.newlyAllocatedContainers) {
+      Container container = rmContainer.getContainer();
+      ContainerObjectType containerObjectType = new ContainerObjectType(
+          container.getAllocationRequestId(), container.getPriority(),
+          container.getExecutionType(), container.getResource());
+      allocatedContainerMap.computeIfAbsent(containerObjectType,
+          k -> new ArrayList<>()).add(rmContainer);
+    }
+
+    Map<ContainerObjectType, Integer> extraContainerAllocatedMap = new HashMap<>();
+    // iterate through resourceRequests and update the request by
+    // decrementing the already allocated containers.
+    for (ResourceRequest request : resourceRequests) {
+      ContainerObjectType containerObjectType =
+          new ContainerObjectType(request.getAllocationRequestId(),
+              request.getPriority(), request.getExecutionTypeRequest().getExecutionType(),
+              request.getCapability());
+      int numContainerAllocated = allocatedContainerMap.getOrDefault(containerObjectType,
+          Collections.emptyList()).size();
+      if (numContainerAllocated > 0) {
+        int numContainerAsk = request.getNumContainers();
+        int updatedContainerRequest = numContainerAsk - numContainerAllocated;
+        if (updatedContainerRequest < 0) {
+          // add an entry to extra allocated map
+          extraContainerAllocatedMap.put(containerObjectType, Math.abs(updatedContainerRequest));
+          LOG.debug("{} container of the resource type: {} will be released",
+              Math.abs(updatedContainerRequest), request);
+          // if newlyAllocatedContainer count is more than the current container
+          // resourceRequests, reset it to 0.
+          updatedContainerRequest = 0;
+        }
+
+        // update the request
+        LOG.debug("Updating container resourceRequests from {} to {} for the resource type: {}",
+            numContainerAsk, updatedContainerRequest, request);
+        request.setNumContainers(updatedContainerRequest);
+      }
+    }
+
+    // Iterate over the entries in extraContainerAllocatedMap
+    for (Map.Entry<ContainerObjectType, Integer> entry : extraContainerAllocatedMap.entrySet()) {
+      ContainerObjectType containerObjectType = entry.getKey();
+      int extraContainers = entry.getValue();
+
+      // Get the list of allocated containers for the current ContainerObjectType
+      List<RMContainer> allocatedContainers = allocatedContainerMap.get(containerObjectType);
+      if (allocatedContainers != null) {
+        for (RMContainer rmContainer : allocatedContainers) {
+          if (extraContainers > 0) {
+            // Change the state of the container from ALLOCATED to EXPIRED since it is not required.
+            LOG.debug("Removing extra container:{}", rmContainer.getContainer());
+            completedContainer(rmContainer, SchedulerUtils.createAbnormalContainerStatus(
+                rmContainer.getContainerId(), SchedulerUtils.EXPIRED_CONTAINER),
+                RMContainerEventType.EXPIRE);
+            application.newlyAllocatedContainers.remove(rmContainer);
+            extraContainers--;
+          }
+        }
+      }
+    }
+  }
+
   private RMContainer recoverAndCreateContainer(NMContainerStatus status,
       RMNode node, String queueName) {
     Container container =
@@ -623,6 +736,14 @@ private void recoverResourceRequestForContainer(RMContainer rmContainer) {
       return;
     }
 
+    // when auto correct container allocation is enabled, there can be a case when extra containers
+    // go to expired state from allocated state. When such scenario happens do not re-attempt the
+    // container request since this is expected.
+    if (autoCorrectContainerAllocation &&
+        RMContainerState.EXPIRED.equals(rmContainer.getState())) {
+      return;
+    }
+
     // Add resource request back to Scheduler ApplicationAttempt.
 
     // We lookup the application-attempt here again using
@@ -1514,4 +1635,101 @@ public boolean attemptAllocationOnNode(SchedulerApplicationAttempt appAttempt,
   public void resetSchedulerMetrics() {
     // reset scheduler metrics
   }
+
+  /**
+   * Gets the apps from a given queue.
+   *
+   * Mechanics:
+   * 1. Get all {@link ApplicationAttemptId}s in the given queue by
+   * {@link #getAppsInQueue(String)} method.
+   * 2. Always need to check validity for the given queue by the returned
+   * values.
+   *
+   * @param queueName queue name
+   * @return a collection of app attempt ids in the given queue, it maybe empty.
+   * @throws YarnException if {@link #getAppsInQueue(String)} return null, will
+   * throw this exception.
+   */
+  private List<ApplicationAttemptId> getAppsFromQueue(String queueName)
+      throws YarnException {
+    List<ApplicationAttemptId> apps = getAppsInQueue(queueName);
+    if (apps == null) {
+      throw new YarnException("The specified queue: " + queueName
+          + " doesn't exist");
+    }
+    return apps;
+  }
+
+  /**
+   * ContainerObjectType is a container object with the following properties.
+   * Namely allocationId, priority, executionType and resourceType.
+   */
+  protected class ContainerObjectType extends Object {
+    private final long allocationId;
+    private final Priority priority;
+    private final ExecutionType executionType;
+    private final Resource resource;
+
+    public ContainerObjectType(long allocationId, Priority priority,
+        ExecutionType executionType, Resource resource) {
+      this.allocationId = allocationId;
+      this.priority = priority;
+      this.executionType = executionType;
+      this.resource = resource;
+    }
+
+    public long getAllocationId() {
+      return allocationId;
+    }
+
+    public Priority getPriority() {
+      return priority;
+    }
+
+    public ExecutionType getExecutionType() {
+      return executionType;
+    }
+
+    public Resource getResource() {
+      return resource;
+    }
+
+    @Override
+    public int hashCode() {
+      return new HashCodeBuilder(17, 37)
+          .append(allocationId)
+          .append(priority)
+          .append(executionType)
+          .append(resource)
+          .toHashCode();
+    }
+
+    @Override
+    public boolean equals(Object obj) {
+      if (obj == null) {
+        return false;
+      }
+      if (obj.getClass() != this.getClass()) {
+        return false;
+      }
+
+      ContainerObjectType other = (ContainerObjectType) obj;
+      return new EqualsBuilder()
+          .append(allocationId, other.getAllocationId())
+          .append(priority, other.getPriority())
+          .append(executionType, other.getExecutionType())
+          .append(resource, other.getResource())
+          .isEquals();
+    }
+
+    @Override
+    public String toString() {
+      return "{ContainerObjectType: "
+          + ", Priority: " + getPriority()
+          + ", Allocation Id: " + getAllocationId()
+          + ", Execution Type: " + getExecutionType()
+          + ", Resource: " + getResource()
+          + "}";
+    }
+  }
 }
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/SchedulerApplicationAttempt.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/SchedulerApplicationAttempt.java
index e1d0138b2bd69..e3bd20b63c8ef 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/SchedulerApplicationAttempt.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/SchedulerApplicationAttempt.java
@@ -839,7 +839,8 @@ protected synchronized void addToUpdateContainerErrors(
     updateContainerErrors.add(error);
   }
 
-  protected synchronized void addToNewlyAllocatedContainers(
+  @VisibleForTesting
+  public synchronized void addToNewlyAllocatedContainers(
       SchedulerNode node, RMContainer rmContainer) {
     ContainerId matchedContainerId =
         getUpdateContext().matchContainerToOutstandingIncreaseReq(
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CapacityScheduler.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CapacityScheduler.java
index cc1a2bf265886..caf2c8bc220f0 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CapacityScheduler.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CapacityScheduler.java
@@ -1236,6 +1236,10 @@ public Allocation allocate(ApplicationAttemptId applicationAttemptId,
           application.showRequests();
         }
 
+        // update the current container ask by considering the already allocated
+        // containers from previous allocation request and return updatedNewlyAllocatedContainers.
+        autoCorrectContainerAllocation(ask, application);
+
         // Update application requests
         if (application.updateResourceRequests(ask) || application
             .updateSchedulingRequests(schedulingRequests)) {
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FairScheduler.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FairScheduler.java
index ac6750a0198a8..b0a3120f3f832 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FairScheduler.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FairScheduler.java
@@ -970,6 +970,11 @@ public Allocation allocate(ApplicationAttemptId appAttemptId,
         }
         application.showRequests();
 
+        // update the current container ask by considering the already allocated containers
+        // from previous allocation request as well as populate the updatedNewlyAllocatedContainers
+        // list according the to the current ask.
+        autoCorrectContainerAllocation(ask, application);
+
         // Update application requests
         application.updateResourceRequests(ask);
 
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/TestAbstractYarnScheduler.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/TestAbstractYarnScheduler.java
index e66d5390382a8..cf427be091e96 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/TestAbstractYarnScheduler.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/TestAbstractYarnScheduler.java
@@ -19,6 +19,8 @@
 package org.apache.hadoop.yarn.server.resourcemanager.scheduler;
 
 import static org.apache.hadoop.yarn.server.resourcemanager.MockNM.createMockNodeStatus;
+import static org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.TestUtils.createResourceRequest;
+import static org.junit.Assert.assertEquals;
 import static org.mockito.Mockito.mock;
 import static org.mockito.Mockito.when;
 
@@ -60,16 +62,23 @@
 import org.apache.hadoop.yarn.server.resourcemanager.ResourceManager;
 import org.apache.hadoop.yarn.server.resourcemanager.ResourceTrackerService;
 import org.apache.hadoop.yarn.server.resourcemanager.nodelabels.RMNodeLabelsManager;
+import org.apache.hadoop.yarn.server.resourcemanager.placement.ApplicationPlacementContext;
 import org.apache.hadoop.yarn.server.resourcemanager.recovery.MemoryRMStateStore;
 import org.apache.hadoop.yarn.server.resourcemanager.rmapp.MockRMApp;
 import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp;
+import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppImpl;
 import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppState;
 import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttempt;
+import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttemptImpl;
+import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttemptMetrics;
 import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer;
+import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerImpl;
 import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerState;
 import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNode;
 import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNodeEventType;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerApp;
+import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.AppAddedSchedulerEvent;
+import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.AppAttemptAddedSchedulerEvent;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.NodeAddedSchedulerEvent;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.NodeRemovedSchedulerEvent;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.SchedulerEvent;
@@ -262,6 +271,352 @@ private void testMaximumAllocationVCoresHelper(
     Assert.assertEquals(0, scheduler.getNumClusterNodes());
   }
 
+  /**
+   * Test for testing autocorrect container allocation feature.
+   */
+  @Test
+  public void testAutoCorrectContainerAllocation() {
+    Configuration conf = new Configuration(getConf());
+    conf.setBoolean(YarnConfiguration.RM_SCHEDULER_AUTOCORRECT_CONTAINER_ALLOCATION, true);
+    conf.setBoolean("yarn.scheduler.capacity.root.auto-create-child-queue.enabled",
+        true);
+    MockRM rm = new MockRM(conf);
+    rm.start();
+    AbstractYarnScheduler scheduler = (AbstractYarnScheduler) rm.getResourceScheduler();
+
+    String host = "127.0.0.1";
+    RMNode node =
+        MockNodes.newNodeInfo(0, MockNodes.newResource(4 * 1024), 1, host);
+    scheduler.handle(new NodeAddedSchedulerEvent(node));
+
+    //add app begin
+    ApplicationId appId1 = BuilderUtils.newApplicationId(100, 1);
+    ApplicationAttemptId appAttemptId = BuilderUtils.newApplicationAttemptId(
+        appId1, 1);
+
+    RMAppAttemptMetrics attemptMetric1 =
+        new RMAppAttemptMetrics(appAttemptId, rm.getRMContext());
+    RMAppImpl app1 = mock(RMAppImpl.class);
+    when(app1.getApplicationId()).thenReturn(appId1);
+    RMAppAttemptImpl attempt1 = mock(RMAppAttemptImpl.class);
+    Container container = mock(Container.class);
+    when(attempt1.getMasterContainer()).thenReturn(container);
+    ApplicationSubmissionContext submissionContext = mock(
+        ApplicationSubmissionContext.class);
+    when(attempt1.getSubmissionContext()).thenReturn(submissionContext);
+    when(attempt1.getAppAttemptId()).thenReturn(appAttemptId);
+    when(attempt1.getRMAppAttemptMetrics()).thenReturn(attemptMetric1);
+    when(app1.getCurrentAppAttempt()).thenReturn(attempt1);
+
+    rm.getRMContext().getRMApps().put(appId1, app1);
+
+    ApplicationPlacementContext apc = new ApplicationPlacementContext("user",
+        "root");
+    SchedulerEvent addAppEvent1 =
+        new AppAddedSchedulerEvent(appId1, "user", "user", apc);
+    scheduler.handle(addAppEvent1);
+    SchedulerEvent addAttemptEvent1 =
+        new AppAttemptAddedSchedulerEvent(appAttemptId, false);
+    scheduler.handle(addAttemptEvent1);
+
+    SchedulerApplicationAttempt application = scheduler.getApplicationAttempt(appAttemptId);
+    SchedulerNode schedulerNode = scheduler.getSchedulerNode(node.getNodeID());
+    Priority priority = Priority.newInstance(0);
+    NodeId nodeId = NodeId.newInstance("foo.bar.org", 1234);
+
+    // test different container ask and newly allocated container.
+    testContainerAskAndNewlyAllocatedContainerZero(scheduler, application, priority);
+    testContainerAskAndNewlyAllocatedContainerOne(scheduler, application, schedulerNode,
+        nodeId, priority, app1.getCurrentAppAttempt().getAppAttemptId());
+    testContainerAskZeroAndNewlyAllocatedContainerOne(scheduler, application, schedulerNode,
+        nodeId, priority, app1.getCurrentAppAttempt().getAppAttemptId());
+    testContainerAskFourAndNewlyAllocatedContainerEight(scheduler, application, schedulerNode,
+        nodeId, priority, app1.getCurrentAppAttempt().getAppAttemptId());
+    testContainerAskFourAndNewlyAllocatedContainerSix(scheduler, application, schedulerNode,
+        nodeId, priority, app1.getCurrentAppAttempt().getAppAttemptId());
+  }
+
+  /**
+   * Creates a mock instance of {@link RMContainer} with the specified parameters.
+   *
+   * @param containerId     The ID of the container
+   * @param nodeId          The NodeId of the node where the container is allocated
+   * @param appAttemptId    The ApplicationAttemptId of the application attempt
+   * @param allocationId    The allocation ID of the container
+   * @param memory          The amount of memory (in MB) requested for the container
+   * @param priority        The priority of the container request
+   * @param executionType   The execution type of the container request
+   * @return A mock instance of RMContainer with the specified parameters
+   */
+  private RMContainer createMockRMContainer(int containerId, NodeId nodeId,
+      ApplicationAttemptId appAttemptId, long allocationId, int memory,
+      Priority priority, ExecutionType executionType) {
+    // Create a mock instance of Container
+    Container container = mock(Container.class);
+
+    // Mock the Container instance with the specified parameters
+    when(container.getResource()).thenReturn(Resource.newInstance(memory, 1));
+    when(container.getPriority()).thenReturn(priority);
+    when(container.getId()).thenReturn(ContainerId.newContainerId(appAttemptId, containerId));
+    when(container.getNodeId()).thenReturn(nodeId);
+    when(container.getAllocationRequestId()).thenReturn(allocationId);
+    when(container.getExecutionType()).thenReturn(executionType);
+    when(container.getContainerToken()).thenReturn(Token.newInstance(new byte[0], "kind",
+        new byte[0], "service"));
+
+    // Create a mock instance of RMContainerImpl
+    RMContainer rmContainer = mock(RMContainerImpl.class);
+
+    // Set up the behavior of the mock RMContainer
+    when(rmContainer.getContainer()).thenReturn(container);
+    when(rmContainer.getContainerId()).thenReturn(
+        ContainerId.newContainerId(appAttemptId, containerId));
+
+    return rmContainer;
+  }
+
+  /**
+   * Tests the behavior when the container ask is 1 and there are no newly allocated containers.
+   *
+   * @param scheduler         The AbstractYarnScheduler instance to test.
+   * @param application       The SchedulerApplicationAttempt instance representing the application.
+   * @param priority          The priority of the resource request.
+   */
+  private void testContainerAskAndNewlyAllocatedContainerZero(AbstractYarnScheduler scheduler,
+      SchedulerApplicationAttempt application, Priority priority) {
+    // Create a resource request with 1 container, 1024 MB memory, and GUARANTEED execution type
+    ResourceRequest resourceRequest = createResourceRequest(1024, 1, 1,
+        priority, 0,
+        ExecutionTypeRequest.newInstance(ExecutionType.GUARANTEED), ResourceRequest.ANY);
+
+    // Create a list with the resource request
+    List<ResourceRequest> containerAsk = new ArrayList<>();
+    containerAsk.add(resourceRequest);
+
+    // Call the autoCorrectContainerAllocation method
+    scheduler.autoCorrectContainerAllocation(containerAsk, application);
+
+    // Assert that the container ask remains unchanged (1 container)
+    assertEquals(1, containerAsk.get(0).getNumContainers());
+
+    // Assert that there are no newly allocated containers
+    assertEquals(0, application.pullNewlyAllocatedContainers().size());
+  }
+
+  /**
+   * Tests the behavior when the container ask is 1 and there is one newly allocated container.
+   *
+   * @param scheduler      The AbstractYarnScheduler instance to test
+   * @param application    The SchedulerApplicationAttempt instance representing the application
+   * @param schedulerNode  The SchedulerNode instance representing the node
+   * @param nodeId         The NodeId of the node
+   * @param priority       The priority of the resource request
+   * @param appAttemptId   The ApplicationAttemptId of the application attempt
+   */
+  private void testContainerAskAndNewlyAllocatedContainerOne(AbstractYarnScheduler scheduler,
+      SchedulerApplicationAttempt application,
+      SchedulerNode schedulerNode, NodeId nodeId,
+      Priority priority, ApplicationAttemptId appAttemptId) {
+    // Create a resource request with 1 container, 1024 MB memory, and GUARANTEED execution type
+    ResourceRequest resourceRequest = createResourceRequest(1024, 1, 1,
+        priority, 0L, ExecutionTypeRequest.newInstance(ExecutionType.GUARANTEED),
+        ResourceRequest.ANY);
+    List<ResourceRequest> containerAsk = new ArrayList<>();
+    containerAsk.add(resourceRequest);
+
+    // Create an RMContainer with the specified parameters
+    RMContainer rmContainer = createMockRMContainer(1, nodeId, appAttemptId,
+        0L, 1024, priority, ExecutionType.GUARANTEED);
+
+    // Add the RMContainer to the newly allocated containers of the application
+    application.addToNewlyAllocatedContainers(schedulerNode, rmContainer);
+
+    // Call the autoCorrectContainerAllocation method
+    scheduler.autoCorrectContainerAllocation(containerAsk, application);
+
+    // Assert that the container ask is updated to 0
+    assertEquals(0, containerAsk.get(0).getNumContainers());
+
+    // Assert that there is one newly allocated container
+    assertEquals(1, application.pullNewlyAllocatedContainers().size());
+  }
+
+  /**
+   * Tests the behavior when the container ask is 0 and there is one newly allocated container.
+   *
+   * @param scheduler      The AbstractYarnScheduler instance to test
+   * @param application    The SchedulerApplicationAttempt instance representing the application
+   * @param schedulerNode  The SchedulerNode instance representing the node
+   * @param nodeId         The NodeId of the node
+   * @param priority       The priority of the resource request
+   * @param appAttemptId   The ApplicationAttemptId of the application attempt
+   */
+  private void testContainerAskZeroAndNewlyAllocatedContainerOne(AbstractYarnScheduler scheduler,
+      SchedulerApplicationAttempt application, SchedulerNode schedulerNode, NodeId nodeId,
+      Priority priority, ApplicationAttemptId appAttemptId) {
+    // Create a resource request with 0 containers, 1024 MB memory, and GUARANTEED execution type
+    ResourceRequest resourceRequest = createResourceRequest(1024, 1,
+        0, priority, 0L,
+        ExecutionTypeRequest.newInstance(ExecutionType.GUARANTEED), ResourceRequest.ANY);
+    List<ResourceRequest> containerAsk = new ArrayList<>();
+    containerAsk.add(resourceRequest);
+
+    // Create an RMContainer with the specified parameters
+    RMContainer rmContainer1 = createMockRMContainer(1, nodeId, appAttemptId,
+        0L, 1024, priority, ExecutionType.GUARANTEED);
+
+    // Add the RMContainer to the newly allocated containers of the application
+    application.addToNewlyAllocatedContainers(schedulerNode, rmContainer1);
+
+    // Call the autoCorrectContainerAllocation method
+    scheduler.autoCorrectContainerAllocation(containerAsk, application);
+
+    // Assert that the container ask remains 0
+    assertEquals(0, resourceRequest.getNumContainers());
+
+    // Assert that there are no newly allocated containers
+    assertEquals(0, application.pullNewlyAllocatedContainers().size());
+  }
+
+  /**
+   * Tests the behavior when the container ask consists of four unique resource requests
+   * and there are eight newly allocated containers (two containers for each resource request type).
+   *
+   * @param scheduler      The AbstractYarnScheduler instance to test
+   * @param application    The SchedulerApplicationAttempt instance representing the application
+   * @param schedulerNode  The SchedulerNode instance representing the node
+   * @param nodeId         The NodeId of the node
+   * @param priority       The priority of the resource requests
+   * @param appAttemptId   The ApplicationAttemptId of the application attempt
+   */
+  private void testContainerAskFourAndNewlyAllocatedContainerEight(AbstractYarnScheduler scheduler,
+      SchedulerApplicationAttempt application, SchedulerNode schedulerNode,
+      NodeId nodeId, Priority priority, ApplicationAttemptId appAttemptId) {
+    // Create four unique resource requests
+    ResourceRequest resourceRequest1 = createResourceRequest(1024, 1, 1,
+        priority, 0L,
+        ExecutionTypeRequest.newInstance(ExecutionType.GUARANTEED), ResourceRequest.ANY);
+    ResourceRequest resourceRequest2 = createResourceRequest(2048, 1, 1,
+        priority, 0L,
+        ExecutionTypeRequest.newInstance(ExecutionType.GUARANTEED), ResourceRequest.ANY);
+    ResourceRequest resourceRequest3 = createResourceRequest(1024, 1, 1,
+        priority, 1L,
+        ExecutionTypeRequest.newInstance(ExecutionType.GUARANTEED), ResourceRequest.ANY);
+    ResourceRequest resourceRequest4 = createResourceRequest(1024, 1, 1,
+        priority, 0L,
+        ExecutionTypeRequest.newInstance(ExecutionType.OPPORTUNISTIC), ResourceRequest.ANY);
+
+    // Add the resource requests to a list
+    List<ResourceRequest> ask4 = new ArrayList<>();
+    ask4.add(resourceRequest1);
+    ask4.add(resourceRequest2);
+    ask4.add(resourceRequest3);
+    ask4.add(resourceRequest4);
+
+    // Create eight RMContainers (two for each resource request type)
+    RMContainer rmContainer1 = createMockRMContainer(1, nodeId, appAttemptId,
+        0L, 1024, priority, ExecutionType.GUARANTEED);
+    RMContainer rmContainer2 = createMockRMContainer(2, nodeId, appAttemptId,
+        0L, 1024, priority, ExecutionType.GUARANTEED);
+    RMContainer rmContainer3 = createMockRMContainer(3, nodeId, appAttemptId,
+        0L, 2048, priority, ExecutionType.GUARANTEED);
+    RMContainer rmContainer4 = createMockRMContainer(4, nodeId, appAttemptId,
+        0L, 2048, priority, ExecutionType.GUARANTEED);
+    RMContainer rmContainer5 = createMockRMContainer(5, nodeId, appAttemptId,
+        1L, 1024, priority, ExecutionType.GUARANTEED);
+    RMContainer rmContainer6 = createMockRMContainer(6, nodeId, appAttemptId,
+        1L, 1024, priority, ExecutionType.GUARANTEED);
+    RMContainer rmContainer7 = createMockRMContainer(7, nodeId, appAttemptId,
+        0L, 1024, priority, ExecutionType.OPPORTUNISTIC);
+    RMContainer rmContainer8 = createMockRMContainer(8, nodeId, appAttemptId,
+        0L, 1024, priority, ExecutionType.OPPORTUNISTIC);
+
+    // Add the RMContainers to the newly allocated containers of the application
+    application.addToNewlyAllocatedContainers(schedulerNode, rmContainer1);
+    application.addToNewlyAllocatedContainers(schedulerNode, rmContainer2);
+    application.addToNewlyAllocatedContainers(schedulerNode, rmContainer3);
+    application.addToNewlyAllocatedContainers(schedulerNode, rmContainer4);
+    application.addToNewlyAllocatedContainers(schedulerNode, rmContainer5);
+    application.addToNewlyAllocatedContainers(schedulerNode, rmContainer6);
+    application.addToNewlyAllocatedContainers(schedulerNode, rmContainer7);
+    application.addToNewlyAllocatedContainers(schedulerNode, rmContainer8);
+
+    // Call the autoCorrectContainerAllocation method
+    scheduler.autoCorrectContainerAllocation(ask4, application);
+
+    // Assert that all resource requests have 0 containers
+    for (ResourceRequest rr : ask4) {
+      assertEquals(0, rr.getNumContainers());
+    }
+
+    // Assert that there are four newly allocated containers
+    assertEquals(4, application.pullNewlyAllocatedContainers().size());
+  }
+
+  /**
+   * Tests the behavior when the container ask consists of two resource requests.
+   * i.e one for any host and one for a specific host ,
+   * each requesting four containers, and there are six newly allocated containers.
+   *
+   * @param scheduler      The AbstractYarnScheduler instance to test
+   * @param application    The SchedulerApplicationAttempt instance representing the application
+   * @param schedulerNode  The SchedulerNode instance representing the node
+   * @param nodeId         The NodeId of the node
+   * @param priority       The priority of the resource requests
+   * @param appAttemptId   The ApplicationAttemptId of the application attempt
+   */
+  private void testContainerAskFourAndNewlyAllocatedContainerSix(AbstractYarnScheduler scheduler,
+      SchedulerApplicationAttempt application, SchedulerNode schedulerNode,
+      NodeId nodeId, Priority priority, ApplicationAttemptId appAttemptId) {
+    // Create a resource request for any host, requesting 4 containers
+    ResourceRequest resourceRequest1 = createResourceRequest(1024, 1, 4,
+        priority, 0L,
+        ExecutionTypeRequest.newInstance(ExecutionType.GUARANTEED), ResourceRequest.ANY);
+
+    // Create a resource request for a specific host, requesting 4 containers
+    ResourceRequest resourceRequest2 = createResourceRequest(1024, 1, 4,
+        priority, 0L,
+        ExecutionTypeRequest.newInstance(ExecutionType.GUARANTEED), nodeId.getHost());
+
+    // Add the resource requests to a list
+    List<ResourceRequest> containerAsk = new ArrayList<>();
+    containerAsk.add(resourceRequest1);
+    containerAsk.add(resourceRequest2);
+
+    // Create six RMContainers with the specified parameters
+    RMContainer rmContainer1 = createMockRMContainer(1, nodeId, appAttemptId,
+        0L, 1024, priority, ExecutionType.GUARANTEED);
+    RMContainer rmContainer2 = createMockRMContainer(2, nodeId, appAttemptId,
+        0L, 1024, priority, ExecutionType.GUARANTEED);
+    RMContainer rmContainer3 = createMockRMContainer(3, nodeId, appAttemptId,
+        0L, 1024, priority, ExecutionType.GUARANTEED);
+    RMContainer rmContainer4 = createMockRMContainer(4, nodeId, appAttemptId,
+        0L, 1024, priority, ExecutionType.GUARANTEED);
+    RMContainer rmContainer5 = createMockRMContainer(5, nodeId, appAttemptId,
+        0L, 1024, priority, ExecutionType.GUARANTEED);
+    RMContainer rmContainer6 = createMockRMContainer(6, nodeId, appAttemptId,
+        0L, 1024, priority, ExecutionType.GUARANTEED);
+
+    // Add the RMContainers to the newly allocated containers of the application
+    application.addToNewlyAllocatedContainers(schedulerNode, rmContainer1);
+    application.addToNewlyAllocatedContainers(schedulerNode, rmContainer2);
+    application.addToNewlyAllocatedContainers(schedulerNode, rmContainer3);
+    application.addToNewlyAllocatedContainers(schedulerNode, rmContainer4);
+    application.addToNewlyAllocatedContainers(schedulerNode, rmContainer5);
+    application.addToNewlyAllocatedContainers(schedulerNode, rmContainer6);
+
+    // Call the autoCorrectContainerAllocation method
+    scheduler.autoCorrectContainerAllocation(containerAsk, application);
+
+    // Assert that all resource requests have 0 containers
+    for (ResourceRequest resourceRequest : containerAsk) {
+      assertEquals(0, resourceRequest.getNumContainers());
+    }
+
+    // Assert that there are four newly allocated containers
+    assertEquals(4, application.pullNewlyAllocatedContainers().size());
+  }
+
   @Test
   public void testUpdateMaxAllocationUsesTotal() throws IOException {
     final int configuredMaxVCores = 20;
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestUtils.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestUtils.java
index 418f85983a8e2..e07ffd20ad50a 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestUtils.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestUtils.java
@@ -179,6 +179,20 @@ public static ResourceRequest createResourceRequest(String resourceName,
     request.setNodeLabelExpression(labelExpression);
     return request;
   }
+
+  public static ResourceRequest createResourceRequest(int memory, int vcores, int numContainers,
+      Priority priority, long allocationId, ExecutionTypeRequest type, String resourceName) {
+    ResourceRequest request =
+        recordFactory.newRecordInstance(ResourceRequest.class);
+    Resource capability = Resources.createResource(memory, vcores);
+    request.setNumContainers(numContainers);
+    request.setCapability(capability);
+    request.setPriority(priority);
+    request.setAllocationRequestId(allocationId);
+    request.setExecutionTypeRequest(type);
+    request.setResourceName(resourceName);
+    return request;
+  }
   
   public static ResourceRequest createResourceRequest(
       String resourceName, int memory, int numContainers, boolean relaxLocality,

From 4f26b365135e4592d88378d64da2cedddec476df Mon Sep 17 00:00:00 2001
From: Prabhjyot Singh <prabhjyot@acceldata.io>
Date: Wed, 20 Nov 2024 10:17:18 -0500
Subject: [PATCH 13/40] ODP-2635: HADOOP-16167:  Fixed Hadoop shell script for
 Ubuntu 18. (#44)

Contributed by Daniel Templeton

(cherry picked from commit 5446e3cb8a4d9b6aa517fc8437ba194a9ae9b193)
(cherry picked from commit 11d6c4eccc46e3a1c5887e0bf6bac009542482df)

Co-authored-by: Eric Yang <eyang@apache.org>
---
 .../hadoop-common/src/main/bin/hadoop-functions.sh   | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/hadoop-common-project/hadoop-common/src/main/bin/hadoop-functions.sh b/hadoop-common-project/hadoop-common/src/main/bin/hadoop-functions.sh
index 31bc607e14db5..6eadf09b94c64 100755
--- a/hadoop-common-project/hadoop-common/src/main/bin/hadoop-functions.sh
+++ b/hadoop-common-project/hadoop-common/src/main/bin/hadoop-functions.sh
@@ -2361,6 +2361,10 @@ function hadoop_verify_user_perm
   declare command=$2
   declare uvar
 
+  if [[ ${command} =~ \. ]]; then
+    return 1
+  fi
+
   uvar=$(hadoop_build_custom_subcmd_var "${program}" "${command}" USER)
 
   if [[ -n ${!uvar} ]]; then
@@ -2392,6 +2396,10 @@ function hadoop_need_reexec
     return 1
   fi
 
+  if [[ ${command} =~ \. ]]; then
+    return 1
+  fi
+
   # if we have privilege, and the _USER is defined, and _USER is
   # set to someone who isn't us, then yes, we should re-exec.
   # otherwise no, don't re-exec and let the system deal with it.
@@ -2428,6 +2436,10 @@ function hadoop_subcommand_opts
     return 1
   fi
 
+  if [[ ${command} =~ \. ]]; then
+    return 1
+  fi
+
   # bash 4 and up have built-in ways to upper and lower
   # case the contents of vars.  This is faster than
   # calling tr.

From 57b8078daecebdce55f996f425245a6ebfd21407 Mon Sep 17 00:00:00 2001
From: Prabhjyot Singh <prabhjyot@acceldata.io>
Date: Wed, 20 Nov 2024 10:18:44 -0500
Subject: [PATCH 14/40] ODP-2634: YARN-10352: Skip schedule on not heartbeated
 nodes in Multi Node Placement. Contributed by Prabhu Joseph and Qi Zhu (#43)

(cherry picked from commit bc815b3ddff2fa4499e5a8b2ffe9ea0d3e8e712d)
---
 .../hadoop/yarn/conf/YarnConfiguration.java   | 244 ++++++++++--------
 .../src/main/resources/yarn-default.xml       |  71 ++---
 .../scheduler/AbstractYarnScheduler.java      |   5 +
 .../scheduler/SchedulerUtils.java             |  14 +-
 .../scheduler/capacity/CapacityScheduler.java |  99 +++++--
 .../scheduler/placement/MultiNodeSorter.java  |   2 +-
 .../placement/MultiNodeSortingManager.java    |  43 ++-
 7 files changed, 307 insertions(+), 171 deletions(-)

diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java
index 8beea5a64bb50..acb7fbb7fe52d 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java
@@ -162,27 +162,27 @@ private static void addDeprecatedKeys() {
   /** Factory to create client IPC classes.*/
   public static final String IPC_CLIENT_FACTORY_CLASS =
     IPC_PREFIX + "client.factory.class";
-  public static final String DEFAULT_IPC_CLIENT_FACTORY_CLASS = 
+  public static final String DEFAULT_IPC_CLIENT_FACTORY_CLASS =
       "org.apache.hadoop.yarn.factories.impl.pb.RpcClientFactoryPBImpl";
 
   /** Factory to create server IPC classes.*/
-  public static final String IPC_SERVER_FACTORY_CLASS = 
+  public static final String IPC_SERVER_FACTORY_CLASS =
     IPC_PREFIX + "server.factory.class";
-  public static final String DEFAULT_IPC_SERVER_FACTORY_CLASS = 
+  public static final String DEFAULT_IPC_SERVER_FACTORY_CLASS =
       "org.apache.hadoop.yarn.factories.impl.pb.RpcServerFactoryPBImpl";
 
   /** Factory to create serializable records.*/
-  public static final String IPC_RECORD_FACTORY_CLASS = 
+  public static final String IPC_RECORD_FACTORY_CLASS =
     IPC_PREFIX + "record.factory.class";
-  public static final String DEFAULT_IPC_RECORD_FACTORY_CLASS = 
+  public static final String DEFAULT_IPC_RECORD_FACTORY_CLASS =
       "org.apache.hadoop.yarn.factories.impl.pb.RecordFactoryPBImpl";
 
   /** RPC class implementation*/
   public static final String IPC_RPC_IMPL =
     IPC_PREFIX + "rpc.class";
-  public static final String DEFAULT_IPC_RPC_IMPL = 
+  public static final String DEFAULT_IPC_RPC_IMPL =
     "org.apache.hadoop.yarn.ipc.HadoopYarnProtoRPC";
-  
+
   ////////////////////////////////
   // Resource Manager Configs
   ////////////////////////////////
@@ -201,7 +201,7 @@ private static void addDeprecatedKeys() {
   public static final long DEFAULT_RM_EPOCH_RANGE = 0;
 
   /** The address of the applications manager interface in the RM.*/
-  public static final String RM_ADDRESS = 
+  public static final String RM_ADDRESS =
     RM_PREFIX + "address";
   public static final int DEFAULT_RM_PORT = 8032;
   public static final String DEFAULT_RM_ADDRESS =
@@ -248,9 +248,9 @@ private static void addDeprecatedKeys() {
   /** The Kerberos principal for the resource manager.*/
   public static final String RM_PRINCIPAL =
     RM_PREFIX + "principal";
-  
+
   /** The address of the scheduler interface.*/
-  public static final String RM_SCHEDULER_ADDRESS = 
+  public static final String RM_SCHEDULER_ADDRESS =
     RM_PREFIX + "scheduler.address";
   public static final int DEFAULT_RM_SCHEDULER_PORT = 8030;
   public static final String DEFAULT_RM_SCHEDULER_ADDRESS = "0.0.0.0:" +
@@ -278,12 +278,12 @@ private static void addDeprecatedKeys() {
   public static final int DEFAULT_RM_SCHEDULER_CLIENT_THREAD_COUNT = 50;
 
   /** If the port should be included or not in the node name. The node name
-   * is used by the scheduler for resource requests allocation location 
+   * is used by the scheduler for resource requests allocation location
    * matching. Typically this is just the hostname, using the port is needed
    * when using minicluster and specific NM are required.*/
   public static final String RM_SCHEDULER_INCLUDE_PORT_IN_NODE_NAME =
       YARN_PREFIX + "scheduler.include-port-in-node-name";
-  public static final boolean DEFAULT_RM_SCHEDULER_USE_PORT_FOR_NODE_NAME = 
+  public static final boolean DEFAULT_RM_SCHEDULER_USE_PORT_FOR_NODE_NAME =
       false;
 
   /** Configured scheduler queue placement rules. */
@@ -338,19 +338,19 @@ private static void addDeprecatedKeys() {
     RM_PREFIX + "scheduler.monitor.policies";
 
   /** The address of the RM web application.*/
-  public static final String RM_WEBAPP_ADDRESS = 
+  public static final String RM_WEBAPP_ADDRESS =
     RM_PREFIX + "webapp.address";
 
   public static final int DEFAULT_RM_WEBAPP_PORT = 8088;
   public static final String DEFAULT_RM_WEBAPP_ADDRESS = "0.0.0.0:" +
     DEFAULT_RM_WEBAPP_PORT;
-  
+
   /** The https address of the RM web application.*/
   public static final String RM_WEBAPP_HTTPS_ADDRESS =
       RM_PREFIX + "webapp.https.address";
   public static final boolean YARN_SSL_CLIENT_HTTPS_NEED_AUTH_DEFAULT = false;
   public static final String YARN_SSL_SERVER_RESOURCE_DEFAULT = "ssl-server.xml";
-  
+
   public static final int DEFAULT_RM_WEBAPP_HTTPS_PORT = 8090;
   public static final String DEFAULT_RM_WEBAPP_HTTPS_ADDRESS = "0.0.0.0:"
       + DEFAULT_RM_WEBAPP_HTTPS_PORT;
@@ -378,17 +378,17 @@ private static void addDeprecatedKeys() {
     "0.0.0.0:" + DEFAULT_RM_RESOURCE_TRACKER_PORT;
 
   /** The expiry interval for application master reporting.*/
-  public static final String RM_AM_EXPIRY_INTERVAL_MS = 
+  public static final String RM_AM_EXPIRY_INTERVAL_MS =
     YARN_PREFIX  + "am.liveness-monitor.expiry-interval-ms";
   public static final int DEFAULT_RM_AM_EXPIRY_INTERVAL_MS = 600000;
 
   /** How long to wait until a node manager is considered dead.*/
-  public static final String RM_NM_EXPIRY_INTERVAL_MS = 
+  public static final String RM_NM_EXPIRY_INTERVAL_MS =
     YARN_PREFIX + "nm.liveness-monitor.expiry-interval-ms";
   public static final int DEFAULT_RM_NM_EXPIRY_INTERVAL_MS = 600000;
 
   /** Are acls enabled.*/
-  public static final String YARN_ACL_ENABLE = 
+  public static final String YARN_ACL_ENABLE =
     YARN_PREFIX + "acl.enable";
   public static final boolean DEFAULT_YARN_ACL_ENABLE = false;
 
@@ -402,10 +402,10 @@ public static boolean isAclEnabled(Configuration conf) {
   }
 
   /** ACL of who can be admin of YARN cluster.*/
-  public static final String YARN_ADMIN_ACL = 
+  public static final String YARN_ADMIN_ACL =
     YARN_PREFIX + "admin.acl";
   public static final String DEFAULT_YARN_ADMIN_ACL = "*";
-  
+
   /** ACL used in case none is found. Allows nothing. */
   public static final String DEFAULT_YARN_APP_ACL = " ";
 
@@ -491,17 +491,17 @@ public static boolean isAclEnabled(Configuration conf) {
   public static final boolean DEFAULT_YARN_INTERMEDIATE_DATA_ENCRYPTION = false;
 
   /** The address of the RM admin interface.*/
-  public static final String RM_ADMIN_ADDRESS = 
+  public static final String RM_ADMIN_ADDRESS =
     RM_PREFIX + "admin.address";
   public static final int DEFAULT_RM_ADMIN_PORT = 8033;
   public static final String DEFAULT_RM_ADMIN_ADDRESS = "0.0.0.0:" +
       DEFAULT_RM_ADMIN_PORT;
-  
+
   /**Number of threads used to handle RM admin interface.*/
   public static final String RM_ADMIN_CLIENT_THREAD_COUNT =
     RM_PREFIX + "admin.client.thread-count";
   public static final int DEFAULT_RM_ADMIN_CLIENT_THREAD_COUNT = 1;
-  
+
   /**
    * The maximum number of application attempts for
    * an application, if unset by user.
@@ -516,15 +516,15 @@ public static boolean isAclEnabled(Configuration conf) {
    */
   public static final String GLOBAL_RM_AM_MAX_ATTEMPTS =
       RM_PREFIX + "am.global.max-attempts";
-  
+
   /** The keytab for the resource manager.*/
-  public static final String RM_KEYTAB = 
+  public static final String RM_KEYTAB =
     RM_PREFIX + "keytab";
 
   /**The kerberos principal to be used for spnego filter for RM.*/
   public static final String RM_WEBAPP_SPNEGO_USER_NAME_KEY =
       RM_PREFIX + "webapp.spnego-principal";
-  
+
   /**The kerberos keytab to be used for spnego filter for RM.*/
   public static final String RM_WEBAPP_SPNEGO_KEYTAB_FILE_KEY =
       RM_PREFIX + "webapp.spnego-keytab-file";
@@ -546,12 +546,12 @@ public static boolean isAclEnabled(Configuration conf) {
   public static final boolean DEFAULT_RM_WEBAPP_ENABLE_CORS_FILTER = false;
 
   /** How long to wait until a container is considered dead.*/
-  public static final String RM_CONTAINER_ALLOC_EXPIRY_INTERVAL_MS = 
+  public static final String RM_CONTAINER_ALLOC_EXPIRY_INTERVAL_MS =
     RM_PREFIX + "rm.container-allocation.expiry-interval-ms";
   public static final int DEFAULT_RM_CONTAINER_ALLOC_EXPIRY_INTERVAL_MS = 600000;
-  
+
   /** Path to file with nodes to include.*/
-  public static final String RM_NODES_INCLUDE_FILE_PATH = 
+  public static final String RM_NODES_INCLUDE_FILE_PATH =
     RM_PREFIX + "nodes.include-path";
   public static final String DEFAULT_RM_NODES_INCLUDE_FILE_PATH = "";
 
@@ -572,19 +572,19 @@ public static boolean isAclEnabled(Configuration conf) {
       RM_PREFIX + "submission-preprocessor.file-refresh-interval-ms";
   public static final int
       DEFAULT_RM_SUBMISSION_PREPROCESSOR_REFRESH_INTERVAL_MS = 0;
-  
+
   /** Path to file with nodes to exclude.*/
-  public static final String RM_NODES_EXCLUDE_FILE_PATH = 
+  public static final String RM_NODES_EXCLUDE_FILE_PATH =
     RM_PREFIX + "nodes.exclude-path";
   public static final String DEFAULT_RM_NODES_EXCLUDE_FILE_PATH = "";
-  
+
   /** Number of threads to handle resource tracker calls.*/
   public static final String RM_RESOURCE_TRACKER_CLIENT_THREAD_COUNT =
     RM_PREFIX + "resource-tracker.client.thread-count";
   public static final int DEFAULT_RM_RESOURCE_TRACKER_CLIENT_THREAD_COUNT = 50;
-  
+
   /** The class to use as the resource scheduler.*/
-  public static final String RM_SCHEDULER = 
+  public static final String RM_SCHEDULER =
     RM_PREFIX + "scheduler.class";
 
   /**
@@ -663,8 +663,8 @@ public static boolean isAclEnabled(Configuration conf) {
 
   public static final int DEFAULT_RM_PLACEMENT_CONSTRAINTS_SCHEDULER_POOL_SIZE =
       1;
- 
-  public static final String DEFAULT_RM_SCHEDULER = 
+
+  public static final String DEFAULT_RM_SCHEDULER =
       "org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacityScheduler";
 
   /** RM set next Heartbeat interval for NM */
@@ -696,6 +696,14 @@ public static boolean isAclEnabled(Configuration conf) {
   public static final float
       DEFAULT_RM_NM_HEARTBEAT_INTERVAL_SLOWDOWN_FACTOR = 1.0f;
 
+  /**
+   * Number of consecutive missed heartbeats after which node will be
+   * skipped from scheduling.
+   */
+  public static final String SCHEDULER_SKIP_NODE_MULTIPLIER =
+      YARN_PREFIX + "scheduler.skip.node.multiplier";
+  public static final int DEFAULT_SCHEDULER_SKIP_NODE_MULTIPLIER = 2;
+
   /** Number of worker threads that write the history data. */
   public static final String RM_HISTORY_WRITER_MULTI_THREADED_DISPATCHER_POOL_SIZE =
       RM_PREFIX + "history-writer.multi-threaded-dispatcher.pool-size";
@@ -1017,7 +1025,7 @@ public static boolean isAclEnabled(Configuration conf) {
   ////////////////////////////////
   /** The class to use as the persistent store.*/
   public static final String RM_STORE = RM_PREFIX + "store.class";
-  
+
   /** URI for FileSystemRMStateStore */
   public static final String FS_RM_STATE_STORE_URI = RM_PREFIX
       + "fs.state-store.uri";
@@ -1075,7 +1083,7 @@ public static boolean isAclEnabled(Configuration conf) {
 
   /** Default application type length */
   public static final int APPLICATION_TYPE_LENGTH = 20;
-  
+
   /** Default queue name */
   public static final String DEFAULT_QUEUE_NAME = "default";
 
@@ -1088,7 +1096,7 @@ public static boolean isAclEnabled(Configuration conf) {
   /**
    * Default sizes of the runtime metric buckets in minutes.
    */
-  public static final String DEFAULT_RM_METRICS_RUNTIME_BUCKETS = 
+  public static final String DEFAULT_RM_METRICS_RUNTIME_BUCKETS =
     "60,300,1440";
 
   public static final String RM_AMRM_TOKEN_MASTER_KEY_ROLLING_INTERVAL_SECS = RM_PREFIX
@@ -1105,7 +1113,7 @@ public static boolean isAclEnabled(Configuration conf) {
 
   public static final String RM_NMTOKEN_MASTER_KEY_ROLLING_INTERVAL_SECS =
       RM_PREFIX + "nm-tokens.master-key-rolling-interval-secs";
-  
+
   public static final long DEFAULT_RM_NMTOKEN_MASTER_KEY_ROLLING_INTERVAL_SECS =
       24 * 60 * 60;
 
@@ -1184,7 +1192,7 @@ public static boolean isAclEnabled(Configuration conf) {
   ////////////////////////////////
   // Node Manager Configs
   ////////////////////////////////
-  
+
   /** Prefix for all node manager configs.*/
   public static final String NM_PREFIX = "yarn.nodemanager.";
 
@@ -1221,13 +1229,13 @@ public static boolean isAclEnabled(Configuration conf) {
       ApplicationConstants.Environment.HADOOP_CONF_DIR.key(),
       ApplicationConstants.Environment.CLASSPATH_PREPEND_DISTCACHE.key(),
       ApplicationConstants.Environment.HADOOP_YARN_HOME.key()));
-  
+
   /** address of node manager IPC.*/
   public static final String NM_ADDRESS = NM_PREFIX + "address";
   public static final int DEFAULT_NM_PORT = 0;
   public static final String DEFAULT_NM_ADDRESS = "0.0.0.0:"
       + DEFAULT_NM_PORT;
-  
+
   /** The actual bind address for the NM.*/
   public static final String NM_BIND_HOST =
     NM_PREFIX + "bind-host";
@@ -1240,42 +1248,42 @@ public static boolean isAclEnabled(Configuration conf) {
   public static final String NM_CONTAINER_STATE_TRANSITION_LISTENERS =
       NM_PREFIX + "container-state-transition-listener.classes";
 
-  /**  
+  /**
    * Adjustment to make to the container os scheduling priority.
    * The valid values for this could vary depending on the platform.
-   * On Linux, higher values mean run the containers at a less 
-   * favorable priority than the NM. 
+   * On Linux, higher values mean run the containers at a less
+   * favorable priority than the NM.
    * The value specified is an int.
    */
-  public static final String NM_CONTAINER_EXECUTOR_SCHED_PRIORITY = 
+  public static final String NM_CONTAINER_EXECUTOR_SCHED_PRIORITY =
     NM_PREFIX + "container-executor.os.sched.priority.adjustment";
   public static final int DEFAULT_NM_CONTAINER_EXECUTOR_SCHED_PRIORITY = 0;
-  
+
   /** Number of threads container manager uses.*/
   public static final String NM_CONTAINER_MGR_THREAD_COUNT =
     NM_PREFIX + "container-manager.thread-count";
   public static final int DEFAULT_NM_CONTAINER_MGR_THREAD_COUNT = 20;
-  
+
   /** Number of threads container manager uses.*/
   public static final String NM_COLLECTOR_SERVICE_THREAD_COUNT =
       NM_PREFIX + "collector-service.thread-count";
   public static final int DEFAULT_NM_COLLECTOR_SERVICE_THREAD_COUNT = 5;
 
   /** Number of threads used in cleanup.*/
-  public static final String NM_DELETE_THREAD_COUNT = 
+  public static final String NM_DELETE_THREAD_COUNT =
     NM_PREFIX +  "delete.thread-count";
   public static final int DEFAULT_NM_DELETE_THREAD_COUNT = 4;
-  
+
   /** Keytab for NM.*/
   public static final String NM_KEYTAB = NM_PREFIX + "keytab";
-  
+
   /**List of directories to store localized files in.*/
   public static final String NM_LOCAL_DIRS = NM_PREFIX + "local-dirs";
   public static final String DEFAULT_NM_LOCAL_DIRS = "/tmp/nm-local-dir";
 
   /**
    * Number of files in each localized directories
-   * Avoid tuning this too low. 
+   * Avoid tuning this too low.
    */
   public static final String NM_LOCAL_CACHE_MAX_FILES_PER_DIRECTORY =
     NM_PREFIX + "local-cache.max-files-per-directory";
@@ -1287,7 +1295,7 @@ public static boolean isAclEnabled(Configuration conf) {
   public static final int DEFAULT_NM_LOCALIZER_PORT = 8040;
   public static final String DEFAULT_NM_LOCALIZER_ADDRESS = "0.0.0.0:" +
     DEFAULT_NM_LOCALIZER_PORT;
-  
+
   /** Address where the collector service IPC is.*/
   public static final String NM_COLLECTOR_SERVICE_ADDRESS =
       NM_PREFIX + "collector-service.address";
@@ -1308,9 +1316,9 @@ public static boolean isAclEnabled(Configuration conf) {
   /** Interval in between cache cleanups.*/
   public static final String NM_LOCALIZER_CACHE_CLEANUP_INTERVAL_MS =
     NM_PREFIX + "localizer.cache.cleanup.interval-ms";
-  public static final long DEFAULT_NM_LOCALIZER_CACHE_CLEANUP_INTERVAL_MS = 
+  public static final long DEFAULT_NM_LOCALIZER_CACHE_CLEANUP_INTERVAL_MS =
     10 * 60 * 1000;
-  
+
   /**
    * Target size of localizer cache in MB, per nodemanager. It is a target
    * retention size that only includes resources with PUBLIC and PRIVATE
@@ -1319,14 +1327,14 @@ public static boolean isAclEnabled(Configuration conf) {
   public static final String NM_LOCALIZER_CACHE_TARGET_SIZE_MB =
     NM_PREFIX + "localizer.cache.target-size-mb";
   public static final long DEFAULT_NM_LOCALIZER_CACHE_TARGET_SIZE_MB = 10 * 1024;
-  
+
   /** Number of threads to handle localization requests.*/
   public static final String NM_LOCALIZER_CLIENT_THREAD_COUNT =
     NM_PREFIX + "localizer.client.thread-count";
   public static final int DEFAULT_NM_LOCALIZER_CLIENT_THREAD_COUNT = 5;
-  
+
   /** Number of threads to use for localization fetching.*/
-  public static final String NM_LOCALIZER_FETCH_THREAD_COUNT = 
+  public static final String NM_LOCALIZER_FETCH_THREAD_COUNT =
     NM_PREFIX + "localizer.fetch.thread-count";
   public static final int DEFAULT_NM_LOCALIZER_FETCH_THREAD_COUNT = 4;
 
@@ -1381,7 +1389,7 @@ public static boolean isAclEnabled(Configuration conf) {
       RM_PREFIX + "delayed.delegation-token.removal-interval-ms";
   public static final long DEFAULT_RM_DELAYED_DELEGATION_TOKEN_REMOVAL_INTERVAL_MS =
       30000l;
-  
+
   /** Delegation Token renewer thread count */
   public static final String RM_DELEGATION_TOKEN_RENEWER_THREAD_COUNT =
       RM_PREFIX + "delegation-token-renewer.thread-count";
@@ -1432,7 +1440,7 @@ public static boolean isAclEnabled(Configuration conf) {
   public static final String LOG_AGGREGATION_REMOTE_APP_LOG_DIR_SUFFIX_FMT
       = YARN_PREFIX + "log-aggregation.%s.remote-app-log-dir-suffix";
 
-  /** 
+  /**
    * How long to wait before deleting aggregated logs, -1 disables.
    * Be careful set this too small and you will spam the name node.
    */
@@ -1444,7 +1452,7 @@ public static boolean isAclEnabled(Configuration conf) {
       + "log-aggregation.debug.filesize";
   public static final long DEFAULT_LOG_AGGREGATION_DEBUG_FILESIZE
       = 100 * 1024 * 1024;
-  
+
   /**
    * How long to wait between aggregated log retention checks. If set to
    * a value {@literal <=} 0 then the value is computed as one-tenth of the
@@ -1506,12 +1514,12 @@ public static boolean isAclEnabled(Configuration conf) {
    * Number of threads used in log cleanup. Only applicable if Log aggregation
    * is disabled
    */
-  public static final String NM_LOG_DELETION_THREADS_COUNT = 
+  public static final String NM_LOG_DELETION_THREADS_COUNT =
     NM_PREFIX +  "log.deletion-threads-count";
   public static final int DEFAULT_NM_LOG_DELETE_THREAD_COUNT = 4;
 
   /** Where to aggregate logs to.*/
-  public static final String NM_REMOTE_APP_LOG_DIR = 
+  public static final String NM_REMOTE_APP_LOG_DIR =
     NM_PREFIX + "remote-app-log-dir";
   public static final String DEFAULT_NM_REMOTE_APP_LOG_DIR = "/tmp/logs";
 
@@ -1519,7 +1527,7 @@ public static boolean isAclEnabled(Configuration conf) {
    * The remote log dir will be created at
    * NM_REMOTE_APP_LOG_DIR/${user}/NM_REMOTE_APP_LOG_DIR_SUFFIX/${appId}
    */
-  public static final String NM_REMOTE_APP_LOG_DIR_SUFFIX = 
+  public static final String NM_REMOTE_APP_LOG_DIR_SUFFIX =
     NM_PREFIX + "remote-app-log-dir-suffix";
   public static final String DEFAULT_NM_REMOTE_APP_LOG_DIR_SUFFIX="logs";
 
@@ -1529,7 +1537,7 @@ public static boolean isAclEnabled(Configuration conf) {
   public static final String YARN_LOG_SERVER_WEBSERVICE_URL =
       YARN_PREFIX + "log.server.web-service.url";
 
-  public static final String YARN_TRACKING_URL_GENERATOR = 
+  public static final String YARN_TRACKING_URL_GENERATOR =
       YARN_PREFIX + "tracking.url.generator";
 
   /** Amount of memory in MB that can be allocated for containers.*/
@@ -1829,7 +1837,7 @@ public static boolean isAclEnabled(Configuration conf) {
       + "webapp.https.address";
   public static final int DEFAULT_NM_WEBAPP_HTTPS_PORT = 8044;
   public static final String DEFAULT_NM_WEBAPP_HTTPS_ADDRESS = "0.0.0.0:"
-      + DEFAULT_NM_WEBAPP_HTTPS_PORT; 
+      + DEFAULT_NM_WEBAPP_HTTPS_PORT;
 
   /** Enable/disable CORS filter. */
   public static final String NM_WEBAPP_ENABLE_CORS_FILTER =
@@ -2010,14 +2018,14 @@ public static boolean isAclEnabled(Configuration conf) {
   public static final long DEFAULT_NM_MIN_PER_DISK_FREE_SPACE_MB = 0;
 
   /** Frequency of running node health script.*/
-  public static final String NM_HEALTH_CHECK_INTERVAL_MS = 
+  public static final String NM_HEALTH_CHECK_INTERVAL_MS =
     NM_PREFIX + "health-checker.interval-ms";
   public static final long DEFAULT_NM_HEALTH_CHECK_INTERVAL_MS = 10 * 60 * 1000;
 
-  /** Health check script time out period.*/  
-  public static final String NM_HEALTH_CHECK_SCRIPT_TIMEOUT_MS = 
+  /** Health check script time out period.*/
+  public static final String NM_HEALTH_CHECK_SCRIPT_TIMEOUT_MS =
     NM_PREFIX + "health-checker.script.timeout-ms";
-  public static final long DEFAULT_NM_HEALTH_CHECK_SCRIPT_TIMEOUT_MS = 
+  public static final long DEFAULT_NM_HEALTH_CHECK_SCRIPT_TIMEOUT_MS =
     2 * DEFAULT_NM_HEALTH_CHECK_INTERVAL_MS;
 
   /** Whether or not to run the node health script before the NM
@@ -2026,13 +2034,13 @@ public static boolean isAclEnabled(Configuration conf) {
       NM_PREFIX + "health-checker.run-before-startup";
   public static final boolean DEFAULT_NM_HEALTH_CHECK_RUN_BEFORE_STARTUP =
       false;
-  
+
   /** The health check script to run.*/
-  public static final String NM_HEALTH_CHECK_SCRIPT_PATH = 
+  public static final String NM_HEALTH_CHECK_SCRIPT_PATH =
     NM_PREFIX + "health-checker.script.path";
-  
+
   /** The arguments to pass to the health check script.*/
-  public static final String NM_HEALTH_CHECK_SCRIPT_OPTS = 
+  public static final String NM_HEALTH_CHECK_SCRIPT_OPTS =
     NM_PREFIX + "health-checker.script.opts";
 
   /** The JVM options used on forking ContainerLocalizer process
@@ -2268,30 +2276,30 @@ public static boolean isAclEnabled(Configuration conf) {
   public static final String DEFAULT_NM_NONSECURE_MODE_LOCAL_USER = "nobody";
 
   /**
-   * The allowed pattern for UNIX user names enforced by 
-   * Linux-container-executor when used in nonsecure mode (use case for this 
+   * The allowed pattern for UNIX user names enforced by
+   * Linux-container-executor when used in nonsecure mode (use case for this
    * is using cgroups). The default value is taken from /usr/sbin/adduser
    */
   public static final String NM_NONSECURE_MODE_USER_PATTERN_KEY = NM_PREFIX +
       "linux-container-executor.nonsecure-mode.user-pattern";
 
-  public static final String DEFAULT_NM_NONSECURE_MODE_USER_PATTERN = 
+  public static final String DEFAULT_NM_NONSECURE_MODE_USER_PATTERN =
       "^[_.A-Za-z0-9][-@_.A-Za-z0-9]{0,255}?[$]?$";
 
   /** The type of resource enforcement to use with the
    *  linux container executor.
    */
-  public static final String NM_LINUX_CONTAINER_RESOURCES_HANDLER = 
+  public static final String NM_LINUX_CONTAINER_RESOURCES_HANDLER =
   NM_PREFIX + "linux-container-executor.resources-handler.class";
-  
+
   /** The path the linux container executor should use for cgroups */
   public static final String NM_LINUX_CONTAINER_CGROUPS_HIERARCHY =
     NM_PREFIX + "linux-container-executor.cgroups.hierarchy";
-  
+
   /** Whether the linux container executor should mount cgroups if not found */
   public static final String NM_LINUX_CONTAINER_CGROUPS_MOUNT =
     NM_PREFIX + "linux-container-executor.cgroups.mount";
-  
+
   /** Where the linux container executor should mount cgroups if not found */
   public static final String NM_LINUX_CONTAINER_CGROUPS_MOUNT_PATH =
     NM_PREFIX + "linux-container-executor.cgroups.mount-path";
@@ -2315,7 +2323,7 @@ public static boolean isAclEnabled(Configuration conf) {
 
   /**
    * Interval of time the linux container executor should try cleaning up
-   * cgroups entry when cleaning up a container. This is required due to what 
+   * cgroups entry when cleaning up a container. This is required due to what
    * it seems a race condition because the SIGTERM/SIGKILL is asynch.
    */
   public static final String NM_LINUX_CONTAINER_CGROUPS_DELETE_TIMEOUT =
@@ -2345,24 +2353,24 @@ public static boolean isAclEnabled(Configuration conf) {
       NM_PREFIX + "windows-container.cpu-limit.enabled";
   public static final boolean DEFAULT_NM_WINDOWS_CONTAINER_CPU_LIMIT_ENABLED = false;
 
-  /** 
+  /**
   /* The Windows group that the windows-secure-container-executor should run as.
   */
   public static final String NM_WINDOWS_SECURE_CONTAINER_GROUP =
       NM_PREFIX + "windows-secure-container-executor.group";
 
   /** T-file compression types used to compress aggregated logs.*/
-  public static final String NM_LOG_AGG_COMPRESSION_TYPE = 
+  public static final String NM_LOG_AGG_COMPRESSION_TYPE =
     NM_PREFIX + "log-aggregation.compression-type";
   public static final String DEFAULT_NM_LOG_AGG_COMPRESSION_TYPE = "none";
-  
+
   /** The kerberos principal for the node manager.*/
   public static final String NM_PRINCIPAL =
     NM_PREFIX + "principal";
-  
-  public static final String NM_AUX_SERVICES = 
+
+  public static final String NM_AUX_SERVICES =
       NM_PREFIX + "aux-services";
-  
+
   public static final String NM_AUX_SERVICE_FMT =
       NM_PREFIX + "aux-services.%s.class";
 
@@ -2392,11 +2400,11 @@ public static boolean isAclEnabled(Configuration conf) {
   /**The kerberos principal to be used for spnego filter for NM.*/
   public static final String NM_WEBAPP_SPNEGO_USER_NAME_KEY =
       NM_PREFIX + "webapp.spnego-principal";
-  
+
   /**The kerberos keytab to be used for spnego filter for NM.*/
   public static final String NM_WEBAPP_SPNEGO_KEYTAB_FILE_KEY =
       NM_PREFIX + "webapp.spnego-keytab-file";
-  
+
   public static final String DEFAULT_NM_USER_HOME_DIR= "/home/";
 
   public static final String NM_RECOVERY_PREFIX = NM_PREFIX + "recovery.";
@@ -2427,44 +2435,44 @@ public static boolean isAclEnabled(Configuration conf) {
   // Web Proxy Configs
   ////////////////////////////////
   public static final String PROXY_PREFIX = "yarn.web-proxy.";
-  
+
   /** The kerberos principal for the proxy.*/
   public static final String PROXY_PRINCIPAL =
     PROXY_PREFIX + "principal";
-  
+
   /** Keytab for Proxy.*/
   public static final String PROXY_KEYTAB = PROXY_PREFIX + "keytab";
-  
+
   /** The address for the web proxy.*/
   public static final String PROXY_ADDRESS =
     PROXY_PREFIX + "address";
   public static final int DEFAULT_PROXY_PORT = 9099;
   public static final String DEFAULT_PROXY_ADDRESS =
     "0.0.0.0:" + DEFAULT_PROXY_PORT;
-  
+
   /**
    * YARN Service Level Authorization
    */
-  public static final String 
+  public static final String
   YARN_SECURITY_SERVICE_AUTHORIZATION_RESOURCETRACKER_PROTOCOL =
       "security.resourcetracker.protocol.acl";
-  public static final String 
+  public static final String
   YARN_SECURITY_SERVICE_AUTHORIZATION_APPLICATIONCLIENT_PROTOCOL =
       "security.applicationclient.protocol.acl";
-  public static final String 
+  public static final String
   YARN_SECURITY_SERVICE_AUTHORIZATION_RESOURCEMANAGER_ADMINISTRATION_PROTOCOL =
       "security.resourcemanager-administration.protocol.acl";
-  public static final String 
+  public static final String
   YARN_SECURITY_SERVICE_AUTHORIZATION_APPLICATIONMASTER_PROTOCOL =
       "security.applicationmaster.protocol.acl";
   public static final String
       YARN_SECURITY_SERVICE_AUTHORIZATION_DISTRIBUTEDSCHEDULING_PROTOCOL =
       "security.distributedscheduling.protocol.acl";
 
-  public static final String 
+  public static final String
   YARN_SECURITY_SERVICE_AUTHORIZATION_CONTAINER_MANAGEMENT_PROTOCOL =
       "security.containermanagement.protocol.acl";
-  public static final String 
+  public static final String
   YARN_SECURITY_SERVICE_AUTHORIZATION_RESOURCE_LOCALIZER =
       "security.resourcelocalizer.protocol.acl";
 
@@ -3069,7 +3077,7 @@ public static boolean isAclEnabled(Configuration conf) {
   public static final String TIMELINE_SERVICE_HANDLER_THREAD_COUNT =
       TIMELINE_SERVICE_PREFIX + "handler-thread-count";
   public static final int DEFAULT_TIMELINE_SERVICE_CLIENT_THREAD_COUNT = 10;
-  
+
 
   /** The address of the timeline service web application.*/
   public static final String TIMELINE_SERVICE_WEBAPP_ADDRESS =
@@ -3290,7 +3298,7 @@ public static boolean isAclEnabled(Configuration conf) {
   public static final String SHARED_CACHE_NESTED_LEVEL =
       SHARED_CACHE_PREFIX + "nested-level";
   public static final int DEFAULT_SHARED_CACHE_NESTED_LEVEL = 3;
-  
+
   // Shared Cache Manager Configs
 
   public static final String SCM_STORE_PREFIX = SHARED_CACHE_PREFIX + "store.";
@@ -3324,7 +3332,7 @@ public static boolean isAclEnabled(Configuration conf) {
       "0.0.0.0:" + DEFAULT_SCM_WEBAPP_PORT;
 
   // In-memory SCM store configuration
-  
+
   public static final String IN_MEMORY_STORE_PREFIX =
       SCM_STORE_PREFIX + "in-memory.";
 
@@ -3345,7 +3353,7 @@ public static boolean isAclEnabled(Configuration conf) {
   public static final String IN_MEMORY_INITIAL_DELAY_MINS =
       IN_MEMORY_STORE_PREFIX + "initial-delay-mins";
   public static final int DEFAULT_IN_MEMORY_INITIAL_DELAY_MINS = 10;
-  
+
   /**
    * The frequency at which the in-memory store checks to remove dead initial
    * applications. Specified in minutes.
@@ -3731,13 +3739,13 @@ public static boolean isAclEnabled(Configuration conf) {
    * Node-labels configurations
    */
   public static final String NODE_LABELS_PREFIX = YARN_PREFIX + "node-labels.";
-  
+
   /** Node label store implementation class */
   public static final String FS_NODE_LABELS_STORE_IMPL_CLASS = NODE_LABELS_PREFIX
       + "fs-store.impl.class";
   public static final String DEFAULT_FS_NODE_LABELS_STORE_IMPL_CLASS =
       "org.apache.hadoop.yarn.nodelabels.FileSystemNodeLabelsStore";
-  
+
   /** URI for NodeLabelManager */
   public static final String FS_NODE_LABELS_STORE_ROOT_DIR = NODE_LABELS_PREFIX
       + "fs-store.root-dir";
@@ -3765,10 +3773,10 @@ public static boolean isAclEnabled(Configuration conf) {
   public static final String NODE_LABELS_ENABLED = NODE_LABELS_PREFIX
       + "enabled";
   public static final boolean DEFAULT_NODE_LABELS_ENABLED = false;
-  
+
   public static final String NODELABEL_CONFIGURATION_TYPE =
       NODE_LABELS_PREFIX + "configuration-type";
-  
+
   public static final String CENTRALIZED_NODELABEL_CONFIGURATION_TYPE =
       "centralized";
 
@@ -3777,7 +3785,7 @@ public static boolean isAclEnabled(Configuration conf) {
 
   public static final String DISTRIBUTED_NODELABEL_CONFIGURATION_TYPE =
       "distributed";
-  
+
   public static final String DEFAULT_NODELABEL_CONFIGURATION_TYPE =
       CENTRALIZED_NODELABEL_CONFIGURATION_TYPE;
 
@@ -4102,7 +4110,7 @@ public static boolean areNodeLabelsEnabled(
   public YarnConfiguration() {
     super();
   }
-  
+
   public YarnConfiguration(Configuration conf) {
     super(conf);
     if (! (conf instanceof YarnConfiguration)) {
@@ -4337,6 +4345,20 @@ public static boolean numaAwarenessEnabled(Configuration conf) {
         DEFAULT_NM_NUMA_AWARENESS_ENABLED);
   }
 
+  /**
+   * Returns Timeout to skip node from scheduling if not heartbeated.
+   * @param conf the configuration
+   * @return timeout in milliseconds.
+   */
+  public static long getSkipNodeInterval(Configuration conf) {
+    long heartbeatIntvl = conf.getLong(
+        YarnConfiguration.RM_NM_HEARTBEAT_INTERVAL_MS,
+        YarnConfiguration.DEFAULT_RM_NM_HEARTBEAT_INTERVAL_MS);
+    int multiplier = conf.getInt(SCHEDULER_SKIP_NODE_MULTIPLIER,
+        DEFAULT_SCHEDULER_SKIP_NODE_MULTIPLIER);
+    return multiplier * heartbeatIntvl;
+  }
+
   /* For debugging. mp configurations to system output as XML format. */
   public static void main(String[] args) throws Exception {
     new YarnConfiguration(new Configuration()).writeXml(System.out);
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml
index 3f35d20ddd3e8..2ebed85564a37 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml
@@ -46,15 +46,15 @@
     <name>yarn.ipc.rpc.class</name>
     <value>org.apache.hadoop.yarn.ipc.HadoopYarnProtoRPC</value>
   </property>
-  
+
   <!-- Resource Manager Configuration -->
 
   <property>
     <description>The hostname of the RM.</description>
     <name>yarn.resourcemanager.hostname</name>
     <value>0.0.0.0</value>
-  </property>    
-  
+  </property>
+
   <property>
     <description>The address of the applications manager interface in the RM.</description>
     <name>yarn.resourcemanager.address</name>
@@ -919,6 +919,13 @@
     <value>1.0</value>
   </property>
 
+  <property>
+    <description>The Number of consecutive missed heartbeats after which node will be
+      skipped from scheduling</description>
+    <name>yarn.scheduler.skip.node.multiplier</name>
+    <value>2</value>
+  </property>
+
   <property>
     <description>The minimum allowed version of a connecting nodemanager.  The valid values are
       NONE (no version checking), EqualToRM (the nodemanager's version is equal to
@@ -1140,7 +1147,7 @@
     <name>yarn.nodemanager.hostname</name>
     <value>0.0.0.0</value>
   </property>
-  
+
   <property>
     <description>The address of the container manager in the NM.</description>
     <name>yarn.nodemanager.address</name>
@@ -1229,13 +1236,13 @@
 
   <property>
     <description>
-      Number of seconds after an application finishes before the nodemanager's 
+      Number of seconds after an application finishes before the nodemanager's
       DeletionService will delete the application's localized file directory
       and log directory.
-      
+
       To diagnose YARN application problems, set this property's value large
       enough (for example, to 600 = 10 minutes) to permit examination of these
-      directories. After changing the property's value, you must restart the 
+      directories. After changing the property's value, you must restart the
       nodemanager in order for it to have an effect.
 
       The roots of YARN applications' work directories is configurable with
@@ -1254,7 +1261,7 @@
   </property>
 
   <property>
-    <description>List of directories to store localized files in. An 
+    <description>List of directories to store localized files in. An
       application's localized file directory will be found in:
       ${yarn.nodemanager.local-dirs}/usercache/${user}/appcache/application_${appid}.
       Individual containers' work directories, called container_${contid}, will
@@ -1312,7 +1319,7 @@
 
   <property>
     <description>Target size of localizer cache in MB, per nodemanager. It is
-      a target retention size that only includes resources with PUBLIC and 
+      a target retention size that only includes resources with PUBLIC and
       PRIVATE visibility and excludes resources with APPLICATION visibility
     </description>
     <name>yarn.nodemanager.localizer.cache.target-size-mb</name>
@@ -1350,7 +1357,7 @@
     <description>
       Where to store container logs. An application's localized log directory
       will be found in ${yarn.nodemanager.log-dirs}/application_${appid}.
-      Individual containers' log directories will be below this, in directories 
+      Individual containers' log directories will be below this, in directories
       named container_{$contid}. Each container directory will contain the files
       stderr, stdin, and syslog generated by that container.
     </description>
@@ -1382,12 +1389,12 @@
   </property>
 
   <property>
-    <description>How long to keep aggregation logs before deleting them.  -1 disables. 
+    <description>How long to keep aggregation logs before deleting them.  -1 disables.
     Be careful set this too small and you will spam the name node.</description>
     <name>yarn.log-aggregation.retain-seconds</name>
     <value>-1</value>
-  </property> 
-  
+  </property>
+
   <property>
     <description>How long to wait between aggregated log retention checks.
     If set to 0 or a negative value then the value is computed as one-tenth
@@ -1451,7 +1458,7 @@
     <value>/tmp/logs</value>
   </property>
   <property>
-    <description>The remote log dir will be created at 
+    <description>The remote log dir will be created at
       {yarn.nodemanager.remote-app-log-dir}/${user}/{thisParam}
     </description>
     <name>yarn.nodemanager.remote-app-log-dir-suffix</name>
@@ -1471,7 +1478,7 @@
   </property>
 
   <property>
-    <description>Amount of physical memory, in MB, that can be allocated 
+    <description>Amount of physical memory, in MB, that can be allocated
     for containers. If set to -1 and
     yarn.nodemanager.resource.detect-hardware-capabilities is true, it is
     automatically calculated(in case of Windows and Linux).
@@ -1762,9 +1769,9 @@
   </property>
 
   <property>
-    <description>The maximum percentage of disk space utilization allowed after 
-    which a disk is marked as bad. Values can range from 0.0 to 100.0. 
-    If the value is greater than or equal to 100, the nodemanager will check 
+    <description>The maximum percentage of disk space utilization allowed after
+    which a disk is marked as bad. Values can range from 0.0 to 100.0.
+    If the value is greater than or equal to 100, the nodemanager will check
     for full disk. This applies to yarn.nodemanager.local-dirs and
     yarn.nodemanager.log-dirs when
       yarn.nodemanager.disk-health-checker.disk-utilization-threshold.enabled is true.</description>
@@ -2120,8 +2127,8 @@
   </property>
 
   <property>
-    <description>The minimum allowed version of a resourcemanager that a nodemanager will connect to.  
-      The valid values are NONE (no version checking), EqualToNM (the resourcemanager's version is 
+    <description>The minimum allowed version of a resourcemanager that a nodemanager will connect to.
+      The valid values are NONE (no version checking), EqualToNM (the resourcemanager's version is
       equal to or greater than the NM version), or a Version String.</description>
     <name>yarn.nodemanager.resourcemanager.minimum.version</name>
     <value>NONE</value>
@@ -2202,7 +2209,7 @@
     <name>yarn.client.max-cached-nodemanagers-proxies</name>
     <value>0</value>
   </property>
-  
+
   <property>
     <description>Enable the node manager to recover after starting</description>
     <name>yarn.nodemanager.recovery.enabled</name>
@@ -2314,13 +2321,13 @@
     <name>yarn.web-proxy.principal</name>
     <value/>
   </property>
-  
+
   <property>
-    <description>Keytab for WebAppProxy, if the proxy is not running as part of 
+    <description>Keytab for WebAppProxy, if the proxy is not running as part of
     the RM.</description>
     <name>yarn.web-proxy.keytab</name>
   </property>
-  
+
   <property>
     <description>The address for the web proxy as HOST:PORT, if this is not
      given then the proxy will run as part of the RM</description>
@@ -2334,7 +2341,7 @@
     <description>
       CLASSPATH for YARN applications. A comma-separated list
       of CLASSPATH entries. When this value is empty, the following default
-      CLASSPATH for YARN applications would be used. 
+      CLASSPATH for YARN applications would be used.
       For Linux:
       $HADOOP_CONF_DIR,
       $HADOOP_COMMON_HOME/share/hadoop/common/*,
@@ -2849,7 +2856,7 @@
     <name>yarn.sharedcache.app-checker.class</name>
     <value>org.apache.hadoop.yarn.server.sharedcachemanager.RemoteAppChecker</value>
   </property>
-  
+
   <property>
     <description>A resource in the in-memory store is considered stale
     if the time since the last reference exceeds the staleness period.
@@ -2857,21 +2864,21 @@
     <name>yarn.sharedcache.store.in-memory.staleness-period-mins</name>
     <value>10080</value>
   </property>
-  
+
   <property>
     <description>Initial delay before the in-memory store runs its first check
     to remove dead initial applications. Specified in minutes.</description>
     <name>yarn.sharedcache.store.in-memory.initial-delay-mins</name>
     <value>10</value>
   </property>
-  
+
   <property>
     <description>The frequency at which the in-memory store checks to remove
     dead initial applications. Specified in minutes.</description>
     <name>yarn.sharedcache.store.in-memory.check-period-mins</name>
     <value>720</value>
   </property>
-  
+
   <property>
     <description>The address of the admin interface in the SCM (shared cache manager)</description>
     <name>yarn.sharedcache.admin.address</name>
@@ -3302,7 +3309,7 @@
     Private_Dirty, Private_Clean, Shared_Dirty, Shared_Clean which can be used
     for computing more accurate RSS. When this flag is enabled, RSS is computed
     as Min(Shared_Dirty, Pss) + Private_Clean + Private_Dirty. It excludes
-    read-only shared mappings in RSS computation.  
+    read-only shared mappings in RSS computation.
     </description>
     <name>yarn.nodemanager.container-monitor.procfs-tree.smaps-based-rss.enabled</name>
     <value>false</value>
@@ -3752,7 +3759,7 @@
     <name>yarn.timeline-service.http-cross-origin.enabled</name>
     <value>false</value>
   </property>
- 
+
   <property>
     <description>
       Flag to enable cross-origin (CORS) support for timeline service v1.x or
@@ -3870,7 +3877,7 @@
     to specify details about the individual resource types.
     </description>
   </property>
- 
+
   <property>
     <name>yarn.webapp.filter-entity-list-by-user</name>
     <value>false</value>
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/AbstractYarnScheduler.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/AbstractYarnScheduler.java
index dd3e0bc26fd95..24bbaf8e6776a 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/AbstractYarnScheduler.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/AbstractYarnScheduler.java
@@ -159,6 +159,7 @@ public abstract class AbstractYarnScheduler
   protected ConcurrentMap<ApplicationId, SchedulerApplication<T>> applications;
   protected int nmExpireInterval;
   protected long nmHeartbeatInterval;
+  private long skipNodeInterval;
 
   private final static List<Container> EMPTY_CONTAINER_LIST =
       new ArrayList<Container>();
@@ -361,6 +362,10 @@ public long getLastNodeUpdateTime() {
     return lastNodeUpdateTime;
   }
 
+  public long getSkipNodeInterval(){
+    return skipNodeInterval;
+  }
+
   protected void containerLaunchedOnNode(
       ContainerId containerId, SchedulerNode node) {
     try {
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/SchedulerUtils.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/SchedulerUtils.java
index c3d2c431a8b3b..af5a4a17b78d9 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/SchedulerUtils.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/SchedulerUtils.java
@@ -61,6 +61,7 @@
 import org.apache.hadoop.yarn.util.resource.ResourceCalculator;
 import org.apache.hadoop.yarn.util.resource.ResourceUtils;
 import org.apache.hadoop.yarn.util.resource.Resources;
+import org.apache.hadoop.util.Time;
 
 import static org.apache.hadoop.yarn.exceptions
         .InvalidResourceRequestException
@@ -72,7 +73,7 @@
         .InvalidResourceRequestException.UNKNOWN_REASON_MESSAGE_TEMPLATE;
 
 /**
- * Utilities shared by schedulers. 
+ * Utilities shared by schedulers.
  */
 @Private
 @Unstable
@@ -136,7 +137,7 @@ public String toString() {
    *
    * @param containerId {@link ContainerId} of returned/released/lost container.
    * @param diagnostics diagnostic message
-   * @return <code>ContainerStatus</code> for an returned/released/lost 
+   * @return <code>ContainerStatus</code> for an returned/released/lost
    *         container
    */
   public static ContainerStatus createAbnormalContainerStatus(
@@ -179,7 +180,7 @@ public static ContainerStatus createPreemptedContainerStatus(
    *
    * @param containerId {@link ContainerId} of returned/released/lost container.
    * @param diagnostics diagnostic message
-   * @return <code>ContainerStatus</code> for an returned/released/lost 
+   * @return <code>ContainerStatus</code> for an returned/released/lost
    *         container
    */
   private static ContainerStatus createAbnormalContainerStatus(
@@ -604,4 +605,11 @@ public static RMContainer createOpportunisticRmContainer(RMContext rmContext,
     node.allocateContainer(rmContainer);
     return rmContainer;
   }
+
+  public static boolean isNodeHeartbeated(SchedulerNode node,
+      long skipNodeInterval) {
+    long timeElapsedFromLastHeartbeat =
+        Time.monotonicNow() - node.getLastHeartbeatMonotonicTime();
+    return timeElapsedFromLastHeartbeat <= skipNodeInterval;
+  }
 }
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CapacityScheduler.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CapacityScheduler.java
index caf2c8bc220f0..17897d1ecf1ad 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CapacityScheduler.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CapacityScheduler.java
@@ -67,6 +67,7 @@
 import org.apache.hadoop.yarn.api.records.ResourceRequest;
 import org.apache.hadoop.yarn.api.records.ResourceSizing;
 import org.apache.hadoop.yarn.api.records.SchedulingRequest;
+import org.apache.hadoop.yarn.api.records.NodeState;
 import org.apache.hadoop.yarn.conf.YarnConfiguration;
 import org.apache.hadoop.yarn.exceptions.YarnException;
 import org.apache.hadoop.yarn.exceptions.YarnRuntimeException;
@@ -233,7 +234,7 @@ public Configuration getConf() {
   private AppPriorityACLsManager appPriorityACLManager;
   private boolean multiNodePlacementEnabled;
 
-  private static boolean printedVerboseLoggingForAsyncScheduling = false;
+  private boolean printedVerboseLoggingForAsyncScheduling;
 
   /**
    * EXPERT
@@ -513,22 +514,47 @@ long getAsyncScheduleInterval() {
 
   private final static Random random = new Random(System.currentTimeMillis());
 
-  private static boolean shouldSkipNodeSchedule(FiCaSchedulerNode node,
+  @VisibleForTesting
+  public static boolean shouldSkipNodeSchedule(FiCaSchedulerNode node,
       CapacityScheduler cs, boolean printVerboseLog) {
-    // Skip node which missed 2 heartbeats since the node might be dead and
-    // we should not continue allocate containers on that.
-    long timeElapsedFromLastHeartbeat =
-        Time.monotonicNow() - node.getLastHeartbeatMonotonicTime();
-    if (timeElapsedFromLastHeartbeat > cs.nmHeartbeatInterval * 2) {
+    // Skip node which missed YarnConfiguration.SCHEDULER_SKIP_NODE_MULTIPLIER
+    // heartbeats since the node might be dead and we should not continue
+    // allocate containers on that.
+    if (!SchedulerUtils.isNodeHeartbeated(node, cs.getSkipNodeInterval())) {
       if (printVerboseLog && LOG.isDebugEnabled()) {
-        LOG.debug("Skip scheduling on node because it haven't heartbeated for "
+        long timeElapsedFromLastHeartbeat =
+            Time.monotonicNow() - node.getLastHeartbeatMonotonicTime();
+        LOG.debug("Skip scheduling on node " + node.getNodeID()
+            + " because it haven't heartbeated for "
             + timeElapsedFromLastHeartbeat / 1000.0f + " secs");
       }
       return true;
     }
+
+    if (node.getRMNode().getState() != NodeState.RUNNING) {
+      if (printVerboseLog && LOG.isDebugEnabled()) {
+        LOG.debug("Skip scheduling on node because it is in " +
+            node.getRMNode().getState() + " state");
+      }
+      return true;
+    }
     return false;
   }
 
+  private static boolean isPrintSkippedNodeLogging(CapacityScheduler cs) {
+    // To avoid too verbose DEBUG logging, only print debug log once for
+    // every 10 secs.
+    boolean printSkipedNodeLogging = false;
+    if (LOG.isDebugEnabled()) {
+      if (Time.monotonicNow() / 1000 % 10 == 0) {
+        printSkipedNodeLogging = (!cs.printedVerboseLoggingForAsyncScheduling);
+      } else {
+        cs.printedVerboseLoggingForAsyncScheduling = false;
+      }
+    }
+    return printSkipedNodeLogging;
+  }
+
   /**
    * Schedule on all nodes by starting at a random point.
    * @param cs
@@ -548,17 +574,12 @@ static void schedule(CapacityScheduler cs) throws InterruptedException{
 
     // To avoid too verbose DEBUG logging, only print debug log once for
     // every 10 secs.
-    boolean printSkipedNodeLogging = false;
-    if (Time.monotonicNow() / 1000 % 10 == 0) {
-      printSkipedNodeLogging = (!printedVerboseLoggingForAsyncScheduling);
-    } else {
-      printedVerboseLoggingForAsyncScheduling = false;
-    }
+    boolean printSkippedNodeLogging = isPrintSkippedNodeLogging(cs);
 
     // Allocate containers of node [start, end)
     for (FiCaSchedulerNode node : nodes) {
       if (current++ >= start) {
-        if (shouldSkipNodeSchedule(node, cs, printSkipedNodeLogging)) {
+        if (shouldSkipNodeSchedule(node, cs, printSkippedNodeLogging)) {
           continue;
         }
         cs.allocateContainersToNode(node.getNodeID(), false);
@@ -572,14 +593,14 @@ static void schedule(CapacityScheduler cs) throws InterruptedException{
       if (current++ > start) {
         break;
       }
-      if (shouldSkipNodeSchedule(node, cs, printSkipedNodeLogging)) {
+      if (shouldSkipNodeSchedule(node, cs, printSkippedNodeLogging)) {
         continue;
       }
       cs.allocateContainersToNode(node.getNodeID(), false);
     }
 
-    if (printSkipedNodeLogging) {
-      printedVerboseLoggingForAsyncScheduling = true;
+    if (printSkippedNodeLogging) {
+      cs.printedVerboseLoggingForAsyncScheduling = true;
     }
 
     Thread.sleep(cs.getAsyncScheduleInterval());
@@ -1456,16 +1477,48 @@ private boolean canAllocateMore(CSAssignment assignment, int offswitchCount,
             || assignedContainers < maxAssignPerHeartbeat);
   }
 
+  private Map<NodeId, FiCaSchedulerNode> getNodesHeartbeated(String partition) {
+    Map<NodeId, FiCaSchedulerNode> nodesByPartition = new HashMap<>();
+    boolean printSkippedNodeLogging = isPrintSkippedNodeLogging(this);
+    List<FiCaSchedulerNode> nodes = nodeTracker
+        .getNodesPerPartition(partition);
+
+    if (nodes != null && !nodes.isEmpty()) {
+      //Filter for node heartbeat too long
+      nodes.stream()
+          .filter(node ->
+              !shouldSkipNodeSchedule(node, this, printSkippedNodeLogging))
+          .forEach(n -> nodesByPartition.put(n.getNodeID(), n));
+    }
+
+    if (printSkippedNodeLogging) {
+      printedVerboseLoggingForAsyncScheduling = true;
+    }
+    return nodesByPartition;
+  }
+
+  private CandidateNodeSet<FiCaSchedulerNode> getCandidateNodeSet(
+      String partition) {
+    CandidateNodeSet<FiCaSchedulerNode> candidates = null;
+    Map<NodeId, FiCaSchedulerNode> nodesByPartition
+        = getNodesHeartbeated(partition);
+
+    if (!nodesByPartition.isEmpty()) {
+      candidates = new SimpleCandidateNodeSet<FiCaSchedulerNode>(
+          nodesByPartition, partition);
+    }
+
+    return candidates;
+  }
+
   private CandidateNodeSet<FiCaSchedulerNode> getCandidateNodeSet(
       FiCaSchedulerNode node) {
     CandidateNodeSet<FiCaSchedulerNode> candidates = null;
     candidates = new SimpleCandidateNodeSet<>(node);
     if (multiNodePlacementEnabled) {
-      Map<NodeId, FiCaSchedulerNode> nodesByPartition = new HashMap<>();
-      List<FiCaSchedulerNode> nodes = nodeTracker
-          .getNodesPerPartition(node.getPartition());
-      if (nodes != null && !nodes.isEmpty()) {
-        nodes.forEach(n -> nodesByPartition.put(n.getNodeID(), n));
+      Map<NodeId, FiCaSchedulerNode> nodesByPartition =
+          getNodesHeartbeated(node.getPartition());
+      if (!nodesByPartition.isEmpty()) {
         candidates = new SimpleCandidateNodeSet<FiCaSchedulerNode>(
             nodesByPartition, node.getPartition());
       }
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/placement/MultiNodeSorter.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/placement/MultiNodeSorter.java
index a757ea527afff..3e3a73fd5c662 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/placement/MultiNodeSorter.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/placement/MultiNodeSorter.java
@@ -135,7 +135,7 @@ public void reSortClusterNodes() {
       Map<NodeId, SchedulerNode> nodesByPartition = new HashMap<>();
       List<SchedulerNode> nodes = ((AbstractYarnScheduler) rmContext
           .getScheduler()).getNodeTracker().getNodesPerPartition(label);
-      if (nodes != null && !nodes.isEmpty()) {
+      if (nodes != null) {
         nodes.forEach(n -> nodesByPartition.put(n.getNodeID(), n));
         multiNodePolicy.addAndRefreshNodesSet(
             (Collection<N>) nodesByPartition.values(), label);
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/placement/MultiNodeSortingManager.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/placement/MultiNodeSortingManager.java
index c8a7e66f5fe03..8c5691f189f67 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/placement/MultiNodeSortingManager.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/placement/MultiNodeSortingManager.java
@@ -21,6 +21,7 @@
 import java.util.HashSet;
 import java.util.Iterator;
 import java.util.Map;
+import java.util.NoSuchElementException;
 import java.util.Set;
 import java.util.concurrent.ConcurrentHashMap;
 
@@ -30,8 +31,10 @@
 import org.slf4j.LoggerFactory;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.service.AbstractService;
+import org.apache.hadoop.yarn.conf.YarnConfiguration;
 import org.apache.hadoop.yarn.server.resourcemanager.RMContext;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerNode;
+import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerUtils;
 
 /**
  * Node Sorting Manager which runs all sorter threads and policies.
@@ -48,6 +51,7 @@ public class MultiNodeSortingManager<N extends SchedulerNode>
   private Set<MultiNodePolicySpec> policySpecs = new HashSet<MultiNodePolicySpec>();
   private Configuration conf;
   private boolean multiNodePlacementEnabled;
+  private long skipNodeInterval;
 
   public MultiNodeSortingManager() {
     super("MultiNodeSortingManager");
@@ -59,6 +63,7 @@ public void serviceInit(Configuration configuration) throws Exception {
     LOG.info("Initializing NodeSortingService=" + getName());
     super.serviceInit(configuration);
     this.conf = configuration;
+    this.skipNodeInterval = YarnConfiguration.getSkipNodeInterval(conf);
   }
 
   @Override
@@ -134,6 +139,42 @@ public Iterator<N> getMultiNodeSortIterator(Collection<N> nodes,
       policy.addAndRefreshNodesSet(nodes, partition);
     }
 
-    return policy.getPreferredNodeIterator(nodes, partition);
+    Iterator<N> nodesIterator = policy.getPreferredNodeIterator(nodes,
+        partition);
+
+    // Skip node which missed YarnConfiguration.SCHEDULER_SKIP_NODE_MULTIPLIER
+    // heartbeats since the node might be dead and we should not continue
+    // allocate containers on that.
+    Iterator<N> filteringIterator = new Iterator() {
+      private N cached;
+      private boolean hasCached;
+      @Override
+      public boolean hasNext() {
+        if (hasCached) {
+          return true;
+        }
+        while (nodesIterator.hasNext()) {
+          cached = nodesIterator.next();
+          if (SchedulerUtils.isNodeHeartbeated(cached, skipNodeInterval)) {
+            hasCached = true;
+            return true;
+          }
+        }
+        return false;
+      }
+
+      @Override
+      public N next() {
+        if (hasCached) {
+          hasCached = false;
+          return cached;
+        }
+        if (!hasNext()) {
+          throw new NoSuchElementException();
+        }
+        return next();
+      }
+    };
+    return filteringIterator;
   }
 }

From c56ed11cd9315469815623a065e0b8e3180c7cc0 Mon Sep 17 00:00:00 2001
From: Prabhjyot Singh <prabhjyot@acceldata.io>
Date: Wed, 20 Nov 2024 10:23:46 -0500
Subject: [PATCH 15/40] ODP-2636: MAPREDUCE-7431: ShuffleHandler refactor and
 fix after Netty4 upgrade. (#5311) (#47)

* MAPREDUCE-7431. ShuffleHandler refactor and fix after Netty4 upgrade. (#5311)

(cherry picked from commit 151b71d7affbbaadab5af7943f824f6ae6a6f47b)
(cherry picked from commit 8271c035a82b395b2b926c582a35b9b6fae4af0b)

* MAPREDUCE-7431: fix compile

(cherry picked from commit 1aaf4a64279abe76afe937b2c6ab52095b0a3b29)

---------

Co-authored-by: Tamas Domok <tdomok@cloudera.com>
---
 .../hadoop-mapreduce-client-shuffle/pom.xml   |    6 +
 .../hadoop/mapred/ShuffleChannelHandler.java  |  715 +++++++
 .../mapred/ShuffleChannelHandlerContext.java  |  140 ++
 .../mapred/ShuffleChannelInitializer.java     |   74 +
 .../apache/hadoop/mapred/ShuffleHandler.java  | 1055 ++--------
 .../mapred/TestShuffleChannelHandler.java     |  562 +++++
 .../hadoop/mapred/TestShuffleHandler.java     | 1853 ++---------------
 .../hadoop/mapred/TestShuffleHandlerBase.java |  172 ++
 .../src/test/resources/cert.pem               |   27 +
 .../src/test/resources/key.pem                |   52 +
 .../src/test/resources/log4j.properties       |    4 +-
 11 files changed, 2062 insertions(+), 2598 deletions(-)
 create mode 100644 hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-shuffle/src/main/java/org/apache/hadoop/mapred/ShuffleChannelHandler.java
 create mode 100644 hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-shuffle/src/main/java/org/apache/hadoop/mapred/ShuffleChannelHandlerContext.java
 create mode 100644 hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-shuffle/src/main/java/org/apache/hadoop/mapred/ShuffleChannelInitializer.java
 create mode 100644 hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-shuffle/src/test/java/org/apache/hadoop/mapred/TestShuffleChannelHandler.java
 create mode 100644 hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-shuffle/src/test/java/org/apache/hadoop/mapred/TestShuffleHandlerBase.java
 create mode 100644 hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-shuffle/src/test/resources/cert.pem
 create mode 100644 hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-shuffle/src/test/resources/key.pem

diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-shuffle/pom.xml b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-shuffle/pom.xml
index 1f9f16ff69ae9..cf428a7202a38 100644
--- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-shuffle/pom.xml
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-shuffle/pom.xml
@@ -55,6 +55,12 @@
       <groupId>org.fusesource.leveldbjni</groupId>
       <artifactId>leveldbjni-all</artifactId>
     </dependency>
+    <dependency>
+      <groupId>ch.qos.logback</groupId>
+      <artifactId>logback-classic</artifactId>
+      <version>1.1.2</version>
+      <scope>test</scope>
+    </dependency>
   </dependencies>
 
   <build>
diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-shuffle/src/main/java/org/apache/hadoop/mapred/ShuffleChannelHandler.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-shuffle/src/main/java/org/apache/hadoop/mapred/ShuffleChannelHandler.java
new file mode 100644
index 0000000000000..49c0bb288b534
--- /dev/null
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-shuffle/src/main/java/org/apache/hadoop/mapred/ShuffleChannelHandler.java
@@ -0,0 +1,715 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.mapred;
+
+import io.netty.buffer.ByteBuf;
+import io.netty.buffer.Unpooled;
+import io.netty.channel.Channel;
+import io.netty.channel.ChannelFuture;
+import io.netty.channel.ChannelFutureListener;
+import io.netty.channel.ChannelHandlerContext;
+import io.netty.channel.ChannelPipeline;
+import io.netty.channel.SimpleChannelInboundHandler;
+import io.netty.handler.codec.TooLongFrameException;
+import io.netty.handler.codec.http.DefaultFullHttpResponse;
+import io.netty.handler.codec.http.DefaultHttpResponse;
+import io.netty.handler.codec.http.FullHttpRequest;
+import io.netty.handler.codec.http.FullHttpResponse;
+import io.netty.handler.codec.http.HttpRequest;
+import io.netty.handler.codec.http.HttpResponse;
+import io.netty.handler.codec.http.HttpResponseStatus;
+import io.netty.handler.codec.http.HttpUtil;
+import io.netty.handler.codec.http.LastHttpContent;
+import io.netty.handler.codec.http.QueryStringDecoder;
+import io.netty.handler.ssl.SslHandler;
+import io.netty.util.CharsetUtil;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.RandomAccessFile;
+import java.net.URL;
+import java.nio.channels.ClosedChannelException;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.atomic.AtomicInteger;
+
+import javax.crypto.SecretKey;
+
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.DataOutputBuffer;
+import org.apache.hadoop.io.SecureIOUtils;
+import org.apache.hadoop.mapreduce.security.SecureShuffleUtils;
+import org.apache.hadoop.mapreduce.task.reduce.ShuffleHeader;
+import org.apache.hadoop.thirdparty.com.google.common.base.Charsets;
+import org.eclipse.jetty.http.HttpHeader;
+
+import static io.netty.buffer.Unpooled.wrappedBuffer;
+import static io.netty.handler.codec.http.HttpHeaderNames.CONTENT_TYPE;
+import static io.netty.handler.codec.http.HttpMethod.GET;
+import static io.netty.handler.codec.http.HttpResponseStatus.BAD_REQUEST;
+import static io.netty.handler.codec.http.HttpResponseStatus.FORBIDDEN;
+import static io.netty.handler.codec.http.HttpResponseStatus.INTERNAL_SERVER_ERROR;
+import static io.netty.handler.codec.http.HttpResponseStatus.METHOD_NOT_ALLOWED;
+import static io.netty.handler.codec.http.HttpResponseStatus.OK;
+import static io.netty.handler.codec.http.HttpResponseStatus.UNAUTHORIZED;
+import static io.netty.handler.codec.http.HttpVersion.HTTP_1_1;
+import static org.apache.hadoop.mapred.ShuffleHandler.AUDITLOG;
+import static org.apache.hadoop.mapred.ShuffleHandler.CONNECTION_CLOSE;
+import static org.apache.hadoop.mapred.ShuffleHandler.FETCH_RETRY_DELAY;
+import static org.apache.hadoop.mapred.ShuffleHandler.IGNORABLE_ERROR_MESSAGE;
+import static org.apache.hadoop.mapred.ShuffleHandler.RETRY_AFTER_HEADER;
+import static org.apache.hadoop.mapred.ShuffleHandler.TIMEOUT_HANDLER;
+import static org.apache.hadoop.mapred.ShuffleHandler.TOO_MANY_REQ_STATUS;
+import static org.apache.hadoop.mapred.ShuffleHandler.LOG;
+
+/**
+ * ShuffleChannelHandler verifies the map request then servers the attempts in a http stream.
+ * Before each attempt a serialised ShuffleHeader object is written with the details.
+ *
+ * <pre>
+ * Example Request
+ * ===================
+ * GET /mapOutput?job=job_1111111111111_0001&amp;reduce=0&amp;
+ *     map=attempt_1111111111111_0001_m_000001_0,
+ *     attempt_1111111111111_0002_m_000002_0,
+ *     attempt_1111111111111_0003_m_000003_0 HTTP/1.1
+ * name: mapreduce
+ * version: 1.0.0
+ * UrlHash: 9zS++qE0/7/D2l1Rg0TqRoSguAk=
+ *
+ * Example Response
+ * ===================
+ * HTTP/1.1 200 OK
+ * ReplyHash: GcuojWkAxXUyhZHPnwoV/MW2tGA=
+ * name: mapreduce
+ * version: 1.0.0
+ * connection: close
+ * content-length: 138
+ *
+ * +--------+-------------------------------------------------+----------------+
+ * |00000000| 25 61 74 74 65 6d 70 74 5f 31 31 31 31 31 31 31 |%attempt_1111111|
+ * |00000010| 31 31 31 31 31 31 5f 30 30 30 31 5f 6d 5f 30 30 |111111_0001_m_00|
+ * |00000020| 30 30 30 31 5f 30 05 0a 00                      |0001_0...       |
+ * +--------+-------------------------------------------------+----------------+
+ * |00000000| 61 61 61 61 61                                  |aaaaa           |
+ * +--------+-------------------------------------------------+----------------+
+ * |00000000| 25 61 74 74 65 6d 70 74 5f 31 31 31 31 31 31 31 |%attempt_1111111|
+ * |00000010| 31 31 31 31 31 31 5f 30 30 30 32 5f 6d 5f 30 30 |111111_0002_m_00|
+ * |00000020| 30 30 30 32 5f 30 05 0a 00                      |0002_0...       |
+ * +--------+-------------------------------------------------+----------------+
+ * |00000000| 62 62 62 62 62                                  |bbbbb           |
+ * +--------+-------------------------------------------------+----------------+
+ * |00000000| 25 61 74 74 65 6d 70 74 5f 31 31 31 31 31 31 31 |%attempt_1111111|
+ * |00000010| 31 31 31 31 31 31 5f 30 30 30 33 5f 6d 5f 30 30 |111111_0003_m_00|
+ * |00000020| 30 30 30 33 5f 30 05 0a 00                      |0003_0...       |
+ * +--------+-------------------------------------------------+----------------+
+ * |00000000| 63 63 63 63 63                                  |ccccc           |
+ * +--------+-------------------------------------------------+----------------+
+ * </pre>
+ */
+public class ShuffleChannelHandler extends SimpleChannelInboundHandler<FullHttpRequest> {
+  private final ShuffleChannelHandlerContext handlerCtx;
+
+  ShuffleChannelHandler(ShuffleChannelHandlerContext ctx) {
+    handlerCtx = ctx;
+  }
+
+  private List<String> splitMaps(List<String> mapq) {
+    if (null == mapq) {
+      return null;
+    }
+    final List<String> ret = new ArrayList<>();
+    for (String s : mapq) {
+      Collections.addAll(ret, s.split(","));
+    }
+    return ret;
+  }
+
+  @Override
+  public void channelActive(ChannelHandlerContext ctx)
+      throws Exception {
+    LOG.debug("Executing channelActive; channel='{}'", ctx.channel().id());
+    int numConnections = handlerCtx.activeConnections.incrementAndGet();
+    if ((handlerCtx.maxShuffleConnections > 0) &&
+        (numConnections > handlerCtx.maxShuffleConnections)) {
+      LOG.info(String.format("Current number of shuffle connections (%d) is " +
+              "greater than the max allowed shuffle connections (%d)",
+          handlerCtx.allChannels.size(), handlerCtx.maxShuffleConnections));
+
+      Map<String, String> headers = new HashMap<>(1);
+      // notify fetchers to backoff for a while before closing the connection
+      // if the shuffle connection limit is hit. Fetchers are expected to
+      // handle this notification gracefully, that is, not treating this as a
+      // fetch failure.
+      headers.put(RETRY_AFTER_HEADER, String.valueOf(FETCH_RETRY_DELAY));
+      sendError(ctx, "", TOO_MANY_REQ_STATUS, headers);
+    } else {
+      super.channelActive(ctx);
+      handlerCtx.allChannels.add(ctx.channel());
+      LOG.debug("Added channel: {}, channel id: {}. Accepted number of connections={}",
+          ctx.channel(), ctx.channel().id(), handlerCtx.activeConnections.get());
+    }
+  }
+
+  @Override
+  public void channelInactive(ChannelHandlerContext ctx) throws Exception {
+    LOG.debug("Executing channelInactive; channel='{}'", ctx.channel().id());
+    super.channelInactive(ctx);
+    int noOfConnections = handlerCtx.activeConnections.decrementAndGet();
+    LOG.debug("New value of Accepted number of connections={}", noOfConnections);
+  }
+
+  @Override
+  public void channelRead0(ChannelHandlerContext ctx, FullHttpRequest request) {
+    Channel channel = ctx.channel();
+    LOG.debug("Received HTTP request: {}, channel='{}'", request, channel.id());
+
+    if (request.method() != GET) {
+      sendError(ctx, METHOD_NOT_ALLOWED);
+      return;
+    }
+    // Check whether the shuffle version is compatible
+    String shuffleVersion = ShuffleHeader.DEFAULT_HTTP_HEADER_VERSION;
+    String httpHeaderName = ShuffleHeader.DEFAULT_HTTP_HEADER_NAME;
+    if (request.headers() != null) {
+      shuffleVersion = request.headers().get(ShuffleHeader.HTTP_HEADER_VERSION);
+      httpHeaderName = request.headers().get(ShuffleHeader.HTTP_HEADER_NAME);
+      LOG.debug("Received from request header: ShuffleVersion={} header name={}, channel id: {}",
+          shuffleVersion, httpHeaderName, channel.id());
+    }
+    if (request.headers() == null ||
+        !ShuffleHeader.DEFAULT_HTTP_HEADER_NAME.equals(httpHeaderName) ||
+        !ShuffleHeader.DEFAULT_HTTP_HEADER_VERSION.equals(shuffleVersion)) {
+      sendError(ctx, "Incompatible shuffle request version", BAD_REQUEST);
+      return;
+    }
+    final Map<String, List<String>> q =
+        new QueryStringDecoder(request.uri()).parameters();
+
+    final List<String> keepAliveList = q.get("keepAlive");
+    boolean keepAliveParam = false;
+    if (keepAliveList != null && keepAliveList.size() == 1) {
+      keepAliveParam = Boolean.parseBoolean(keepAliveList.get(0));
+      if (LOG.isDebugEnabled()) {
+        LOG.debug("KeepAliveParam: {} : {}, channel id: {}",
+            keepAliveList, keepAliveParam, channel.id());
+      }
+    }
+    final List<String> mapIds = splitMaps(q.get("map"));
+    final List<String> reduceQ = q.get("reduce");
+    final List<String> jobQ = q.get("job");
+    if (LOG.isDebugEnabled()) {
+      LOG.debug("RECV: " + request.uri() +
+          "\n  mapId: " + mapIds +
+          "\n  reduceId: " + reduceQ +
+          "\n  jobId: " + jobQ +
+          "\n  keepAlive: " + keepAliveParam +
+          "\n  channel id: " + channel.id());
+    }
+
+    if (mapIds == null || reduceQ == null || jobQ == null) {
+      sendError(ctx, "Required param job, map and reduce", BAD_REQUEST);
+      return;
+    }
+    if (reduceQ.size() != 1 || jobQ.size() != 1) {
+      sendError(ctx, "Too many job/reduce parameters", BAD_REQUEST);
+      return;
+    }
+
+    int reduceId;
+    String jobId;
+    try {
+      reduceId = Integer.parseInt(reduceQ.get(0));
+      jobId = jobQ.get(0);
+    } catch (NumberFormatException e) {
+      sendError(ctx, "Bad reduce parameter", BAD_REQUEST);
+      return;
+    } catch (IllegalArgumentException e) {
+      sendError(ctx, "Bad job parameter", BAD_REQUEST);
+      return;
+    }
+    final String reqUri = request.uri();
+    if (null == reqUri) {
+      // TODO? add upstream?
+      sendError(ctx, FORBIDDEN);
+      return;
+    }
+    HttpResponse response = new DefaultHttpResponse(HTTP_1_1, OK);
+    try {
+      verifyRequest(jobId, ctx, request, response,
+          new URL("http", "", handlerCtx.port, reqUri));
+    } catch (IOException e) {
+      LOG.warn("Shuffle failure ", e);
+      sendError(ctx, e.getMessage(), UNAUTHORIZED);
+      return;
+    }
+
+    Map<String, MapOutputInfo> mapOutputInfoMap = new HashMap<>();
+    ChannelPipeline pipeline = channel.pipeline();
+    ShuffleHandler.TimeoutHandler timeoutHandler =
+        (ShuffleHandler.TimeoutHandler)pipeline.get(TIMEOUT_HANDLER);
+    timeoutHandler.setEnabledTimeout(false);
+    String user = handlerCtx.userRsrc.get(jobId);
+
+    try {
+      populateHeaders(mapIds, jobId, user, reduceId,
+          response, keepAliveParam, mapOutputInfoMap);
+    } catch(IOException e) {
+      LOG.error("Shuffle error while populating headers. Channel id: " + channel.id(), e);
+      sendError(ctx, getErrorMessage(e), INTERNAL_SERVER_ERROR);
+      return;
+    }
+
+    channel.write(response);
+
+    //Initialize one ReduceContext object per channelRead call
+    boolean keepAlive = keepAliveParam || handlerCtx.connectionKeepAliveEnabled;
+    ReduceContext reduceContext = new ReduceContext(mapIds, reduceId, ctx,
+        user, mapOutputInfoMap, jobId, keepAlive);
+
+    sendMap(reduceContext);
+  }
+
+  /**
+   * Calls sendMapOutput for the mapId pointed by ReduceContext.mapsToSend
+   * and increments it. This method is first called by messageReceived()
+   * maxSessionOpenFiles times and then on the completion of every
+   * sendMapOutput operation. This limits the number of open files on a node,
+   * which can get really large(exhausting file descriptors on the NM) if all
+   * sendMapOutputs are called in one go, as was done previous to this change.
+   * @param reduceContext used to call sendMapOutput with correct params.
+   */
+  public void sendMap(ReduceContext reduceContext) {
+    LOG.trace("Executing sendMap; channel='{}'", reduceContext.ctx.channel().id());
+    if (reduceContext.getMapsToSend().get() <
+        reduceContext.getMapIds().size()) {
+      int nextIndex = reduceContext.getMapsToSend().getAndIncrement();
+      String mapId = reduceContext.getMapIds().get(nextIndex);
+
+      try {
+        MapOutputInfo info = reduceContext.getInfoMap().get(mapId);
+        if (info == null) {
+          info = getMapOutputInfo(mapId, reduceContext.getReduceId(),
+              reduceContext.getJobId(), reduceContext.getUser());
+        }
+        LOG.trace("Calling sendMapOutput; channel='{}'", reduceContext.ctx.channel().id());
+        ChannelFuture nextMap = sendMapOutput(
+            reduceContext.getCtx().channel(),
+            reduceContext.getUser(), mapId,
+            reduceContext.getReduceId(), info);
+        nextMap.addListener(new ReduceMapFileCount(this, reduceContext));
+      } catch (IOException e) {
+        LOG.error("Shuffle error: {}; channel={}", e, reduceContext.ctx.channel().id());
+
+        // It is not possible to sendError, the success HttpResponse has been already sent
+        reduceContext.ctx.channel().close();
+      }
+    }
+  }
+
+  private String getErrorMessage(Throwable t) {
+    StringBuilder sb = new StringBuilder(t.getMessage());
+    while (t.getCause() != null) {
+      sb.append(t.getCause().getMessage());
+      t = t.getCause();
+    }
+    return sb.toString();
+  }
+
+  protected MapOutputInfo getMapOutputInfo(String mapId, int reduce, String jobId, String user)
+      throws IOException {
+    ShuffleHandler.AttemptPathInfo pathInfo;
+    try {
+      ShuffleHandler.AttemptPathIdentifier identifier = new ShuffleHandler.AttemptPathIdentifier(
+          jobId, user, mapId);
+      pathInfo = handlerCtx.pathCache.get(identifier);
+      if (LOG.isDebugEnabled()) {
+        LOG.debug("Retrieved pathInfo for " + identifier +
+            " check for corresponding loaded messages to determine whether" +
+            " it was loaded or cached");
+      }
+    } catch (ExecutionException e) {
+      if (e.getCause() instanceof IOException) {
+        throw (IOException) e.getCause();
+      } else {
+        throw new RuntimeException(e.getCause());
+      }
+    }
+
+    IndexRecord info =
+        handlerCtx.indexCache.getIndexInformation(mapId, reduce, pathInfo.indexPath, user);
+
+    if (LOG.isDebugEnabled()) {
+      LOG.debug("getMapOutputInfo: jobId=" + jobId + ", mapId=" + mapId +
+          ",dataFile=" + pathInfo.dataPath + ", indexFile=" +
+          pathInfo.indexPath);
+      LOG.debug("getMapOutputInfo: startOffset={}, partLength={} rawLength={}",
+          info.startOffset, info.partLength, info.rawLength);
+    }
+
+    return new MapOutputInfo(pathInfo.dataPath, info);
+  }
+
+  protected void populateHeaders(List<String> mapIds, String jobId,
+                                 String user, int reduce, HttpResponse response,
+                                 boolean keepAliveParam,
+                                 Map<String, MapOutputInfo> mapOutputInfoMap)
+      throws IOException {
+
+    long contentLength = 0;
+    for (String mapId : mapIds) {
+      MapOutputInfo outputInfo = getMapOutputInfo(mapId, reduce, jobId, user);
+      if (mapOutputInfoMap.size() < handlerCtx.mapOutputMetaInfoCacheSize) {
+        mapOutputInfoMap.put(mapId, outputInfo);
+      }
+
+      ShuffleHeader header =
+          new ShuffleHeader(mapId, outputInfo.indexRecord.partLength,
+              outputInfo.indexRecord.rawLength, reduce);
+      DataOutputBuffer dob = new DataOutputBuffer();
+      header.write(dob);
+      contentLength += outputInfo.indexRecord.partLength;
+      contentLength += dob.getLength();
+
+      // verify file access to data file to send an actually correct http error
+      final File spillFile = new File(outputInfo.mapOutputFileName.toString());
+      RandomAccessFile r = SecureIOUtils.openForRandomRead(spillFile, "r", user, null);
+      r.close();
+    }
+
+    // Now set the response headers.
+    setResponseHeaders(response, keepAliveParam, contentLength);
+
+    // this audit log is disabled by default,
+    // to turn it on please enable this audit log
+    // on log4j.properties by uncommenting the setting
+    if (AUDITLOG.isDebugEnabled()) {
+      StringBuilder sb = new StringBuilder("shuffle for ");
+      sb.append(jobId).append(" reducer ").append(reduce);
+      sb.append(" length ").append(contentLength);
+      if (AUDITLOG.isTraceEnabled()) {
+        // For trace level logging, append the list of mappers
+        sb.append(" mappers: ").append(mapIds);
+        AUDITLOG.trace(sb.toString());
+      } else {
+        AUDITLOG.debug(sb.toString());
+      }
+    }
+  }
+
+  protected void setResponseHeaders(HttpResponse response,
+                                    boolean keepAliveParam, long contentLength) {
+    if (!handlerCtx.connectionKeepAliveEnabled && !keepAliveParam) {
+      response.headers().set(HttpHeader.CONNECTION.asString(), CONNECTION_CLOSE);
+    } else {
+      response.headers().set(HttpHeader.CONNECTION.asString(),
+          HttpHeader.KEEP_ALIVE.asString());
+      response.headers().set(HttpHeader.KEEP_ALIVE.asString(),
+          "timeout=" + handlerCtx.connectionKeepAliveTimeOut);
+    }
+
+    // Content length must be set (https://www.rfc-editor.org/rfc/rfc7230#section-3.3.3)
+    HttpUtil.setContentLength(response, contentLength);
+  }
+
+  @SuppressWarnings("checkstyle:VisibilityModifier")
+  static class MapOutputInfo {
+    final Path mapOutputFileName;
+    final IndexRecord indexRecord;
+
+    MapOutputInfo(Path mapOutputFileName, IndexRecord indexRecord) {
+      this.mapOutputFileName = mapOutputFileName;
+      this.indexRecord = indexRecord;
+    }
+  }
+
+  protected void verifyRequest(String appid, ChannelHandlerContext ctx,
+                               HttpRequest request, HttpResponse response, URL requestUri)
+      throws IOException {
+    SecretKey tokenSecret = handlerCtx.secretManager.retrieveTokenSecret(appid);
+    if (null == tokenSecret) {
+      LOG.info("Request for unknown token {}, channel id: {}", appid, ctx.channel().id());
+      throw new IOException("Could not find jobid");
+    }
+    // encrypting URL
+    String encryptedURL = SecureShuffleUtils.buildMsgFrom(requestUri);
+    // hash from the fetcher
+    String urlHashStr =
+        request.headers().get(SecureShuffleUtils.HTTP_HEADER_URL_HASH);
+    if (urlHashStr == null) {
+      LOG.info("Missing header hash for {}, channel id: {}", appid, ctx.channel().id());
+      throw new IOException("fetcher cannot be authenticated");
+    }
+    if (LOG.isDebugEnabled()) {
+      int len = urlHashStr.length();
+      LOG.debug("Verifying request. encryptedURL:{}, hash:{}, channel id: " +
+              "{}", encryptedURL,
+          urlHashStr.substring(len - len / 2, len - 1), ctx.channel().id());
+    }
+    // verify - throws exception
+    SecureShuffleUtils.verifyReply(urlHashStr, encryptedURL, tokenSecret);
+    // verification passed - encode the reply
+    String reply = SecureShuffleUtils.generateHash(urlHashStr.getBytes(Charsets.UTF_8),
+        tokenSecret);
+    response.headers().set(
+        SecureShuffleUtils.HTTP_HEADER_REPLY_URL_HASH, reply);
+    // Put shuffle version into http header
+    response.headers().set(ShuffleHeader.HTTP_HEADER_NAME,
+        ShuffleHeader.DEFAULT_HTTP_HEADER_NAME);
+    response.headers().set(ShuffleHeader.HTTP_HEADER_VERSION,
+        ShuffleHeader.DEFAULT_HTTP_HEADER_VERSION);
+    if (LOG.isDebugEnabled()) {
+      int len = reply.length();
+      LOG.debug("Fetcher request verified. " +
+              "encryptedURL: {}, reply: {}, channel id: {}",
+          encryptedURL, reply.substring(len - len / 2, len - 1),
+          ctx.channel().id());
+    }
+  }
+
+  public static ByteBuf shuffleHeaderToBytes(ShuffleHeader header) throws IOException {
+    final DataOutputBuffer dob = new DataOutputBuffer();
+    header.write(dob);
+    return wrappedBuffer(dob.getData(), 0, dob.getLength());
+  }
+
+  protected ChannelFuture sendMapOutput(Channel ch, String user, String mapId, int reduce,
+                                        MapOutputInfo mapOutputInfo)
+      throws IOException {
+    final IndexRecord info = mapOutputInfo.indexRecord;
+    ch.write(shuffleHeaderToBytes(
+        new ShuffleHeader(mapId, info.partLength, info.rawLength, reduce)));
+    final File spillFile =
+        new File(mapOutputInfo.mapOutputFileName.toString());
+    RandomAccessFile spill = SecureIOUtils.openForRandomRead(spillFile, "r", user, null);
+    ChannelFuture writeFuture;
+    if (ch.pipeline().get(SslHandler.class) == null) {
+      final FadvisedFileRegion partition = new FadvisedFileRegion(spill,
+          info.startOffset, info.partLength, handlerCtx.manageOsCache, handlerCtx.readaheadLength,
+          handlerCtx.readaheadPool, spillFile.getAbsolutePath(),
+          handlerCtx.shuffleBufferSize, handlerCtx.shuffleTransferToAllowed);
+      writeFuture = ch.writeAndFlush(partition);
+      // TODO error handling; distinguish IO/connection failures,
+      //      attribute to appropriate spill output
+      writeFuture.addListener((ChannelFutureListener) future -> {
+        if (future.isSuccess()) {
+          partition.transferSuccessful();
+        }
+        partition.deallocate();
+      });
+    } else {
+      // HTTPS cannot be done with zero copy.
+      final FadvisedChunkedFile chunk = new FadvisedChunkedFile(spill,
+          info.startOffset, info.partLength, handlerCtx.sslFileBufferSize,
+          handlerCtx.manageOsCache, handlerCtx.readaheadLength, handlerCtx.readaheadPool,
+          spillFile.getAbsolutePath());
+      writeFuture = ch.writeAndFlush(chunk);
+    }
+
+    handlerCtx.metrics.shuffleConnections.incr();
+    handlerCtx.metrics.shuffleOutputBytes.incr(info.partLength); // optimistic
+    return writeFuture;
+  }
+
+  protected void sendError(ChannelHandlerContext ctx,
+                           HttpResponseStatus status) {
+    sendError(ctx, "", status);
+  }
+
+  protected void sendError(ChannelHandlerContext ctx, String message,
+                           HttpResponseStatus status) {
+    sendError(ctx, message, status, Collections.emptyMap());
+  }
+
+  protected void sendError(ChannelHandlerContext ctx, String msg,
+                           HttpResponseStatus status, Map<String, String> headers) {
+    FullHttpResponse response = new DefaultFullHttpResponse(HTTP_1_1, status,
+        Unpooled.copiedBuffer(msg, CharsetUtil.UTF_8));
+    response.headers().set(CONTENT_TYPE, "text/plain; charset=UTF-8");
+    // Put shuffle version into http header
+    response.headers().set(ShuffleHeader.HTTP_HEADER_NAME,
+        ShuffleHeader.DEFAULT_HTTP_HEADER_NAME);
+    response.headers().set(ShuffleHeader.HTTP_HEADER_VERSION,
+        ShuffleHeader.DEFAULT_HTTP_HEADER_VERSION);
+    for (Map.Entry<String, String> header : headers.entrySet()) {
+      response.headers().set(header.getKey(), header.getValue());
+    }
+    HttpUtil.setContentLength(response, response.content().readableBytes());
+
+    // Close the connection as soon as the error message is sent.
+    ctx.channel().writeAndFlush(response).addListener(ChannelFutureListener.CLOSE);
+    // TODO: missing keep-alive handling
+  }
+
+  @Override
+  public void exceptionCaught(ChannelHandlerContext ctx, Throwable cause)
+      throws Exception {
+    Channel ch = ctx.channel();
+    if (cause instanceof TooLongFrameException) {
+      LOG.trace("TooLongFrameException, channel id: {}", ch.id());
+      sendError(ctx, BAD_REQUEST);
+      return;
+    } else if (cause instanceof IOException) {
+      if (cause instanceof ClosedChannelException) {
+        LOG.debug("Ignoring closed channel error, channel id: " + ch.id(), cause);
+        return;
+      }
+      String message = String.valueOf(cause.getMessage());
+      if (IGNORABLE_ERROR_MESSAGE.matcher(message).matches()) {
+        LOG.debug("Ignoring client socket close, channel id: " + ch.id(), cause);
+        return;
+      }
+    }
+
+    LOG.error("Shuffle error. Channel id: " + ch.id(), cause);
+    if (ch.isActive()) {
+      sendError(ctx, INTERNAL_SERVER_ERROR);
+    }
+  }
+
+  /**
+   * Maintain parameters per messageReceived() Netty context.
+   * Allows sendMapOutput calls from operationComplete()
+   */
+  public static class ReduceContext {
+    private final List<String> mapIds;
+    private final AtomicInteger mapsToWait;
+    private final AtomicInteger mapsToSend;
+    private final int reduceId;
+    private final ChannelHandlerContext ctx;
+    private final String user;
+    private final Map<String, ShuffleChannelHandler.MapOutputInfo> infoMap;
+    private final String jobId;
+    private final boolean keepAlive;
+
+    ReduceContext(List<String> mapIds, int rId,
+                  ChannelHandlerContext context, String usr,
+                  Map<String, ShuffleChannelHandler.MapOutputInfo> mapOutputInfoMap,
+                  String jobId, boolean keepAlive) {
+
+      this.mapIds = mapIds;
+      this.reduceId = rId;
+      /*
+       * Atomic count for tracking the no. of map outputs that are yet to
+       * complete. Multiple futureListeners' operationComplete() can decrement
+       * this value asynchronously. It is used to decide when the channel should
+       * be closed.
+       */
+      this.mapsToWait = new AtomicInteger(mapIds.size());
+      /*
+       * Atomic count for tracking the no. of map outputs that have been sent.
+       * Multiple sendMap() calls can increment this value
+       * asynchronously. Used to decide which mapId should be sent next.
+       */
+      this.mapsToSend = new AtomicInteger(0);
+      this.ctx = context;
+      this.user = usr;
+      this.infoMap = mapOutputInfoMap;
+      this.jobId = jobId;
+      this.keepAlive = keepAlive;
+    }
+
+    public int getReduceId() {
+      return reduceId;
+    }
+
+    public ChannelHandlerContext getCtx() {
+      return ctx;
+    }
+
+    public String getUser() {
+      return user;
+    }
+
+    public Map<String, ShuffleChannelHandler.MapOutputInfo> getInfoMap() {
+      return infoMap;
+    }
+
+    public String getJobId() {
+      return jobId;
+    }
+
+    public List<String> getMapIds() {
+      return mapIds;
+    }
+
+    public AtomicInteger getMapsToSend() {
+      return mapsToSend;
+    }
+
+    public AtomicInteger getMapsToWait() {
+      return mapsToWait;
+    }
+
+    public boolean getKeepAlive() {
+      return keepAlive;
+    }
+  }
+
+  static class ReduceMapFileCount implements ChannelFutureListener {
+    private final ShuffleChannelHandler handler;
+    private final ReduceContext reduceContext;
+
+    ReduceMapFileCount(ShuffleChannelHandler handler, ReduceContext rc) {
+      this.handler = handler;
+      this.reduceContext = rc;
+    }
+
+    @Override
+    public void operationComplete(ChannelFuture future) throws Exception {
+      LOG.trace("SendMap operation complete; mapsToWait='{}', channel='{}'",
+          this.reduceContext.getMapsToWait().get(), future.channel().id());
+      if (!future.isSuccess()) {
+        LOG.error("Future is unsuccessful. channel='{}' Cause: ",
+            future.channel().id(), future.cause());
+        future.channel().close();
+        return;
+      }
+      int waitCount = this.reduceContext.getMapsToWait().decrementAndGet();
+      if (waitCount == 0) {
+        ChannelFuture lastContentFuture =
+            future.channel().writeAndFlush(LastHttpContent.EMPTY_LAST_CONTENT);
+        handler.handlerCtx.metrics.operationComplete(future);
+
+        // Let the idle timer handler close keep-alive connections
+        if (reduceContext.getKeepAlive()) {
+          LOG.trace("SendMap operation complete, keeping alive the connection; channel='{}'",
+              future.channel().id());
+          ChannelPipeline pipeline = future.channel().pipeline();
+          ShuffleHandler.TimeoutHandler timeoutHandler =
+              (ShuffleHandler.TimeoutHandler)pipeline.get(TIMEOUT_HANDLER);
+          timeoutHandler.setEnabledTimeout(true);
+        } else {
+          LOG.trace("SendMap operation complete, closing connection; channel='{}'",
+              future.channel().id());
+          lastContentFuture.addListener(ChannelFutureListener.CLOSE);
+        }
+      } else {
+        LOG.trace("SendMap operation complete, waitCount > 0, " +
+                "invoking sendMap with reduceContext; channel='{}'",
+            future.channel().id());
+        handler.sendMap(reduceContext);
+      }
+    }
+  }
+}
diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-shuffle/src/main/java/org/apache/hadoop/mapred/ShuffleChannelHandlerContext.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-shuffle/src/main/java/org/apache/hadoop/mapred/ShuffleChannelHandlerContext.java
new file mode 100644
index 0000000000000..fa037e98e83f9
--- /dev/null
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-shuffle/src/main/java/org/apache/hadoop/mapred/ShuffleChannelHandlerContext.java
@@ -0,0 +1,140 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.mapred;
+
+import io.netty.channel.group.ChannelGroup;
+
+import org.apache.hadoop.thirdparty.com.google.common.cache.LoadingCache;
+
+import java.util.Map;
+import java.util.concurrent.atomic.AtomicInteger;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.ReadaheadPool;
+import org.apache.hadoop.mapreduce.security.token.JobTokenSecretManager;
+import org.apache.hadoop.util.Shell;
+
+import static org.apache.hadoop.mapred.ShuffleHandler.DEFAULT_MAX_SHUFFLE_CONNECTIONS;
+import static org.apache.hadoop.mapred.ShuffleHandler.DEFAULT_SHUFFLE_BUFFER_SIZE;
+import static org.apache.hadoop.mapred.ShuffleHandler.DEFAULT_SHUFFLE_CONNECTION_KEEP_ALIVE_ENABLED;
+import static org.apache.hadoop.mapred.ShuffleHandler.DEFAULT_SHUFFLE_CONNECTION_KEEP_ALIVE_TIME_OUT;
+import static org.apache.hadoop.mapred.ShuffleHandler.DEFAULT_SHUFFLE_MANAGE_OS_CACHE;
+import static org.apache.hadoop.mapred.ShuffleHandler.DEFAULT_SHUFFLE_MAPOUTPUT_META_INFO_CACHE_SIZE;
+import static org.apache.hadoop.mapred.ShuffleHandler.DEFAULT_SHUFFLE_MAX_SESSION_OPEN_FILES;
+import static org.apache.hadoop.mapred.ShuffleHandler.DEFAULT_SHUFFLE_READAHEAD_BYTES;
+import static org.apache.hadoop.mapred.ShuffleHandler.DEFAULT_SHUFFLE_TRANSFERTO_ALLOWED;
+import static org.apache.hadoop.mapred.ShuffleHandler.DEFAULT_SUFFLE_SSL_FILE_BUFFER_SIZE;
+import static org.apache.hadoop.mapred.ShuffleHandler.MAX_SHUFFLE_CONNECTIONS;
+import static org.apache.hadoop.mapred.ShuffleHandler.SHUFFLE_BUFFER_SIZE;
+import static org.apache.hadoop.mapred.ShuffleHandler.SHUFFLE_CONNECTION_KEEP_ALIVE_ENABLED;
+import static org.apache.hadoop.mapred.ShuffleHandler.SHUFFLE_CONNECTION_KEEP_ALIVE_TIME_OUT;
+import static org.apache.hadoop.mapred.ShuffleHandler.SHUFFLE_MANAGE_OS_CACHE;
+import static org.apache.hadoop.mapred.ShuffleHandler.SHUFFLE_MAPOUTPUT_META_INFO_CACHE_SIZE;
+import static org.apache.hadoop.mapred.ShuffleHandler.SHUFFLE_MAX_SESSION_OPEN_FILES;
+import static org.apache.hadoop.mapred.ShuffleHandler.SHUFFLE_READAHEAD_BYTES;
+import static org.apache.hadoop.mapred.ShuffleHandler.SHUFFLE_TRANSFERTO_ALLOWED;
+import static org.apache.hadoop.mapred.ShuffleHandler.SUFFLE_SSL_FILE_BUFFER_SIZE_KEY;
+import static org.apache.hadoop.mapred.ShuffleHandler.WINDOWS_DEFAULT_SHUFFLE_TRANSFERTO_ALLOWED;
+
+@SuppressWarnings("checkstyle:VisibilityModifier")
+public class ShuffleChannelHandlerContext {
+
+  public final Configuration conf;
+  public final JobTokenSecretManager secretManager;
+  public final Map<String, String> userRsrc;
+  public final LoadingCache<ShuffleHandler.AttemptPathIdentifier,
+      ShuffleHandler.AttemptPathInfo> pathCache;
+  public final IndexCache indexCache;
+  public final ShuffleHandler.ShuffleMetrics metrics;
+  public final ChannelGroup allChannels;
+
+
+  public final boolean connectionKeepAliveEnabled;
+  public final int sslFileBufferSize;
+  public final int connectionKeepAliveTimeOut;
+  public final int mapOutputMetaInfoCacheSize;
+
+  public final AtomicInteger activeConnections = new AtomicInteger();
+
+  /**
+   * Should the shuffle use posix_fadvise calls to manage the OS cache during
+   * sendfile.
+   */
+  public final boolean manageOsCache;
+  public final int readaheadLength;
+  public final int maxShuffleConnections;
+  public final int shuffleBufferSize;
+  public final boolean shuffleTransferToAllowed;
+  public final int maxSessionOpenFiles;
+  public final ReadaheadPool readaheadPool = ReadaheadPool.getInstance();
+
+  public int port = -1;
+
+  public ShuffleChannelHandlerContext(Configuration conf,
+                                      Map<String, String> userRsrc,
+                                      JobTokenSecretManager secretManager,
+                                      LoadingCache<ShuffleHandler.AttemptPathIdentifier,
+                                          ShuffleHandler.AttemptPathInfo> patCache,
+                                      IndexCache indexCache,
+                                      ShuffleHandler.ShuffleMetrics metrics,
+                                      ChannelGroup allChannels) {
+    this.conf = conf;
+    this.userRsrc = userRsrc;
+    this.secretManager = secretManager;
+    this.pathCache = patCache;
+    this.indexCache = indexCache;
+    this.metrics = metrics;
+    this.allChannels = allChannels;
+
+    sslFileBufferSize = conf.getInt(SUFFLE_SSL_FILE_BUFFER_SIZE_KEY,
+        DEFAULT_SUFFLE_SSL_FILE_BUFFER_SIZE);
+    connectionKeepAliveEnabled =
+        conf.getBoolean(SHUFFLE_CONNECTION_KEEP_ALIVE_ENABLED,
+            DEFAULT_SHUFFLE_CONNECTION_KEEP_ALIVE_ENABLED);
+    connectionKeepAliveTimeOut =
+        Math.max(1, conf.getInt(SHUFFLE_CONNECTION_KEEP_ALIVE_TIME_OUT,
+            DEFAULT_SHUFFLE_CONNECTION_KEEP_ALIVE_TIME_OUT));
+    mapOutputMetaInfoCacheSize =
+        Math.max(1, conf.getInt(SHUFFLE_MAPOUTPUT_META_INFO_CACHE_SIZE,
+            DEFAULT_SHUFFLE_MAPOUTPUT_META_INFO_CACHE_SIZE));
+
+    manageOsCache = conf.getBoolean(SHUFFLE_MANAGE_OS_CACHE,
+        DEFAULT_SHUFFLE_MANAGE_OS_CACHE);
+
+    readaheadLength = conf.getInt(SHUFFLE_READAHEAD_BYTES,
+        DEFAULT_SHUFFLE_READAHEAD_BYTES);
+
+    maxShuffleConnections = conf.getInt(MAX_SHUFFLE_CONNECTIONS,
+        DEFAULT_MAX_SHUFFLE_CONNECTIONS);
+
+    shuffleBufferSize = conf.getInt(SHUFFLE_BUFFER_SIZE,
+        DEFAULT_SHUFFLE_BUFFER_SIZE);
+
+    shuffleTransferToAllowed = conf.getBoolean(SHUFFLE_TRANSFERTO_ALLOWED,
+        (Shell.WINDOWS)?WINDOWS_DEFAULT_SHUFFLE_TRANSFERTO_ALLOWED:
+            DEFAULT_SHUFFLE_TRANSFERTO_ALLOWED);
+
+    maxSessionOpenFiles = conf.getInt(SHUFFLE_MAX_SESSION_OPEN_FILES,
+        DEFAULT_SHUFFLE_MAX_SESSION_OPEN_FILES);
+  }
+
+  void setPort(int port) {
+    this.port = port;
+  }
+}
diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-shuffle/src/main/java/org/apache/hadoop/mapred/ShuffleChannelInitializer.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-shuffle/src/main/java/org/apache/hadoop/mapred/ShuffleChannelInitializer.java
new file mode 100644
index 0000000000000..25f01322df934
--- /dev/null
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-shuffle/src/main/java/org/apache/hadoop/mapred/ShuffleChannelInitializer.java
@@ -0,0 +1,74 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.mapred;
+
+import io.netty.channel.ChannelInitializer;
+import io.netty.channel.ChannelPipeline;
+import io.netty.channel.socket.SocketChannel;
+import io.netty.handler.codec.http.HttpObjectAggregator;
+import io.netty.handler.codec.http.HttpServerCodec;
+import io.netty.handler.ssl.SslHandler;
+import io.netty.handler.stream.ChunkedWriteHandler;
+
+import java.io.IOException;
+import java.security.GeneralSecurityException;
+
+import org.apache.hadoop.security.ssl.SSLFactory;
+
+import static org.apache.hadoop.mapred.ShuffleHandler.TIMEOUT_HANDLER;
+import static org.apache.hadoop.mapred.ShuffleHandler.LOG;
+
+public class ShuffleChannelInitializer extends ChannelInitializer<SocketChannel> {
+
+  public static final int MAX_CONTENT_LENGTH = 1 << 16;
+
+  private final ShuffleChannelHandlerContext handlerContext;
+  private final SSLFactory sslFactory;
+
+
+  public ShuffleChannelInitializer(ShuffleChannelHandlerContext ctx, SSLFactory sslFactory) {
+    this.handlerContext = ctx;
+    this.sslFactory = sslFactory;
+  }
+
+  @Override
+  public void initChannel(SocketChannel ch) throws GeneralSecurityException, IOException {
+    LOG.debug("ShuffleChannelInitializer init; channel='{}'", ch.id());
+
+    ChannelPipeline pipeline = ch.pipeline();
+    if (sslFactory != null) {
+      pipeline.addLast("ssl", new SslHandler(sslFactory.createSSLEngine()));
+    }
+    pipeline.addLast("http", new HttpServerCodec());
+    pipeline.addLast("aggregator", new HttpObjectAggregator(MAX_CONTENT_LENGTH));
+    pipeline.addLast("chunking", new ChunkedWriteHandler());
+
+    // An EventExecutorGroup could be specified to run in a
+    // different thread than an I/O thread so that the I/O thread
+    // is not blocked by a time-consuming task:
+    // https://netty.io/4.1/api/io/netty/channel/ChannelPipeline.html
+    pipeline.addLast("shuffle", new ShuffleChannelHandler(handlerContext));
+
+    pipeline.addLast(TIMEOUT_HANDLER,
+        new ShuffleHandler.TimeoutHandler(handlerContext.connectionKeepAliveTimeOut));
+    // TODO factor security manager into pipeline
+    // TODO factor out encode/decode to permit binary shuffle
+    // TODO factor out decode of index to permit alt. models
+  }
+}
diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-shuffle/src/main/java/org/apache/hadoop/mapred/ShuffleHandler.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-shuffle/src/main/java/org/apache/hadoop/mapred/ShuffleHandler.java
index fc755a67631f0..9971896e4cd6d 100644
--- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-shuffle/src/main/java/org/apache/hadoop/mapred/ShuffleHandler.java
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-shuffle/src/main/java/org/apache/hadoop/mapred/ShuffleHandler.java
@@ -1,4 +1,4 @@
-/**
+/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
@@ -18,94 +18,52 @@
 
 package org.apache.hadoop.mapred;
 
-import static io.netty.buffer.Unpooled.wrappedBuffer;
-import static io.netty.handler.codec.http.HttpHeaderNames.CONTENT_TYPE;
-import static io.netty.handler.codec.http.HttpMethod.GET;
-import static io.netty.handler.codec.http.HttpResponseStatus.BAD_REQUEST;
-import static io.netty.handler.codec.http.HttpResponseStatus.FORBIDDEN;
-import static io.netty.handler.codec.http.HttpResponseStatus.INTERNAL_SERVER_ERROR;
-import static io.netty.handler.codec.http.HttpResponseStatus.METHOD_NOT_ALLOWED;
-import static io.netty.handler.codec.http.HttpResponseStatus.NOT_FOUND;
-import static io.netty.handler.codec.http.HttpResponseStatus.OK;
-import static io.netty.handler.codec.http.HttpResponseStatus.UNAUTHORIZED;
-import static io.netty.handler.codec.http.HttpVersion.HTTP_1_1;
-import static org.apache.hadoop.mapred.ShuffleHandler.NettyChannelHelper.*;
 import static org.fusesource.leveldbjni.JniDBFactory.asString;
 import static org.fusesource.leveldbjni.JniDBFactory.bytes;
 
 import java.io.File;
-import java.io.FileNotFoundException;
 import java.io.IOException;
-import java.io.RandomAccessFile;
 import java.net.InetSocketAddress;
-import java.net.URL;
 import java.nio.ByteBuffer;
-import java.nio.channels.ClosedChannelException;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.HashMap;
-import java.util.List;
 import java.util.Map;
 import java.util.concurrent.ConcurrentHashMap;
-import java.util.concurrent.ExecutionException;
 import java.util.concurrent.ThreadFactory;
 import java.util.concurrent.TimeUnit;
-import java.util.concurrent.atomic.AtomicInteger;
 import java.util.regex.Pattern;
 
-import javax.crypto.SecretKey;
-
 import io.netty.bootstrap.ServerBootstrap;
-import io.netty.buffer.Unpooled;
 import io.netty.channel.Channel;
 import io.netty.channel.ChannelFuture;
 import io.netty.channel.ChannelFutureListener;
-import io.netty.channel.ChannelHandler;
 import io.netty.channel.ChannelHandlerContext;
-import io.netty.channel.ChannelInboundHandlerAdapter;
-import io.netty.channel.ChannelInitializer;
 import io.netty.channel.ChannelOption;
-import io.netty.channel.ChannelOutboundHandlerAdapter;
-import io.netty.channel.ChannelPipeline;
-import io.netty.channel.ChannelPromise;
 import io.netty.channel.EventLoopGroup;
 import io.netty.channel.group.ChannelGroup;
 import io.netty.channel.group.DefaultChannelGroup;
 import io.netty.channel.nio.NioEventLoopGroup;
-import io.netty.channel.socket.SocketChannel;
 import io.netty.channel.socket.nio.NioServerSocketChannel;
-import io.netty.handler.codec.TooLongFrameException;
-import io.netty.handler.codec.http.DefaultFullHttpResponse;
-import io.netty.handler.codec.http.DefaultHttpResponse;
-import io.netty.handler.codec.http.FullHttpResponse;
-import io.netty.handler.codec.http.HttpObjectAggregator;
-import io.netty.handler.codec.http.HttpRequest;
-import io.netty.handler.codec.http.HttpRequestDecoder;
-import io.netty.handler.codec.http.HttpResponse;
-import io.netty.handler.codec.http.HttpResponseEncoder;
 import io.netty.handler.codec.http.HttpResponseStatus;
-import io.netty.handler.codec.http.LastHttpContent;
-import io.netty.handler.codec.http.QueryStringDecoder;
-import io.netty.handler.ssl.SslHandler;
-import io.netty.handler.stream.ChunkedWriteHandler;
 import io.netty.handler.timeout.IdleState;
 import io.netty.handler.timeout.IdleStateEvent;
 import io.netty.handler.timeout.IdleStateHandler;
-import io.netty.util.CharsetUtil;
-import io.netty.util.concurrent.DefaultEventExecutorGroup;
+import io.netty.util.concurrent.GlobalEventExecutor;
+
+import javax.annotation.Nonnull;
+
+import org.apache.hadoop.thirdparty.com.google.common.cache.CacheBuilder;
+import org.apache.hadoop.thirdparty.com.google.common.cache.CacheLoader;
+import org.apache.hadoop.thirdparty.com.google.common.cache.LoadingCache;
+import org.apache.hadoop.thirdparty.com.google.common.cache.RemovalListener;
+
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.DataInputByteBuffer;
 import org.apache.hadoop.io.DataOutputBuffer;
-import org.apache.hadoop.io.ReadaheadPool;
-import org.apache.hadoop.io.SecureIOUtils;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.mapred.proto.ShuffleHandlerRecoveryProtos.JobShuffleInfoProto;
 import org.apache.hadoop.mapreduce.MRConfig;
-import org.apache.hadoop.mapreduce.security.SecureShuffleUtils;
 import org.apache.hadoop.mapreduce.security.token.JobTokenIdentifier;
 import org.apache.hadoop.mapreduce.security.token.JobTokenSecretManager;
-import org.apache.hadoop.mapreduce.task.reduce.ShuffleHeader;
 import org.apache.hadoop.metrics2.MetricsSystem;
 import org.apache.hadoop.metrics2.annotation.Metric;
 import org.apache.hadoop.metrics2.annotation.Metrics;
@@ -116,8 +74,6 @@
 import org.apache.hadoop.security.proto.SecurityProtos.TokenProto;
 import org.apache.hadoop.security.ssl.SSLFactory;
 import org.apache.hadoop.security.token.Token;
-import org.apache.hadoop.util.DiskChecker;
-import org.apache.hadoop.util.Shell;
 import org.apache.hadoop.yarn.api.records.ApplicationId;
 import org.apache.hadoop.yarn.proto.YarnServerCommonProtos.VersionProto;
 import org.apache.hadoop.yarn.server.api.ApplicationInitializationContext;
@@ -132,25 +88,17 @@
 import org.iq80.leveldb.DB;
 import org.iq80.leveldb.DBException;
 import org.iq80.leveldb.Options;
-import org.eclipse.jetty.http.HttpHeader;
 import org.slf4j.LoggerFactory;
 
-import com.google.common.annotations.VisibleForTesting;
-import com.google.common.base.Charsets;
-import com.google.common.cache.CacheBuilder;
-import com.google.common.cache.CacheLoader;
-import com.google.common.cache.LoadingCache;
-import com.google.common.cache.RemovalListener;
-import com.google.common.cache.RemovalNotification;
-import com.google.common.cache.Weigher;
-import com.google.common.util.concurrent.ThreadFactoryBuilder;
+import org.apache.hadoop.classification.VisibleForTesting;
+import org.apache.hadoop.thirdparty.com.google.common.util.concurrent.ThreadFactoryBuilder;
 import com.google.protobuf.ByteString;
 
 public class ShuffleHandler extends AuxiliaryService {
 
-  private static final org.slf4j.Logger LOG =
+  public static final org.slf4j.Logger LOG =
       LoggerFactory.getLogger(ShuffleHandler.class);
-  private static final org.slf4j.Logger AUDITLOG =
+  public static final org.slf4j.Logger AUDITLOG =
       LoggerFactory.getLogger(ShuffleHandler.class.getName()+".audit");
   public static final String SHUFFLE_MANAGE_OS_CACHE = "mapreduce.shuffle.manage.os.cache";
   public static final boolean DEFAULT_SHUFFLE_MANAGE_OS_CACHE = true;
@@ -172,7 +120,7 @@ public class ShuffleHandler extends AuxiliaryService {
   
   // pattern to identify errors related to the client closing the socket early
   // idea borrowed from Netty SslHandler
-  private static final Pattern IGNORABLE_ERROR_MESSAGE = Pattern.compile(
+  public static final Pattern IGNORABLE_ERROR_MESSAGE = Pattern.compile(
       "^.*(?:connection.*reset|connection.*closed|broken.*pipe).*$",
       Pattern.CASE_INSENSITIVE);
 
@@ -189,37 +137,21 @@ public class ShuffleHandler extends AuxiliaryService {
   // This should be kept in sync with Fetcher.FETCH_RETRY_DELAY_DEFAULT
   public static final long FETCH_RETRY_DELAY = 1000L;
   public static final String RETRY_AFTER_HEADER = "Retry-After";
-  static final String ENCODER_HANDLER_NAME = "encoder";
 
   private int port;
   private EventLoopGroup bossGroup;
   private EventLoopGroup workerGroup;
-  private ServerBootstrap bootstrap;
-  private Channel ch;
-  private final ChannelGroup accepted =
-      new DefaultChannelGroup(new DefaultEventExecutorGroup(5).next());
-  private final AtomicInteger activeConnections = new AtomicInteger();
-  protected HttpPipelineFactory pipelineFact;
-  private int sslFileBufferSize;
-
-  //TODO snemeth add a config option for these later, this is temporarily disabled for now.
-  private boolean useOutboundExceptionHandler = false;
-  private boolean useOutboundLogger = false;
-  
-  /**
-   * Should the shuffle use posix_fadvise calls to manage the OS cache during
-   * sendfile.
-   */
-  private boolean manageOsCache;
-  private int readaheadLength;
-  private int maxShuffleConnections;
-  private int shuffleBufferSize;
-  private boolean shuffleTransferToAllowed;
-  private int maxSessionOpenFiles;
-  private ReadaheadPool readaheadPool = ReadaheadPool.getInstance();
 
-  private Map<String, String> userRsrc;
-  private JobTokenSecretManager secretManager;
+  @SuppressWarnings("checkstyle:VisibilityModifier")
+  protected final ChannelGroup allChannels =
+      new DefaultChannelGroup(GlobalEventExecutor.INSTANCE);
+
+  private SSLFactory sslFactory;
+
+  @SuppressWarnings("checkstyle:VisibilityModifier")
+  protected JobTokenSecretManager secretManager;
+  @SuppressWarnings("checkstyle:VisibilityModifier")
+  protected Map<String, String> userRsrc;
 
   private DB stateDb = null;
 
@@ -278,9 +210,6 @@ public class ShuffleHandler extends AuxiliaryService {
       "mapreduce.shuffle.max.session-open-files";
   public static final int DEFAULT_SHUFFLE_MAX_SESSION_OPEN_FILES = 3;
 
-  boolean connectionKeepAliveEnabled = false;
-  private int connectionKeepAliveTimeOut;
-  private int mapOutputMetaInfoCacheSize;
 
   @Metrics(about="Shuffle output metrics", context="mapred")
   static class ShuffleMetrics implements ChannelFutureListener {
@@ -304,170 +233,11 @@ public void operationComplete(ChannelFuture future) throws Exception {
     }
   }
 
-  static class NettyChannelHelper {
-    static ChannelFuture writeToChannel(Channel ch, Object obj) {
-      LOG.debug("Writing {} to channel: {}", obj.getClass().getSimpleName(), ch.id());
-      return ch.writeAndFlush(obj);
-    }
-
-    static ChannelFuture writeToChannelAndClose(Channel ch, Object obj) {
-      return writeToChannel(ch, obj).addListener(ChannelFutureListener.CLOSE);
-    }
-
-    static ChannelFuture writeToChannelAndAddLastHttpContent(Channel ch, HttpResponse obj) {
-      writeToChannel(ch, obj);
-      return writeLastHttpContentToChannel(ch);
-    }
-
-    static ChannelFuture writeLastHttpContentToChannel(Channel ch) {
-      LOG.debug("Writing LastHttpContent, channel id: {}", ch.id());
-      return ch.writeAndFlush(LastHttpContent.EMPTY_LAST_CONTENT);
-    }
-
-    static ChannelFuture closeChannel(Channel ch) {
-      LOG.debug("Closing channel, channel id: {}", ch.id());
-      return ch.close();
-    }
-
-    static void closeChannels(ChannelGroup channelGroup) {
-      channelGroup.close().awaitUninterruptibly(10, TimeUnit.SECONDS);
-    }
-
-    public static ChannelFuture closeAsIdle(Channel ch, int timeout) {
-      LOG.debug("Closing channel as writer was idle for {} seconds", timeout);
-      return closeChannel(ch);
-    }
-
-    public static void channelActive(Channel ch) {
-      LOG.debug("Executing channelActive, channel id: {}", ch.id());
-    }
-
-    public static void channelInactive(Channel ch) {
-      LOG.debug("Executing channelInactive, channel id: {}", ch.id());
-    }
-  }
-
-  private final MetricsSystem ms;
+  @SuppressWarnings("checkstyle:VisibilityModifier")
+  protected final MetricsSystem ms;
+  @SuppressWarnings("checkstyle:VisibilityModifier")
   final ShuffleMetrics metrics;
 
-  class ReduceMapFileCount implements ChannelFutureListener {
-
-    private ReduceContext reduceContext;
-
-    ReduceMapFileCount(ReduceContext rc) {
-      this.reduceContext = rc;
-    }
-
-    @Override
-    public void operationComplete(ChannelFuture future) throws Exception {
-      LOG.trace("operationComplete");
-      if (!future.isSuccess()) {
-        LOG.error("Future is unsuccessful. Cause: ", future.cause());
-        closeChannel(future.channel());
-        return;
-      }
-      int waitCount = this.reduceContext.getMapsToWait().decrementAndGet();
-      if (waitCount == 0) {
-        LOG.trace("Finished with all map outputs");
-        //HADOOP-15327: Need to send an instance of LastHttpContent to define HTTP
-        //message boundaries. See details in jira.
-        writeLastHttpContentToChannel(future.channel());
-        metrics.operationComplete(future);
-        // Let the idle timer handler close keep-alive connections
-        if (reduceContext.getKeepAlive()) {
-          ChannelPipeline pipeline = future.channel().pipeline();
-          TimeoutHandler timeoutHandler =
-              (TimeoutHandler)pipeline.get(TIMEOUT_HANDLER);
-          timeoutHandler.setEnabledTimeout(true);
-        } else {
-          closeChannel(future.channel());
-        }
-      } else {
-        LOG.trace("operationComplete, waitCount > 0, invoking sendMap with reduceContext");
-        pipelineFact.getSHUFFLE().sendMap(reduceContext);
-      }
-    }
-  }
-
-  /**
-   * Maintain parameters per messageReceived() Netty context.
-   * Allows sendMapOutput calls from operationComplete()
-   */
-  private static class ReduceContext {
-    private List<String> mapIds;
-    private AtomicInteger mapsToWait;
-    private AtomicInteger mapsToSend;
-    private int reduceId;
-    private ChannelHandlerContext ctx;
-    private String user;
-    private Map<String, Shuffle.MapOutputInfo> infoMap;
-    private String jobId;
-    private final boolean keepAlive;
-
-    ReduceContext(List<String> mapIds, int rId,
-                         ChannelHandlerContext context, String usr,
-                         Map<String, Shuffle.MapOutputInfo> mapOutputInfoMap,
-                         String jobId, boolean keepAlive) {
-
-      this.mapIds = mapIds;
-      this.reduceId = rId;
-      /**
-      * Atomic count for tracking the no. of map outputs that are yet to
-      * complete. Multiple futureListeners' operationComplete() can decrement
-      * this value asynchronously. It is used to decide when the channel should
-      * be closed.
-      */
-      this.mapsToWait = new AtomicInteger(mapIds.size());
-      /**
-      * Atomic count for tracking the no. of map outputs that have been sent.
-      * Multiple sendMap() calls can increment this value
-      * asynchronously. Used to decide which mapId should be sent next.
-      */
-      this.mapsToSend = new AtomicInteger(0);
-      this.ctx = context;
-      this.user = usr;
-      this.infoMap = mapOutputInfoMap;
-      this.jobId = jobId;
-      this.keepAlive = keepAlive;
-    }
-
-    public int getReduceId() {
-      return reduceId;
-    }
-
-    public ChannelHandlerContext getCtx() {
-      return ctx;
-    }
-
-    public String getUser() {
-      return user;
-    }
-
-    public Map<String, Shuffle.MapOutputInfo> getInfoMap() {
-      return infoMap;
-    }
-
-    public String getJobId() {
-      return jobId;
-    }
-
-    public List<String> getMapIds() {
-      return mapIds;
-    }
-
-    public AtomicInteger getMapsToSend() {
-      return mapsToSend;
-    }
-
-    public AtomicInteger getMapsToWait() {
-      return mapsToWait;
-    }
-
-    public boolean getKeepAlive() {
-      return keepAlive;
-    }
-  }
-
   ShuffleHandler(MetricsSystem ms) {
     super(MAPREDUCE_SHUFFLE_SERVICEID);
     this.ms = ms;
@@ -482,18 +252,20 @@ public ShuffleHandler() {
    * Serialize the shuffle port into a ByteBuffer for use later on.
    * @param port the port to be sent to the ApplciationMaster
    * @return the serialized form of the port.
+   * @throws IOException on failure
    */
   public static ByteBuffer serializeMetaData(int port) throws IOException {
     //TODO these bytes should be versioned
-    DataOutputBuffer port_dob = new DataOutputBuffer();
-    port_dob.writeInt(port);
-    return ByteBuffer.wrap(port_dob.getData(), 0, port_dob.getLength());
+    DataOutputBuffer portDob = new DataOutputBuffer();
+    portDob.writeInt(port);
+    return ByteBuffer.wrap(portDob.getData(), 0, portDob.getLength());
   }
 
   /**
    * A helper function to deserialize the metadata returned by ShuffleHandler.
    * @param meta the metadata returned by the ShuffleHandler
    * @return the port the Shuffle Handler is listening on to serve shuffle data.
+   * @throws IOException on failure
    */
   public static int deserializeMetaData(ByteBuffer meta) throws IOException {
     //TODO this should be returning a class not just an int
@@ -509,16 +281,18 @@ public static int deserializeMetaData(ByteBuffer meta) throws IOException {
    * @param jobToken the job token to be used for authentication of
    * shuffle data requests.
    * @return the serialized version of the jobToken.
+   * @throws IOException on failure
    */
   public static ByteBuffer serializeServiceData(Token<JobTokenIdentifier> jobToken)
       throws IOException {
     //TODO these bytes should be versioned
-    DataOutputBuffer jobToken_dob = new DataOutputBuffer();
-    jobToken.write(jobToken_dob);
-    return ByteBuffer.wrap(jobToken_dob.getData(), 0, jobToken_dob.getLength());
+    DataOutputBuffer jobTokenDob = new DataOutputBuffer();
+    jobToken.write(jobTokenDob);
+    return ByteBuffer.wrap(jobTokenDob.getData(), 0, jobTokenDob.getLength());
   }
 
-  static Token<JobTokenIdentifier> deserializeServiceData(ByteBuffer secret) throws IOException {
+  public static Token<JobTokenIdentifier> deserializeServiceData(ByteBuffer secret)
+      throws IOException {
     DataInputByteBuffer in = new DataInputByteBuffer();
     in.reset(secret);
     Token<JobTokenIdentifier> jt = new Token<JobTokenIdentifier>();
@@ -558,14 +332,6 @@ public void stopApplication(ApplicationTerminationContext context) {
 
   @Override
   protected void serviceInit(Configuration conf) throws Exception {
-    manageOsCache = conf.getBoolean(SHUFFLE_MANAGE_OS_CACHE,
-        DEFAULT_SHUFFLE_MANAGE_OS_CACHE);
-
-    readaheadLength = conf.getInt(SHUFFLE_READAHEAD_BYTES,
-        DEFAULT_SHUFFLE_READAHEAD_BYTES);
-    
-    maxShuffleConnections = conf.getInt(MAX_SHUFFLE_CONNECTIONS, 
-                                        DEFAULT_MAX_SHUFFLE_CONNECTIONS);
     int maxShuffleThreads = conf.getInt(MAX_SHUFFLE_THREADS,
                                         DEFAULT_MAX_SHUFFLE_THREADS);
     // Since Netty 4.x, the value of 0 threads would default to:
@@ -576,16 +342,6 @@ protected void serviceInit(Configuration conf) throws Exception {
     if (maxShuffleThreads == 0) {
       maxShuffleThreads = 2 * Runtime.getRuntime().availableProcessors();
     }
-    
-    shuffleBufferSize = conf.getInt(SHUFFLE_BUFFER_SIZE, 
-                                    DEFAULT_SHUFFLE_BUFFER_SIZE);
-        
-    shuffleTransferToAllowed = conf.getBoolean(SHUFFLE_TRANSFERTO_ALLOWED,
-         (Shell.WINDOWS)?WINDOWS_DEFAULT_SHUFFLE_TRANSFERTO_ALLOWED:
-                         DEFAULT_SHUFFLE_TRANSFERTO_ALLOWED);
-
-    maxSessionOpenFiles = conf.getInt(SHUFFLE_MAX_SESSION_OPEN_FILES,
-        DEFAULT_SHUFFLE_MAX_SESSION_OPEN_FILES);
 
     ThreadFactory bossFactory = new ThreadFactoryBuilder()
         .setNameFormat("ShuffleHandler Netty Boss #%d")
@@ -594,66 +350,117 @@ protected void serviceInit(Configuration conf) throws Exception {
         .setNameFormat("ShuffleHandler Netty Worker #%d")
         .build();
     
-    bossGroup = new NioEventLoopGroup(maxShuffleThreads, bossFactory);
+    bossGroup = new NioEventLoopGroup(1, bossFactory);
     workerGroup = new NioEventLoopGroup(maxShuffleThreads, workerFactory);
     super.serviceInit(new Configuration(conf));
   }
 
+  protected ShuffleChannelHandlerContext createHandlerContext() {
+    Configuration conf = getConfig();
+
+    final LoadingCache<AttemptPathIdentifier, AttemptPathInfo> pathCache =
+        CacheBuilder.newBuilder().expireAfterAccess(
+                conf.getInt(EXPIRE_AFTER_ACCESS_MINUTES, DEFAULT_EXPIRE_AFTER_ACCESS_MINUTES),
+                TimeUnit.MINUTES).softValues().concurrencyLevel(conf.getInt(CONCURRENCY_LEVEL,
+                DEFAULT_CONCURRENCY_LEVEL)).
+            removalListener(
+                (RemovalListener<AttemptPathIdentifier, AttemptPathInfo>) notification -> {
+                  if (LOG.isDebugEnabled()) {
+                    LOG.debug("PathCache Eviction: " + notification.getKey() +
+                        ", Reason=" + notification.getCause());
+                  }
+                }
+            ).maximumWeight(conf.getInt(MAX_WEIGHT, DEFAULT_MAX_WEIGHT)).weigher(
+                (key, value) -> key.jobId.length() + key.user.length() +
+                    key.attemptId.length()+
+                    value.indexPath.toString().length() +
+                    value.dataPath.toString().length()
+            ).build(new CacheLoader<AttemptPathIdentifier, AttemptPathInfo>() {
+              @Override
+              public AttemptPathInfo load(@Nonnull AttemptPathIdentifier key) throws
+                  Exception {
+                String base = getBaseLocation(key.jobId, key.user);
+                String attemptBase = base + key.attemptId;
+                Path indexFileName = getAuxiliaryLocalPathHandler()
+                    .getLocalPathForRead(attemptBase + "/" + INDEX_FILE_NAME);
+                Path mapOutputFileName = getAuxiliaryLocalPathHandler()
+                    .getLocalPathForRead(attemptBase + "/" + DATA_FILE_NAME);
+
+                if (LOG.isDebugEnabled()) {
+                  LOG.debug("Loaded : " + key + " via loader");
+                }
+                return new AttemptPathInfo(indexFileName, mapOutputFileName);
+              }
+            });
+
+    return new ShuffleChannelHandlerContext(conf,
+        userRsrc,
+        secretManager,
+        pathCache,
+        new IndexCache(new JobConf(conf)),
+        metrics,
+        allChannels
+    );
+  }
+
   // TODO change AbstractService to throw InterruptedException
   @Override
   protected void serviceStart() throws Exception {
     Configuration conf = getConfig();
-    userRsrc = new ConcurrentHashMap<String,String>();
+    userRsrc = new ConcurrentHashMap<>();
     secretManager = new JobTokenSecretManager();
     recoverState(conf);
-    try {
-      pipelineFact = new HttpPipelineFactory(conf);
-    } catch (Exception ex) {
-      throw new RuntimeException(ex);
+
+    if (conf.getBoolean(MRConfig.SHUFFLE_SSL_ENABLED_KEY,
+        MRConfig.SHUFFLE_SSL_ENABLED_DEFAULT)) {
+      LOG.info("Encrypted shuffle is enabled.");
+      sslFactory = new SSLFactory(SSLFactory.Mode.SERVER, conf);
+      sslFactory.init();
     }
 
-    bootstrap = new ServerBootstrap();
+    ShuffleChannelHandlerContext handlerContext = createHandlerContext();
+    ServerBootstrap bootstrap = new ServerBootstrap();
     bootstrap.group(bossGroup, workerGroup)
         .channel(NioServerSocketChannel.class)
         .option(ChannelOption.SO_BACKLOG,
             conf.getInt(SHUFFLE_LISTEN_QUEUE_SIZE,
                 DEFAULT_SHUFFLE_LISTEN_QUEUE_SIZE))
         .childOption(ChannelOption.SO_KEEPALIVE, true)
-        .childHandler(pipelineFact);
+        .childHandler(new ShuffleChannelInitializer(
+            handlerContext,
+            sslFactory)
+        );
     port = conf.getInt(SHUFFLE_PORT_CONFIG_KEY, DEFAULT_SHUFFLE_PORT);
-    ch = bootstrap.bind(new InetSocketAddress(port)).sync().channel();
-    accepted.add(ch);
+    Channel ch = bootstrap.bind(new InetSocketAddress(port)).sync().channel();
     port = ((InetSocketAddress)ch.localAddress()).getPort();
+    allChannels.add(ch);
     conf.set(SHUFFLE_PORT_CONFIG_KEY, Integer.toString(port));
-    pipelineFact.SHUFFLE.setPort(port);
+    handlerContext.setPort(port);
     LOG.info(getName() + " listening on port " + port);
     super.serviceStart();
-
-    sslFileBufferSize = conf.getInt(SUFFLE_SSL_FILE_BUFFER_SIZE_KEY,
-                                    DEFAULT_SUFFLE_SSL_FILE_BUFFER_SIZE);
-    connectionKeepAliveEnabled =
-        conf.getBoolean(SHUFFLE_CONNECTION_KEEP_ALIVE_ENABLED,
-          DEFAULT_SHUFFLE_CONNECTION_KEEP_ALIVE_ENABLED);
-    connectionKeepAliveTimeOut =
-        Math.max(1, conf.getInt(SHUFFLE_CONNECTION_KEEP_ALIVE_TIME_OUT,
-          DEFAULT_SHUFFLE_CONNECTION_KEEP_ALIVE_TIME_OUT));
-    mapOutputMetaInfoCacheSize =
-        Math.max(1, conf.getInt(SHUFFLE_MAPOUTPUT_META_INFO_CACHE_SIZE,
-          DEFAULT_SHUFFLE_MAPOUTPUT_META_INFO_CACHE_SIZE));
   }
 
   @Override
   protected void serviceStop() throws Exception {
-    closeChannels(accepted);
+    allChannels.close().awaitUninterruptibly(10, TimeUnit.SECONDS);
 
-    if (pipelineFact != null) {
-      pipelineFact.destroy();
+    if (sslFactory != null) {
+      sslFactory.destroy();
     }
 
     if (stateDb != null) {
       stateDb.close();
     }
     ms.unregisterSource(ShuffleMetrics.class.getSimpleName());
+
+    if (bossGroup != null) {
+      bossGroup.shutdownGracefully();
+    }
+
+    if (workerGroup != null) {
+      workerGroup.shutdownGracefully();
+    }
+
     super.serviceStop();
   }
 
@@ -668,10 +475,6 @@ public synchronized ByteBuffer getMetaData() {
     }
   }
 
-  protected Shuffle getShuffle(Configuration conf) {
-    return new Shuffle(conf);
-  }
-
   private void recoverState(Configuration conf) throws IOException {
     Path recoveryRoot = getRecoveryPath();
     if (recoveryRoot != null) {
@@ -847,11 +650,6 @@ private void removeJobShuffleInfo(JobID jobId) throws IOException {
     }
   }
 
-  @VisibleForTesting
-  public void setUseOutboundExceptionHandler(boolean useHandler) {
-    this.useOutboundExceptionHandler = useHandler;
-  }
-
   static class TimeoutHandler extends IdleStateHandler {
     private final int connectionKeepAliveTimeOut;
     private boolean enabledTimeout;
@@ -864,11 +662,6 @@ static class TimeoutHandler extends IdleStateHandler {
       this.connectionKeepAliveTimeOut = connectionKeepAliveTimeOut;
     }
 
-    @VisibleForTesting
-    public int getConnectionKeepAliveTimeOut() {
-      return connectionKeepAliveTimeOut;
-    }
-
     void setEnabledTimeout(boolean enabledTimeout) {
       this.enabledTimeout = enabledTimeout;
     }
@@ -876,607 +669,18 @@ void setEnabledTimeout(boolean enabledTimeout) {
     @Override
     public void channelIdle(ChannelHandlerContext ctx, IdleStateEvent e) {
       if (e.state() == IdleState.WRITER_IDLE && enabledTimeout) {
-        closeAsIdle(ctx.channel(), connectionKeepAliveTimeOut);
-      }
-    }
-  }
-
-  class HttpPipelineFactory extends ChannelInitializer<SocketChannel> {
-    private static final int MAX_CONTENT_LENGTH = 1 << 16;
-
-    final Shuffle SHUFFLE;
-    private SSLFactory sslFactory;
-
-    HttpPipelineFactory(Configuration conf) throws Exception {
-      SHUFFLE = getShuffle(conf);
-      if (conf.getBoolean(MRConfig.SHUFFLE_SSL_ENABLED_KEY,
-                          MRConfig.SHUFFLE_SSL_ENABLED_DEFAULT)) {
-        LOG.info("Encrypted shuffle is enabled.");
-        sslFactory = new SSLFactory(SSLFactory.Mode.SERVER, conf);
-        sslFactory.init();
-      }
-    }
-
-    public Shuffle getSHUFFLE() {
-      return SHUFFLE;
-    }
-
-    public void destroy() {
-      if (sslFactory != null) {
-        sslFactory.destroy();
-      }
-    }
-
-    @Override protected void initChannel(SocketChannel ch) throws Exception {
-      ChannelPipeline pipeline = ch.pipeline();
-      if (sslFactory != null) {
-        pipeline.addLast("ssl", new SslHandler(sslFactory.createSSLEngine()));
-      }
-      pipeline.addLast("decoder", new HttpRequestDecoder());
-      pipeline.addLast("aggregator", new HttpObjectAggregator(MAX_CONTENT_LENGTH));
-      pipeline.addLast(ENCODER_HANDLER_NAME, useOutboundLogger ?
-          new LoggingHttpResponseEncoder(false) : new HttpResponseEncoder());
-      pipeline.addLast("chunking", new ChunkedWriteHandler());
-      pipeline.addLast("shuffle", SHUFFLE);
-      if (useOutboundExceptionHandler) {
-        //https://stackoverflow.com/questions/50612403/catch-all-exception-handling-for-outbound-channelhandler
-        pipeline.addLast("outboundExceptionHandler", new ChannelOutboundHandlerAdapter() {
-          @Override
-          public void write(ChannelHandlerContext ctx, Object msg,
-              ChannelPromise promise) throws Exception {
-            promise.addListener(ChannelFutureListener.FIRE_EXCEPTION_ON_FAILURE);
-            super.write(ctx, msg, promise);
-          }
-        });
-      }
-      pipeline.addLast(TIMEOUT_HANDLER, new TimeoutHandler(connectionKeepAliveTimeOut));
-      // TODO factor security manager into pipeline
-      // TODO factor out encode/decode to permit binary shuffle
-      // TODO factor out decode of index to permit alt. models
-    }
-  }
-
-  @ChannelHandler.Sharable
-  class Shuffle extends ChannelInboundHandlerAdapter {
-    private final IndexCache indexCache;
-    private final LoadingCache<AttemptPathIdentifier, AttemptPathInfo> pathCache;
-
-    private int port;
-
-    Shuffle(Configuration conf) {
-      this.port = conf.getInt(SHUFFLE_PORT_CONFIG_KEY, DEFAULT_SHUFFLE_PORT);
-      this.indexCache = new IndexCache(new JobConf(conf));
-      this.pathCache = CacheBuilder.newBuilder()
-          .expireAfterAccess(conf.getInt(EXPIRE_AFTER_ACCESS_MINUTES,
-              DEFAULT_EXPIRE_AFTER_ACCESS_MINUTES), TimeUnit.MINUTES)
-          .softValues()
-          .concurrencyLevel(conf.getInt(CONCURRENCY_LEVEL,
-              DEFAULT_CONCURRENCY_LEVEL))
-          .removalListener((RemovalListener<AttemptPathIdentifier,
-              AttemptPathInfo>) notification ->
-              LOG.debug("PathCache Eviction: {}, Reason={}",
-                  notification.getKey(), notification.getCause()))
-          .maximumWeight(conf.getInt(MAX_WEIGHT, DEFAULT_MAX_WEIGHT))
-          .weigher((key, value) -> key.jobId.length() + key.user.length() +
-              key.attemptId.length()+ value.indexPath.toString().length() +
-              value.dataPath.toString().length())
-          .build(new CacheLoader<AttemptPathIdentifier, AttemptPathInfo>() {
-            @Override
-            public AttemptPathInfo load(AttemptPathIdentifier key) throws
-                Exception {
-              String base = getBaseLocation(key.jobId, key.user);
-              String attemptBase = base + key.attemptId;
-              Path indexFileName = getAuxiliaryLocalPathHandler()
-                  .getLocalPathForRead(attemptBase + "/" + INDEX_FILE_NAME);
-              Path mapOutputFileName = getAuxiliaryLocalPathHandler()
-                  .getLocalPathForRead(attemptBase + "/" + DATA_FILE_NAME);
-              LOG.debug("Loaded : {} via loader", key);
-              return new AttemptPathInfo(indexFileName, mapOutputFileName);
-            }
-          });
-    }
-
-    public void setPort(int port) {
-      this.port = port;
-    }
-
-    private List<String> splitMaps(List<String> mapq) {
-      if (null == mapq) {
-        return null;
-      }
-      final List<String> ret = new ArrayList<String>();
-      for (String s : mapq) {
-        Collections.addAll(ret, s.split(","));
-      }
-      return ret;
-    }
-
-    @Override
-    public void channelActive(ChannelHandlerContext ctx)
-        throws Exception {
-      NettyChannelHelper.channelActive(ctx.channel());
-      int numConnections = activeConnections.incrementAndGet();
-      if ((maxShuffleConnections > 0) && (numConnections > maxShuffleConnections)) {
-        LOG.info(String.format("Current number of shuffle connections (%d) is " + 
-            "greater than the max allowed shuffle connections (%d)",
-            accepted.size(), maxShuffleConnections));
-
-        Map<String, String> headers = new HashMap<>(1);
-        // notify fetchers to backoff for a while before closing the connection
-        // if the shuffle connection limit is hit. Fetchers are expected to
-        // handle this notification gracefully, that is, not treating this as a
-        // fetch failure.
-        headers.put(RETRY_AFTER_HEADER, String.valueOf(FETCH_RETRY_DELAY));
-        sendError(ctx, "", TOO_MANY_REQ_STATUS, headers);
-      } else {
-        super.channelActive(ctx);
-        accepted.add(ctx.channel());
-        LOG.debug("Added channel: {}, channel id: {}. Accepted number of connections={}",
-            ctx.channel(), ctx.channel().id(), activeConnections.get());
-      }
-    }
-
-    @Override
-    public void channelInactive(ChannelHandlerContext ctx) throws Exception {
-      NettyChannelHelper.channelInactive(ctx.channel());
-      super.channelInactive(ctx);
-      int noOfConnections = activeConnections.decrementAndGet();
-      LOG.debug("New value of Accepted number of connections={}", noOfConnections);
-    }
-
-    @Override
-    public void channelRead(ChannelHandlerContext ctx, Object msg)
-        throws Exception {
-      Channel channel = ctx.channel();
-      LOG.trace("Executing channelRead, channel id: {}", channel.id());
-      HttpRequest request = (HttpRequest) msg;
-      LOG.debug("Received HTTP request: {}, channel id: {}", request, channel.id());
-      if (request.method() != GET) {
-        sendError(ctx, METHOD_NOT_ALLOWED);
-        return;
-      }
-      // Check whether the shuffle version is compatible
-      String shuffleVersion = ShuffleHeader.DEFAULT_HTTP_HEADER_VERSION;
-      String httpHeaderName = ShuffleHeader.DEFAULT_HTTP_HEADER_NAME;
-      if (request.headers() != null) {
-        shuffleVersion = request.headers().get(ShuffleHeader.HTTP_HEADER_VERSION);
-        httpHeaderName = request.headers().get(ShuffleHeader.HTTP_HEADER_NAME);
-        LOG.debug("Received from request header: ShuffleVersion={} header name={}, channel id: {}",
-            shuffleVersion, httpHeaderName, channel.id());
-      }
-      if (request.headers() == null ||
-          !ShuffleHeader.DEFAULT_HTTP_HEADER_NAME.equals(httpHeaderName) ||
-          !ShuffleHeader.DEFAULT_HTTP_HEADER_VERSION.equals(shuffleVersion)) {
-        sendError(ctx, "Incompatible shuffle request version", BAD_REQUEST);
-      }
-      final Map<String, List<String>> q =
-          new QueryStringDecoder(request.uri()).parameters();
-      final List<String> keepAliveList = q.get("keepAlive");
-      boolean keepAliveParam = false;
-      if (keepAliveList != null && keepAliveList.size() == 1) {
-        keepAliveParam = Boolean.valueOf(keepAliveList.get(0));
-        if (LOG.isDebugEnabled()) {
-          LOG.debug("KeepAliveParam: {} : {}, channel id: {}",
-              keepAliveList, keepAliveParam, channel.id());
-        }
-      }
-      final List<String> mapIds = splitMaps(q.get("map"));
-      final List<String> reduceQ = q.get("reduce");
-      final List<String> jobQ = q.get("job");
-      if (LOG.isDebugEnabled()) {
-        LOG.debug("RECV: " + request.uri() +
-            "\n  mapId: " + mapIds +
-            "\n  reduceId: " + reduceQ +
-            "\n  jobId: " + jobQ +
-            "\n  keepAlive: " + keepAliveParam +
-            "\n  channel id: " + channel.id());
-      }
-
-      if (mapIds == null || reduceQ == null || jobQ == null) {
-        sendError(ctx, "Required param job, map and reduce", BAD_REQUEST);
-        return;
-      }
-      if (reduceQ.size() != 1 || jobQ.size() != 1) {
-        sendError(ctx, "Too many job/reduce parameters", BAD_REQUEST);
-        return;
-      }
-
-      int reduceId;
-      String jobId;
-      try {
-        reduceId = Integer.parseInt(reduceQ.get(0));
-        jobId = jobQ.get(0);
-      } catch (NumberFormatException e) {
-        sendError(ctx, "Bad reduce parameter", BAD_REQUEST);
-        return;
-      } catch (IllegalArgumentException e) {
-        sendError(ctx, "Bad job parameter", BAD_REQUEST);
-        return;
-      }
-      final String reqUri = request.uri();
-      if (null == reqUri) {
-        // TODO? add upstream?
-        sendError(ctx, FORBIDDEN);
-        return;
-      }
-      HttpResponse response = new DefaultHttpResponse(HTTP_1_1, OK);
-      try {
-        verifyRequest(jobId, ctx, request, response,
-            new URL("http", "", this.port, reqUri));
-      } catch (IOException e) {
-        LOG.warn("Shuffle failure ", e);
-        sendError(ctx, e.getMessage(), UNAUTHORIZED);
-        return;
-      }
-
-      Map<String, MapOutputInfo> mapOutputInfoMap =
-          new HashMap<String, MapOutputInfo>();
-      ChannelPipeline pipeline = channel.pipeline();
-      TimeoutHandler timeoutHandler =
-          (TimeoutHandler)pipeline.get(TIMEOUT_HANDLER);
-      timeoutHandler.setEnabledTimeout(false);
-      String user = userRsrc.get(jobId);
-
-      try {
-        populateHeaders(mapIds, jobId, user, reduceId, request,
-            response, keepAliveParam, mapOutputInfoMap);
-      } catch(IOException e) {
-        //HADOOP-15327
-        // Need to send an instance of LastHttpContent to define HTTP
-        // message boundaries.
-        //Sending a HTTP 200 OK + HTTP 500 later (sendError)
-        // is quite a non-standard way of crafting HTTP responses,
-        // but we need to keep backward compatibility.
-        // See more details in jira.
-        writeToChannelAndAddLastHttpContent(channel, response);
-        LOG.error("Shuffle error while populating headers. Channel id: " + channel.id(), e);
-        sendError(ctx, getErrorMessage(e), INTERNAL_SERVER_ERROR);
-        return;
-      }
-      writeToChannel(channel, response).addListener((ChannelFutureListener) future -> {
-        if (future.isSuccess()) {
-          LOG.debug("Written HTTP response object successfully. Channel id: {}", channel.id());
-        } else {
-          LOG.error("Error while writing HTTP response object: {}. " +
-              "Cause: {}, channel id: {}", response, future.cause(), channel.id());
-        }
-      });
-      //Initialize one ReduceContext object per channelRead call
-      boolean keepAlive = keepAliveParam || connectionKeepAliveEnabled;
-      ReduceContext reduceContext = new ReduceContext(mapIds, reduceId, ctx,
-          user, mapOutputInfoMap, jobId, keepAlive);
-      for (int i = 0; i < Math.min(maxSessionOpenFiles, mapIds.size()); i++) {
-        ChannelFuture nextMap = sendMap(reduceContext);
-        if(nextMap == null) {
-          return;
-        }
-      }
-    }
-
-    /**
-     * Calls sendMapOutput for the mapId pointed by ReduceContext.mapsToSend
-     * and increments it. This method is first called by messageReceived()
-     * maxSessionOpenFiles times and then on the completion of every
-     * sendMapOutput operation. This limits the number of open files on a node,
-     * which can get really large(exhausting file descriptors on the NM) if all
-     * sendMapOutputs are called in one go, as was done previous to this change.
-     * @param reduceContext used to call sendMapOutput with correct params.
-     * @return the ChannelFuture of the sendMapOutput, can be null.
-     */
-    public ChannelFuture sendMap(ReduceContext reduceContext) {
-      LOG.trace("Executing sendMap");
-      ChannelFuture nextMap = null;
-      if (reduceContext.getMapsToSend().get() <
-          reduceContext.getMapIds().size()) {
-        int nextIndex = reduceContext.getMapsToSend().getAndIncrement();
-        String mapId = reduceContext.getMapIds().get(nextIndex);
-
-        try {
-          MapOutputInfo info = reduceContext.getInfoMap().get(mapId);
-          if (info == null) {
-            info = getMapOutputInfo(mapId, reduceContext.getReduceId(),
-                reduceContext.getJobId(), reduceContext.getUser());
-          }
-          LOG.trace("Calling sendMapOutput");
-          nextMap = sendMapOutput(
-              reduceContext.getCtx(),
-              reduceContext.getCtx().channel(),
-              reduceContext.getUser(), mapId,
-              reduceContext.getReduceId(), info);
-          if (nextMap == null) {
-            //This can only happen if spill file was not found
-            sendError(reduceContext.getCtx(), NOT_FOUND);
-            LOG.trace("Returning nextMap: null");
-            return null;
-          }
-          nextMap.addListener(new ReduceMapFileCount(reduceContext));
-        } catch (IOException e) {
-          if (e instanceof DiskChecker.DiskErrorException) {
-            LOG.error("Shuffle error: " + e);
-          } else {
-            LOG.error("Shuffle error: ", e);
-          }
-          String errorMessage = getErrorMessage(e);
-          sendError(reduceContext.getCtx(), errorMessage,
-              INTERNAL_SERVER_ERROR);
-          return null;
-        }
-      }
-      return nextMap;
-    }
-
-    private String getErrorMessage(Throwable t) {
-      StringBuffer sb = new StringBuffer(t.getMessage());
-      while (t.getCause() != null) {
-        sb.append(t.getCause().getMessage());
-        t = t.getCause();
-      }
-      return sb.toString();
-    }
-
-    private String getBaseLocation(String jobId, String user) {
-      final JobID jobID = JobID.forName(jobId);
-      final ApplicationId appID =
-          ApplicationId.newInstance(Long.parseLong(jobID.getJtIdentifier()),
-            jobID.getId());
-      final String baseStr =
-          ContainerLocalizer.USERCACHE + "/" + user + "/"
-              + ContainerLocalizer.APPCACHE + "/"
-              + appID.toString() + "/output" + "/";
-      return baseStr;
-    }
-
-    protected MapOutputInfo getMapOutputInfo(String mapId, int reduce,
-        String jobId, String user) throws IOException {
-      AttemptPathInfo pathInfo;
-      try {
-        AttemptPathIdentifier identifier = new AttemptPathIdentifier(
-            jobId, user, mapId);
-        pathInfo = pathCache.get(identifier);
-        if (LOG.isDebugEnabled()) {
-          LOG.debug("Retrieved pathInfo for " + identifier +
-              " check for corresponding loaded messages to determine whether" +
-              " it was loaded or cached");
-        }
-      } catch (ExecutionException e) {
-        if (e.getCause() instanceof IOException) {
-          throw (IOException) e.getCause();
-        } else {
-          throw new RuntimeException(e.getCause());
-        }
-      }
-
-      IndexRecord info = indexCache.getIndexInformation(mapId, reduce, pathInfo.indexPath, user);
-
-      if (LOG.isDebugEnabled()) {
-        LOG.debug("getMapOutputInfo: jobId=" + jobId + ", mapId=" + mapId +
-            ",dataFile=" + pathInfo.dataPath + ", indexFile=" +
-            pathInfo.indexPath);
-      }
-
-      MapOutputInfo outputInfo = new MapOutputInfo(pathInfo.dataPath, info);
-      return outputInfo;
-    }
-
-    protected void populateHeaders(List<String> mapIds, String jobId,
-        String user, int reduce, HttpRequest request, HttpResponse response,
-        boolean keepAliveParam, Map<String, MapOutputInfo> mapOutputInfoMap)
-        throws IOException {
-
-      long contentLength = 0;
-      for (String mapId : mapIds) {
-        MapOutputInfo outputInfo = getMapOutputInfo(mapId, reduce, jobId, user);
-        if (mapOutputInfoMap.size() < mapOutputMetaInfoCacheSize) {
-          mapOutputInfoMap.put(mapId, outputInfo);
-        }
-
-        ShuffleHeader header =
-            new ShuffleHeader(mapId, outputInfo.indexRecord.partLength,
-            outputInfo.indexRecord.rawLength, reduce);
-        DataOutputBuffer dob = new DataOutputBuffer();
-        header.write(dob);
-        contentLength += outputInfo.indexRecord.partLength;
-        contentLength += dob.getLength();
-      }
-
-      // Now set the response headers.
-      setResponseHeaders(response, keepAliveParam, contentLength);
-
-      // this audit log is disabled by default,
-      // to turn it on please enable this audit log
-      // on log4j.properties by uncommenting the setting
-      if (AUDITLOG.isDebugEnabled()) {
-        StringBuilder sb = new StringBuilder("shuffle for ");
-        sb.append(jobId).append(" reducer ").append(reduce);
-        sb.append(" length ").append(contentLength);
-        if (AUDITLOG.isTraceEnabled()) {
-          // For trace level logging, append the list of mappers
-          sb.append(" mappers: ").append(mapIds);
-          AUDITLOG.trace(sb.toString());
-        } else {
-          AUDITLOG.debug(sb.toString());
-        }
-      }
-    }
-
-    protected void setResponseHeaders(HttpResponse response,
-        boolean keepAliveParam, long contentLength) {
-      if (!connectionKeepAliveEnabled && !keepAliveParam) {
-        response.headers().set(HttpHeader.CONNECTION.asString(), CONNECTION_CLOSE);
-      } else {
-        response.headers().set(HttpHeader.CONTENT_LENGTH.asString(),
-            String.valueOf(contentLength));
-        response.headers().set(HttpHeader.CONNECTION.asString(),
-            HttpHeader.KEEP_ALIVE.asString());
-        response.headers().set(HttpHeader.KEEP_ALIVE.asString(),
-            "timeout=" + connectionKeepAliveTimeOut);
-        LOG.info("Content Length in shuffle : " + contentLength);
-      }
-    }
-
-    class MapOutputInfo {
-      final Path mapOutputFileName;
-      final IndexRecord indexRecord;
-
-      MapOutputInfo(Path mapOutputFileName, IndexRecord indexRecord) {
-        this.mapOutputFileName = mapOutputFileName;
-        this.indexRecord = indexRecord;
-      }
-    }
-
-    protected void verifyRequest(String appid, ChannelHandlerContext ctx,
-        HttpRequest request, HttpResponse response, URL requestUri)
-        throws IOException {
-      SecretKey tokenSecret = secretManager.retrieveTokenSecret(appid);
-      if (null == tokenSecret) {
-        LOG.info("Request for unknown token {}, channel id: {}", appid, ctx.channel().id());
-        throw new IOException("Could not find jobid");
-      }
-      // encrypting URL
-      String encryptedURL = SecureShuffleUtils.buildMsgFrom(requestUri);
-      // hash from the fetcher
-      String urlHashStr =
-          request.headers().get(SecureShuffleUtils.HTTP_HEADER_URL_HASH);
-      if (urlHashStr == null) {
-        LOG.info("Missing header hash for {}, channel id: {}", appid, ctx.channel().id());
-        throw new IOException("fetcher cannot be authenticated");
-      }
-      if (LOG.isDebugEnabled()) {
-        int len = urlHashStr.length();
-        LOG.debug("Verifying request. encryptedURL:{}, hash:{}, channel id: " +
-                "{}", encryptedURL,
-            urlHashStr.substring(len - len / 2, len - 1), ctx.channel().id());
-      }
-      // verify - throws exception
-      SecureShuffleUtils.verifyReply(urlHashStr, encryptedURL, tokenSecret);
-      // verification passed - encode the reply
-      String reply = SecureShuffleUtils.generateHash(urlHashStr.getBytes(Charsets.UTF_8),
-          tokenSecret);
-      response.headers().set(
-          SecureShuffleUtils.HTTP_HEADER_REPLY_URL_HASH, reply);
-      // Put shuffle version into http header
-      response.headers().set(ShuffleHeader.HTTP_HEADER_NAME,
-          ShuffleHeader.DEFAULT_HTTP_HEADER_NAME);
-      response.headers().set(ShuffleHeader.HTTP_HEADER_VERSION,
-          ShuffleHeader.DEFAULT_HTTP_HEADER_VERSION);
-      if (LOG.isDebugEnabled()) {
-        int len = reply.length();
-        LOG.debug("Fetcher request verified. " +
-            "encryptedURL: {}, reply: {}, channel id: {}",
-            encryptedURL, reply.substring(len - len / 2, len - 1),
-            ctx.channel().id());
-      }
-    }
-
-    protected ChannelFuture sendMapOutput(ChannelHandlerContext ctx, Channel ch,
-        String user, String mapId, int reduce, MapOutputInfo mapOutputInfo)
-        throws IOException {
-      final IndexRecord info = mapOutputInfo.indexRecord;
-      final ShuffleHeader header = new ShuffleHeader(mapId, info.partLength, info.rawLength,
-          reduce);
-      final DataOutputBuffer dob = new DataOutputBuffer();
-      header.write(dob);
-      writeToChannel(ch, wrappedBuffer(dob.getData(), 0, dob.getLength()));
-      final File spillfile =
-          new File(mapOutputInfo.mapOutputFileName.toString());
-      RandomAccessFile spill;
-      try {
-        spill = SecureIOUtils.openForRandomRead(spillfile, "r", user, null);
-      } catch (FileNotFoundException e) {
-        LOG.info("{} not found. Channel id: {}", spillfile, ctx.channel().id());
-        return null;
-      }
-      ChannelFuture writeFuture;
-      if (ch.pipeline().get(SslHandler.class) == null) {
-        final FadvisedFileRegion partition = new FadvisedFileRegion(spill,
-            info.startOffset, info.partLength, manageOsCache, readaheadLength,
-            readaheadPool, spillfile.getAbsolutePath(), 
-            shuffleBufferSize, shuffleTransferToAllowed);
-        writeFuture = writeToChannel(ch, partition);
-        writeFuture.addListener(new ChannelFutureListener() {
-            // TODO error handling; distinguish IO/connection failures,
-            //      attribute to appropriate spill output
-          @Override
-          public void operationComplete(ChannelFuture future) {
-            if (future.isSuccess()) {
-              partition.transferSuccessful();
-            }
-            partition.deallocate();
-          }
-        });
-      } else {
-        // HTTPS cannot be done with zero copy.
-        final FadvisedChunkedFile chunk = new FadvisedChunkedFile(spill,
-            info.startOffset, info.partLength, sslFileBufferSize,
-            manageOsCache, readaheadLength, readaheadPool,
-            spillfile.getAbsolutePath());
-        writeFuture = writeToChannel(ch, chunk);
-      }
-      metrics.shuffleConnections.incr();
-      metrics.shuffleOutputBytes.incr(info.partLength); // optimistic
-      return writeFuture;
-    }
-
-    protected void sendError(ChannelHandlerContext ctx,
-        HttpResponseStatus status) {
-      sendError(ctx, "", status);
-    }
-
-    protected void sendError(ChannelHandlerContext ctx, String message,
-        HttpResponseStatus status) {
-      sendError(ctx, message, status, Collections.emptyMap());
-    }
-
-    protected void sendError(ChannelHandlerContext ctx, String msg,
-        HttpResponseStatus status, Map<String, String> headers) {
-      FullHttpResponse response = new DefaultFullHttpResponse(HTTP_1_1, status,
-              Unpooled.copiedBuffer(msg, CharsetUtil.UTF_8));
-      response.headers().set(CONTENT_TYPE, "text/plain; charset=UTF-8");
-      // Put shuffle version into http header
-      response.headers().set(ShuffleHeader.HTTP_HEADER_NAME,
-          ShuffleHeader.DEFAULT_HTTP_HEADER_NAME);
-      response.headers().set(ShuffleHeader.HTTP_HEADER_VERSION,
-          ShuffleHeader.DEFAULT_HTTP_HEADER_VERSION);
-      for (Map.Entry<String, String> header : headers.entrySet()) {
-        response.headers().set(header.getKey(), header.getValue());
-      }
-
-      // Close the connection as soon as the error message is sent.
-      writeToChannelAndClose(ctx.channel(), response);
-    }
-
-    @Override
-    public void exceptionCaught(ChannelHandlerContext ctx, Throwable cause)
-        throws Exception {
-      Channel ch = ctx.channel();
-      if (cause instanceof TooLongFrameException) {
-        LOG.trace("TooLongFrameException, channel id: {}", ch.id());
-        sendError(ctx, BAD_REQUEST);
-        return;
-      } else if (cause instanceof IOException) {
-        if (cause instanceof ClosedChannelException) {
-          LOG.debug("Ignoring closed channel error, channel id: " + ch.id(), cause);
-          return;
-        }
-        String message = String.valueOf(cause.getMessage());
-        if (IGNORABLE_ERROR_MESSAGE.matcher(message).matches()) {
-          LOG.debug("Ignoring client socket close, channel id: " + ch.id(), cause);
-          return;
-        }
-      }
-
-      LOG.error("Shuffle error. Channel id: " + ch.id(), cause);
-      if (ch.isActive()) {
-        sendError(ctx, INTERNAL_SERVER_ERROR);
+        LOG.debug("Closing channel as writer was idle for {} seconds", connectionKeepAliveTimeOut);
+        ctx.channel().close();
       }
     }
   }
 
+  @SuppressWarnings("checkstyle:VisibilityModifier")
   static class AttemptPathInfo {
     // TODO Change this over to just store local dir indices, instead of the
     // entire path. Far more efficient.
-    private final Path indexPath;
-    private final Path dataPath;
+    public final Path indexPath;
+    public final Path dataPath;
 
     AttemptPathInfo(Path indexPath, Path dataPath) {
       this.indexPath = indexPath;
@@ -1484,10 +688,11 @@ static class AttemptPathInfo {
     }
   }
 
+  @SuppressWarnings("checkstyle:VisibilityModifier")
   static class AttemptPathIdentifier {
-    private final String jobId;
-    private final String user;
-    private final String attemptId;
+    public final String jobId;
+    public final String user;
+    public final String attemptId;
 
     AttemptPathIdentifier(String jobId, String user, String attemptId) {
       this.jobId = jobId;
@@ -1531,4 +736,14 @@ public String toString() {
           '}';
     }
   }
+
+  private static String getBaseLocation(String jobId, String user) {
+    final JobID jobID = JobID.forName(jobId);
+    final ApplicationId appID =
+        ApplicationId.newInstance(Long.parseLong(jobID.getJtIdentifier()),
+            jobID.getId());
+    return ContainerLocalizer.USERCACHE + "/" + user + "/"
+        + ContainerLocalizer.APPCACHE + "/"
+        + appID + "/output" + "/";
+  }
 }
diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-shuffle/src/test/java/org/apache/hadoop/mapred/TestShuffleChannelHandler.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-shuffle/src/test/java/org/apache/hadoop/mapred/TestShuffleChannelHandler.java
new file mode 100644
index 0000000000000..7fedc7bb2dc09
--- /dev/null
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-shuffle/src/test/java/org/apache/hadoop/mapred/TestShuffleChannelHandler.java
@@ -0,0 +1,562 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.mapred;
+
+import io.netty.buffer.ByteBuf;
+import io.netty.buffer.ByteBufAllocator;
+import io.netty.buffer.ByteBufUtil;
+import io.netty.buffer.Unpooled;
+import io.netty.channel.ChannelHandlerContext;
+import io.netty.channel.ChannelInboundHandlerAdapter;
+import io.netty.channel.FileRegion;
+import io.netty.channel.embedded.EmbeddedChannel;
+import io.netty.channel.group.DefaultChannelGroup;
+import io.netty.handler.codec.MessageToMessageEncoder;
+import io.netty.handler.codec.http.DefaultFullHttpRequest;
+import io.netty.handler.codec.http.DefaultHttpResponse;
+import io.netty.handler.codec.http.FullHttpRequest;
+import io.netty.handler.codec.http.HttpHeaders;
+import io.netty.handler.codec.http.HttpMethod;
+import io.netty.handler.codec.http.HttpObjectAggregator;
+import io.netty.handler.codec.http.HttpResponseDecoder;
+import io.netty.handler.codec.http.HttpResponseStatus;
+import io.netty.handler.codec.http.HttpServerCodec;
+import io.netty.handler.codec.http.HttpUtil;
+import io.netty.handler.codec.http.HttpVersion;
+import io.netty.handler.codec.http.LastHttpContent;
+import io.netty.handler.ssl.SslContext;
+import io.netty.handler.ssl.SslContextBuilder;
+import io.netty.handler.ssl.SslHandler;
+import io.netty.handler.stream.ChunkedWriteHandler;
+import io.netty.util.concurrent.GlobalEventExecutor;
+
+import java.io.ByteArrayOutputStream;
+import java.io.File;
+import java.io.IOException;
+import java.net.URL;
+import java.nio.channels.Channels;
+import java.nio.channels.WritableByteChannel;
+import java.nio.charset.StandardCharsets;
+import java.security.cert.X509Certificate;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Objects;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.TimeUnit;
+
+import javax.crypto.SecretKey;
+import javax.net.ssl.SSLContext;
+import javax.net.ssl.SSLEngine;
+import javax.net.ssl.TrustManager;
+import javax.net.ssl.X509TrustManager;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.security.SecureShuffleUtils;
+import org.apache.hadoop.mapreduce.security.token.JobTokenIdentifier;
+import org.apache.hadoop.mapreduce.security.token.JobTokenSecretManager;
+import org.apache.hadoop.mapreduce.task.reduce.ShuffleHeader;
+import org.apache.hadoop.metrics2.MetricsSystem;
+import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem;
+import org.apache.hadoop.security.token.SecretManager;
+import org.apache.hadoop.security.token.Token;
+import org.apache.hadoop.thirdparty.com.google.common.base.Charsets;
+import org.eclipse.jetty.http.HttpHeader;
+import org.junit.Test;
+import org.slf4j.LoggerFactory;
+
+import static io.netty.handler.codec.http.HttpHeaderNames.CONTENT_LENGTH;
+import static io.netty.handler.codec.http.HttpHeaderNames.CONTENT_TYPE;
+import static io.netty.handler.codec.http.HttpVersion.HTTP_1_1;
+import static org.apache.hadoop.mapred.ShuffleChannelHandler.shuffleHeaderToBytes;
+import static org.apache.hadoop.mapred.ShuffleChannelInitializer.MAX_CONTENT_LENGTH;
+import static org.apache.hadoop.mapred.ShuffleHandler.CONNECTION_CLOSE;
+import static org.apache.hadoop.mapred.ShuffleHandler.SHUFFLE_CONNECTION_KEEP_ALIVE_ENABLED;
+import static org.apache.hadoop.mapred.ShuffleHandler.SHUFFLE_CONNECTION_KEEP_ALIVE_TIME_OUT;
+import static org.apache.hadoop.mapred.ShuffleHandler.TIMEOUT_HANDLER;
+import static org.apache.hadoop.mapreduce.security.SecureShuffleUtils.HTTP_HEADER_URL_HASH;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.fail;
+
+public class TestShuffleChannelHandler extends TestShuffleHandlerBase {
+  private static final org.slf4j.Logger LOG =
+      LoggerFactory.getLogger(TestShuffleChannelHandler.class);
+
+  @Test
+  public void testGetMapsFileRegion() throws IOException {
+    final ShuffleTest t = createShuffleTest();
+    final EmbeddedChannel shuffle = t.createShuffleHandlerChannelFileRegion();
+    t.testGetAllAttemptsForReduce0NoKeepAlive(shuffle.outboundMessages(), shuffle);
+  }
+
+  @Test
+  public void testGetMapsChunkedFileSSl() throws Exception {
+    final ShuffleTest t = createShuffleTest();
+    final LinkedList<Object> unencryptedMessages = new LinkedList<>();
+    final EmbeddedChannel shuffle = t.createShuffleHandlerSSL(unencryptedMessages);
+    t.testGetAllAttemptsForReduce0NoKeepAlive(unencryptedMessages, shuffle);
+  }
+
+  @Test
+  public void testKeepAlive() throws Exception {
+    // TODO: problems with keep-alive
+    // current behaviour:
+    //  a) mapreduce.shuffle.connection-keep-alive.enable=false
+    //     + client request with &keepAlive=true
+    //     ==> connection is kept
+    //  b) mapreduce.shuffle.connection-keep-alive.enable=true
+    //     ==> connection is kept
+    //
+    // a) seems like a bug
+    // b) might be ok, because it's the default in HTTP/1.1
+    Configuration conf = new Configuration();
+    conf.set(SHUFFLE_CONNECTION_KEEP_ALIVE_ENABLED, "false");
+    conf.set(SHUFFLE_CONNECTION_KEEP_ALIVE_TIME_OUT, "15");
+    final ShuffleTest t = createShuffleTest(conf);
+    final EmbeddedChannel shuffle = t.createShuffleHandlerChannelFileRegion();
+    t.testKeepAlive(shuffle.outboundMessages(), shuffle);
+  }
+
+  @Test
+  public void testKeepAliveSSL() throws Exception {
+    Configuration conf = new Configuration();
+    conf.set(SHUFFLE_CONNECTION_KEEP_ALIVE_ENABLED, "false");
+    conf.set(SHUFFLE_CONNECTION_KEEP_ALIVE_TIME_OUT, "15");
+    final ShuffleTest t = createShuffleTest(conf);
+    final LinkedList<Object> unencryptedMessages = new LinkedList<>();
+    final EmbeddedChannel shuffle = t.createShuffleHandlerSSL(unencryptedMessages);
+    t.testKeepAlive(unencryptedMessages, shuffle);
+  }
+
+  @Test
+  public void tetKeepAliveTimeout() throws InterruptedException, IOException {
+    Configuration conf = new Configuration();
+    conf.set(SHUFFLE_CONNECTION_KEEP_ALIVE_ENABLED, "true");
+    conf.set(SHUFFLE_CONNECTION_KEEP_ALIVE_TIME_OUT, "1");
+    final ShuffleTest t = createShuffleTest(conf);
+    final EmbeddedChannel shuffle = t.createShuffleHandlerChannelFileRegion();
+
+    FullHttpRequest req = t.createRequest(getUri(TEST_JOB_ID, 0,
+        Collections.singletonList(TEST_ATTEMPT_1), true));
+    shuffle.writeInbound(req);
+    t.assertResponse(shuffle.outboundMessages(),
+        t.getExpectedHttpResponse(req, true, 46),
+        t.getAttemptData(new Attempt(TEST_ATTEMPT_1, TEST_DATA_A))
+    );
+    assertTrue("keep-alive", shuffle.isActive());
+
+    TimeUnit.SECONDS.sleep(3);
+    shuffle.runScheduledPendingTasks();
+
+    assertFalse("closed", shuffle.isActive());
+  }
+
+  @Test
+  public void testIncompatibleShuffleVersion() {
+    Configuration conf = new Configuration();
+    conf.set(SHUFFLE_CONNECTION_KEEP_ALIVE_ENABLED, "true");
+    final ShuffleTest t = createShuffleTest(conf);
+    final EmbeddedChannel shuffle = t.createShuffleHandlerChannelFileRegion();
+    FullHttpRequest req = t.createRequest(getUri(TEST_JOB_ID, 0,
+        Collections.singletonList(TEST_ATTEMPT_1), true));
+    req.headers().set(ShuffleHeader.HTTP_HEADER_NAME, "invalid");
+    shuffle.writeInbound(req);
+
+    final EmbeddedChannel decoder = t.createHttpResponseChannel();
+    for (Object obj : shuffle.outboundMessages()) {
+      decoder.writeInbound(obj);
+    }
+    DefaultHttpResponse actual = decoder.readInbound();
+    assertFalse(actual.headers().get(CONTENT_LENGTH).isEmpty());
+    actual.headers().set(CONTENT_LENGTH, 0);
+
+    assertEquals(getExpectedHttpResponse(HttpResponseStatus.BAD_REQUEST).toString(),
+        actual.toString());
+
+    assertFalse("closed", shuffle.isActive()); // known-issue
+  }
+
+  @Test
+  public void testInvalidMapNoIndexFile() {
+    final ShuffleTest t = createShuffleTest();
+    final EmbeddedChannel shuffle = t.createShuffleHandlerChannelFileRegion();
+    FullHttpRequest req = t.createRequest(getUri(TEST_JOB_ID, 0,
+        Arrays.asList(TEST_ATTEMPT_1, "non-existing"), true));
+    shuffle.writeInbound(req);
+
+    final EmbeddedChannel decoder = t.createHttpResponseChannel();
+    for (Object obj : shuffle.outboundMessages()) {
+      decoder.writeInbound(obj);
+    }
+
+    DefaultHttpResponse actual = decoder.readInbound();
+    assertFalse(actual.headers().get(CONTENT_LENGTH).isEmpty());
+    actual.headers().set(CONTENT_LENGTH, 0);
+
+    assertEquals(getExpectedHttpResponse(HttpResponseStatus.INTERNAL_SERVER_ERROR).toString(),
+        actual.toString());
+
+    assertFalse("closed", shuffle.isActive());
+  }
+
+  @Test
+  public void testInvalidMapNoDataFile() {
+    final ShuffleTest t = createShuffleTest();
+    final EmbeddedChannel shuffle = t.createShuffleHandlerChannelFileRegion();
+
+    String dataFile = getDataFile(tempDir.toAbsolutePath().toString(), TEST_ATTEMPT_2);
+    assertTrue("should delete", new File(dataFile).delete());
+
+    FullHttpRequest req = t.createRequest(getUri(TEST_JOB_ID, 0,
+        Arrays.asList(TEST_ATTEMPT_1, TEST_ATTEMPT_2), false));
+    shuffle.writeInbound(req);
+
+    final EmbeddedChannel decoder = t.createHttpResponseChannel();
+    for (Object obj : shuffle.outboundMessages()) {
+      decoder.writeInbound(obj);
+    }
+
+    DefaultHttpResponse actual = decoder.readInbound();
+    assertFalse(actual.headers().get(CONTENT_LENGTH).isEmpty());
+    actual.headers().set(CONTENT_LENGTH, 0);
+
+    assertEquals(getExpectedHttpResponse(HttpResponseStatus.INTERNAL_SERVER_ERROR).toString(),
+        actual.toString());
+
+    assertFalse("closed", shuffle.isActive());
+  }
+
+  private DefaultHttpResponse getExpectedHttpResponse(HttpResponseStatus status) {
+    DefaultHttpResponse response = new DefaultHttpResponse(HTTP_1_1, status);
+    response.headers().set(CONTENT_TYPE, "text/plain; charset=UTF-8");
+    response.headers().set(ShuffleHeader.HTTP_HEADER_NAME,
+        ShuffleHeader.DEFAULT_HTTP_HEADER_NAME);
+    response.headers().set(ShuffleHeader.HTTP_HEADER_VERSION,
+        ShuffleHeader.DEFAULT_HTTP_HEADER_VERSION);
+    response.headers().set(CONTENT_LENGTH, 0);
+    return response;
+  }
+
+  private ShuffleTest createShuffleTest() {
+    return createShuffleTest(new Configuration());
+  }
+
+  private ShuffleTest createShuffleTest(Configuration conf) {
+    return new ShuffleTest(conf);
+  }
+
+  private File getResourceFile(String resourceName) {
+    ClassLoader classLoader = Thread.currentThread().getContextClassLoader();
+    return new File(Objects.requireNonNull(classLoader.getResource(resourceName)).getFile());
+  }
+
+  @SuppressWarnings("checkstyle:VisibilityModifier")
+  static class Attempt {
+    final String id;
+    final String content;
+
+    Attempt(String attempt, String content) {
+      this.id = attempt;
+      this.content = content;
+    }
+  }
+
+  private class ShuffleTest {
+    private final ShuffleChannelHandlerContext ctx;
+    private final SecretKey shuffleSecretKey;
+
+    ShuffleTest(Configuration conf) {
+      JobConf jobConf = new JobConf(conf);
+      MetricsSystem ms = DefaultMetricsSystem.instance();
+      this.ctx = new ShuffleChannelHandlerContext(conf,
+          new ConcurrentHashMap<>(),
+          new JobTokenSecretManager(),
+          createLoadingCache(),
+          new IndexCache(jobConf),
+          ms.register(new ShuffleHandler.ShuffleMetrics()),
+          new DefaultChannelGroup(GlobalEventExecutor.INSTANCE)
+      );
+
+      JobTokenIdentifier tokenId = new JobTokenIdentifier(new Text(TEST_JOB_ID));
+      Token<JobTokenIdentifier> token = new Token<>(tokenId, ctx.secretManager);
+      shuffleSecretKey = JobTokenSecretManager.createSecretKey(token.getPassword());
+
+      ctx.userRsrc.put(TEST_JOB_ID, TEST_USER);
+      ctx.secretManager.addTokenForJob(TEST_JOB_ID, token);
+    }
+
+    public FullHttpRequest createRequest(String uri) {
+      FullHttpRequest request =
+          new DefaultFullHttpRequest(HttpVersion.HTTP_1_1, HttpMethod.GET, uri);
+      request.headers().set(ShuffleHeader.HTTP_HEADER_NAME,
+          ShuffleHeader.DEFAULT_HTTP_HEADER_NAME);
+      request.headers().set(ShuffleHeader.HTTP_HEADER_VERSION,
+          ShuffleHeader.DEFAULT_HTTP_HEADER_VERSION);
+      request.headers().set(ShuffleHeader.HTTP_HEADER_VERSION,
+          ShuffleHeader.DEFAULT_HTTP_HEADER_VERSION);
+      try {
+        String msgToEncode = SecureShuffleUtils.buildMsgFrom(new URL("http", "", ctx.port, uri));
+        request.headers().set(HTTP_HEADER_URL_HASH,
+            SecureShuffleUtils.hashFromString(msgToEncode, shuffleSecretKey));
+      } catch (IOException e) {
+        e.printStackTrace();
+        fail("Could not create URL hash for test request");
+      }
+
+      return request;
+    }
+
+    public DefaultHttpResponse getExpectedHttpResponse(
+        FullHttpRequest request, boolean keepAlive, long contentLength) {
+      DefaultHttpResponse response =
+          new DefaultHttpResponse(HttpVersion.HTTP_1_1, HttpResponseStatus.OK);
+      HttpHeaders headers = response.headers();
+      try {
+        SecretKey tokenSecret = ctx.secretManager.retrieveTokenSecret(TEST_JOB_ID);
+        headers.set(SecureShuffleUtils.HTTP_HEADER_REPLY_URL_HASH,
+            SecureShuffleUtils.generateHash(
+                request.headers().get(HTTP_HEADER_URL_HASH).getBytes(Charsets.UTF_8),
+                tokenSecret));
+      } catch (SecretManager.InvalidToken e) {
+        fail("Could not generate reply hash");
+      }
+      headers.set(ShuffleHeader.HTTP_HEADER_NAME, ShuffleHeader.DEFAULT_HTTP_HEADER_NAME);
+      headers.set(ShuffleHeader.HTTP_HEADER_VERSION, ShuffleHeader.DEFAULT_HTTP_HEADER_VERSION);
+      if (keepAlive) {
+        headers.set(HttpHeader.CONNECTION.asString(), HttpHeader.KEEP_ALIVE.asString());
+        headers.set(HttpHeader.KEEP_ALIVE.asString(), "timeout=" + ctx.connectionKeepAliveTimeOut);
+      } else {
+        response.headers().set(HttpHeader.CONNECTION.asString(), CONNECTION_CLOSE);
+      }
+      HttpUtil.setContentLength(response, contentLength);
+      return response;
+    }
+
+    private void testGetAllAttemptsForReduce0NoKeepAlive(
+        java.util.Queue<Object> outboundMessages, EmbeddedChannel shuffle) throws IOException {
+      final FullHttpRequest request = createRequest(
+          getUri(TEST_JOB_ID, 0,
+              Arrays.asList(TEST_ATTEMPT_1, TEST_ATTEMPT_2, TEST_ATTEMPT_3), false));
+      shuffle.writeInbound(request);
+      assertResponse(outboundMessages,
+          getExpectedHttpResponse(request, false, 138),
+          getAllAttemptsForReduce0()
+      );
+      assertFalse("no keep-alive", shuffle.isActive());
+    }
+
+    private void testKeepAlive(java.util.Queue<Object> messages,
+                               EmbeddedChannel shuffle) throws IOException {
+      final FullHttpRequest req1 = createRequest(
+          getUri(TEST_JOB_ID, 0, Collections.singletonList(TEST_ATTEMPT_1), true));
+      shuffle.writeInbound(req1);
+      assertResponse(messages,
+          getExpectedHttpResponse(req1, true, 46),
+          getAttemptData(new Attempt(TEST_ATTEMPT_1, TEST_DATA_A))
+      );
+      assertTrue("keep-alive", shuffle.isActive());
+      messages.clear();
+
+      final FullHttpRequest req2 = createRequest(
+          getUri(TEST_JOB_ID, 0, Collections.singletonList(TEST_ATTEMPT_2), true));
+      shuffle.writeInbound(req2);
+      assertResponse(messages,
+          getExpectedHttpResponse(req2, true, 46),
+          getAttemptData(new Attempt(TEST_ATTEMPT_2, TEST_DATA_B))
+      );
+      assertTrue("keep-alive", shuffle.isActive());
+      messages.clear();
+
+      final FullHttpRequest req3 = createRequest(
+          getUri(TEST_JOB_ID, 0, Collections.singletonList(TEST_ATTEMPT_3), false));
+      shuffle.writeInbound(req3);
+      assertResponse(messages,
+          getExpectedHttpResponse(req3, false, 46),
+          getAttemptData(new Attempt(TEST_ATTEMPT_3, TEST_DATA_C))
+      );
+      assertFalse("no keep-alive", shuffle.isActive());
+    }
+
+    private ArrayList<ByteBuf> getAllAttemptsForReduce0() throws IOException {
+      return getAttemptData(
+          new Attempt(TEST_ATTEMPT_1, TEST_DATA_A),
+          new Attempt(TEST_ATTEMPT_2, TEST_DATA_B),
+          new Attempt(TEST_ATTEMPT_3, TEST_DATA_C)
+      );
+    }
+
+    private ArrayList<ByteBuf> getAttemptData(Attempt... attempts) throws IOException {
+      ArrayList<ByteBuf> data = new ArrayList<>();
+      for (Attempt attempt : attempts) {
+        data.add(shuffleHeaderToBytes(new ShuffleHeader(attempt.id, attempt.content.length(),
+            attempt.content.length() * 2L, 0)));
+        data.add(Unpooled.copiedBuffer(attempt.content.getBytes(StandardCharsets.UTF_8)));
+      }
+      return data;
+    }
+
+    private void assertResponse(java.util.Queue<Object> outboundMessages,
+                                DefaultHttpResponse response,
+                                List<ByteBuf> content) {
+      final EmbeddedChannel decodeChannel = createHttpResponseChannel();
+
+      content.add(LastHttpContent.EMPTY_LAST_CONTENT.content());
+
+      int i = 0;
+      for (Object outboundMessage : outboundMessages) {
+        ByteBuf actualBytes = ((ByteBuf) outboundMessage);
+        String actualHexdump = ByteBufUtil.prettyHexDump(actualBytes);
+        LOG.info("\n{}", actualHexdump);
+
+        decodeChannel.writeInbound(actualBytes);
+        Object obj = decodeChannel.readInbound();
+        LOG.info("Decoded object: {}", obj);
+
+        if (i == 0) {
+          DefaultHttpResponse resp = (DefaultHttpResponse) obj;
+          assertEquals(response.toString(), resp.toString());
+        }
+        if (i > 0 && i <= content.size()) {
+          assertEquals("data should match",
+              ByteBufUtil.prettyHexDump(content.get(i - 1)), actualHexdump);
+        }
+
+        i++;
+      }
+
+      // This check is done after to have better debug logs on failure.
+      assertEquals("all data should match", content.size() + 1, outboundMessages.size());
+    }
+
+    public EmbeddedChannel createShuffleHandlerChannelFileRegion() {
+      final EmbeddedChannel channel = createShuffleHandlerChannel();
+
+      channel.pipeline().addFirst(
+          new MessageToMessageEncoder<FileRegion>() {
+            @Override
+            protected void encode(
+                ChannelHandlerContext cCtx, FileRegion msg, List<Object> out) throws Exception {
+              ByteArrayOutputStream stream = new ByteArrayOutputStream();
+              WritableByteChannel wbc = Channels.newChannel(stream);
+              msg.transferTo(wbc, msg.position());
+              out.add(Unpooled.wrappedBuffer(stream.toByteArray()));
+            }
+          }
+      );
+
+      return channel;
+    }
+
+    public EmbeddedChannel createSSLClient() throws Exception {
+      final EmbeddedChannel channel = createShuffleHandlerChannel();
+
+      SSLContext sc = SSLContext.getInstance("SSL");
+
+      final TrustManager trm = new X509TrustManager() {
+        public X509Certificate[] getAcceptedIssuers() {
+          return null;
+        }
+
+        public void checkClientTrusted(X509Certificate[] certs, String authType) {
+        }
+
+        public void checkServerTrusted(X509Certificate[] certs, String authType) {
+        }
+      };
+
+      sc.init(null, new TrustManager[]{trm}, null);
+
+      final SSLEngine sslEngine = sc.createSSLEngine();
+      sslEngine.setUseClientMode(true);
+      channel.pipeline().addFirst("ssl", new SslHandler(sslEngine));
+
+      return channel;
+    }
+
+    public EmbeddedChannel createShuffleHandlerSSL(java.util.Queue<Object> unencryptedMessages)
+        throws Exception {
+      final EmbeddedChannel channel = createShuffleHandlerChannel();
+      // SelfSignedCertificate was generated manually with:
+      //  openssl req -x509 -newkey rsa:4096 -keyout key.pem \
+      //    -out cert.pem -sha256 -days 3650 -nodes -subj '/CN=localhost'
+      // Because:
+      //  SelfSignedCertificate ssc = new SelfSignedCertificate();
+      // Throws: Failed to generate a self-signed X.509 certificate using Bouncy Castle
+      final SslContext sslCtx = SslContextBuilder
+          .forServer(getResourceFile("cert.pem"), getResourceFile("key.pem"))
+          .build();
+      final SslHandler sslHandler = sslCtx.newHandler(ByteBufAllocator.DEFAULT);
+      channel.pipeline().addFirst("ssl", sslHandler);
+
+      channel.pipeline().addAfter("ssl", "unencrypted", new MessageToMessageEncoder<ByteBuf>() {
+        @Override
+        protected void encode(ChannelHandlerContext cCtx, ByteBuf msg, List<Object> out) {
+          unencryptedMessages.add(msg.copy());
+          out.add(msg.retain());
+        }
+      });
+
+      channel.pipeline().addLast(new ChannelInboundHandlerAdapter() {
+        @Override
+        public void userEventTriggered(ChannelHandlerContext cCtx, Object evt) {
+          LOG.info("EVENT: {}", evt);
+        }
+      });
+
+      // SSLHandshake must be done, otherwise messages are buffered
+      final EmbeddedChannel client = createSSLClient();
+      for (Object obj : client.outboundMessages()) {
+        channel.writeInbound(obj);
+      }
+      client.outboundMessages().clear();
+      for (Object obj : channel.outboundMessages()) {
+        client.writeInbound(obj);
+      }
+      channel.outboundMessages().clear();
+      for (Object obj : client.outboundMessages()) {
+        channel.writeInbound(obj);
+      }
+      client.outboundMessages().clear();
+
+      return channel;
+    }
+
+    public EmbeddedChannel createShuffleHandlerChannel() {
+      final EmbeddedChannel channel = new EmbeddedChannel();
+      channel.pipeline().addLast("http", new HttpServerCodec());
+      channel.pipeline().addLast("aggregator", new HttpObjectAggregator(MAX_CONTENT_LENGTH));
+      channel.pipeline().addLast("chunking", new ChunkedWriteHandler());
+      channel.pipeline().addLast("shuffle", new ShuffleChannelHandler(ctx));
+      channel.pipeline().addLast(TIMEOUT_HANDLER,
+          new ShuffleHandler.TimeoutHandler(ctx.connectionKeepAliveTimeOut));
+      return channel;
+    }
+
+    public EmbeddedChannel createHttpResponseChannel() {
+      return new EmbeddedChannel(
+          new HttpResponseDecoder()
+      );
+    }
+  }
+}
diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-shuffle/src/test/java/org/apache/hadoop/mapred/TestShuffleHandler.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-shuffle/src/test/java/org/apache/hadoop/mapred/TestShuffleHandler.java
index bf859e4482ea3..a7d2f9ba2d45d 100644
--- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-shuffle/src/test/java/org/apache/hadoop/mapred/TestShuffleHandler.java
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-shuffle/src/test/java/org/apache/hadoop/mapred/TestShuffleHandler.java
@@ -17,29 +17,17 @@
  */
 package org.apache.hadoop.mapred;
 
-import io.netty.channel.ChannelFutureListener;
-import io.netty.channel.DefaultFileRegion;
+
 import org.apache.hadoop.thirdparty.com.google.common.collect.Maps;
-import io.netty.channel.AbstractChannel;
-import io.netty.channel.Channel;
+
 import io.netty.channel.ChannelFuture;
-import io.netty.channel.ChannelHandlerContext;
-import io.netty.channel.ChannelPipeline;
-import io.netty.channel.socket.SocketChannel;
-import io.netty.handler.codec.http.HttpMethod;
-import io.netty.handler.codec.http.HttpRequest;
-import io.netty.handler.codec.http.HttpResponse;
-import io.netty.handler.codec.http.HttpResponseEncoder;
 import io.netty.handler.codec.http.HttpResponseStatus;
-import io.netty.handler.timeout.IdleStateEvent;
-import org.apache.hadoop.test.GenericTestUtils;
 
-import static io.netty.buffer.Unpooled.wrappedBuffer;
-import static java.util.stream.Collectors.toList;
+import static org.apache.hadoop.mapred.ShuffleHandler.SHUFFLE_PORT_CONFIG_KEY;
+import static org.apache.hadoop.mapreduce.security.SecureShuffleUtils.HTTP_HEADER_URL_HASH;
 import static org.apache.hadoop.test.MetricsAsserts.assertCounter;
 import static org.apache.hadoop.test.MetricsAsserts.assertGauge;
 import static org.apache.hadoop.test.MetricsAsserts.getMetrics;
-import static org.hamcrest.MatcherAssert.assertThat;
 import static org.junit.Assert.assertNotEquals;
 import static org.junit.Assert.assertTrue;
 import static org.junit.Assert.assertEquals;
@@ -49,46 +37,31 @@
 import static org.mockito.Mockito.mock;
 import static org.mockito.Mockito.when;
 
-import java.io.ByteArrayOutputStream;
+import java.io.BufferedReader;
 import java.io.DataInputStream;
-import java.io.EOFException;
 import java.io.File;
 import java.io.FileInputStream;
-import java.io.FileOutputStream;
 import java.io.IOException;
-import java.io.InputStream;
+import java.io.InputStreamReader;
 import java.net.HttpURLConnection;
-import java.net.InetSocketAddress;
-import java.net.Proxy;
-import java.net.Socket;
+import java.net.MalformedURLException;
 import java.net.URL;
-import java.net.SocketAddress;
-import java.net.URLConnection;
 import java.nio.ByteBuffer;
-import java.nio.channels.ClosedChannelException;
-import java.nio.charset.StandardCharsets;
-import java.nio.file.Files;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collections;
 import java.util.List;
 import java.util.Map;
-import java.util.concurrent.CountDownLatch;
-import java.util.concurrent.TimeUnit;
-import java.util.function.Consumer;
-import java.util.zip.CheckedOutputStream;
-import java.util.zip.Checksum;
+
+import javax.crypto.SecretKey;
 
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.CommonConfigurationKeysPublic;
-import org.apache.hadoop.fs.FSDataOutputStream;
-import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.FileUtil;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.DataOutputBuffer;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.io.nativeio.NativeIO;
-import org.apache.hadoop.mapreduce.TypeConverter;
 import org.apache.hadoop.mapreduce.security.SecureShuffleUtils;
 import org.apache.hadoop.mapreduce.security.token.JobTokenIdentifier;
 import org.apache.hadoop.mapreduce.security.token.JobTokenSecretManager;
@@ -100,761 +73,22 @@
 import org.apache.hadoop.security.UserGroupInformation;
 import org.apache.hadoop.security.token.Token;
 import org.apache.hadoop.service.ServiceStateException;
-import org.apache.hadoop.util.DiskChecker;
-import org.apache.hadoop.util.PureJavaCrc32;
 import org.apache.hadoop.util.Sets;
-import org.apache.hadoop.util.StringUtils;
 import org.apache.hadoop.yarn.api.records.ApplicationId;
-import org.apache.hadoop.yarn.conf.YarnConfiguration;
 import org.apache.hadoop.yarn.server.api.ApplicationInitializationContext;
 import org.apache.hadoop.yarn.server.api.ApplicationTerminationContext;
-import org.apache.hadoop.yarn.server.api.AuxiliaryLocalPathHandler;
-import org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.ContainerLocalizer;
 import org.apache.hadoop.yarn.server.records.Version;
-import org.hamcrest.CoreMatchers;
-import org.junit.After;
-import org.junit.Assert;
-import org.junit.Before;
-import org.junit.Rule;
 import org.junit.Test;
-import org.junit.rules.TestName;
-import org.mockito.Mockito;
-import org.eclipse.jetty.http.HttpHeader;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-public class TestShuffleHandler {
-  static final long MiB = 1024 * 1024; 
+public class TestShuffleHandler extends TestShuffleHandlerBase {
+  static final long MIB = 1024 * 1024;
   private static final Logger LOG =
       LoggerFactory.getLogger(TestShuffleHandler.class);
-  private static final File ABS_LOG_DIR = GenericTestUtils.getTestDir(
-      TestShuffleHandler.class.getSimpleName() + "LocDir");
-  private static final long ATTEMPT_ID = 12345L;
-  private static final long ATTEMPT_ID_2 = 12346L;
-  private static final HttpResponseStatus OK_STATUS = new HttpResponseStatus(200, "OK");
-
-
-  //Control test execution properties with these flags
-  private static final boolean DEBUG_MODE = false;
-  //WARNING: If this is set to true and proxy server is not running, tests will fail!
-  private static final boolean USE_PROXY = false;
-  private static final int HEADER_WRITE_COUNT = 100000;
-  private static final int ARBITRARY_NEGATIVE_TIMEOUT_SECONDS = -100;
-  private static TestExecution TEST_EXECUTION;
-
-  private static class TestExecution {
-    private static final int DEFAULT_KEEP_ALIVE_TIMEOUT_SECONDS = 1;
-    private static final int DEBUG_KEEP_ALIVE_SECONDS = 1000;
-    private static final int DEFAULT_PORT = 0; //random port
-    private static final int FIXED_PORT = 8088;
-    private static final String PROXY_HOST = "127.0.0.1";
-    private static final int PROXY_PORT = 8888;
-    private static final int CONNECTION_DEBUG_TIMEOUT = 1000000;
-    private final boolean debugMode;
-    private final boolean useProxy;
-
-    TestExecution(boolean debugMode, boolean useProxy) {
-      this.debugMode = debugMode;
-      this.useProxy = useProxy;
-    }
-
-    int getKeepAliveTimeout() {
-      if (debugMode) {
-        return DEBUG_KEEP_ALIVE_SECONDS;
-      }
-      return DEFAULT_KEEP_ALIVE_TIMEOUT_SECONDS;
-    }
-
-    HttpURLConnection openConnection(URL url) throws IOException {
-      HttpURLConnection conn;
-      if (useProxy) {
-        Proxy proxy
-            = new Proxy(Proxy.Type.HTTP, new InetSocketAddress(PROXY_HOST, PROXY_PORT));
-        conn = (HttpURLConnection) url.openConnection(proxy);
-      } else {
-        conn = (HttpURLConnection) url.openConnection();
-      }
-      return conn;
-    }
-
-    int shuffleHandlerPort() {
-      if (debugMode) {
-        return FIXED_PORT;
-      } else {
-        return DEFAULT_PORT;
-      }
-    }
-
-    void parameterizeConnection(URLConnection conn) {
-      if (DEBUG_MODE) {
-        conn.setReadTimeout(CONNECTION_DEBUG_TIMEOUT);
-        conn.setConnectTimeout(CONNECTION_DEBUG_TIMEOUT);
-      }
-    }
-  }
-
-  private static class ResponseConfig {
-    private final int headerWriteCount;
-    private final int mapOutputCount;
-    private final int contentLengthOfOneMapOutput;
-    private long headerSize;
-    public long contentLengthOfResponse;
-
-    ResponseConfig(int headerWriteCount, int mapOutputCount,
-        int contentLengthOfOneMapOutput) {
-      if (mapOutputCount <= 0 && contentLengthOfOneMapOutput > 0) {
-        throw new IllegalStateException("mapOutputCount should be at least 1");
-      }
-      this.headerWriteCount = headerWriteCount;
-      this.mapOutputCount = mapOutputCount;
-      this.contentLengthOfOneMapOutput = contentLengthOfOneMapOutput;
-    }
-
-    private void setHeaderSize(long headerSize) {
-      this.headerSize = headerSize;
-      long contentLengthOfAllHeaders = headerWriteCount * headerSize;
-      this.contentLengthOfResponse = computeContentLengthOfResponse(contentLengthOfAllHeaders);
-      LOG.debug("Content-length of all headers: {}", contentLengthOfAllHeaders);
-      LOG.debug("Content-length of one MapOutput: {}", contentLengthOfOneMapOutput);
-      LOG.debug("Content-length of final HTTP response: {}", contentLengthOfResponse);
-    }
-
-    private long computeContentLengthOfResponse(long contentLengthOfAllHeaders) {
-      int mapOutputCountMultiplier = mapOutputCount;
-      if (mapOutputCount == 0) {
-        mapOutputCountMultiplier = 1;
-      }
-      return (contentLengthOfAllHeaders + contentLengthOfOneMapOutput) * mapOutputCountMultiplier;
-    }
-  }
-
-  private enum ShuffleUrlType {
-    SIMPLE, WITH_KEEPALIVE, WITH_KEEPALIVE_MULTIPLE_MAP_IDS, WITH_KEEPALIVE_NO_MAP_IDS
-  }
-
-  private static class InputStreamReadResult {
-    final String asString;
-    int totalBytesRead;
-
-    InputStreamReadResult(byte[] bytes, int totalBytesRead) {
-      this.asString = new String(bytes, StandardCharsets.UTF_8);
-      this.totalBytesRead = totalBytesRead;
-    }
-  }
-
-  private static abstract class AdditionalMapOutputSenderOperations {
-    public abstract ChannelFuture perform(ChannelHandlerContext ctx, Channel ch) throws IOException;
-  }
-
-  private class ShuffleHandlerForKeepAliveTests extends ShuffleHandler {
-    final LastSocketAddress lastSocketAddress = new LastSocketAddress();
-    final ArrayList<Throwable> failures = new ArrayList<>();
-    final ShuffleHeaderProvider shuffleHeaderProvider;
-    final HeaderPopulator headerPopulator;
-    MapOutputSender mapOutputSender;
-    private Consumer<IdleStateEvent> channelIdleCallback;
-    private CustomTimeoutHandler customTimeoutHandler;
-    private boolean failImmediatelyOnErrors = false;
-    private boolean closeChannelOnError = true;
-    private ResponseConfig responseConfig;
-
-    ShuffleHandlerForKeepAliveTests(long attemptId, ResponseConfig responseConfig,
-        Consumer<IdleStateEvent> channelIdleCallback) throws IOException {
-      this(attemptId, responseConfig);
-      this.channelIdleCallback = channelIdleCallback;
-    }
-
-    ShuffleHandlerForKeepAliveTests(long attemptId, ResponseConfig responseConfig)
-        throws IOException {
-      this.responseConfig = responseConfig;
-      this.shuffleHeaderProvider = new ShuffleHeaderProvider(attemptId);
-      this.responseConfig.setHeaderSize(shuffleHeaderProvider.getShuffleHeaderSize());
-      this.headerPopulator = new HeaderPopulator(this, responseConfig, shuffleHeaderProvider, true);
-      this.mapOutputSender = new MapOutputSender(responseConfig, lastSocketAddress,
-          shuffleHeaderProvider);
-      setUseOutboundExceptionHandler(true);
-    }
-
-    public void setFailImmediatelyOnErrors(boolean failImmediatelyOnErrors) {
-      this.failImmediatelyOnErrors = failImmediatelyOnErrors;
-    }
-
-    public void setCloseChannelOnError(boolean closeChannelOnError) {
-      this.closeChannelOnError = closeChannelOnError;
-    }
-
-    @Override
-    protected Shuffle getShuffle(final Configuration conf) {
-      // replace the shuffle handler with one stubbed for testing
-      return new Shuffle(conf) {
-        @Override
-        protected MapOutputInfo getMapOutputInfo(String mapId, int reduce,
-            String jobId, String user) {
-          return null;
-        }
-        @Override
-        protected void verifyRequest(String appid, ChannelHandlerContext ctx,
-            HttpRequest request, HttpResponse response, URL requestUri) {
-        }
-
-        @Override
-        protected void populateHeaders(List<String> mapIds, String jobId,
-            String user, int reduce, HttpRequest request,
-            HttpResponse response, boolean keepAliveParam,
-            Map<String, MapOutputInfo> infoMap) throws IOException {
-          long contentLength = headerPopulator.populateHeaders(
-              keepAliveParam);
-          super.setResponseHeaders(response, keepAliveParam, contentLength);
-        }
-
-        @Override
-        protected ChannelFuture sendMapOutput(ChannelHandlerContext ctx,
-            Channel ch, String user, String mapId, int reduce,
-            MapOutputInfo info) throws IOException {
-          return mapOutputSender.send(ctx, ch);
-        }
-
-        @Override
-        public void channelActive(ChannelHandlerContext ctx) throws Exception {
-          ctx.pipeline().replace(HttpResponseEncoder.class, ENCODER_HANDLER_NAME,
-              new LoggingHttpResponseEncoder(false));
-          replaceTimeoutHandlerWithCustom(ctx);
-          LOG.debug("Modified pipeline: {}", ctx.pipeline());
-          super.channelActive(ctx);
-        }
-
-        private void replaceTimeoutHandlerWithCustom(ChannelHandlerContext ctx) {
-          TimeoutHandler oldTimeoutHandler =
-              (TimeoutHandler)ctx.pipeline().get(TIMEOUT_HANDLER);
-          int timeoutValue =
-              oldTimeoutHandler.getConnectionKeepAliveTimeOut();
-          customTimeoutHandler = new CustomTimeoutHandler(timeoutValue, channelIdleCallback);
-          ctx.pipeline().replace(TIMEOUT_HANDLER, TIMEOUT_HANDLER, customTimeoutHandler);
-        }
-
-        @Override
-        protected void sendError(ChannelHandlerContext ctx,
-            HttpResponseStatus status) {
-          String message = "Error while processing request. Status: " + status;
-          handleError(ctx, message);
-          if (failImmediatelyOnErrors) {
-            stop();
-          }
-        }
-
-        @Override
-        protected void sendError(ChannelHandlerContext ctx, String message,
-            HttpResponseStatus status) {
-          String errMessage = String.format("Error while processing request. " +
-              "Status: " +
-              "%s, message: %s", status, message);
-          handleError(ctx, errMessage);
-          if (failImmediatelyOnErrors) {
-            stop();
-          }
-        }
-      };
-    }
-
-    private void handleError(ChannelHandlerContext ctx, String message) {
-      LOG.error(message);
-      failures.add(new Error(message));
-      if (closeChannelOnError) {
-        LOG.warn("sendError: Closing channel");
-        ctx.channel().close();
-      }
-    }
-
-    private class CustomTimeoutHandler extends TimeoutHandler {
-      private boolean channelIdle = false;
-      private final Consumer<IdleStateEvent> channelIdleCallback;
-
-      CustomTimeoutHandler(int connectionKeepAliveTimeOut,
-          Consumer<IdleStateEvent> channelIdleCallback) {
-        super(connectionKeepAliveTimeOut);
-        this.channelIdleCallback = channelIdleCallback;
-      }
-
-      @Override
-      public void channelIdle(ChannelHandlerContext ctx, IdleStateEvent e) {
-        LOG.debug("Channel idle");
-        this.channelIdle = true;
-        if (channelIdleCallback != null) {
-          LOG.debug("Calling channel idle callback..");
-          channelIdleCallback.accept(e);
-        }
-        super.channelIdle(ctx, e);
-      }
-    }
-  }
-
-  private static class MapOutputSender {
-    private final ResponseConfig responseConfig;
-    private final LastSocketAddress lastSocketAddress;
-    private final ShuffleHeaderProvider shuffleHeaderProvider;
-    private AdditionalMapOutputSenderOperations additionalMapOutputSenderOperations;
-
-    MapOutputSender(ResponseConfig responseConfig, LastSocketAddress lastSocketAddress,
-        ShuffleHeaderProvider shuffleHeaderProvider) {
-      this.responseConfig = responseConfig;
-      this.lastSocketAddress = lastSocketAddress;
-      this.shuffleHeaderProvider = shuffleHeaderProvider;
-    }
-
-    public ChannelFuture send(ChannelHandlerContext ctx, Channel ch) throws IOException {
-      LOG.debug("In MapOutputSender#send");
-      lastSocketAddress.setAddress(ch.remoteAddress());
-      ShuffleHeader header = shuffleHeaderProvider.createNewShuffleHeader();
-      ChannelFuture future = writeHeaderNTimes(ch, header, responseConfig.headerWriteCount);
-      // This is the last operation
-      // It's safe to increment ShuffleHeader counter for better identification
-      shuffleHeaderProvider.incrementCounter();
-      if (additionalMapOutputSenderOperations != null) {
-        return additionalMapOutputSenderOperations.perform(ctx, ch);
-      }
-      return future;
-    }
-
-    private ChannelFuture writeHeaderNTimes(Channel ch, ShuffleHeader header, int iterations)
-        throws IOException {
-      DataOutputBuffer dob = new DataOutputBuffer();
-      for (int i = 0; i < iterations; ++i) {
-        header.write(dob);
-      }
-      LOG.debug("MapOutputSender#writeHeaderNTimes WriteAndFlush big chunk of data, " +
-          "outputBufferSize: " + dob.size());
-      return ch.writeAndFlush(wrappedBuffer(dob.getData(), 0, dob.getLength()));
-    }
-  }
-
-  private static class ShuffleHeaderProvider {
-    private final long attemptId;
-    private int attemptCounter = 0;
-    private int cachedSize = Integer.MIN_VALUE;
-
-    ShuffleHeaderProvider(long attemptId) {
-      this.attemptId = attemptId;
-    }
-
-    ShuffleHeader createNewShuffleHeader() {
-      return new ShuffleHeader(String.format("attempt_%s_1_m_1_0%s", attemptId, attemptCounter),
-          5678, 5678, 1);
-    }
-
-    void incrementCounter() {
-      attemptCounter++;
-    }
-
-    private int getShuffleHeaderSize() throws IOException {
-      if (cachedSize != Integer.MIN_VALUE) {
-        return cachedSize;
-      }
-      DataOutputBuffer dob = new DataOutputBuffer();
-      ShuffleHeader header = createNewShuffleHeader();
-      header.write(dob);
-      cachedSize = dob.size();
-      return cachedSize;
-    }
-  }
-
-  private static class HeaderPopulator {
-    private final ShuffleHandler shuffleHandler;
-    private final boolean disableKeepAliveConfig;
-    private final ShuffleHeaderProvider shuffleHeaderProvider;
-    private final ResponseConfig responseConfig;
-
-    HeaderPopulator(ShuffleHandler shuffleHandler,
-        ResponseConfig responseConfig,
-        ShuffleHeaderProvider shuffleHeaderProvider,
-        boolean disableKeepAliveConfig) {
-      this.shuffleHandler = shuffleHandler;
-      this.responseConfig = responseConfig;
-      this.disableKeepAliveConfig = disableKeepAliveConfig;
-      this.shuffleHeaderProvider = shuffleHeaderProvider;
-    }
-
-    public long populateHeaders(boolean keepAliveParam) throws IOException {
-      // Send some dummy data (populate content length details)
-      DataOutputBuffer dob = new DataOutputBuffer();
-      for (int i = 0; i < responseConfig.headerWriteCount; ++i) {
-        ShuffleHeader header =
-            shuffleHeaderProvider.createNewShuffleHeader();
-        header.write(dob);
-      }
-      // for testing purpose;
-      // disable connectionKeepAliveEnabled if keepAliveParam is available
-      if (keepAliveParam && disableKeepAliveConfig) {
-        shuffleHandler.connectionKeepAliveEnabled = false;
-      }
-      return responseConfig.contentLengthOfResponse;
-    }
-  }
-
-  private static final class HttpConnectionData {
-    private final Map<String, List<String>> headers;
-    private HttpURLConnection conn;
-    private final int payloadLength;
-    private final SocketAddress socket;
-    private int responseCode = -1;
-
-    private HttpConnectionData(HttpURLConnection conn, int payloadLength,
-        SocketAddress socket) {
-      this.headers = conn.getHeaderFields();
-      this.conn = conn;
-      this.payloadLength = payloadLength;
-      this.socket = socket;
-      try {
-        this.responseCode = conn.getResponseCode();
-      } catch (IOException e) {
-        fail("Failed to read response code from connection: " + conn);
-      }
-    }
-
-    static HttpConnectionData create(HttpURLConnection conn, int payloadLength,
-        SocketAddress socket) {
-      return new HttpConnectionData(conn, payloadLength, socket);
-    }
-  }
-
-  private static final class HttpConnectionAssert {
-    private final HttpConnectionData connData;
-
-    private HttpConnectionAssert(HttpConnectionData connData) {
-      this.connData = connData;
-    }
-
-    static HttpConnectionAssert create(HttpConnectionData connData) {
-      return new HttpConnectionAssert(connData);
-    }
-
-    public static void assertKeepAliveConnectionsAreSame(
-        HttpConnectionHelper httpConnectionHelper) {
-      assertTrue("At least two connection data " +
-          "is required to perform this assertion",
-          httpConnectionHelper.connectionData.size() >= 2);
-      SocketAddress firstAddress = httpConnectionHelper.getConnectionData(0).socket;
-      SocketAddress secondAddress = httpConnectionHelper.getConnectionData(1).socket;
-      Assert.assertNotNull("Initial shuffle address should not be null",
-          firstAddress);
-      Assert.assertNotNull("Keep-Alive shuffle address should not be null",
-          secondAddress);
-      assertEquals("Initial shuffle address and keep-alive shuffle "
-          + "address should be the same", firstAddress, secondAddress);
-    }
-
-    public HttpConnectionAssert expectKeepAliveWithTimeout(long timeout) {
-      assertEquals(HttpURLConnection.HTTP_OK, connData.responseCode);
-      assertHeaderValue(HttpHeader.CONNECTION, HttpHeader.KEEP_ALIVE.asString());
-      assertHeaderValue(HttpHeader.KEEP_ALIVE, "timeout=" + timeout);
-      return this;
-    }
-
-    public HttpConnectionAssert expectBadRequest(long timeout) {
-      assertEquals(HttpURLConnection.HTTP_BAD_REQUEST, connData.responseCode);
-      assertHeaderValue(HttpHeader.CONNECTION, HttpHeader.KEEP_ALIVE.asString());
-      assertHeaderValue(HttpHeader.KEEP_ALIVE, "timeout=" + timeout);
-      return this;
-    }
-
-    public HttpConnectionAssert expectResponseContentLength(long size) {
-      assertEquals(size, connData.payloadLength);
-      return this;
-    }
-
-    private void assertHeaderValue(HttpHeader header, String expectedValue) {
-      List<String> headerList = connData.headers.get(header.asString());
-      Assert.assertNotNull("Got null header value for header: " + header, headerList);
-      Assert.assertFalse("Got empty header value for header: " + header, headerList.isEmpty());
-      assertEquals("Unexpected size of header list for header: " + header, 1,
-          headerList.size());
-      assertEquals(expectedValue, headerList.get(0));
-    }
-  }
-
-  private static class HttpConnectionHelper {
-    private final LastSocketAddress lastSocketAddress;
-    List<HttpConnectionData> connectionData = new ArrayList<>();
-
-    HttpConnectionHelper(LastSocketAddress lastSocketAddress) {
-      this.lastSocketAddress = lastSocketAddress;
-    }
-
-    public void connectToUrls(String[] urls, ResponseConfig responseConfig) throws IOException {
-      connectToUrlsInternal(urls, responseConfig, HttpURLConnection.HTTP_OK);
-    }
-
-    public void connectToUrls(String[] urls, ResponseConfig responseConfig, int expectedHttpStatus)
-        throws IOException {
-      connectToUrlsInternal(urls, responseConfig, expectedHttpStatus);
-    }
-
-    private void connectToUrlsInternal(String[] urls, ResponseConfig responseConfig,
-        int expectedHttpStatus) throws IOException {
-      int requests = urls.length;
-      int expectedConnections = urls.length;
-      LOG.debug("Will connect to URLs: {}", Arrays.toString(urls));
-      for (int reqIdx = 0; reqIdx < requests; reqIdx++) {
-        String urlString = urls[reqIdx];
-        LOG.debug("Connecting to URL: {}", urlString);
-        URL url = new URL(urlString);
-        HttpURLConnection conn = TEST_EXECUTION.openConnection(url);
-        conn.setRequestProperty(ShuffleHeader.HTTP_HEADER_NAME,
-            ShuffleHeader.DEFAULT_HTTP_HEADER_NAME);
-        conn.setRequestProperty(ShuffleHeader.HTTP_HEADER_VERSION,
-            ShuffleHeader.DEFAULT_HTTP_HEADER_VERSION);
-        TEST_EXECUTION.parameterizeConnection(conn);
-        conn.connect();
-        if (expectedHttpStatus == HttpURLConnection.HTTP_BAD_REQUEST) {
-          //Catch exception as error are caught with overridden sendError method
-          //Caught errors will be validated later.
-          try {
-            DataInputStream input = new DataInputStream(conn.getInputStream());
-          } catch (Exception e) {
-            expectedConnections--;
-            continue;
-          }
-        }
-        DataInputStream input = new DataInputStream(conn.getInputStream());
-        LOG.debug("Opened DataInputStream for connection: {}/{}", (reqIdx + 1), requests);
-        ShuffleHeader header = new ShuffleHeader();
-        header.readFields(input);
-        InputStreamReadResult result = readDataFromInputStream(input);
-        result.totalBytesRead += responseConfig.headerSize;
-        int expectedContentLength =
-            Integer.parseInt(conn.getHeaderField(HttpHeader.CONTENT_LENGTH.asString()));
-
-        if (result.totalBytesRead != expectedContentLength) {
-          throw new IOException(String.format("Premature EOF InputStream. " +
-              "Expected content-length: %s, " +
-              "Actual content-length: %s", expectedContentLength, result.totalBytesRead));
-        }
-        connectionData.add(HttpConnectionData
-            .create(conn, result.totalBytesRead, lastSocketAddress.getSocketAddres()));
-        input.close();
-        LOG.debug("Finished all interactions with URL: {}. Progress: {}/{}", url, (reqIdx + 1),
-            requests);
-      }
-      assertEquals(expectedConnections, connectionData.size());
-    }
-
-    void validate(Consumer<HttpConnectionData> connDataValidator) {
-      for (int i = 0; i < connectionData.size(); i++) {
-        LOG.debug("Validating connection data #{}", (i + 1));
-        HttpConnectionData connData = connectionData.get(i);
-        connDataValidator.accept(connData);
-      }
-    }
-
-    HttpConnectionData getConnectionData(int i) {
-      return connectionData.get(i);
-    }
-
-    private static InputStreamReadResult readDataFromInputStream(
-        InputStream input) throws IOException {
-      ByteArrayOutputStream dataStream = new ByteArrayOutputStream();
-      byte[] buffer = new byte[1024];
-      int bytesRead;
-      int totalBytesRead = 0;
-      while ((bytesRead = input.read(buffer)) != -1) {
-        dataStream.write(buffer, 0, bytesRead);
-        totalBytesRead += bytesRead;
-      }
-      LOG.debug("Read total bytes: " + totalBytesRead);
-      dataStream.flush();
-      return new InputStreamReadResult(dataStream.toByteArray(), totalBytesRead);
-    }
-  }
-
-  class ShuffleHandlerForTests extends ShuffleHandler {
-    public final ArrayList<Throwable> failures = new ArrayList<>();
-
-    ShuffleHandlerForTests() {
-      setUseOutboundExceptionHandler(true);
-    }
-
-    ShuffleHandlerForTests(MetricsSystem ms) {
-      super(ms);
-      setUseOutboundExceptionHandler(true);
-    }
-
-    @Override
-    protected Shuffle getShuffle(final Configuration conf) {
-      return new Shuffle(conf) {
-        @Override
-        public void exceptionCaught(ChannelHandlerContext ctx,
-            Throwable cause) throws Exception {
-          LOG.debug("ExceptionCaught");
-          failures.add(cause);
-          super.exceptionCaught(ctx, cause);
-        }
-      };
-    }
-  }
-
-  class MockShuffleHandler extends org.apache.hadoop.mapred.ShuffleHandler {
-    final ArrayList<Throwable> failures = new ArrayList<>();
-
-    private final AuxiliaryLocalPathHandler pathHandler =
-        new TestAuxiliaryLocalPathHandler();
-
-    MockShuffleHandler() {
-      setUseOutboundExceptionHandler(true);
-    }
-
-    MockShuffleHandler(MetricsSystem ms) {
-      super(ms);
-      setUseOutboundExceptionHandler(true);
-    }
 
-    @Override
-    protected Shuffle getShuffle(final Configuration conf) {
-      return new Shuffle(conf) {
-        @Override
-        protected void verifyRequest(String appid, ChannelHandlerContext ctx,
-            HttpRequest request, HttpResponse response, URL requestUri)
-            throws IOException {
-        }
-        @Override
-        protected MapOutputInfo getMapOutputInfo(String mapId, int reduce,
-            String jobId, String user) {
-          // Do nothing.
-          return null;
-        }
-        @Override
-        protected void populateHeaders(List<String> mapIds, String jobId,
-            String user, int reduce, HttpRequest request,
-            HttpResponse response, boolean keepAliveParam,
-            Map<String, MapOutputInfo> infoMap) {
-          // Do nothing.
-        }
-        @Override
-        protected ChannelFuture sendMapOutput(ChannelHandlerContext ctx,
-            Channel ch, String user, String mapId, int reduce,
-            MapOutputInfo info) throws IOException {
-
-          ShuffleHeader header =
-              new ShuffleHeader("attempt_12345_1_m_1_0", 5678, 5678, 1);
-          DataOutputBuffer dob = new DataOutputBuffer();
-          header.write(dob);
-          ch.writeAndFlush(wrappedBuffer(dob.getData(), 0, dob.getLength()));
-          dob = new DataOutputBuffer();
-          for (int i = 0; i < 100; ++i) {
-            header.write(dob);
-          }
-          return ch.writeAndFlush(wrappedBuffer(dob.getData(), 0, dob.getLength()));
-        }
-
-        @Override
-        public void exceptionCaught(ChannelHandlerContext ctx,
-            Throwable cause) throws Exception {
-          LOG.debug("ExceptionCaught");
-          failures.add(cause);
-          super.exceptionCaught(ctx, cause);
-        }
-      };
-    }
-
-    @Override
-    public AuxiliaryLocalPathHandler getAuxiliaryLocalPathHandler() {
-      return pathHandler;
-    }
-  }
-
-  private class TestAuxiliaryLocalPathHandler
-      implements AuxiliaryLocalPathHandler {
-    @Override
-    public Path getLocalPathForRead(String path) {
-      return new Path(ABS_LOG_DIR.getAbsolutePath(), path);
-    }
-
-    @Override
-    public Path getLocalPathForWrite(String path) {
-      return new Path(ABS_LOG_DIR.getAbsolutePath());
-    }
-
-    @Override
-    public Path getLocalPathForWrite(String path, long size) {
-      return new Path(ABS_LOG_DIR.getAbsolutePath());
-    }
-
-    @Override
-    public Iterable<Path> getAllLocalPathsForRead(String path) {
-      ArrayList<Path> paths = new ArrayList<>();
-      paths.add(new Path(ABS_LOG_DIR.getAbsolutePath()));
-      return paths;
-    }
-  }
-
-  private static class MockShuffleHandler2 extends
-      org.apache.hadoop.mapred.ShuffleHandler {
-    final ArrayList<Throwable> failures = new ArrayList<>(1);
-    boolean socketKeepAlive = false;
-
-    MockShuffleHandler2() {
-      setUseOutboundExceptionHandler(true);
-    }
-
-    MockShuffleHandler2(MetricsSystem ms) {
-      super(ms);
-      setUseOutboundExceptionHandler(true);
-    }
-
-    @Override
-    protected Shuffle getShuffle(final Configuration conf) {
-      return new Shuffle(conf) {
-        @Override
-        protected void verifyRequest(String appid, ChannelHandlerContext ctx,
-            HttpRequest request, HttpResponse response, URL requestUri) {
-          SocketChannel channel = (SocketChannel)(ctx.channel());
-          socketKeepAlive = channel.config().isKeepAlive();
-        }
-
-        @Override
-        public void exceptionCaught(ChannelHandlerContext ctx,
-            Throwable cause) throws Exception {
-          LOG.debug("ExceptionCaught");
-          failures.add(cause);
-          super.exceptionCaught(ctx, cause);
-        }
-      };
-    }
-
-    protected boolean isSocketKeepAlive() {
-      return socketKeepAlive;
-    }
-  }
-
-  @Rule
-  public TestName name = new TestName();
-
-  @Before
-  public void setup() {
-    TEST_EXECUTION = new TestExecution(DEBUG_MODE, USE_PROXY);
-  }
-
-  @After
-  public void tearDown() {
-    int port = TEST_EXECUTION.shuffleHandlerPort();
-    if (isPortUsed(port)) {
-      String msg = String.format("Port is being used: %d. " +
-          "Current testcase name: %s",
-          port, name.getMethodName());
-      throw new IllegalStateException(msg);
-    }
-  }
-
-  private static boolean isPortUsed(int port) {
-    if (port == 0) {
-      //Don't check if port is 0
-      return false;
-    }
-    try (Socket ignored = new Socket("localhost", port)) {
-      return true;
-    } catch (IOException e) {
-      LOG.error("Port: {}, port check result: {}", port, e.getMessage());
-      return false;
-    }
-  }
+  private static final HttpResponseStatus OK_STATUS = new HttpResponseStatus(200, "OK");
+  private static final ApplicationId TEST_APP_ID = ApplicationId.newInstance(1111111111111L, 1);
 
   /**
    * Test the validation of ShuffleHandler's meta-data's serialization and
@@ -862,8 +96,8 @@ private static boolean isPortUsed(int port) {
    *
    * @throws Exception exception
    */
-  @Test (timeout = 10000)
-  public void testSerializeMeta()  throws Exception {
+  @Test(timeout = 10000)
+  public void testSerializeMeta() throws Exception {
     assertEquals(1, ShuffleHandler.deserializeMetaData(
         ShuffleHandler.serializeMetaData(1)));
     assertEquals(-1, ShuffleHandler.deserializeMetaData(
@@ -877,24 +111,24 @@ public void testSerializeMeta()  throws Exception {
    *
    * @throws Exception exception
    */
-  @Test (timeout = 10000)
+  @Test(timeout = 10000)
   public void testShuffleMetrics() throws Exception {
     MetricsSystem ms = new MetricsSystemImpl();
-    ShuffleHandler sh = new ShuffleHandlerForTests(ms);
+    ShuffleHandler sh = new ShuffleHandler(ms);
     ChannelFuture cf = mock(ChannelFuture.class);
     when(cf.isSuccess()).thenReturn(true).thenReturn(false);
 
     sh.metrics.shuffleConnections.incr();
-    sh.metrics.shuffleOutputBytes.incr(MiB);
+    sh.metrics.shuffleOutputBytes.incr(MIB);
     sh.metrics.shuffleConnections.incr();
-    sh.metrics.shuffleOutputBytes.incr(2*MiB);
+    sh.metrics.shuffleOutputBytes.incr(2 * MIB);
 
-    checkShuffleMetrics(ms, 3*MiB, 0, 0, 2);
+    checkShuffleMetrics(ms, 3 * MIB, 0, 0, 2);
 
     sh.metrics.operationComplete(cf);
     sh.metrics.operationComplete(cf);
 
-    checkShuffleMetrics(ms, 3*MiB, 1, 1, 0);
+    checkShuffleMetrics(ms, 3 * MIB, 1, 1, 0);
 
     sh.stop();
   }
@@ -909,444 +143,32 @@ static void checkShuffleMetrics(MetricsSystem ms, long bytes, int failed,
     assertGauge("ShuffleConnections", connections, rb);
   }
 
-  /**
-   * Verify client prematurely closing a connection.
-   *
-   * @throws Exception exception.
-   */
-  @Test (timeout = 10000)
-  public void testClientClosesConnection() throws Exception {
-    Configuration conf = new Configuration();
-    conf.setInt(ShuffleHandler.SHUFFLE_PORT_CONFIG_KEY, TEST_EXECUTION.shuffleHandlerPort());
-    ShuffleHandlerForTests shuffleHandler = new ShuffleHandlerForTests() {
-
-      @Override
-      protected Shuffle getShuffle(Configuration conf) {
-        // replace the shuffle handler with one stubbed for testing
-        return new Shuffle(conf) {
-          @Override
-          protected MapOutputInfo getMapOutputInfo(String mapId, int reduce,
-              String jobId, String user) {
-            return null;
-          }
-          @Override
-          protected void populateHeaders(List<String> mapIds, String jobId,
-              String user, int reduce, HttpRequest request,
-              HttpResponse response, boolean keepAliveParam,
-              Map<String, MapOutputInfo> infoMap) {
-            // Only set response headers and skip everything else
-            // send some dummy value for content-length
-            super.setResponseHeaders(response, keepAliveParam, 100);
-          }
-          @Override
-          protected void verifyRequest(String appid, ChannelHandlerContext ctx,
-              HttpRequest request, HttpResponse response, URL requestUri) {
-          }
-          @Override
-          protected ChannelFuture sendMapOutput(ChannelHandlerContext ctx,
-              Channel ch, String user, String mapId, int reduce,
-              MapOutputInfo info)
-                  throws IOException {
-            ShuffleHeader header =
-                new ShuffleHeader("attempt_12345_1_m_1_0", 5678, 5678, 1);
-            DataOutputBuffer dob = new DataOutputBuffer();
-            header.write(dob);
-            ch.writeAndFlush(wrappedBuffer(dob.getData(), 0, dob.getLength()));
-            dob = new DataOutputBuffer();
-            for (int i = 0; i < 100000; ++i) {
-              header.write(dob);
-            }
-            return ch.writeAndFlush(wrappedBuffer(dob.getData(), 0, dob.getLength()));
-          }
-          @Override
-          protected void sendError(ChannelHandlerContext ctx,
-              HttpResponseStatus status) {
-            if (failures.size() == 0) {
-              failures.add(new Error());
-              ctx.channel().close();
-            }
-          }
-          @Override
-          protected void sendError(ChannelHandlerContext ctx, String message,
-              HttpResponseStatus status) {
-            if (failures.size() == 0) {
-              failures.add(new Error());
-              ctx.channel().close();
-            }
-          }
-        };
-      }
-    };
-    shuffleHandler.init(conf);
-    shuffleHandler.start();
-
-    // simulate a reducer that closes early by reading a single shuffle header
-    // then closing the connection
-    URL url = new URL("http://127.0.0.1:"
-        + shuffleHandler.getConfig().get(ShuffleHandler.SHUFFLE_PORT_CONFIG_KEY)
-        + "/mapOutput?job=job_12345_1&reduce=1&map=attempt_12345_1_m_1_0");
-    HttpURLConnection conn = TEST_EXECUTION.openConnection(url);
-    conn.setRequestProperty(ShuffleHeader.HTTP_HEADER_NAME,
-        ShuffleHeader.DEFAULT_HTTP_HEADER_NAME);
-    conn.setRequestProperty(ShuffleHeader.HTTP_HEADER_VERSION,
-        ShuffleHeader.DEFAULT_HTTP_HEADER_VERSION);
-    conn.connect();
-    DataInputStream input = new DataInputStream(conn.getInputStream());
-    assertEquals(HttpURLConnection.HTTP_OK, conn.getResponseCode());
-    assertEquals("close",
-        conn.getHeaderField(HttpHeader.CONNECTION.asString()));
-    ShuffleHeader header = new ShuffleHeader();
-    header.readFields(input);
-    input.close();
-
-    assertEquals("sendError called when client closed connection", 0,
-        shuffleHandler.failures.size());
-    assertEquals("Should have no caught exceptions", Collections.emptyList(),
-        shuffleHandler.failures);
-
-    shuffleHandler.stop();
-  }
-
-  static class LastSocketAddress {
-    SocketAddress lastAddress;
-    void setAddress(SocketAddress lastAddress) {
-      this.lastAddress = lastAddress;
-    }
-    SocketAddress getSocketAddres() {
-      return lastAddress;
-    }
-  }
-
-  @Test(timeout = 10000)
-  public void testKeepAliveInitiallyEnabled() throws Exception {
-    Configuration conf = new Configuration();
-    conf.setInt(ShuffleHandler.SHUFFLE_PORT_CONFIG_KEY, TEST_EXECUTION.shuffleHandlerPort());
-    conf.setBoolean(ShuffleHandler.SHUFFLE_CONNECTION_KEEP_ALIVE_ENABLED, true);
-    conf.setInt(ShuffleHandler.SHUFFLE_CONNECTION_KEEP_ALIVE_TIME_OUT,
-        TEST_EXECUTION.getKeepAliveTimeout());
-    ResponseConfig responseConfig = new ResponseConfig(HEADER_WRITE_COUNT, 0, 0);
-    ShuffleHandlerForKeepAliveTests shuffleHandler = new ShuffleHandlerForKeepAliveTests(
-        ATTEMPT_ID, responseConfig);
-    testKeepAliveWithHttpOk(conf, shuffleHandler, ShuffleUrlType.SIMPLE,
-        ShuffleUrlType.WITH_KEEPALIVE);
-  }
-
-  @Test(timeout = 1000000)
-  public void testKeepAliveInitiallyEnabledTwoKeepAliveUrls() throws Exception {
-    Configuration conf = new Configuration();
-    conf.setInt(ShuffleHandler.SHUFFLE_PORT_CONFIG_KEY, TEST_EXECUTION.shuffleHandlerPort());
-    conf.setBoolean(ShuffleHandler.SHUFFLE_CONNECTION_KEEP_ALIVE_ENABLED, true);
-    conf.setInt(ShuffleHandler.SHUFFLE_CONNECTION_KEEP_ALIVE_TIME_OUT,
-        TEST_EXECUTION.getKeepAliveTimeout());
-    ResponseConfig responseConfig = new ResponseConfig(HEADER_WRITE_COUNT, 0, 0);
-    ShuffleHandlerForKeepAliveTests shuffleHandler = new ShuffleHandlerForKeepAliveTests(
-        ATTEMPT_ID, responseConfig);
-    testKeepAliveWithHttpOk(conf, shuffleHandler, ShuffleUrlType.WITH_KEEPALIVE,
-        ShuffleUrlType.WITH_KEEPALIVE);
-  }
-
-  //TODO snemeth implement keepalive test that used properly mocked ShuffleHandler
-  @Test(timeout = 10000)
-  public void testKeepAliveInitiallyDisabled() throws Exception {
-    Configuration conf = new Configuration();
-    conf.setInt(ShuffleHandler.SHUFFLE_PORT_CONFIG_KEY, TEST_EXECUTION.shuffleHandlerPort());
-    conf.setBoolean(ShuffleHandler.SHUFFLE_CONNECTION_KEEP_ALIVE_ENABLED, false);
-    conf.setInt(ShuffleHandler.SHUFFLE_CONNECTION_KEEP_ALIVE_TIME_OUT,
-        TEST_EXECUTION.getKeepAliveTimeout());
-    ResponseConfig responseConfig = new ResponseConfig(HEADER_WRITE_COUNT, 0, 0);
-    ShuffleHandlerForKeepAliveTests shuffleHandler = new ShuffleHandlerForKeepAliveTests(
-        ATTEMPT_ID, responseConfig);
-    testKeepAliveWithHttpOk(conf, shuffleHandler, ShuffleUrlType.WITH_KEEPALIVE,
-        ShuffleUrlType.WITH_KEEPALIVE);
-  }
-
-  @Test(timeout = 10000)
-  public void testKeepAliveMultipleMapAttemptIds() throws Exception {
-    final int mapOutputContentLength = 11;
-    final int mapOutputCount = 2;
-
-    Configuration conf = new Configuration();
-    conf.setInt(ShuffleHandler.SHUFFLE_PORT_CONFIG_KEY, TEST_EXECUTION.shuffleHandlerPort());
-    conf.setBoolean(ShuffleHandler.SHUFFLE_CONNECTION_KEEP_ALIVE_ENABLED, true);
-    conf.setInt(ShuffleHandler.SHUFFLE_CONNECTION_KEEP_ALIVE_TIME_OUT,
-        TEST_EXECUTION.getKeepAliveTimeout());
-    ResponseConfig responseConfig = new ResponseConfig(HEADER_WRITE_COUNT,
-        mapOutputCount, mapOutputContentLength);
-    ShuffleHandlerForKeepAliveTests shuffleHandler = new ShuffleHandlerForKeepAliveTests(
-        ATTEMPT_ID, responseConfig);
-    shuffleHandler.mapOutputSender.additionalMapOutputSenderOperations =
-        new AdditionalMapOutputSenderOperations() {
-          @Override
-          public ChannelFuture perform(ChannelHandlerContext ctx, Channel ch) throws IOException {
-            File tmpFile = File.createTempFile("test", ".tmp");
-            Files.write(tmpFile.toPath(),
-                "dummytestcontent123456".getBytes(StandardCharsets.UTF_8));
-            final DefaultFileRegion partition = new DefaultFileRegion(tmpFile, 0,
-                mapOutputContentLength);
-            LOG.debug("Writing response partition: {}, channel: {}",
-                partition, ch.id());
-            return ch.writeAndFlush(partition)
-                .addListener((ChannelFutureListener) future ->
-                    LOG.debug("Finished Writing response partition: {}, channel: " +
-                        "{}", partition, ch.id()));
-          }
-        };
-    testKeepAliveWithHttpOk(conf, shuffleHandler,
-        ShuffleUrlType.WITH_KEEPALIVE_MULTIPLE_MAP_IDS,
-        ShuffleUrlType.WITH_KEEPALIVE_MULTIPLE_MAP_IDS);
-  }
-
-  @Test(timeout = 10000)
-  public void testKeepAliveWithoutMapAttemptIds() throws Exception {
-    Configuration conf = new Configuration();
-    conf.setInt(ShuffleHandler.SHUFFLE_PORT_CONFIG_KEY, TEST_EXECUTION.shuffleHandlerPort());
-    conf.setBoolean(ShuffleHandler.SHUFFLE_CONNECTION_KEEP_ALIVE_ENABLED, true);
-    conf.setInt(ShuffleHandler.SHUFFLE_CONNECTION_KEEP_ALIVE_TIME_OUT,
-        TEST_EXECUTION.getKeepAliveTimeout());
-    ResponseConfig responseConfig = new ResponseConfig(HEADER_WRITE_COUNT, 0, 0);
-    ShuffleHandlerForKeepAliveTests shuffleHandler = new ShuffleHandlerForKeepAliveTests(
-        ATTEMPT_ID, responseConfig);
-    shuffleHandler.setFailImmediatelyOnErrors(true);
-    //Closing channels caused Netty to open another channel
-    // so 1 request was handled with 2 separate channels,
-    // ultimately generating 2 * HTTP 400 errors.
-    // We'd like to avoid this so disabling closing the channel here.
-    shuffleHandler.setCloseChannelOnError(false);
-    testKeepAliveWithHttpBadRequest(conf, shuffleHandler, ShuffleUrlType.WITH_KEEPALIVE_NO_MAP_IDS);
-  }
-
-  private void testKeepAliveWithHttpOk(
-      Configuration conf,
-      ShuffleHandlerForKeepAliveTests shuffleHandler,
-      ShuffleUrlType... shuffleUrlTypes) throws IOException {
-    testKeepAliveWithHttpStatus(conf, shuffleHandler, shuffleUrlTypes, HttpURLConnection.HTTP_OK);
-  }
-
-  private void testKeepAliveWithHttpBadRequest(
-      Configuration conf,
-      ShuffleHandlerForKeepAliveTests shuffleHandler,
-      ShuffleUrlType... shuffleUrlTypes) throws IOException {
-    testKeepAliveWithHttpStatus(conf, shuffleHandler, shuffleUrlTypes,
-        HttpURLConnection.HTTP_BAD_REQUEST);
-  }
-
-  private void testKeepAliveWithHttpStatus(Configuration conf,
-      ShuffleHandlerForKeepAliveTests shuffleHandler,
-      ShuffleUrlType[] shuffleUrlTypes,
-      int expectedHttpStatus) throws IOException {
-    if (expectedHttpStatus != HttpURLConnection.HTTP_BAD_REQUEST) {
-      assertTrue("Expected at least two shuffle URL types ",
-          shuffleUrlTypes.length >= 2);
-    }
-    shuffleHandler.init(conf);
-    shuffleHandler.start();
-
-    String[] urls = new String[shuffleUrlTypes.length];
-    for (int i = 0; i < shuffleUrlTypes.length; i++) {
-      ShuffleUrlType url = shuffleUrlTypes[i];
-      if (url == ShuffleUrlType.SIMPLE) {
-        urls[i] = getShuffleUrl(shuffleHandler, ATTEMPT_ID, ATTEMPT_ID);
-      } else if (url == ShuffleUrlType.WITH_KEEPALIVE) {
-        urls[i] = getShuffleUrlWithKeepAlive(shuffleHandler, ATTEMPT_ID, ATTEMPT_ID);
-      } else if (url == ShuffleUrlType.WITH_KEEPALIVE_MULTIPLE_MAP_IDS) {
-        urls[i] = getShuffleUrlWithKeepAlive(shuffleHandler, ATTEMPT_ID, ATTEMPT_ID, ATTEMPT_ID_2);
-      } else if (url == ShuffleUrlType.WITH_KEEPALIVE_NO_MAP_IDS) {
-        urls[i] = getShuffleUrlWithKeepAlive(shuffleHandler, ATTEMPT_ID);
-      }
-    }
-    HttpConnectionHelper connHelper;
-    try {
-      connHelper = new HttpConnectionHelper(shuffleHandler.lastSocketAddress);
-      connHelper.connectToUrls(urls, shuffleHandler.responseConfig, expectedHttpStatus);
-      if (expectedHttpStatus == HttpURLConnection.HTTP_BAD_REQUEST) {
-        assertEquals(1, shuffleHandler.failures.size());
-      }
-    } finally {
-      shuffleHandler.stop();
-    }
-
-    //Verify expectations
-    int configuredTimeout = TEST_EXECUTION.getKeepAliveTimeout();
-    int expectedTimeout = configuredTimeout < 0 ? 1 : configuredTimeout;
-
-    connHelper.validate(connData -> {
-      HttpConnectionAssert.create(connData)
-          .expectKeepAliveWithTimeout(expectedTimeout)
-          .expectResponseContentLength(shuffleHandler.responseConfig.contentLengthOfResponse);
-    });
-    if (expectedHttpStatus == HttpURLConnection.HTTP_OK) {
-      HttpConnectionAssert.assertKeepAliveConnectionsAreSame(connHelper);
-      assertEquals("Unexpected ShuffleHandler failure", Collections.emptyList(),
-          shuffleHandler.failures);
-    }
-  }
-
-  @Test(timeout = 10000)
-  public void testSocketKeepAlive() throws Exception {
-    Configuration conf = new Configuration();
-    conf.setInt(ShuffleHandler.SHUFFLE_PORT_CONFIG_KEY, TEST_EXECUTION.shuffleHandlerPort());
-    conf.setBoolean(ShuffleHandler.SHUFFLE_CONNECTION_KEEP_ALIVE_ENABLED, true);
-    // try setting to negative keep alive timeout.
-    conf.setInt(ShuffleHandler.SHUFFLE_CONNECTION_KEEP_ALIVE_TIME_OUT,
-        ARBITRARY_NEGATIVE_TIMEOUT_SECONDS);
-    HttpURLConnection conn = null;
-    MockShuffleHandler2 shuffleHandler = new MockShuffleHandler2();
-    AuxiliaryLocalPathHandler pathHandler =
-        mock(AuxiliaryLocalPathHandler.class);
-    when(pathHandler.getLocalPathForRead(anyString())).thenThrow(
-        new DiskChecker.DiskErrorException("Test"));
-    shuffleHandler.setAuxiliaryLocalPathHandler(pathHandler);
-    try {
-      shuffleHandler.init(conf);
-      shuffleHandler.start();
-
-      String shuffleBaseURL = "http://127.0.0.1:"
-              + shuffleHandler.getConfig().get(
-                ShuffleHandler.SHUFFLE_PORT_CONFIG_KEY);
-      URL url =
-          new URL(shuffleBaseURL + "/mapOutput?job=job_12345_1&reduce=1&"
-              + "map=attempt_12345_1_m_1_0");
-      conn = TEST_EXECUTION.openConnection(url);
-      conn.setRequestProperty(ShuffleHeader.HTTP_HEADER_NAME,
-          ShuffleHeader.DEFAULT_HTTP_HEADER_NAME);
-      conn.setRequestProperty(ShuffleHeader.HTTP_HEADER_VERSION,
-          ShuffleHeader.DEFAULT_HTTP_HEADER_VERSION);
-      conn.connect();
-      int rc = conn.getResponseCode();
-      conn.getInputStream();
-      assertEquals(HttpURLConnection.HTTP_OK, rc);
-      assertTrue("socket should be set KEEP_ALIVE",
-          shuffleHandler.isSocketKeepAlive());
-    } finally {
-      if (conn != null) {
-        conn.disconnect();
-      }
-      shuffleHandler.stop();
-    }
-    assertEquals("Should have no caught exceptions",
-        Collections.emptyList(), shuffleHandler.failures);
-  }
-
-  /**
-   * Simulate a reducer that sends an invalid shuffle-header - sometimes a wrong
-   * header_name and sometimes a wrong version.
-   * 
-   * @throws Exception exception
-   */
-  @Test (timeout = 10000)
-  public void testIncompatibleShuffleVersion() throws Exception {
-    final int failureNum = 3;
-    Configuration conf = new Configuration();
-    conf.setInt(ShuffleHandler.SHUFFLE_PORT_CONFIG_KEY, TEST_EXECUTION.shuffleHandlerPort());
-    ShuffleHandler shuffleHandler = new ShuffleHandlerForTests();
-    shuffleHandler.init(conf);
-    shuffleHandler.start();
-
-    // simulate a reducer that closes early by reading a single shuffle header
-    // then closing the connection
-    URL url = new URL("http://127.0.0.1:"
-        + shuffleHandler.getConfig().get(ShuffleHandler.SHUFFLE_PORT_CONFIG_KEY)
-        + "/mapOutput?job=job_12345_1&reduce=1&map=attempt_12345_1_m_1_0");
-    for (int i = 0; i < failureNum; ++i) {
-      HttpURLConnection conn = TEST_EXECUTION.openConnection(url);
-      conn.setRequestProperty(ShuffleHeader.HTTP_HEADER_NAME,
-          i == 0 ? "mapreduce" : "other");
-      conn.setRequestProperty(ShuffleHeader.HTTP_HEADER_VERSION,
-          i == 1 ? "1.0.0" : "1.0.1");
-      conn.connect();
-      assertEquals(
-          HttpURLConnection.HTTP_BAD_REQUEST, conn.getResponseCode());
-    }
-
-    shuffleHandler.stop();
-    shuffleHandler.close();
-  }
-
   /**
    * Validate the limit on number of shuffle connections.
-   * 
+   *
    * @throws Exception exception
    */
-  @Test (timeout = 10000)
+  @Test(timeout = 10000)
   public void testMaxConnections() throws Exception {
-    final ArrayList<Throwable> failures = new ArrayList<>();
     final int maxAllowedConnections = 3;
     final int notAcceptedConnections = 1;
     final int connAttempts = maxAllowedConnections + notAcceptedConnections;
-    
+
     Configuration conf = new Configuration();
-    conf.setInt(ShuffleHandler.SHUFFLE_PORT_CONFIG_KEY, TEST_EXECUTION.shuffleHandlerPort());
     conf.setInt(ShuffleHandler.MAX_SHUFFLE_CONNECTIONS, maxAllowedConnections);
-    ShuffleHandler shuffleHandler = new ShuffleHandler() {
-      @Override
-      protected Shuffle getShuffle(Configuration conf) {
-        // replace the shuffle handler with one stubbed for testing
-        return new Shuffle(conf) {
-          @Override
-          protected MapOutputInfo getMapOutputInfo(String mapId, int reduce,
-              String jobId, String user) {
-            // Do nothing.
-            return null;
-          }
-          @Override
-          protected void populateHeaders(List<String> mapIds, String jobId,
-              String user, int reduce, HttpRequest request,
-              HttpResponse response, boolean keepAliveParam,
-              Map<String, MapOutputInfo> infoMap) {
-            // Do nothing.
-          }
-          @Override
-          protected void verifyRequest(String appid, ChannelHandlerContext ctx,
-              HttpRequest request, HttpResponse response, URL requestUri) {
-            // Do nothing.
-          }
-          @Override
-          protected ChannelFuture sendMapOutput(ChannelHandlerContext ctx,
-              Channel ch, String user, String mapId, int reduce,
-              MapOutputInfo info)
-                  throws IOException {
-            // send a shuffle header and a lot of data down the channel
-            // to trigger a broken pipe
-            ShuffleHeader header =
-                new ShuffleHeader("dummy_header", 5678, 5678, 1);
-            DataOutputBuffer dob = new DataOutputBuffer();
-            header.write(dob);
-            ch.writeAndFlush(wrappedBuffer(dob.getData(), 0, dob.getLength()));
-            dob = new DataOutputBuffer();
-            for (int i=0; i<100000; ++i) {
-              header.write(dob);
-            }
-            return ch.writeAndFlush(wrappedBuffer(dob.getData(), 0, dob.getLength()));
-          }
-
-          @Override
-          public void exceptionCaught(ChannelHandlerContext ctx,
-              Throwable cause) throws Exception {
-            LOG.debug("ExceptionCaught");
-            failures.add(cause);
-            super.exceptionCaught(ctx, cause);
-          }
-        };
-      }
-    };
-    shuffleHandler.setUseOutboundExceptionHandler(true);
+    ShuffleHandlerMock shuffleHandler = new ShuffleHandlerMock();
     shuffleHandler.init(conf);
     shuffleHandler.start();
+    final String port = shuffleHandler.getConfig().get(SHUFFLE_PORT_CONFIG_KEY);
+    final SecretKey secretKey = shuffleHandler.addTestApp();
 
     // setup connections
     HttpURLConnection[] conns = new HttpURLConnection[connAttempts];
 
     for (int i = 0; i < connAttempts; i++) {
-      String urlString = "http://127.0.0.1:" 
-           + shuffleHandler.getConfig().get(ShuffleHandler.SHUFFLE_PORT_CONFIG_KEY)
-           + "/mapOutput?job=job_12345_1&reduce=1&map=attempt_12345_1_m_"
-           + i + "_0";
-      URL url = new URL(urlString);
-      conns[i] = TEST_EXECUTION.openConnection(url);
-      conns[i].setRequestProperty(ShuffleHeader.HTTP_HEADER_NAME,
-          ShuffleHeader.DEFAULT_HTTP_HEADER_NAME);
-      conns[i].setRequestProperty(ShuffleHeader.HTTP_HEADER_VERSION,
-          ShuffleHeader.DEFAULT_HTTP_HEADER_VERSION);
+      conns[i] = createRequest(
+          geURL(port, TEST_JOB_ID, 0, Collections.singletonList(TEST_ATTEMPT_1), true),
+          secretKey);
     }
 
     // Try to open numerous connections
@@ -1378,7 +200,7 @@ public void exceptionCaught(ChannelHandlerContext ctx,
             HttpURLConnection.HTTP_OK,
             ShuffleHandler.TOO_MANY_REQ_STATUS.code()),
         mapOfConnections.keySet());
-    
+
     List<HttpURLConnection> successfulConnections =
         mapOfConnections.get(HttpURLConnection.HTTP_OK);
     assertEquals(String.format("Expected exactly %d requests " +
@@ -1402,307 +224,196 @@ public void exceptionCaught(ChannelHandlerContext ctx,
     assertTrue("The backoff value cannot be negative.", backoff > 0);
 
     shuffleHandler.stop();
+  }
+
+  /**
+   * Validate the limit on number of shuffle connections.
+   *
+   * @throws Exception exception
+   */
+  @Test(timeout = 10000)
+  public void testKeepAlive() throws Exception {
+    Configuration conf = new Configuration();
+    ShuffleHandlerMock shuffleHandler = new ShuffleHandlerMock();
+    shuffleHandler.init(conf);
+    shuffleHandler.start();
+    final String port = shuffleHandler.getConfig().get(ShuffleHandler.SHUFFLE_PORT_CONFIG_KEY);
+    final SecretKey secretKey = shuffleHandler.addTestApp();
+
+    HttpURLConnection conn1 = createRequest(
+        geURL(port, TEST_JOB_ID, 0, Collections.singletonList(TEST_ATTEMPT_1), true),
+        secretKey);
+    conn1.connect();
+    verifyContent(conn1, TEST_DATA_A);
+
+    HttpURLConnection conn2 = createRequest(
+        geURL(port, TEST_JOB_ID, 0, Collections.singletonList(TEST_ATTEMPT_2), true),
+        secretKey);
+    conn2.connect();
+    verifyContent(conn2, TEST_DATA_B);
+
+    HttpURLConnection conn3 = createRequest(
+        geURL(port, TEST_JOB_ID, 0, Collections.singletonList(TEST_ATTEMPT_3), false),
+        secretKey);
+    conn3.connect();
+    verifyContent(conn3, TEST_DATA_C);
+
+    shuffleHandler.stop();
 
-    //It's okay to get a ClosedChannelException.
-    //All other kinds of exceptions means something went wrong
-    assertEquals("Should have no caught exceptions",
-        Collections.emptyList(), failures.stream()
-            .filter(f -> !(f instanceof ClosedChannelException))
-            .collect(toList()));
+    List<String> actual = matchLogs("connections=\\d+");
+    assertEquals("only one connection was used",
+        Arrays.asList("connections=1", "connections=0"), actual);
   }
 
   /**
    * Validate the ownership of the map-output files being pulled in. The
    * local-file-system owner of the file should match the user component in the
    *
-   * @throws Exception exception
+   * @throws IOException exception
    */
   @Test(timeout = 100000)
   public void testMapFileAccess() throws IOException {
-    final ArrayList<Throwable> failures = new ArrayList<>();
     // This will run only in NativeIO is enabled as SecureIOUtils need it
     assumeTrue(NativeIO.isAvailable());
     Configuration conf = new Configuration();
-    conf.setInt(ShuffleHandler.SHUFFLE_PORT_CONFIG_KEY, TEST_EXECUTION.shuffleHandlerPort());
-    conf.setInt(ShuffleHandler.MAX_SHUFFLE_CONNECTIONS, 3);
-    conf.set(CommonConfigurationKeysPublic.HADOOP_SECURITY_AUTHENTICATION,
-        "kerberos");
+    conf.set(CommonConfigurationKeysPublic.HADOOP_SECURITY_AUTHENTICATION, "kerberos");
     UserGroupInformation.setConfiguration(conf);
-    conf.set(YarnConfiguration.NM_LOCAL_DIRS, ABS_LOG_DIR.getAbsolutePath());
-    ApplicationId appId = ApplicationId.newInstance(12345, 1);
-    LOG.info(appId.toString());
-    String appAttemptId = "attempt_12345_1_m_1_0";
-    String user = "randomUser";
-    String reducerId = "0";
-    List<File> fileMap = new ArrayList<>();
-    createShuffleHandlerFiles(ABS_LOG_DIR, user, appId.toString(), appAttemptId,
-        conf, fileMap);
-    ShuffleHandler shuffleHandler = new ShuffleHandler() {
-      @Override
-      protected Shuffle getShuffle(Configuration conf) {
-        // replace the shuffle handler with one stubbed for testing
-        return new Shuffle(conf) {
-
-          @Override
-          protected void verifyRequest(String appid, ChannelHandlerContext ctx,
-              HttpRequest request, HttpResponse response, URL requestUri) {
-            // Do nothing.
-          }
 
-          @Override
-          public void exceptionCaught(ChannelHandlerContext ctx,
-              Throwable cause) throws Exception {
-            LOG.debug("ExceptionCaught");
-            failures.add(cause);
-            super.exceptionCaught(ctx, cause);
-          }
-
-          @Override
-          public void channelActive(ChannelHandlerContext ctx) throws Exception {
-            ctx.pipeline().replace(HttpResponseEncoder.class,
-                "loggingResponseEncoder",
-                new LoggingHttpResponseEncoder(false));
-            LOG.debug("Modified pipeline: {}", ctx.pipeline());
-            super.channelActive(ctx);
-          }
-        };
-      }
-    };
-    AuxiliaryLocalPathHandler pathHandler = new TestAuxiliaryLocalPathHandler();
-    shuffleHandler.setUseOutboundExceptionHandler(true);
-    shuffleHandler.setAuxiliaryLocalPathHandler(pathHandler);
+    ShuffleHandlerMock shuffleHandler = new ShuffleHandlerMock();
     shuffleHandler.init(conf);
     try {
       shuffleHandler.start();
-      DataOutputBuffer outputBuffer = new DataOutputBuffer();
-      outputBuffer.reset();
-      Token<JobTokenIdentifier> jt =
-          new Token<>("identifier".getBytes(),
-              "password".getBytes(), new Text(user), new Text("shuffleService"));
-      jt.write(outputBuffer);
-      shuffleHandler
-          .initializeApplication(new ApplicationInitializationContext(user,
-              appId, ByteBuffer.wrap(outputBuffer.getData(), 0,
-              outputBuffer.getLength())));
-      URL url =
-          new URL(
-              "http://127.0.0.1:"
-                  + shuffleHandler.getConfig().get(
-                      ShuffleHandler.SHUFFLE_PORT_CONFIG_KEY)
-                  + "/mapOutput?job=job_12345_0001&reduce=" + reducerId
-                  + "&map=attempt_12345_1_m_1_0");
-      HttpURLConnection conn = TEST_EXECUTION.openConnection(url);
-      conn.setRequestProperty(ShuffleHeader.HTTP_HEADER_NAME,
-          ShuffleHeader.DEFAULT_HTTP_HEADER_NAME);
-      conn.setRequestProperty(ShuffleHeader.HTTP_HEADER_VERSION,
-          ShuffleHeader.DEFAULT_HTTP_HEADER_VERSION);
+      final String port = shuffleHandler.getConfig().get(ShuffleHandler.SHUFFLE_PORT_CONFIG_KEY);
+      final SecretKey secretKey = shuffleHandler.addTestApp();
+
+      HttpURLConnection conn = createRequest(
+          geURL(port, TEST_JOB_ID, 0, Collections.singletonList(TEST_ATTEMPT_1), false),
+          secretKey);
       conn.connect();
-      DataInputStream is = new DataInputStream(conn.getInputStream());
-      InputStreamReadResult result = HttpConnectionHelper.readDataFromInputStream(is);
-      String receivedString = result.asString;
+      BufferedReader in = new BufferedReader(new InputStreamReader(conn.getInputStream()));
+      StringBuilder builder = new StringBuilder();
+      String inputLine;
+      while ((inputLine = in.readLine()) != null) {
+        System.out.println(inputLine);
+        builder.append(inputLine);
+      }
+      String receivedString = builder.toString();
 
       //Retrieve file owner name
-      FileInputStream fis = new FileInputStream(fileMap.get(0));
-      String owner = NativeIO.POSIX.getFstat(fis.getFD()).getOwner();
-      fis.close();
+      String indexFilePath = getIndexFile(tempDir.toAbsolutePath().toString(), TEST_ATTEMPT_1);
+      String owner;
+      try (FileInputStream fis = new FileInputStream(indexFilePath)) {
+        owner = NativeIO.POSIX.getFstat(fis.getFD()).getOwner();
+      }
 
       String message =
-          "Owner '" + owner + "' for path " + fileMap.get(0).getAbsolutePath()
-              + " did not match expected owner '" + user + "'";
+          "Owner '" + owner + "' for path " + indexFilePath
+              + " did not match expected owner '" + TEST_USER + "'";
       assertTrue(String.format("Received string '%s' should contain " +
-          "message '%s'", receivedString, message),
+              "message '%s'", receivedString, message),
           receivedString.contains(message));
       assertEquals(HttpURLConnection.HTTP_OK, conn.getResponseCode());
       LOG.info("received: " + receivedString);
       assertNotEquals("", receivedString);
     } finally {
       shuffleHandler.stop();
-      FileUtil.fullyDelete(ABS_LOG_DIR);
-    }
-
-    assertEquals("Should have no caught exceptions",
-        Collections.emptyList(), failures);
-  }
-
-  private static void createShuffleHandlerFiles(File logDir, String user,
-      String appId, String appAttemptId, Configuration conf,
-      List<File> fileMap) throws IOException {
-    String attemptDir =
-        StringUtils.join(Path.SEPARATOR,
-            new String[] {logDir.getAbsolutePath(),
-                ContainerLocalizer.USERCACHE, user,
-                ContainerLocalizer.APPCACHE, appId, "output", appAttemptId });
-    File appAttemptDir = new File(attemptDir);
-    appAttemptDir.mkdirs();
-    System.out.println(appAttemptDir.getAbsolutePath());
-    File indexFile = new File(appAttemptDir, "file.out.index");
-    fileMap.add(indexFile);
-    createIndexFile(indexFile, conf);
-    File mapOutputFile = new File(appAttemptDir, "file.out");
-    fileMap.add(mapOutputFile);
-    createMapOutputFile(mapOutputFile, conf);
-  }
-
-  private static void createMapOutputFile(File mapOutputFile, Configuration conf)
-          throws IOException {
-    FileOutputStream out = new FileOutputStream(mapOutputFile);
-    out.write("Creating new dummy map output file. Used only for testing"
-        .getBytes());
-    out.flush();
-    out.close();
-  }
-
-  private static void createIndexFile(File indexFile, Configuration conf)
-      throws IOException {
-    if (indexFile.exists()) {
-      System.out.println("Deleting existing file");
-      indexFile.delete();
     }
-    indexFile.createNewFile();
-    FSDataOutputStream output = FileSystem.getLocal(conf).getRaw().append(
-        new Path(indexFile.getAbsolutePath()));
-    Checksum crc = new PureJavaCrc32();
-    crc.reset();
-    CheckedOutputStream chk = new CheckedOutputStream(output, crc);
-    String msg = "Writing new index file. This file will be used only " +
-        "for the testing.";
-    chk.write(Arrays.copyOf(msg.getBytes(),
-        MapTask.MAP_OUTPUT_INDEX_RECORD_LENGTH));
-    output.writeLong(chk.getChecksum().getValue());
-    output.close();
   }
 
   @Test
   public void testRecovery() throws IOException {
-    final String user = "someuser";
-    final ApplicationId appId = ApplicationId.newInstance(12345, 1);
-    final JobID jobId = JobID.downgrade(TypeConverter.fromYarn(appId));
     final File tmpDir = new File(System.getProperty("test.build.data",
         System.getProperty("java.io.tmpdir")),
         TestShuffleHandler.class.getName());
-    ShuffleHandler shuffle = new ShuffleHandlerForTests();
-    AuxiliaryLocalPathHandler pathHandler = new TestAuxiliaryLocalPathHandler();
-    shuffle.setAuxiliaryLocalPathHandler(pathHandler);
+    ShuffleHandlerMock shuffle = new ShuffleHandlerMock();
     Configuration conf = new Configuration();
-    conf.setInt(ShuffleHandler.SHUFFLE_PORT_CONFIG_KEY, TEST_EXECUTION.shuffleHandlerPort());
     conf.setInt(ShuffleHandler.MAX_SHUFFLE_CONNECTIONS, 3);
-    conf.set(YarnConfiguration.NM_LOCAL_DIRS,
-        ABS_LOG_DIR.getAbsolutePath());
     // emulate aux services startup with recovery enabled
     shuffle.setRecoveryPath(new Path(tmpDir.toString()));
-    tmpDir.mkdirs();
+    assertTrue(tmpDir.mkdirs());
     try {
       shuffle.init(conf);
       shuffle.start();
-
-      // set up a shuffle token for an application
-      DataOutputBuffer outputBuffer = new DataOutputBuffer();
-      outputBuffer.reset();
-      Token<JobTokenIdentifier> jt = new Token<>(
-          "identifier".getBytes(), "password".getBytes(), new Text(user),
-          new Text("shuffleService"));
-      jt.write(outputBuffer);
-      shuffle.initializeApplication(new ApplicationInitializationContext(user,
-          appId, ByteBuffer.wrap(outputBuffer.getData(), 0,
-            outputBuffer.getLength())));
+      final String port = shuffle.getConfig().get(ShuffleHandler.SHUFFLE_PORT_CONFIG_KEY);
+      final SecretKey secretKey = shuffle.addTestApp();
 
       // verify we are authorized to shuffle
-      int rc = getShuffleResponseCode(shuffle, jt);
+      int rc = getShuffleResponseCode(port, secretKey);
       assertEquals(HttpURLConnection.HTTP_OK, rc);
 
       // emulate shuffle handler restart
       shuffle.close();
-      shuffle = new ShuffleHandlerForTests();
-      shuffle.setAuxiliaryLocalPathHandler(pathHandler);
+      shuffle = new ShuffleHandlerMock();
       shuffle.setRecoveryPath(new Path(tmpDir.toString()));
       shuffle.init(conf);
       shuffle.start();
 
       // verify we are still authorized to shuffle to the old application
-      rc = getShuffleResponseCode(shuffle, jt);
+      rc = getShuffleResponseCode(port, secretKey);
       assertEquals(HttpURLConnection.HTTP_OK, rc);
 
       // shutdown app and verify access is lost
-      shuffle.stopApplication(new ApplicationTerminationContext(appId));
-      rc = getShuffleResponseCode(shuffle, jt);
+      shuffle.stopApplication(new ApplicationTerminationContext(TEST_APP_ID));
+      rc = getShuffleResponseCode(port, secretKey);
       assertEquals(HttpURLConnection.HTTP_UNAUTHORIZED, rc);
 
       // emulate shuffle handler restart
       shuffle.close();
-      shuffle = new ShuffleHandlerForTests();
+      shuffle = new ShuffleHandlerMock();
       shuffle.setRecoveryPath(new Path(tmpDir.toString()));
       shuffle.init(conf);
       shuffle.start();
 
       // verify we still don't have access
-      rc = getShuffleResponseCode(shuffle, jt);
+      rc = getShuffleResponseCode(port, secretKey);
       assertEquals(HttpURLConnection.HTTP_UNAUTHORIZED, rc);
     } finally {
-      if (shuffle != null) {
-        shuffle.close();
-      }
+      shuffle.close();
       FileUtil.fullyDelete(tmpDir);
     }
   }
-  
+
   @Test
   public void testRecoveryFromOtherVersions() throws IOException {
-    final String user = "someuser";
-    final ApplicationId appId = ApplicationId.newInstance(12345, 1);
     final File tmpDir = new File(System.getProperty("test.build.data",
         System.getProperty("java.io.tmpdir")),
         TestShuffleHandler.class.getName());
     Configuration conf = new Configuration();
-    conf.setInt(ShuffleHandler.SHUFFLE_PORT_CONFIG_KEY, TEST_EXECUTION.shuffleHandlerPort());
     conf.setInt(ShuffleHandler.MAX_SHUFFLE_CONNECTIONS, 3);
-    ShuffleHandler shuffle = new ShuffleHandlerForTests();
-    AuxiliaryLocalPathHandler pathHandler = new TestAuxiliaryLocalPathHandler();
-    shuffle.setAuxiliaryLocalPathHandler(pathHandler);
-    conf.set(YarnConfiguration.NM_LOCAL_DIRS, ABS_LOG_DIR.getAbsolutePath());
+    ShuffleHandlerMock shuffle = new ShuffleHandlerMock();
     // emulate aux services startup with recovery enabled
     shuffle.setRecoveryPath(new Path(tmpDir.toString()));
-    tmpDir.mkdirs();
+    assertTrue(tmpDir.mkdirs());
     try {
       shuffle.init(conf);
       shuffle.start();
-
-      // set up a shuffle token for an application
-      DataOutputBuffer outputBuffer = new DataOutputBuffer();
-      outputBuffer.reset();
-      Token<JobTokenIdentifier> jt = new Token<>(
-          "identifier".getBytes(), "password".getBytes(), new Text(user),
-          new Text("shuffleService"));
-      jt.write(outputBuffer);
-      shuffle.initializeApplication(new ApplicationInitializationContext(user,
-          appId, ByteBuffer.wrap(outputBuffer.getData(), 0,
-              outputBuffer.getLength())));
+      final String port = shuffle.getConfig().get(ShuffleHandler.SHUFFLE_PORT_CONFIG_KEY);
+      final SecretKey secretKey = shuffle.addTestApp();
 
       // verify we are authorized to shuffle
-      int rc = getShuffleResponseCode(shuffle, jt);
+      int rc = getShuffleResponseCode(port, secretKey);
       assertEquals(HttpURLConnection.HTTP_OK, rc);
 
       // emulate shuffle handler restart
       shuffle.close();
-      shuffle = new ShuffleHandlerForTests();
-      shuffle.setAuxiliaryLocalPathHandler(pathHandler);
+      shuffle = new ShuffleHandlerMock();
       shuffle.setRecoveryPath(new Path(tmpDir.toString()));
       shuffle.init(conf);
       shuffle.start();
 
       // verify we are still authorized to shuffle to the old application
-      rc = getShuffleResponseCode(shuffle, jt);
+      rc = getShuffleResponseCode(port, secretKey);
       assertEquals(HttpURLConnection.HTTP_OK, rc);
       Version version = Version.newInstance(1, 0);
       assertEquals(version, shuffle.getCurrentVersion());
-    
+
       // emulate shuffle handler restart with compatible version
       Version version11 = Version.newInstance(1, 1);
       // update version info before close shuffle
       shuffle.storeVersion(version11);
       assertEquals(version11, shuffle.loadVersion());
       shuffle.close();
-      shuffle = new ShuffleHandlerForTests();
-      shuffle.setAuxiliaryLocalPathHandler(pathHandler);
+      shuffle = new ShuffleHandlerMock();
       shuffle.setRecoveryPath(new Path(tmpDir.toString()));
       shuffle.init(conf);
       shuffle.start();
@@ -1710,309 +421,99 @@ public void testRecoveryFromOtherVersions() throws IOException {
       // successfully.
       assertEquals(version, shuffle.loadVersion());
       // verify we are still authorized to shuffle to the old application
-      rc = getShuffleResponseCode(shuffle, jt);
+      rc = getShuffleResponseCode(port, secretKey);
       assertEquals(HttpURLConnection.HTTP_OK, rc);
-    
+
       // emulate shuffle handler restart with incompatible version
       Version version21 = Version.newInstance(2, 1);
       shuffle.storeVersion(version21);
       assertEquals(version21, shuffle.loadVersion());
       shuffle.close();
-      shuffle = new ShuffleHandlerForTests();
-      shuffle.setAuxiliaryLocalPathHandler(pathHandler);
+      shuffle = new ShuffleHandlerMock();
       shuffle.setRecoveryPath(new Path(tmpDir.toString()));
       shuffle.init(conf);
-    
+
       try {
         shuffle.start();
         fail("Incompatible version, should expect fail here.");
       } catch (ServiceStateException e) {
         assertTrue("Exception message mismatch",
             e.getMessage().contains("Incompatible version for state DB schema:"));
-      } 
-    
-    } finally {
-      if (shuffle != null) {
-        shuffle.close();
       }
+
+    } finally {
+      shuffle.close();
       FileUtil.fullyDelete(tmpDir);
     }
   }
 
-  private static int getShuffleResponseCode(ShuffleHandler shuffle,
-      Token<JobTokenIdentifier> jt) throws IOException {
-    URL url = new URL("http://127.0.0.1:"
-        + shuffle.getConfig().get(ShuffleHandler.SHUFFLE_PORT_CONFIG_KEY)
-        + "/mapOutput?job=job_12345_0001&reduce=0&map=attempt_12345_1_m_1_0");
-    HttpURLConnection conn = TEST_EXECUTION.openConnection(url);
-    String encHash = SecureShuffleUtils.hashFromString(
-        SecureShuffleUtils.buildMsgFrom(url),
-        JobTokenSecretManager.createSecretKey(jt.getPassword()));
-    conn.addRequestProperty(
-        SecureShuffleUtils.HTTP_HEADER_URL_HASH, encHash);
-    conn.setRequestProperty(ShuffleHeader.HTTP_HEADER_NAME,
-        ShuffleHeader.DEFAULT_HTTP_HEADER_NAME);
-    conn.setRequestProperty(ShuffleHeader.HTTP_HEADER_VERSION,
-        ShuffleHeader.DEFAULT_HTTP_HEADER_VERSION);
+  private static void verifyContent(HttpURLConnection conn,
+                                    String expectedContent) throws IOException {
+    DataInputStream input = new DataInputStream(conn.getInputStream());
+    ShuffleHeader header = new ShuffleHeader();
+    header.readFields(input);
+    byte[] data = new byte[expectedContent.length()];
+    assertEquals(expectedContent.length(), input.read(data));
+    assertEquals(expectedContent, new String(data));
+  }
+
+  private static int getShuffleResponseCode(String port, SecretKey key) throws IOException {
+    HttpURLConnection conn = createRequest(
+        geURL(port, TEST_JOB_ID, 0, Collections.singletonList(TEST_ATTEMPT_1), false),
+        key);
     conn.connect();
     int rc = conn.getResponseCode();
     conn.disconnect();
     return rc;
   }
 
-  @Test(timeout = 100000)
-  public void testGetMapOutputInfo() throws Exception {
-    final ArrayList<Throwable> failures = new ArrayList<>(1);
-    Configuration conf = new Configuration();
-    conf.setInt(ShuffleHandler.SHUFFLE_PORT_CONFIG_KEY, TEST_EXECUTION.shuffleHandlerPort());
-    conf.setInt(ShuffleHandler.MAX_SHUFFLE_CONNECTIONS, 3);
-    conf.set(CommonConfigurationKeysPublic.HADOOP_SECURITY_AUTHENTICATION,
-        "simple");
-    UserGroupInformation.setConfiguration(conf);
-    conf.set(YarnConfiguration.NM_LOCAL_DIRS, ABS_LOG_DIR.getAbsolutePath());
-    ApplicationId appId = ApplicationId.newInstance(12345, 1);
-    String appAttemptId = "attempt_12345_1_m_1_0";
-    String user = "randomUser";
-    String reducerId = "0";
-    List<File> fileMap = new ArrayList<>();
-    createShuffleHandlerFiles(ABS_LOG_DIR, user, appId.toString(), appAttemptId,
-        conf, fileMap);
-    AuxiliaryLocalPathHandler pathHandler = new TestAuxiliaryLocalPathHandler();
-    ShuffleHandler shuffleHandler = new ShuffleHandler() {
-      @Override
-      protected Shuffle getShuffle(Configuration conf) {
-        // replace the shuffle handler with one stubbed for testing
-        return new Shuffle(conf) {
-          @Override
-          protected void populateHeaders(List<String> mapIds,
-              String outputBaseStr, String user, int reduce,
-              HttpRequest request, HttpResponse response,
-              boolean keepAliveParam, Map<String, MapOutputInfo> infoMap)
-              throws IOException {
-            // Only set response headers and skip everything else
-            // send some dummy value for content-length
-            super.setResponseHeaders(response, keepAliveParam, 100);
-          }
-          @Override
-          protected void verifyRequest(String appid,
-              ChannelHandlerContext ctx, HttpRequest request,
-              HttpResponse response, URL requestUri) {
-            // Do nothing.
-          }
-          @Override
-          protected void sendError(ChannelHandlerContext ctx, String message,
-              HttpResponseStatus status) {
-            if (failures.size() == 0) {
-              failures.add(new Error(message));
-              ctx.channel().close();
-            }
-          }
-          @Override
-          protected ChannelFuture sendMapOutput(ChannelHandlerContext ctx,
-              Channel ch, String user, String mapId, int reduce,
-              MapOutputInfo info) throws IOException {
-            // send a shuffle header
-            ShuffleHeader header =
-                new ShuffleHeader("attempt_12345_1_m_1_0", 5678, 5678, 1);
-            DataOutputBuffer dob = new DataOutputBuffer();
-            header.write(dob);
-            return ch.writeAndFlush(wrappedBuffer(dob.getData(), 0, dob.getLength()));
-          }
-        };
-      }
-    };
-    shuffleHandler.setUseOutboundExceptionHandler(true);
-    shuffleHandler.setAuxiliaryLocalPathHandler(pathHandler);
-    shuffleHandler.init(conf);
-    try {
-      shuffleHandler.start();
-      DataOutputBuffer outputBuffer = new DataOutputBuffer();
-      outputBuffer.reset();
-      Token<JobTokenIdentifier> jt =
-          new Token<>("identifier".getBytes(),
-              "password".getBytes(), new Text(user), new Text("shuffleService"));
-      jt.write(outputBuffer);
-      shuffleHandler
-          .initializeApplication(new ApplicationInitializationContext(user,
-          appId, ByteBuffer.wrap(outputBuffer.getData(), 0,
-          outputBuffer.getLength())));
-      URL url =
-          new URL(
-              "http://127.0.0.1:"
-                  + shuffleHandler.getConfig().get(
-                      ShuffleHandler.SHUFFLE_PORT_CONFIG_KEY)
-                  + "/mapOutput?job=job_12345_0001&reduce=" + reducerId
-                  + "&map=attempt_12345_1_m_1_0");
-      HttpURLConnection conn = TEST_EXECUTION.openConnection(url);
-      conn.setRequestProperty(ShuffleHeader.HTTP_HEADER_NAME,
-          ShuffleHeader.DEFAULT_HTTP_HEADER_NAME);
-      conn.setRequestProperty(ShuffleHeader.HTTP_HEADER_VERSION,
-          ShuffleHeader.DEFAULT_HTTP_HEADER_VERSION);
-      conn.connect();
-      try {
-        DataInputStream is = new DataInputStream(conn.getInputStream());
-        ShuffleHeader header = new ShuffleHeader();
-        header.readFields(is);
-        is.close();
-      } catch (EOFException e) {
-        // ignore
-      }
-      assertEquals("sendError called due to shuffle error",
-          0, failures.size());
-    } finally {
-      shuffleHandler.stop();
-      FileUtil.fullyDelete(ABS_LOG_DIR);
-    }
+  private static URL geURL(String port, String jobId, int reduce, List<String> maps,
+                           boolean keepAlive) throws MalformedURLException {
+    return new URL(getURLString(port, getUri(jobId, reduce, maps, keepAlive)));
   }
 
-  @Test(timeout = 4000)
-  public void testSendMapCount() throws Exception {
-    final List<ShuffleHandler.ReduceMapFileCount> listenerList =
-        new ArrayList<>();
-    int connectionKeepAliveTimeOut = 5; //arbitrary value
-    final ChannelHandlerContext mockCtx =
-        mock(ChannelHandlerContext.class);
-    final Channel mockCh = mock(AbstractChannel.class);
-    final ChannelPipeline mockPipeline = mock(ChannelPipeline.class);
-
-    // Mock HttpRequest and ChannelFuture
-    final HttpRequest mockHttpRequest = createMockHttpRequest();
-    final ChannelFuture mockFuture = createMockChannelFuture(mockCh,
-        listenerList);
-    final ShuffleHandler.TimeoutHandler timerHandler =
-        new ShuffleHandler.TimeoutHandler(connectionKeepAliveTimeOut);
-
-    // Mock Netty Channel Context and Channel behavior
-    Mockito.doReturn(mockCh).when(mockCtx).channel();
-    when(mockCh.pipeline()).thenReturn(mockPipeline);
-    when(mockPipeline.get(
-        Mockito.any(String.class))).thenReturn(timerHandler);
-    when(mockCtx.channel()).thenReturn(mockCh);
-    Mockito.doReturn(mockFuture).when(mockCh).writeAndFlush(Mockito.any(Object.class));
-
-    final MockShuffleHandler sh = new MockShuffleHandler();
-    Configuration conf = new Configuration();
-    sh.init(conf);
-    sh.start();
-    int maxOpenFiles =conf.getInt(ShuffleHandler.SHUFFLE_MAX_SESSION_OPEN_FILES,
-        ShuffleHandler.DEFAULT_SHUFFLE_MAX_SESSION_OPEN_FILES);
-    sh.getShuffle(conf).channelRead(mockCtx, mockHttpRequest);
-    assertTrue("Number of Open files should not exceed the configured " +
-            "value!-Not Expected",
-        listenerList.size() <= maxOpenFiles);
-    while(!listenerList.isEmpty()) {
-      listenerList.remove(0).operationComplete(mockFuture);
-      assertTrue("Number of Open files should not exceed the configured " +
-              "value!-Not Expected",
-          listenerList.size() <= maxOpenFiles);
-    }
-    sh.close();
-    sh.stop();
-
-    assertEquals("Should have no caught exceptions",
-        Collections.emptyList(), sh.failures);
-  }
-
-  @Test(timeout = 10000)
-  public void testIdleStateHandlingSpecifiedTimeout() throws Exception {
-    int timeoutSeconds = 4;
-    int expectedTimeoutSeconds = timeoutSeconds;
-    testHandlingIdleState(timeoutSeconds, expectedTimeoutSeconds);
+  private static String getURLString(String port, String uri) {
+    return String.format("http://127.0.0.1:%s%s", port, uri);
   }
 
-  @Test(timeout = 10000)
-  public void testIdleStateHandlingNegativeTimeoutDefaultsTo1Second() throws Exception {
-    int expectedTimeoutSeconds = 1; //expected by production code
-    testHandlingIdleState(ARBITRARY_NEGATIVE_TIMEOUT_SECONDS, expectedTimeoutSeconds);
+  private static HttpURLConnection createRequest(URL url, SecretKey secretKey) throws IOException {
+    HttpURLConnection connection = (HttpURLConnection) url.openConnection();
+    connection.setRequestProperty(ShuffleHeader.HTTP_HEADER_NAME,
+        ShuffleHeader.DEFAULT_HTTP_HEADER_NAME);
+    connection.setRequestProperty(ShuffleHeader.HTTP_HEADER_VERSION,
+        ShuffleHeader.DEFAULT_HTTP_HEADER_VERSION);
+    String msgToEncode = SecureShuffleUtils.buildMsgFrom(url);
+    connection.setRequestProperty(HTTP_HEADER_URL_HASH,
+        SecureShuffleUtils.hashFromString(msgToEncode, secretKey));
+    return connection;
   }
 
-  private String getShuffleUrlWithKeepAlive(ShuffleHandler shuffleHandler, long jobId,
-      long... attemptIds) {
-    String url = getShuffleUrl(shuffleHandler, jobId, attemptIds);
-    return url + "&keepAlive=true";
-  }
+  class ShuffleHandlerMock extends ShuffleHandler {
 
-  private String getShuffleUrl(ShuffleHandler shuffleHandler, long jobId, long... attemptIds) {
-    String port = shuffleHandler.getConfig().get(ShuffleHandler.SHUFFLE_PORT_CONFIG_KEY);
-    String shuffleBaseURL = "http://127.0.0.1:" + port;
+    public SecretKey addTestApp() throws IOException {
+      DataOutputBuffer outputBuffer = new DataOutputBuffer();
+      outputBuffer.reset();
+      Token<JobTokenIdentifier> jt = new Token<>(
+          "identifier".getBytes(), "password".getBytes(), new Text(TEST_USER),
+          new Text("shuffleService"));
+      jt.write(outputBuffer);
+      initializeApplication(new ApplicationInitializationContext(TEST_USER, TEST_APP_ID,
+          ByteBuffer.wrap(outputBuffer.getData(), 0,
+              outputBuffer.getLength())));
 
-    StringBuilder mapAttemptIds = new StringBuilder();
-    for (int i = 0; i < attemptIds.length; i++) {
-      if (i == 0) {
-        mapAttemptIds.append("&map=");
-      } else {
-        mapAttemptIds.append(",");
-      }
-      mapAttemptIds.append(String.format("attempt_%s_1_m_1_0", attemptIds[i]));
+      return JobTokenSecretManager.createSecretKey(jt.getPassword());
     }
 
-    String location = String.format("/mapOutput" +
-        "?job=job_%s_1" +
-        "&reduce=1" +
-        "%s", jobId, mapAttemptIds);
-    return shuffleBaseURL + location;
-  }
-
-  private void testHandlingIdleState(int configuredTimeoutSeconds, int expectedTimeoutSeconds)
-      throws IOException,
-      InterruptedException {
-    Configuration conf = new Configuration();
-    conf.setInt(ShuffleHandler.SHUFFLE_PORT_CONFIG_KEY, TEST_EXECUTION.shuffleHandlerPort());
-    conf.setBoolean(ShuffleHandler.SHUFFLE_CONNECTION_KEEP_ALIVE_ENABLED, true);
-    conf.setInt(ShuffleHandler.SHUFFLE_CONNECTION_KEEP_ALIVE_TIME_OUT, configuredTimeoutSeconds);
-
-    final CountDownLatch countdownLatch = new CountDownLatch(1);
-    ResponseConfig responseConfig = new ResponseConfig(HEADER_WRITE_COUNT, 0, 0);
-    ShuffleHandlerForKeepAliveTests shuffleHandler = new ShuffleHandlerForKeepAliveTests(
-        ATTEMPT_ID, responseConfig,
-        event -> countdownLatch.countDown());
-    shuffleHandler.init(conf);
-    shuffleHandler.start();
-
-    String shuffleUrl = getShuffleUrl(shuffleHandler, ATTEMPT_ID, ATTEMPT_ID);
-    String[] urls = new String[] {shuffleUrl};
-    HttpConnectionHelper httpConnectionHelper = new HttpConnectionHelper(
-        shuffleHandler.lastSocketAddress);
-    long beforeConnectionTimestamp = System.currentTimeMillis();
-    httpConnectionHelper.connectToUrls(urls, shuffleHandler.responseConfig);
-    countdownLatch.await();
-    long channelClosedTimestamp = System.currentTimeMillis();
-    long secondsPassed =
-        TimeUnit.SECONDS.convert(channelClosedTimestamp - beforeConnectionTimestamp,
-            TimeUnit.MILLISECONDS);
-    assertTrue(String.format("Expected at least %s seconds of timeout. " +
-            "Actual timeout seconds: %s", expectedTimeoutSeconds, secondsPassed),
-        secondsPassed >= expectedTimeoutSeconds);
-    shuffleHandler.stop();
-  }
-
-  public ChannelFuture createMockChannelFuture(Channel mockCh,
-      final List<ShuffleHandler.ReduceMapFileCount> listenerList) {
-    final ChannelFuture mockFuture = mock(ChannelFuture.class);
-    when(mockFuture.channel()).thenReturn(mockCh);
-    Mockito.doReturn(true).when(mockFuture).isSuccess();
-    Mockito.doAnswer(invocation -> {
-      //Add ReduceMapFileCount listener to a list
-      if (invocation.getArguments()[0].getClass() == ShuffleHandler.ReduceMapFileCount.class) {
-        listenerList.add((ShuffleHandler.ReduceMapFileCount)
-            invocation.getArguments()[0]);
-      }
-      return null;
-    }).when(mockFuture).addListener(Mockito.any(
-        ShuffleHandler.ReduceMapFileCount.class));
-    return mockFuture;
-  }
-
-  public HttpRequest createMockHttpRequest() {
-    HttpRequest mockHttpRequest = mock(HttpRequest.class);
-    Mockito.doReturn(HttpMethod.GET).when(mockHttpRequest).method();
-    Mockito.doAnswer(invocation -> {
-      String uri = "/mapOutput?job=job_12345_1&reduce=1";
-      for (int i = 0; i < 100; i++) {
-        uri = uri.concat("&map=attempt_12345_1_m_" + i + "_0");
-      }
-      return uri;
-    }).when(mockHttpRequest).uri();
-    return mockHttpRequest;
-  }
-}
+    @Override
+    protected ShuffleChannelHandlerContext createHandlerContext() {
+      return new ShuffleChannelHandlerContext(getConfig(),
+          userRsrc,
+          secretManager,
+          createLoadingCache(),
+          new IndexCache(new JobConf(getConfig())),
+          ms.register(new ShuffleHandler.ShuffleMetrics()),
+          allChannels
+      );
+    }
+  }
+}
\ No newline at end of file
diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-shuffle/src/test/java/org/apache/hadoop/mapred/TestShuffleHandlerBase.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-shuffle/src/test/java/org/apache/hadoop/mapred/TestShuffleHandlerBase.java
new file mode 100644
index 0000000000000..1bce443381d47
--- /dev/null
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-shuffle/src/test/java/org/apache/hadoop/mapred/TestShuffleHandlerBase.java
@@ -0,0 +1,172 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.mapred;
+
+import io.netty.util.ResourceLeakDetector;
+
+import java.io.ByteArrayOutputStream;
+import java.io.File;
+import java.io.IOException;
+import java.io.PrintStream;
+import java.io.PrintWriter;
+import java.nio.file.Files;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.concurrent.TimeUnit;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import javax.annotation.Nonnull;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.thirdparty.com.google.common.cache.CacheBuilder;
+import org.apache.hadoop.thirdparty.com.google.common.cache.CacheLoader;
+import org.apache.hadoop.thirdparty.com.google.common.cache.LoadingCache;
+import org.apache.hadoop.thirdparty.com.google.common.cache.RemovalListener;
+import org.junit.After;
+import org.junit.Before;
+
+import static io.netty.util.ResourceLeakDetector.Level.PARANOID;
+import static org.apache.hadoop.io.MapFile.DATA_FILE_NAME;
+import static org.apache.hadoop.io.MapFile.INDEX_FILE_NAME;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+
+public class TestShuffleHandlerBase {
+  public static final String TEST_ATTEMPT_1 = "attempt_1111111111111_0001_m_000001_0";
+  public static final String TEST_ATTEMPT_2 = "attempt_1111111111111_0002_m_000002_0";
+  public static final String TEST_ATTEMPT_3 = "attempt_1111111111111_0003_m_000003_0";
+  public static final String TEST_JOB_ID = "job_1111111111111_0001";
+  public static final String TEST_USER = "testUser";
+  public static final String TEST_DATA_A = "aaaaa";
+  public static final String TEST_DATA_B = "bbbbb";
+  public static final String TEST_DATA_C = "ccccc";
+
+  private final PrintStream standardOut = System.out;
+  private final ByteArrayOutputStream outputStreamCaptor = new ByteArrayOutputStream();
+  @SuppressWarnings("checkstyle:VisibilityModifier")
+  protected java.nio.file.Path tempDir;
+
+  @Before
+  public void setup() throws IOException {
+    tempDir = Files.createTempDirectory("test-shuffle-channel-handler");
+    tempDir.toFile().deleteOnExit();
+
+    generateMapOutput(tempDir.toAbsolutePath().toString(), TEST_ATTEMPT_1,
+        Arrays.asList(TEST_DATA_A, TEST_DATA_B, TEST_DATA_C));
+    generateMapOutput(tempDir.toAbsolutePath().toString(), TEST_ATTEMPT_2,
+        Arrays.asList(TEST_DATA_B, TEST_DATA_A, TEST_DATA_C));
+    generateMapOutput(tempDir.toAbsolutePath().toString(), TEST_ATTEMPT_3,
+        Arrays.asList(TEST_DATA_C, TEST_DATA_B, TEST_DATA_A));
+
+    outputStreamCaptor.reset();
+    ResourceLeakDetector.setLevel(PARANOID);
+    System.setOut(new PrintStream(outputStreamCaptor));
+  }
+
+  @After
+  public void teardown() {
+    System.setOut(standardOut);
+    System.out.print(outputStreamCaptor);
+    // For this to work ch.qos.logback.classic is needed for some reason
+    assertFalse(outputStreamCaptor.toString()
+        .contains("LEAK: ByteBuf.release() was not called before"));
+  }
+
+  public List<String> matchLogs(String pattern) {
+    String logs = outputStreamCaptor.toString();
+    Matcher m = Pattern.compile(pattern).matcher(logs);
+    List<String> allMatches = new ArrayList<>();
+    while (m.find()) {
+      allMatches.add(m.group());
+    }
+    return allMatches;
+  }
+
+  public static void generateMapOutput(String tempDir, String attempt, List<String> maps)
+      throws IOException {
+    SpillRecord record = new SpillRecord(maps.size());
+
+    assertTrue(new File(getBasePath(tempDir, attempt)).mkdirs());
+    try (PrintWriter writer = new PrintWriter(getDataFile(tempDir, attempt), "UTF-8")) {
+      long startOffset = 0;
+      int partition = 0;
+      for (String map : maps) {
+        record.putIndex(new IndexRecord(
+                startOffset,
+                map.length() * 2L, // doesn't matter in this test
+                map.length()),
+            partition);
+        startOffset += map.length() + 1;
+        partition++;
+        writer.write(map);
+      }
+      record.writeToFile(new Path(getIndexFile(tempDir, attempt)),
+          new JobConf(new Configuration()));
+    }
+  }
+
+  public static String getIndexFile(String tempDir, String attempt) {
+    return String.format("%s/%s", getBasePath(tempDir, attempt), INDEX_FILE_NAME);
+  }
+
+  public static String getDataFile(String tempDir, String attempt) {
+    return String.format("%s/%s", getBasePath(tempDir, attempt), DATA_FILE_NAME);
+  }
+
+  private static String getBasePath(String tempDir, String attempt) {
+    return String.format("%s/%s/%s/%s", tempDir, TEST_JOB_ID, TEST_USER, attempt);
+  }
+
+  public static String getUri(String jobId, int reduce, List<String> maps, boolean keepAlive) {
+    return String.format("/mapOutput?job=%s&reduce=%d&map=%s%s",
+        jobId, reduce, String.join(",", maps),
+        keepAlive ? "&keepAlive=true" : "");
+  }
+
+  public LoadingCache<ShuffleHandler.AttemptPathIdentifier,
+      ShuffleHandler.AttemptPathInfo> createLoadingCache() {
+    return CacheBuilder.newBuilder().expireAfterAccess(
+            5,
+            TimeUnit.MINUTES).softValues().concurrencyLevel(16).
+        removalListener(
+            (RemovalListener<ShuffleHandler.AttemptPathIdentifier,
+                ShuffleHandler.AttemptPathInfo>) notification -> {
+            }
+        ).maximumWeight(10 * 1024 * 1024).weigher(
+            (key, value) -> key.jobId.length() + key.user.length() +
+                key.attemptId.length() +
+                value.indexPath.toString().length() +
+                value.dataPath.toString().length()
+        ).build(new CacheLoader<ShuffleHandler.AttemptPathIdentifier,
+            ShuffleHandler.AttemptPathInfo>() {
+          @Override
+          public ShuffleHandler.AttemptPathInfo load(
+              @Nonnull ShuffleHandler.AttemptPathIdentifier key) {
+            String base = String.format("%s/%s/%s/", tempDir, key.jobId, key.user);
+            String attemptBase = base + key.attemptId;
+            Path indexFileName = new Path(attemptBase + "/" + INDEX_FILE_NAME);
+            Path mapOutputFileName = new Path(attemptBase + "/" + DATA_FILE_NAME);
+            return new ShuffleHandler.AttemptPathInfo(indexFileName, mapOutputFileName);
+          }
+        });
+  }
+}
diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-shuffle/src/test/resources/cert.pem b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-shuffle/src/test/resources/cert.pem
new file mode 100644
index 0000000000000..ec32a67152a0c
--- /dev/null
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-shuffle/src/test/resources/cert.pem
@@ -0,0 +1,27 @@
+-----BEGIN CERTIFICATE-----
+MIIEpDCCAowCCQDDMEtH5Wp0qTANBgkqhkiG9w0BAQsFADAUMRIwEAYDVQQDDAls
+b2NhbGhvc3QwHhcNMjMwMTE2MTI0NjQ4WhcNMzMwMTEzMTI0NjQ4WjAUMRIwEAYD
+VQQDDAlsb2NhbGhvc3QwggIiMA0GCSqGSIb3DQEBAQUAA4ICDwAwggIKAoICAQDO
+FiF+sfoJYHPMPx4jaU11mCupytAFJzz9igaiaKAZCjVHBVWC31KDxHmRdKD066DO
+clOJORNOe8Oe4aB5Lbu6wgKtlHEtKmqAU2WrYAEl0oXrZKEL0Xgs1KTTChbVSJ/I
+m1WwmEthriQSul0WaEncNpS5NV4PORhiGu0plw+SWSJBFsbl29K6oHE1ClgVjm8j
+iu4Y1NAilOPcjmhCmwRq5eq5H0mJ5LWxfvjLIJ9cPpMLG9eVLQkOIE9I01DJ37WM
+OvljUMpmhxWDq2oZEmeyCJUFSUh1IlcUM1hTmRUzU/Vcf7EhpAYZxphvSIvDQkAw
+cmnn0LQZmORCMP0HurR1o3NnzAVf/ahfpXwvA/BuCsEcW1Le+WATtxa2EvRCnEPa
+I76W35FY69t/WYZNIzPgo9eYD7iDBbqxuBH+GlDuwWU6mjEc0nL11uGtcRPrXzKa
+QhRMqAtwNW5I5S5HgPLbMiu/n+PpX6+S431eLHFHJ6WUvcwOIK4ZqLH4/Piks1fV
+0Svdo47Jymlt6dOvYm85tFsWkYcmldO6aQilRuGBYdXJ06xDyH7EaD0Z2PmPjhl9
+zkt3gpaXxBn0gsJIn++qZ26pXFxVewlJi0m84Xd3x10h9MvpP8AZMhFkLWXR2nqw
+eCfell4jzGNXBDLEcspv6HmuTvP7+gqgRCuFLrjOiQIDAQABMA0GCSqGSIb3DQEB
+CwUAA4ICAQAexU5VTmT5VAqau0TGTGEgStGPWoliV4b+d8AcdJvOd1qmetgFhJ+X
+TSutcFPdascOys0+tUV2GZwew3P8yTQyd35LDUC4OjGt2kISDplGAtTdDD19u5/R
+hQf7VxJImOxsg2lPPRv3RXMbNF64reP9YsM9osWgJOFzrDf9FkP2HByslU2v7ler
+sWQVu+Ei7r3/ZMOI7hHnN8MLqcj+BJwEHCTa8HPmr0Ic3lJ86vUVR4QQE5LgNvSu
+oSOZlALsMNVx2rxmirhC6guLwPh7HylDFMzyVedCzmqwQ0R8SSU6SmJvXNLeBFLw
+F5mZRh1jabiqsMTGnmMQ97GPs0q78M2sw3TjI+nexCcYZ3jQfR+1eFSg4DlSd55x
+BMVfT2kYThzxOw3brtygXjl6gGr8v5M6PzOvbLyDtEa3iDp7Mslw2tJ5OmxxJV9g
+QVvBQL1L2nySFk0ij2bIjD7fdpF/EpxrNf4IATOAf5YvxELUeXnyuqJZBtgC8b3I
+qXHJIpGM7N4jdwhe0sMVH0OWlqzsL14QZCE6YdvXBEksJ/HBVUie6afYAZrUwUP1
+gtcq9uFpPteg9PsBLZ7hGfNt2278EXhPBtlIpeiPE8X19Lr3bCmCO1PbWNCTkweb
+tGfwnH46DmWYUqYrofnKso1mq56yEbbuDy7a2FeHJ2d+18Fh97WnUw==
+-----END CERTIFICATE-----
diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-shuffle/src/test/resources/key.pem b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-shuffle/src/test/resources/key.pem
new file mode 100644
index 0000000000000..e064e5e8d0379
--- /dev/null
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-shuffle/src/test/resources/key.pem
@@ -0,0 +1,52 @@
+-----BEGIN PRIVATE KEY-----
+MIIJRAIBADANBgkqhkiG9w0BAQEFAASCCS4wggkqAgEAAoICAQDOFiF+sfoJYHPM
+Px4jaU11mCupytAFJzz9igaiaKAZCjVHBVWC31KDxHmRdKD066DOclOJORNOe8Oe
+4aB5Lbu6wgKtlHEtKmqAU2WrYAEl0oXrZKEL0Xgs1KTTChbVSJ/Im1WwmEthriQS
+ul0WaEncNpS5NV4PORhiGu0plw+SWSJBFsbl29K6oHE1ClgVjm8jiu4Y1NAilOPc
+jmhCmwRq5eq5H0mJ5LWxfvjLIJ9cPpMLG9eVLQkOIE9I01DJ37WMOvljUMpmhxWD
+q2oZEmeyCJUFSUh1IlcUM1hTmRUzU/Vcf7EhpAYZxphvSIvDQkAwcmnn0LQZmORC
+MP0HurR1o3NnzAVf/ahfpXwvA/BuCsEcW1Le+WATtxa2EvRCnEPaI76W35FY69t/
+WYZNIzPgo9eYD7iDBbqxuBH+GlDuwWU6mjEc0nL11uGtcRPrXzKaQhRMqAtwNW5I
+5S5HgPLbMiu/n+PpX6+S431eLHFHJ6WUvcwOIK4ZqLH4/Piks1fV0Svdo47Jymlt
+6dOvYm85tFsWkYcmldO6aQilRuGBYdXJ06xDyH7EaD0Z2PmPjhl9zkt3gpaXxBn0
+gsJIn++qZ26pXFxVewlJi0m84Xd3x10h9MvpP8AZMhFkLWXR2nqweCfell4jzGNX
+BDLEcspv6HmuTvP7+gqgRCuFLrjOiQIDAQABAoICAQDAe6UfK2YIugCN5OnmUyUY
+z18AwD/YgFSTzgXyTNwzZvhp9A5xJNpx3eFZvN/Uwfs4t0lUom1o4WnNjJkQdWmg
+vjI4I6wtbi942evcy9dmlyGjwSI14phm7tlfj03SOXmbqZG4VhYaDsb8gvoMwq0x
+s/zmm3TVrRMcFmAqd0ABBaVbu8VbzRweWVpDGv04bQda4BkQMjyQABZu2seAZj8T
+BNldvF44H9igBqKjPj35rywxtPh/CUgq3HyQ3WXYl0x+xFpHq57Pch3jFAgNkMYv
+X5qoDFFTrhY89NPriNBnV2SU12L+s69LBdU8Izr+zXMcjNBjxudf/RA8znqWbIi8
+pbwXOwBUD4XP3coAzipVOJfeXb5OAkq+wjHnKb4YXJ5mNFb7LcRy6MJVqyxPNJGh
+UlfGxsJ441K/9e+aoDEyB0xbjeZ+2yP021P2eObwj46M5sxP2BFSe8E1DUpQ5+ZX
+kKipKDZETLc2e4G37Hziw2Wa5X0AAbKgSh1a5AMd0GUbrmJQzO0dok1ujJNu+zpn
+K0qAG0c/HD+egIKPEc03+81fLzXKxGHOxrTnHPInWLeGjxEL3oM2Tl5QkYSjm8qg
+uIY5up5K//R+fDy45/XRACPOo+yf2RTFnIjfIhxJaB2M7BrBUpWvX1xLJQfDS3Gb
+4Rfo2Qlgh/adrNkr2m0NHQKCAQEA8KZK7eugKV/Gk5L0j5E59qytlVZTUoDWdbAq
+vMnAgU6BGiTy/Av4jPCH5HDYD5ZX5nqD+GVkXSh2fiT8eSpgtIuBEdeiHHZXwCcb
+IK7vKxSePQrs0aA53uk7LY0LGPMRhDheYTItTE+6pRp2HswDgRBw+1fm6Yt1ps32
+oqM7bOUSg6eCKISmAP8UV9ac1l6ZHLdhTIzrVnOc/YqIW4ruNbwpSK1fI7uTWH4i
+5JqfPtTa7anJrt080vVUi6cS22G8QhlW3q6fo1GrH8QM4gInRF/4MwkAAP8p1787
+KlFHXxS0bWnJZhjKvh7naeVJi5EaMCWJ1gKF/OcvQYONrA6zdwKCAQEA2ztvxaPy
+j4Pq2wpYWlHueCPPn5yMDQQqCLlcG50HzPbquSdZe7o0MEWqV8MaXB6fw1xLwCC4
+i5+HnL72KaBu6DVIhMYDmPzhiw4GbCASfR4v/biQ+047KfnQiHPUEhUCxGvHhcDR
+Y3Zhzax6mC79Mfz2gunEx2ZI1MURn/sO+3tQtx+Gzsoj/W4LHpeEQGKsUhcIN48v
+XAmeWqVwwmr0twQygnOQyvgZxtiunnIADVxJJh4AQLWGagDiMjaWJ4fZ7q8aBMLY
+SFBlDqzf5xssraUAiaawsaRL0jliy0y8WXwSJHb4WKebH2QQcUq22c2y8IbKMcsz
+AjLHf1nG0oEN/wKCAQEAypfkutnEEzDbVz+/feIQPzfuRqvPJ8TpR1jREfBzjSiP
+DXiPy1s0m0qfzNSnQLAcPh9kXMjXLooy/02Z81OW6EgNl/zGMFn80/rYHrLHDKil
+8tPwvSW7sor9VALKle2EEKD367G3augwRHC7gn/Ub2JtC1xcPL84g/4fJZpwG+PZ
+q1ZpAD10F6Cm+q/lh59KHV/QnQaB1V0tjFGFLDQRCNHom5PBZa6zhCtkqrn1WIsP
+6EcpUHpWi28YBx3XhTOJrsfwVzYBlRfbDboZ8mdHsYttw2FPTIeELWqDn8OfZ09h
++j6126sBe/8+aOsr+EBnIKNpn+6t6FSkdu4OZgxWTwKCAQEAxjRXWjVeUBgxFfWl
+aYsXcXDW/nP3PrFcG2UR/DkdW1aFYf5MbYssMdRaLFxNEanyQRrJtZsfncQORE11
+mq7cdn3t4XRwvjkq5KA6eDkK3imTs+zQzxOsc8fSm/s0aWCrjs/upGNuK2FVDTD5
+6WraKP6OFE+rQ6ebAxpkU+IUapLTp6wOIhkpLq/1x6OuwtMy/kiqeiiu5aQgkc1v
+Q6aVNn3J+Jzo9EgYbelq/f8NQwcDbz3Cdr5nFqFT35fway7sflm6yUErbz2YEAuF
+ppiv7RH3iXu11fU3Q4n0Yt8ujiyY7nTNFMH7ggbiwrqM1B+fvsvuM9SFemBUczPE
+iH57GwKCAQAdLm1mSeUPn3qEXy/ui7M7GPK43r1l2fn6UJhEGckm4YJ2DOlWisNW
+2ilyzfdlYF1Cq22iKxi3/mZdNojKKL7yFCTwx2evHsSIt2vcyD25sFVh5u9O/xFa
+1Zk3Pzq6XpaAfZCY4OizJb5zraWYWVNAP1DI4qT0Kg6LvNWZ5G9Dh+tptTmB9E05
+5GiBWD3OfWH5AMQ2UmprEivbaqN8Gm/W6m6Hraf+LbP4aFORwElNAZTymeNcW5O5
+ha2XU2TAINmhgPm1IZEGiSah+A+s2uW4Ox4nQJfksy+rtJOPRcnK4aIhURhzwJv/
+8JszrQ2Tq9fN/cO50CDeipqAtKkcWNjE
+-----END PRIVATE KEY-----
diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-shuffle/src/test/resources/log4j.properties b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-shuffle/src/test/resources/log4j.properties
index b7d8ad36efc26..471993fd5900c 100644
--- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-shuffle/src/test/resources/log4j.properties
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-shuffle/src/test/resources/log4j.properties
@@ -17,5 +17,5 @@ log4j.threshold=ALL
 log4j.appender.stdout=org.apache.log4j.ConsoleAppender
 log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
 log4j.appender.stdout.layout.ConversionPattern=%d{ISO8601} %-5p [%t] %c{2} (%F:%M(%L)) - %m%n
-log4j.logger.io.netty=INFO
-log4j.logger.org.apache.hadoop.mapred=INFO
\ No newline at end of file
+log4j.logger.io.netty=TRACE
+log4j.logger.org.apache.hadoop.mapred=TRACE
\ No newline at end of file

From 170ee4b00e0ff1d309de0aacbc98508dbf30fc79 Mon Sep 17 00:00:00 2001
From: Prabhjyot Singh <prabhjyot@acceldata.io>
Date: Wed, 20 Nov 2024 10:28:09 -0500
Subject: [PATCH 16/40] ODP-2638: MAPREDUCE-7433: Remove unused
 mapred/LoggingHttpResponseEncoder.java (#5388) (#48)

(cherry picked from commit e4b5314991142e4a11774340c54fa4a4b7df99bd)
(cherry picked from commit df1cf3e60ef38d6a38c6def8bb507dcc35a1ea37)

Co-authored-by: Tamas Domok <tdomok@cloudera.com>
---
 .../mapred/LoggingHttpResponseEncoder.java    | 106 ------------------
 1 file changed, 106 deletions(-)
 delete mode 100644 hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-shuffle/src/main/java/org/apache/hadoop/mapred/LoggingHttpResponseEncoder.java

diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-shuffle/src/main/java/org/apache/hadoop/mapred/LoggingHttpResponseEncoder.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-shuffle/src/main/java/org/apache/hadoop/mapred/LoggingHttpResponseEncoder.java
deleted file mode 100644
index c7b98ce166ca6..0000000000000
--- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-shuffle/src/main/java/org/apache/hadoop/mapred/LoggingHttpResponseEncoder.java
+++ /dev/null
@@ -1,106 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *     http://www.apache.org/licenses/LICENSE-2.0
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.hadoop.mapred;
-
-import io.netty.buffer.ByteBuf;
-import io.netty.channel.ChannelHandlerContext;
-import io.netty.channel.ChannelPromise;
-import io.netty.handler.codec.http.HttpHeaders;
-import io.netty.handler.codec.http.HttpResponse;
-import io.netty.handler.codec.http.HttpResponseEncoder;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.util.List;
-
-class LoggingHttpResponseEncoder extends HttpResponseEncoder {
-  private static final Logger LOG = LoggerFactory.getLogger(LoggingHttpResponseEncoder.class);
-  private final boolean logStacktraceOfEncodingMethods;
-
-  LoggingHttpResponseEncoder(boolean logStacktraceOfEncodingMethods) {
-    this.logStacktraceOfEncodingMethods = logStacktraceOfEncodingMethods;
-  }
-
-  @Override
-  public boolean acceptOutboundMessage(Object msg) throws Exception {
-    printExecutingMethod();
-    LOG.info("OUTBOUND MESSAGE: " + msg);
-    return super.acceptOutboundMessage(msg);
-  }
-
-  @Override
-  protected void encodeInitialLine(ByteBuf buf, HttpResponse response) throws Exception {
-    LOG.debug("Executing method: {}, response: {}",
-        getExecutingMethodName(), response);
-    logStacktraceIfRequired();
-    super.encodeInitialLine(buf, response);
-  }
-
-  @Override
-  protected void encode(ChannelHandlerContext ctx, Object msg,
-      List<Object> out) throws Exception {
-    LOG.debug("Encoding to channel {}: {}", ctx.channel(), msg);
-    printExecutingMethod();
-    logStacktraceIfRequired();
-    super.encode(ctx, msg, out);
-  }
-
-  @Override
-  protected void encodeHeaders(HttpHeaders headers, ByteBuf buf) {
-    printExecutingMethod();
-    super.encodeHeaders(headers, buf);
-  }
-
-  @Override
-  public void write(ChannelHandlerContext ctx, Object msg, ChannelPromise
-      promise) throws Exception {
-    LOG.debug("Writing to channel {}: {}", ctx.channel(), msg);
-    printExecutingMethod();
-    super.write(ctx, msg, promise);
-  }
-
-  private void logStacktraceIfRequired() {
-    if (logStacktraceOfEncodingMethods) {
-      LOG.debug("Stacktrace: ", new Throwable());
-    }
-  }
-
-  private void printExecutingMethod() {
-    String methodName = getExecutingMethodName(1);
-    LOG.debug("Executing method: {}", methodName);
-  }
-
-  private String getExecutingMethodName() {
-    return getExecutingMethodName(0);
-  }
-
-  private String getExecutingMethodName(int additionalSkipFrames) {
-    try {
-      StackTraceElement[] stackTrace = Thread.currentThread().getStackTrace();
-      // Array items (indices):
-      // 0: java.lang.Thread.getStackTrace(...)
-      // 1: TestShuffleHandler$LoggingHttpResponseEncoder.getExecutingMethodName(...)
-      int skipFrames = 2 + additionalSkipFrames;
-      String methodName = stackTrace[skipFrames].getMethodName();
-      String className = this.getClass().getSimpleName();
-      return className + "#" + methodName;
-    } catch (Throwable t) {
-      LOG.error("Error while getting execution method name", t);
-      return "unknown";
-    }
-  }
-}

From f9636ef64a98243943ab9536a118e9e2d49e7df3 Mon Sep 17 00:00:00 2001
From: Prabhjyot Singh <prabhjyot@acceldata.io>
Date: Wed, 20 Nov 2024 10:28:24 -0500
Subject: [PATCH 17/40] ODP-2637: MAPREDUCE-7434: Fix ShuffleHandler tests.
 Contributed by Tamas Domok (#49)

(cherry picked from commit 8f6be3678d1113e3e7f5477c357fc81f62d460b8)
(cherry picked from commit 14a608b6e240cd3660f1f6647b03fcd668c0dbc8)

Co-authored-by: Szilard Nemeth <snemeth@apache.org>
---
 .../mapred/TestShuffleChannelHandler.java     |  2 +-
 .../hadoop/mapred/TestShuffleHandler.java     | 44 +++++++++++++------
 .../hadoop/mapred/TestShuffleHandlerBase.java | 29 ++++++------
 3 files changed, 47 insertions(+), 28 deletions(-)

diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-shuffle/src/test/java/org/apache/hadoop/mapred/TestShuffleChannelHandler.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-shuffle/src/test/java/org/apache/hadoop/mapred/TestShuffleChannelHandler.java
index 7fedc7bb2dc09..66fa3de94f89f 100644
--- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-shuffle/src/test/java/org/apache/hadoop/mapred/TestShuffleChannelHandler.java
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-shuffle/src/test/java/org/apache/hadoop/mapred/TestShuffleChannelHandler.java
@@ -225,7 +225,7 @@ public void testInvalidMapNoDataFile() {
     final ShuffleTest t = createShuffleTest();
     final EmbeddedChannel shuffle = t.createShuffleHandlerChannelFileRegion();
 
-    String dataFile = getDataFile(tempDir.toAbsolutePath().toString(), TEST_ATTEMPT_2);
+    String dataFile = getDataFile(TEST_USER, tempDir.toAbsolutePath().toString(), TEST_ATTEMPT_2);
     assertTrue("should delete", new File(dataFile).delete());
 
     FullHttpRequest req = t.createRequest(getUri(TEST_JOB_ID, 0,
diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-shuffle/src/test/java/org/apache/hadoop/mapred/TestShuffleHandler.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-shuffle/src/test/java/org/apache/hadoop/mapred/TestShuffleHandler.java
index a7d2f9ba2d45d..48a4089d0faf2 100644
--- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-shuffle/src/test/java/org/apache/hadoop/mapred/TestShuffleHandler.java
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-shuffle/src/test/java/org/apache/hadoop/mapred/TestShuffleHandler.java
@@ -29,6 +29,7 @@
 import static org.apache.hadoop.test.MetricsAsserts.assertGauge;
 import static org.apache.hadoop.test.MetricsAsserts.getMetrics;
 import static org.junit.Assert.assertNotEquals;
+import static org.junit.Assert.assertNotNull;
 import static org.junit.Assert.assertTrue;
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.fail;
@@ -42,6 +43,7 @@
 import java.io.File;
 import java.io.FileInputStream;
 import java.io.IOException;
+import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.net.HttpURLConnection;
 import java.net.MalformedURLException;
@@ -160,7 +162,7 @@ public void testMaxConnections() throws Exception {
     shuffleHandler.init(conf);
     shuffleHandler.start();
     final String port = shuffleHandler.getConfig().get(SHUFFLE_PORT_CONFIG_KEY);
-    final SecretKey secretKey = shuffleHandler.addTestApp();
+    final SecretKey secretKey = shuffleHandler.addTestApp(TEST_USER);
 
     // setup connections
     HttpURLConnection[] conns = new HttpURLConnection[connAttempts];
@@ -238,7 +240,7 @@ public void testKeepAlive() throws Exception {
     shuffleHandler.init(conf);
     shuffleHandler.start();
     final String port = shuffleHandler.getConfig().get(ShuffleHandler.SHUFFLE_PORT_CONFIG_KEY);
-    final SecretKey secretKey = shuffleHandler.addTestApp();
+    final SecretKey secretKey = shuffleHandler.addTestApp(TEST_USER);
 
     HttpURLConnection conn1 = createRequest(
         geURL(port, TEST_JOB_ID, 0, Collections.singletonList(TEST_ATTEMPT_1), true),
@@ -279,18 +281,34 @@ public void testMapFileAccess() throws IOException {
     conf.set(CommonConfigurationKeysPublic.HADOOP_SECURITY_AUTHENTICATION, "kerberos");
     UserGroupInformation.setConfiguration(conf);
 
+    final String randomUser = "randomUser";
+    final String attempt = "attempt_1111111111111_0004_m_000004_0";
+    generateMapOutput(randomUser, tempDir.toAbsolutePath().toString(), attempt,
+            Arrays.asList(TEST_DATA_C, TEST_DATA_B, TEST_DATA_A));
+
     ShuffleHandlerMock shuffleHandler = new ShuffleHandlerMock();
     shuffleHandler.init(conf);
     try {
       shuffleHandler.start();
       final String port = shuffleHandler.getConfig().get(ShuffleHandler.SHUFFLE_PORT_CONFIG_KEY);
-      final SecretKey secretKey = shuffleHandler.addTestApp();
+      final SecretKey secretKey = shuffleHandler.addTestApp(randomUser);
 
       HttpURLConnection conn = createRequest(
-          geURL(port, TEST_JOB_ID, 0, Collections.singletonList(TEST_ATTEMPT_1), false),
+          geURL(port, TEST_JOB_ID, 0, Collections.singletonList(attempt), false),
           secretKey);
       conn.connect();
-      BufferedReader in = new BufferedReader(new InputStreamReader(conn.getInputStream()));
+
+      InputStream is = null;
+      try {
+        is = conn.getInputStream();
+      } catch (IOException ioe) {
+        if (conn.getResponseCode() != HttpURLConnection.HTTP_OK) {
+          is = conn.getErrorStream();
+        }
+      }
+
+      assertNotNull(is);
+      BufferedReader in = new BufferedReader(new InputStreamReader(is));
       StringBuilder builder = new StringBuilder();
       String inputLine;
       while ((inputLine = in.readLine()) != null) {
@@ -300,7 +318,7 @@ public void testMapFileAccess() throws IOException {
       String receivedString = builder.toString();
 
       //Retrieve file owner name
-      String indexFilePath = getIndexFile(tempDir.toAbsolutePath().toString(), TEST_ATTEMPT_1);
+      String indexFilePath = getIndexFile(randomUser, tempDir.toAbsolutePath().toString(), attempt);
       String owner;
       try (FileInputStream fis = new FileInputStream(indexFilePath)) {
         owner = NativeIO.POSIX.getFstat(fis.getFD()).getOwner();
@@ -308,11 +326,11 @@ public void testMapFileAccess() throws IOException {
 
       String message =
           "Owner '" + owner + "' for path " + indexFilePath
-              + " did not match expected owner '" + TEST_USER + "'";
+              + " did not match expected owner '" + randomUser + "'";
       assertTrue(String.format("Received string '%s' should contain " +
               "message '%s'", receivedString, message),
           receivedString.contains(message));
-      assertEquals(HttpURLConnection.HTTP_OK, conn.getResponseCode());
+      assertEquals(HttpURLConnection.HTTP_INTERNAL_ERROR, conn.getResponseCode());
       LOG.info("received: " + receivedString);
       assertNotEquals("", receivedString);
     } finally {
@@ -335,7 +353,7 @@ public void testRecovery() throws IOException {
       shuffle.init(conf);
       shuffle.start();
       final String port = shuffle.getConfig().get(ShuffleHandler.SHUFFLE_PORT_CONFIG_KEY);
-      final SecretKey secretKey = shuffle.addTestApp();
+      final SecretKey secretKey = shuffle.addTestApp(TEST_USER);
 
       // verify we are authorized to shuffle
       int rc = getShuffleResponseCode(port, secretKey);
@@ -388,7 +406,7 @@ public void testRecoveryFromOtherVersions() throws IOException {
       shuffle.init(conf);
       shuffle.start();
       final String port = shuffle.getConfig().get(ShuffleHandler.SHUFFLE_PORT_CONFIG_KEY);
-      final SecretKey secretKey = shuffle.addTestApp();
+      final SecretKey secretKey = shuffle.addTestApp(TEST_USER);
 
       // verify we are authorized to shuffle
       int rc = getShuffleResponseCode(port, secretKey);
@@ -490,14 +508,14 @@ private static HttpURLConnection createRequest(URL url, SecretKey secretKey) thr
 
   class ShuffleHandlerMock extends ShuffleHandler {
 
-    public SecretKey addTestApp() throws IOException {
+    public SecretKey addTestApp(String user) throws IOException {
       DataOutputBuffer outputBuffer = new DataOutputBuffer();
       outputBuffer.reset();
       Token<JobTokenIdentifier> jt = new Token<>(
-          "identifier".getBytes(), "password".getBytes(), new Text(TEST_USER),
+          "identifier".getBytes(), "password".getBytes(), new Text(user),
           new Text("shuffleService"));
       jt.write(outputBuffer);
-      initializeApplication(new ApplicationInitializationContext(TEST_USER, TEST_APP_ID,
+      initializeApplication(new ApplicationInitializationContext(user, TEST_APP_ID,
           ByteBuffer.wrap(outputBuffer.getData(), 0,
               outputBuffer.getLength())));
 
diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-shuffle/src/test/java/org/apache/hadoop/mapred/TestShuffleHandlerBase.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-shuffle/src/test/java/org/apache/hadoop/mapred/TestShuffleHandlerBase.java
index 1bce443381d47..406f286623006 100644
--- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-shuffle/src/test/java/org/apache/hadoop/mapred/TestShuffleHandlerBase.java
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-shuffle/src/test/java/org/apache/hadoop/mapred/TestShuffleHandlerBase.java
@@ -55,7 +55,7 @@ public class TestShuffleHandlerBase {
   public static final String TEST_ATTEMPT_2 = "attempt_1111111111111_0002_m_000002_0";
   public static final String TEST_ATTEMPT_3 = "attempt_1111111111111_0003_m_000003_0";
   public static final String TEST_JOB_ID = "job_1111111111111_0001";
-  public static final String TEST_USER = "testUser";
+  public static final String TEST_USER = System.getProperty("user.name");
   public static final String TEST_DATA_A = "aaaaa";
   public static final String TEST_DATA_B = "bbbbb";
   public static final String TEST_DATA_C = "ccccc";
@@ -70,11 +70,11 @@ public void setup() throws IOException {
     tempDir = Files.createTempDirectory("test-shuffle-channel-handler");
     tempDir.toFile().deleteOnExit();
 
-    generateMapOutput(tempDir.toAbsolutePath().toString(), TEST_ATTEMPT_1,
+    generateMapOutput(TEST_USER, tempDir.toAbsolutePath().toString(), TEST_ATTEMPT_1,
         Arrays.asList(TEST_DATA_A, TEST_DATA_B, TEST_DATA_C));
-    generateMapOutput(tempDir.toAbsolutePath().toString(), TEST_ATTEMPT_2,
+    generateMapOutput(TEST_USER, tempDir.toAbsolutePath().toString(), TEST_ATTEMPT_2,
         Arrays.asList(TEST_DATA_B, TEST_DATA_A, TEST_DATA_C));
-    generateMapOutput(tempDir.toAbsolutePath().toString(), TEST_ATTEMPT_3,
+    generateMapOutput(TEST_USER, tempDir.toAbsolutePath().toString(), TEST_ATTEMPT_3,
         Arrays.asList(TEST_DATA_C, TEST_DATA_B, TEST_DATA_A));
 
     outputStreamCaptor.reset();
@@ -101,12 +101,13 @@ public List<String> matchLogs(String pattern) {
     return allMatches;
   }
 
-  public static void generateMapOutput(String tempDir, String attempt, List<String> maps)
+  public static void generateMapOutput(String user, String tempDir,
+                                       String attempt, List<String> maps)
       throws IOException {
     SpillRecord record = new SpillRecord(maps.size());
 
-    assertTrue(new File(getBasePath(tempDir, attempt)).mkdirs());
-    try (PrintWriter writer = new PrintWriter(getDataFile(tempDir, attempt), "UTF-8")) {
+    assertTrue(new File(getBasePath(user, tempDir, attempt)).mkdirs());
+    try (PrintWriter writer = new PrintWriter(getDataFile(user, tempDir, attempt), "UTF-8")) {
       long startOffset = 0;
       int partition = 0;
       for (String map : maps) {
@@ -119,21 +120,21 @@ public static void generateMapOutput(String tempDir, String attempt, List<String
         partition++;
         writer.write(map);
       }
-      record.writeToFile(new Path(getIndexFile(tempDir, attempt)),
+      record.writeToFile(new Path(getIndexFile(user, tempDir, attempt)),
           new JobConf(new Configuration()));
     }
   }
 
-  public static String getIndexFile(String tempDir, String attempt) {
-    return String.format("%s/%s", getBasePath(tempDir, attempt), INDEX_FILE_NAME);
+  public static String getIndexFile(String user, String tempDir, String attempt) {
+    return String.format("%s/%s", getBasePath(user, tempDir, attempt), INDEX_FILE_NAME);
   }
 
-  public static String getDataFile(String tempDir, String attempt) {
-    return String.format("%s/%s", getBasePath(tempDir, attempt), DATA_FILE_NAME);
+  public static String getDataFile(String user, String tempDir, String attempt) {
+    return String.format("%s/%s", getBasePath(user, tempDir, attempt), DATA_FILE_NAME);
   }
 
-  private static String getBasePath(String tempDir, String attempt) {
-    return String.format("%s/%s/%s/%s", tempDir, TEST_JOB_ID, TEST_USER, attempt);
+  private static String getBasePath(String user, String tempDir, String attempt) {
+    return String.format("%s/%s/%s/%s", tempDir, TEST_JOB_ID, user, attempt);
   }
 
   public static String getUri(String jobId, int reduce, List<String> maps, boolean keepAlive) {

From 17308337ca1d245b1eb9826cb5f758d36c201096 Mon Sep 17 00:00:00 2001
From: Prabhjyot Singh <prabhjyot@acceldata.io>
Date: Wed, 20 Nov 2024 10:29:59 -0500
Subject: [PATCH 18/40] ODP-2639: MAPREDUCE-7441: Race condition in closing
 FadvisedFileRegion (#50)

(cherry picked from commit 1fddf35f9a916a77caba55de093840d8200bf88a)

Co-authored-by: manishsinghmowall <manishsingh@acceldata.io>
---
 .../hadoop/mapred/FadvisedFileRegion.java     | 102 ++++++++++--------
 1 file changed, 58 insertions(+), 44 deletions(-)

diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-shuffle/src/main/java/org/apache/hadoop/mapred/FadvisedFileRegion.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-shuffle/src/main/java/org/apache/hadoop/mapred/FadvisedFileRegion.java
index 9290a282e3917..184b58e6c76b1 100644
--- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-shuffle/src/main/java/org/apache/hadoop/mapred/FadvisedFileRegion.java
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-shuffle/src/main/java/org/apache/hadoop/mapred/FadvisedFileRegion.java
@@ -41,6 +41,7 @@ public class FadvisedFileRegion extends DefaultFileRegion {
   private static final Logger LOG =
       LoggerFactory.getLogger(FadvisedFileRegion.class);
 
+  private final Object closeLock = new Object();
   private final boolean manageOsCache;
   private final int readaheadLength;
   private final ReadaheadPool readaheadPool;
@@ -51,12 +52,12 @@ public class FadvisedFileRegion extends DefaultFileRegion {
   private final int shuffleBufferSize;
   private final boolean shuffleTransferToAllowed;
   private final FileChannel fileChannel;
-  
-  private ReadaheadRequest readaheadRequest;
+
+  private volatile ReadaheadRequest readaheadRequest;
 
   public FadvisedFileRegion(RandomAccessFile file, long position, long count,
       boolean manageOsCache, int readaheadLength, ReadaheadPool readaheadPool,
-      String identifier, int shuffleBufferSize, 
+      String identifier, int shuffleBufferSize,
       boolean shuffleTransferToAllowed) throws IOException {
     super(file.getChannel(), position, count);
     this.manageOsCache = manageOsCache;
@@ -73,97 +74,110 @@ public FadvisedFileRegion(RandomAccessFile file, long position, long count,
 
   @Override
   public long transferTo(WritableByteChannel target, long position)
-      throws IOException {
-    if (readaheadPool != null && readaheadLength > 0) {
-      readaheadRequest = readaheadPool.readaheadStream(identifier, fd,
-          position() + position, readaheadLength,
-          position() + count(), readaheadRequest);
+          throws IOException {
+    synchronized (closeLock) {
+      if (fd.valid()) {
+        if (readaheadPool != null && readaheadLength > 0) {
+          readaheadRequest = readaheadPool.readaheadStream(identifier, fd,
+                  position() + position, readaheadLength,
+                  position() + count(), readaheadRequest);
+        }
+
+        if(this.shuffleTransferToAllowed) {
+          return super.transferTo(target, position);
+        } else {
+          return customShuffleTransfer(target, position);
+        }
+      } else {
+        return 0L;
+      }
     }
-    
-    if(this.shuffleTransferToAllowed) {
-      return super.transferTo(target, position);
-    } else {
-      return customShuffleTransfer(target, position);
-    } 
+
   }
 
   /**
-   * This method transfers data using local buffer. It transfers data from 
-   * a disk to a local buffer in memory, and then it transfers data from the 
+   * This method transfers data using local buffer. It transfers data from
+   * a disk to a local buffer in memory, and then it transfers data from the
    * buffer to the target. This is used only if transferTo is disallowed in
-   * the configuration file. super.TransferTo does not perform well on Windows 
-   * due to a small IO request generated. customShuffleTransfer can control 
-   * the size of the IO requests by changing the size of the intermediate 
+   * the configuration file. super.TransferTo does not perform well on Windows
+   * due to a small IO request generated. customShuffleTransfer can control
+   * the size of the IO requests by changing the size of the intermediate
    * buffer.
    */
   @VisibleForTesting
   long customShuffleTransfer(WritableByteChannel target, long position)
-      throws IOException {
+          throws IOException {
     long actualCount = this.count - position;
     if (actualCount < 0 || position < 0) {
       throw new IllegalArgumentException(
-          "position out of range: " + position +
-          " (expected: 0 - " + (this.count - 1) + ')');
+              "position out of range: " + position +
+                      " (expected: 0 - " + (this.count - 1) + ')');
     }
     if (actualCount == 0) {
       return 0L;
     }
-    
+
     long trans = actualCount;
     int readSize;
     ByteBuffer byteBuffer = ByteBuffer.allocate(
-        Math.min(
-            this.shuffleBufferSize,
-            trans > Integer.MAX_VALUE ? Integer.MAX_VALUE : (int) trans));
-    
+            Math.min(
+                    this.shuffleBufferSize,
+                    trans > Integer.MAX_VALUE ? Integer.MAX_VALUE : (int) trans));
+
     while(trans > 0L &&
-        (readSize = fileChannel.read(byteBuffer, this.position+position)) > 0) {
+            (readSize = fileChannel.read(byteBuffer, this.position+position)) > 0) {
       //adjust counters and buffer limit
       if(readSize < trans) {
         trans -= readSize;
         position += readSize;
         byteBuffer.flip();
       } else {
-        //We can read more than we need if the actualCount is not multiple 
+        //We can read more than we need if the actualCount is not multiple
         //of the byteBuffer size and file is big enough. In that case we cannot
         //use flip method but we need to set buffer limit manually to trans.
         byteBuffer.limit((int)trans);
         byteBuffer.position(0);
-        position += trans; 
+        position += trans;
         trans = 0;
       }
-      
+
       //write data to the target
       while(byteBuffer.hasRemaining()) {
         target.write(byteBuffer);
       }
-      
+
       byteBuffer.clear();
     }
-    
+
     return actualCount - trans;
   }
 
-  
+
   @Override
   protected void deallocate() {
-    if (readaheadRequest != null) {
-      readaheadRequest.cancel();
+    synchronized (closeLock) {
+      if (readaheadRequest != null) {
+        readaheadRequest.cancel();
+        readaheadRequest = null;
+      }
+      super.deallocate();
     }
-    super.deallocate();
   }
-  
+
   /**
    * Call when the transfer completes successfully so we can advise the OS that
    * we don't need the region to be cached anymore.
    */
   public void transferSuccessful() {
-    if (manageOsCache && count() > 0) {
-      try {
-        NativeIO.POSIX.getCacheManipulator().posixFadviseIfPossible(identifier,
-            fd, position(), count(), POSIX_FADV_DONTNEED);
-      } catch (Throwable t) {
-        LOG.warn("Failed to manage OS cache for " + identifier, t);
+    synchronized (closeLock) {
+      if (fd.valid() && manageOsCache && count() > 0) {
+        try {
+          NativeIO.POSIX.getCacheManipulator().posixFadviseIfPossible(identifier,
+                  fd, position(), count(), POSIX_FADV_DONTNEED);
+        } catch (Throwable t) {
+          LOG.warn("Failed to manage OS cache for " + identifier +
+                  " fd " + fd, t);
+        }
       }
     }
   }

From 6c76302fa82d9f8616912167c04b8e43b58567ff Mon Sep 17 00:00:00 2001
From: Prabhjyot Singh <prabhjyot@acceldata.io>
Date: Wed, 20 Nov 2024 10:33:57 -0500
Subject: [PATCH 19/40] ODP-2640: HADOOP-16647: Support OpenSSL 1.1.1 LTS -
 patch-0

(cherry picked from commit d6b90a7a8018019805ab08c753b3d72556e5ddbc)
(cherry picked from commit 7e2516ff5584d97a02758d5e662f746ceb2f8309)
---
 .../org/apache/hadoop/crypto/OpensslCipher.c  | 22 +++++++++----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/hadoop-common-project/hadoop-common/src/main/native/src/org/apache/hadoop/crypto/OpensslCipher.c b/hadoop-common-project/hadoop-common/src/main/native/src/org/apache/hadoop/crypto/OpensslCipher.c
index abff7ea5f17ff..94253d9cd0d5b 100644
--- a/hadoop-common-project/hadoop-common/src/main/native/src/org/apache/hadoop/crypto/OpensslCipher.c
+++ b/hadoop-common-project/hadoop-common/src/main/native/src/org/apache/hadoop/crypto/OpensslCipher.c
@@ -27,7 +27,7 @@
 #ifdef UNIX
 static EVP_CIPHER_CTX * (*dlsym_EVP_CIPHER_CTX_new)(void);
 static void (*dlsym_EVP_CIPHER_CTX_free)(EVP_CIPHER_CTX *);
-#if OPENSSL_API_COMPAT < 0x10100000L && OPENSSL_VERSION_NUMBER >= 0x10100000L
+#if OPENSSL_VERSION_NUMBER >= 0x10100000L
 static int (*dlsym_EVP_CIPHER_CTX_reset)(EVP_CIPHER_CTX *);
 #else
 static int (*dlsym_EVP_CIPHER_CTX_cleanup)(EVP_CIPHER_CTX *);
@@ -127,7 +127,7 @@ JNIEXPORT void JNICALL Java_org_apache_hadoop_crypto_OpensslCipher_initIDs
                       "EVP_CIPHER_CTX_new");
   LOAD_DYNAMIC_SYMBOL(dlsym_EVP_CIPHER_CTX_free, env, openssl,  \
                       "EVP_CIPHER_CTX_free");
-#if OPENSSL_API_COMPAT < 0x10100000L && OPENSSL_VERSION_NUMBER >= 0x10100000L
+#if OPENSSL_VERSION_NUMBER >= 0x10100000L
   LOAD_DYNAMIC_SYMBOL(dlsym_EVP_CIPHER_CTX_reset, env, openssl,  \
                       "EVP_CIPHER_CTX_reset");
 #else
@@ -161,7 +161,7 @@ JNIEXPORT void JNICALL Java_org_apache_hadoop_crypto_OpensslCipher_initIDs
   LOAD_DYNAMIC_SYMBOL(__dlsym_EVP_CIPHER_CTX_free, dlsym_EVP_CIPHER_CTX_free,  \
                       env, openssl, "EVP_CIPHER_CTX_free");
   LOAD_DYNAMIC_SYMBOL(__dlsym_EVP_CIPHER_CTX_cleanup,  \
-                      dlsym_EVP_CIPHER_CTX_cleanup, env, 
+                      dlsym_EVP_CIPHER_CTX_cleanup, env,
                       openssl, "EVP_CIPHER_CTX_cleanup");
   LOAD_DYNAMIC_SYMBOL(__dlsym_EVP_CIPHER_CTX_init, dlsym_EVP_CIPHER_CTX_init,  \
                       env, openssl, "EVP_CIPHER_CTX_init");
@@ -240,7 +240,7 @@ static EVP_CIPHER * getEvpCipher(int alg, int keyLen)
 }
 
 JNIEXPORT jlong JNICALL Java_org_apache_hadoop_crypto_OpensslCipher_init
-    (JNIEnv *env, jobject object, jlong ctx, jint mode, jint alg, jint padding, 
+    (JNIEnv *env, jobject object, jlong ctx, jint mode, jint alg, jint padding,
     jbyteArray key, jbyteArray iv)
 {
   int jKeyLen = (*env)->GetArrayLength(env, key);
@@ -253,7 +253,7 @@ JNIEXPORT jlong JNICALL Java_org_apache_hadoop_crypto_OpensslCipher_init
     THROW(env, "java/lang/IllegalArgumentException", "Invalid iv length.");
     return (jlong)0;
   }
-  
+
   EVP_CIPHER_CTX *context = CONTEXT(ctx);
   if (context == 0) {
     // Create and initialize a EVP_CIPHER_CTX
@@ -263,7 +263,7 @@ JNIEXPORT jlong JNICALL Java_org_apache_hadoop_crypto_OpensslCipher_init
       return (jlong)0;
     }
   }
-  
+
   jbyte *jKey = (*env)->GetByteArrayElements(env, key, NULL);
   if (jKey == NULL) {
     THROW(env, "java/lang/InternalError", "Cannot get bytes array for key.");
@@ -275,13 +275,13 @@ JNIEXPORT jlong JNICALL Java_org_apache_hadoop_crypto_OpensslCipher_init
     THROW(env, "java/lang/InternalError", "Cannot get bytes array for iv.");
     return (jlong)0;
   }
-  
+
   int rc = dlsym_EVP_CipherInit_ex(context, getEvpCipher(alg, jKeyLen),  \
       NULL, (unsigned char *)jKey, (unsigned char *)jIv, mode == ENCRYPT_MODE);
   (*env)->ReleaseByteArrayElements(env, key, jKey, 0);
   (*env)->ReleaseByteArrayElements(env, iv, jIv, 0);
   if (rc == 0) {
-#if OPENSSL_API_COMPAT < 0x10100000L && OPENSSL_VERSION_NUMBER >= 0x10100000L
+#if OPENSSL_VERSION_NUMBER >= 0x10100000L
     dlsym_EVP_CIPHER_CTX_reset(context);
 #else
     dlsym_EVP_CIPHER_CTX_cleanup(context);
@@ -348,7 +348,7 @@ JNIEXPORT jint JNICALL Java_org_apache_hadoop_crypto_OpensslCipher_update
   int output_len = 0;
   if (!dlsym_EVP_CipherUpdate(context, output_bytes, &output_len,  \
       input_bytes, input_len)) {
-#if OPENSSL_API_COMPAT < 0x10100000L && OPENSSL_VERSION_NUMBER >= 0x10100000L
+#if OPENSSL_VERSION_NUMBER >= 0x10100000L
     dlsym_EVP_CIPHER_CTX_reset(context);
 #else
     dlsym_EVP_CIPHER_CTX_cleanup(context);
@@ -394,7 +394,7 @@ JNIEXPORT jint JNICALL Java_org_apache_hadoop_crypto_OpensslCipher_doFinal
   
   int output_len = 0;
   if (!dlsym_EVP_CipherFinal_ex(context, output_bytes, &output_len)) {
-#if OPENSSL_API_COMPAT < 0x10100000L && OPENSSL_VERSION_NUMBER >= 0x10100000L
+#if OPENSSL_VERSION_NUMBER >= 0x10100000L
     dlsym_EVP_CIPHER_CTX_reset(context);
 #else
     dlsym_EVP_CIPHER_CTX_cleanup(context);
@@ -418,7 +418,7 @@ JNIEXPORT jstring JNICALL Java_org_apache_hadoop_crypto_OpensslCipher_getLibrary
     (JNIEnv *env, jclass clazz) 
 {
 #ifdef UNIX
-#if OPENSSL_API_COMPAT < 0x10100000L && OPENSSL_VERSION_NUMBER >= 0x10100000L
+#if OPENSSL_VERSION_NUMBER >= 0x10100000L
   if (dlsym_EVP_CIPHER_CTX_reset) {
     Dl_info dl_info;
     if(dladdr(

From 8b873823e5239e1021c1c77303f62dfd30b47576 Mon Sep 17 00:00:00 2001
From: Shubham Sharma <shubh.luck@yahoo.in>
Date: Thu, 21 Nov 2024 18:16:59 +0530
Subject: [PATCH 20/40] ODP-1434 Fixing case sensitivity issue with jackson
 upgrade on registering yarn-service

---
 .../hadoop/yarn/service/api/records/PlacementScope.java   | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-services/hadoop-yarn-services-core/src/main/java/org/apache/hadoop/yarn/service/api/records/PlacementScope.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-services/hadoop-yarn-services-core/src/main/java/org/apache/hadoop/yarn/service/api/records/PlacementScope.java
index 01b1d5dedf31b..bd6e9a9f6f004 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-services/hadoop-yarn-services-core/src/main/java/org/apache/hadoop/yarn/service/api/records/PlacementScope.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-services/hadoop-yarn-services-core/src/main/java/org/apache/hadoop/yarn/service/api/records/PlacementScope.java
@@ -22,6 +22,7 @@
 import org.apache.hadoop.yarn.api.resource.PlacementConstraints;
 
 import com.fasterxml.jackson.annotation.JsonValue;
+import com.fasterxml.jackson.annotation.JsonCreator;
 
 import io.swagger.annotations.ApiModel;
 
@@ -45,6 +46,13 @@ public String getValue() {
     return value;
   }
 
+  @JsonCreator
+  public static PlacementScope fromString(String key) {
+    return key == null
+            ? null
+            : PlacementScope.valueOf(key.toUpperCase());
+  }
+
   @Override
   @JsonValue
   public String toString() {

From fde040a87515c653338746ad78c9e45b7909695b Mon Sep 17 00:00:00 2001
From: manishsinghmowall <manishsingh@acceldata.io>
Date: Thu, 5 Dec 2024 07:15:37 +0100
Subject: [PATCH 21/40] ODP-3.2.3.3-3 version update.

---
 hadoop-assemblies/pom.xml                              |  4 ++--
 hadoop-build-tools/pom.xml                             |  2 +-
 hadoop-client-modules/hadoop-client-api/pom.xml        |  4 ++--
 .../hadoop-client-check-invariants/pom.xml             |  4 ++--
 .../hadoop-client-check-test-invariants/pom.xml        |  4 ++--
 .../hadoop-client-integration-tests/pom.xml            |  4 ++--
 .../hadoop-client-minicluster/pom.xml                  |  4 ++--
 hadoop-client-modules/hadoop-client-runtime/pom.xml    |  4 ++--
 hadoop-client-modules/hadoop-client/pom.xml            |  4 ++--
 hadoop-client-modules/pom.xml                          |  2 +-
 .../hadoop-cloud-storage/pom.xml                       |  4 ++--
 hadoop-cloud-storage-project/pom.xml                   |  4 ++--
 hadoop-common-project/hadoop-annotations/pom.xml       |  4 ++--
 hadoop-common-project/hadoop-auth-examples/pom.xml     |  4 ++--
 hadoop-common-project/hadoop-auth/pom.xml              |  4 ++--
 hadoop-common-project/hadoop-common/pom.xml            |  4 ++--
 hadoop-common-project/hadoop-kms/pom.xml               |  4 ++--
 hadoop-common-project/hadoop-minikdc/pom.xml           |  4 ++--
 hadoop-common-project/hadoop-nfs/pom.xml               |  4 ++--
 hadoop-common-project/pom.xml                          |  4 ++--
 hadoop-dist/pom.xml                                    |  4 ++--
 hadoop-hdfs-project/hadoop-hdfs-client/pom.xml         |  4 ++--
 hadoop-hdfs-project/hadoop-hdfs-httpfs/pom.xml         |  4 ++--
 hadoop-hdfs-project/hadoop-hdfs-native-client/pom.xml  |  4 ++--
 hadoop-hdfs-project/hadoop-hdfs-nfs/pom.xml            |  4 ++--
 hadoop-hdfs-project/hadoop-hdfs-rbf/pom.xml            |  4 ++--
 hadoop-hdfs-project/hadoop-hdfs/pom.xml                |  4 ++--
 hadoop-hdfs-project/pom.xml                            |  4 ++--
 .../hadoop-mapreduce-client-app/pom.xml                |  4 ++--
 .../hadoop-mapreduce-client-common/pom.xml             |  4 ++--
 .../hadoop-mapreduce-client-core/pom.xml               |  4 ++--
 .../hadoop-mapreduce-client-hs-plugins/pom.xml         |  4 ++--
 .../hadoop-mapreduce-client-hs/pom.xml                 |  4 ++--
 .../hadoop-mapreduce-client-jobclient/pom.xml          |  4 ++--
 .../hadoop-mapreduce-client-nativetask/pom.xml         |  4 ++--
 .../hadoop-mapreduce-client-shuffle/pom.xml            |  4 ++--
 .../hadoop-mapreduce-client-uploader/pom.xml           |  4 ++--
 .../hadoop-mapreduce-client/pom.xml                    |  4 ++--
 .../hadoop-mapreduce-examples/pom.xml                  |  4 ++--
 hadoop-mapreduce-project/pom.xml                       |  4 ++--
 hadoop-maven-plugins/pom.xml                           |  2 +-
 hadoop-minicluster/pom.xml                             |  4 ++--
 hadoop-project-dist/pom.xml                            |  4 ++--
 hadoop-project/pom.xml                                 |  4 ++--
 hadoop-tools/hadoop-aliyun/pom.xml                     |  2 +-
 hadoop-tools/hadoop-archive-logs/pom.xml               |  4 ++--
 hadoop-tools/hadoop-archives/pom.xml                   |  4 ++--
 hadoop-tools/hadoop-aws/pom.xml                        |  4 ++--
 hadoop-tools/hadoop-azure-datalake/pom.xml             |  2 +-
 hadoop-tools/hadoop-azure/pom.xml                      |  2 +-
 hadoop-tools/hadoop-datajoin/pom.xml                   |  4 ++--
 hadoop-tools/hadoop-distcp/pom.xml                     |  4 ++--
 hadoop-tools/hadoop-extras/pom.xml                     |  4 ++--
 hadoop-tools/hadoop-fs2img/pom.xml                     |  4 ++--
 hadoop-tools/hadoop-gridmix/pom.xml                    |  4 ++--
 hadoop-tools/hadoop-kafka/pom.xml                      |  4 ++--
 hadoop-tools/hadoop-openstack/pom.xml                  |  4 ++--
 hadoop-tools/hadoop-pipes/pom.xml                      |  4 ++--
 hadoop-tools/hadoop-resourceestimator/pom.xml          |  2 +-
 hadoop-tools/hadoop-rumen/pom.xml                      |  4 ++--
 hadoop-tools/hadoop-sls/pom.xml                        |  4 ++--
 hadoop-tools/hadoop-streaming/pom.xml                  |  4 ++--
 hadoop-tools/hadoop-tools-dist/pom.xml                 |  4 ++--
 hadoop-tools/pom.xml                                   |  4 ++--
 .../hadoop-yarn/hadoop-yarn-api/pom.xml                |  4 ++--
 .../hadoop-yarn-applications-distributedshell/pom.xml  |  4 ++--
 .../pom.xml                                            |  4 ++--
 .../hadoop-yarn-services-api/pom.xml                   |  2 +-
 .../hadoop-yarn-services-core/pom.xml                  |  2 +-
 .../hadoop-yarn-services/pom.xml                       |  2 +-
 .../hadoop-yarn-submarine/pom.xml                      | 10 +++++-----
 .../hadoop-yarn/hadoop-yarn-applications/pom.xml       |  4 ++--
 .../hadoop-yarn/hadoop-yarn-client/pom.xml             |  4 ++--
 .../hadoop-yarn/hadoop-yarn-common/pom.xml             |  4 ++--
 .../hadoop-yarn/hadoop-yarn-registry/pom.xml           |  4 ++--
 .../pom.xml                                            |  4 ++--
 .../hadoop-yarn-server-common/pom.xml                  |  4 ++--
 .../hadoop-yarn-server-nodemanager/pom.xml             |  4 ++--
 .../hadoop-yarn-server-resourcemanager/pom.xml         |  4 ++--
 .../hadoop-yarn-server-router/pom.xml                  |  4 ++--
 .../hadoop-yarn-server-sharedcachemanager/pom.xml      |  4 ++--
 .../hadoop-yarn-server-tests/pom.xml                   |  4 ++--
 .../hadoop-yarn-server-timeline-pluginstorage/pom.xml  |  4 ++--
 .../pom.xml                                            |  4 ++--
 .../pom.xml                                            |  2 +-
 .../pom.xml                                            |  4 ++--
 .../pom.xml                                            |  4 ++--
 .../pom.xml                                            |  4 ++--
 .../pom.xml                                            |  4 ++--
 .../hadoop-yarn-server-timelineservice-hbase/pom.xml   |  4 ++--
 .../hadoop-yarn-server-timelineservice/pom.xml         |  4 ++--
 .../hadoop-yarn-server-web-proxy/pom.xml               |  4 ++--
 .../hadoop-yarn/hadoop-yarn-server/pom.xml             |  4 ++--
 .../hadoop-yarn/hadoop-yarn-site/pom.xml               |  4 ++--
 hadoop-yarn-project/hadoop-yarn/hadoop-yarn-ui/pom.xml |  4 ++--
 hadoop-yarn-project/hadoop-yarn/pom.xml                |  4 ++--
 hadoop-yarn-project/pom.xml                            |  4 ++--
 pom.xml                                                |  2 +-
 98 files changed, 187 insertions(+), 187 deletions(-)

diff --git a/hadoop-assemblies/pom.xml b/hadoop-assemblies/pom.xml
index c6b10ee78a086..6d7f53a4beaef 100644
--- a/hadoop-assemblies/pom.xml
+++ b/hadoop-assemblies/pom.xml
@@ -23,11 +23,11 @@
   <parent>
     <groupId>org.apache.hadoop</groupId>
     <artifactId>hadoop-project</artifactId>
-    <version>3.2.3.3.2.3.3-2</version>
+    <version>3.2.3.3.2.3.3-3</version>
     <relativePath>../hadoop-project</relativePath>
   </parent>
   <artifactId>hadoop-assemblies</artifactId>
-  <version>3.2.3.3.2.3.3-2</version>
+  <version>3.2.3.3.2.3.3-3</version>
   <name>Apache Hadoop Assemblies</name>
   <description>Apache Hadoop Assemblies</description>
 
diff --git a/hadoop-build-tools/pom.xml b/hadoop-build-tools/pom.xml
index 819fa682c2421..88a5290b45c24 100644
--- a/hadoop-build-tools/pom.xml
+++ b/hadoop-build-tools/pom.xml
@@ -18,7 +18,7 @@
   <parent>
     <artifactId>hadoop-main</artifactId>
     <groupId>org.apache.hadoop</groupId>
-    <version>3.2.3.3.2.3.3-2</version>
+    <version>3.2.3.3.2.3.3-3</version>
   </parent>
   <modelVersion>4.0.0</modelVersion>
   <artifactId>hadoop-build-tools</artifactId>
diff --git a/hadoop-client-modules/hadoop-client-api/pom.xml b/hadoop-client-modules/hadoop-client-api/pom.xml
index dd8844e0d97ca..34d2d326f0cce 100644
--- a/hadoop-client-modules/hadoop-client-api/pom.xml
+++ b/hadoop-client-modules/hadoop-client-api/pom.xml
@@ -18,11 +18,11 @@
 <parent>
    <groupId>org.apache.hadoop</groupId>
    <artifactId>hadoop-project</artifactId>
-   <version>3.2.3.3.2.3.3-2</version>
+   <version>3.2.3.3.2.3.3-3</version>
    <relativePath>../../hadoop-project</relativePath>
 </parent>
   <artifactId>hadoop-client-api</artifactId>
-  <version>3.2.3.3.2.3.3-2</version>
+  <version>3.2.3.3.2.3.3-3</version>
   <packaging>jar</packaging>
 
   <description>Apache Hadoop Client</description>
diff --git a/hadoop-client-modules/hadoop-client-check-invariants/pom.xml b/hadoop-client-modules/hadoop-client-check-invariants/pom.xml
index 1aa0167c0c038..db041fd990063 100644
--- a/hadoop-client-modules/hadoop-client-check-invariants/pom.xml
+++ b/hadoop-client-modules/hadoop-client-check-invariants/pom.xml
@@ -18,11 +18,11 @@
   <parent>
     <groupId>org.apache.hadoop</groupId>
     <artifactId>hadoop-project</artifactId>
-    <version>3.2.3.3.2.3.3-2</version>
+    <version>3.2.3.3.2.3.3-3</version>
     <relativePath>../../hadoop-project</relativePath>
   </parent>
   <artifactId>hadoop-client-check-invariants</artifactId>
-  <version>3.2.3.3.2.3.3-2</version>
+  <version>3.2.3.3.2.3.3-3</version>
   <packaging>pom</packaging>
 
   <description>
diff --git a/hadoop-client-modules/hadoop-client-check-test-invariants/pom.xml b/hadoop-client-modules/hadoop-client-check-test-invariants/pom.xml
index 5306a266552f0..1f4ff2f96299b 100644
--- a/hadoop-client-modules/hadoop-client-check-test-invariants/pom.xml
+++ b/hadoop-client-modules/hadoop-client-check-test-invariants/pom.xml
@@ -18,11 +18,11 @@
   <parent>
     <groupId>org.apache.hadoop</groupId>
     <artifactId>hadoop-project</artifactId>
-    <version>3.2.3.3.2.3.3-2</version>
+    <version>3.2.3.3.2.3.3-3</version>
     <relativePath>../../hadoop-project</relativePath>
   </parent>
   <artifactId>hadoop-client-check-test-invariants</artifactId>
-  <version>3.2.3.3.2.3.3-2</version>
+  <version>3.2.3.3.2.3.3-3</version>
   <packaging>pom</packaging>
 
   <description>
diff --git a/hadoop-client-modules/hadoop-client-integration-tests/pom.xml b/hadoop-client-modules/hadoop-client-integration-tests/pom.xml
index 7c97e9c6d8d94..4ec945b2fe956 100644
--- a/hadoop-client-modules/hadoop-client-integration-tests/pom.xml
+++ b/hadoop-client-modules/hadoop-client-integration-tests/pom.xml
@@ -18,11 +18,11 @@
   <parent>
     <groupId>org.apache.hadoop</groupId>
     <artifactId>hadoop-project</artifactId>
-    <version>3.2.3.3.2.3.3-2</version>
+    <version>3.2.3.3.2.3.3-3</version>
     <relativePath>../../hadoop-project</relativePath>
   </parent>
   <artifactId>hadoop-client-integration-tests</artifactId>
-  <version>3.2.3.3.2.3.3-2</version>
+  <version>3.2.3.3.2.3.3-3</version>
 
   <description>Checks that we can use the generated artifacts</description>
   <name>Apache Hadoop Client Packaging Integration Tests</name>
diff --git a/hadoop-client-modules/hadoop-client-minicluster/pom.xml b/hadoop-client-modules/hadoop-client-minicluster/pom.xml
index e2caad8451d6e..d393f41629fca 100644
--- a/hadoop-client-modules/hadoop-client-minicluster/pom.xml
+++ b/hadoop-client-modules/hadoop-client-minicluster/pom.xml
@@ -18,11 +18,11 @@
   <parent>
     <groupId>org.apache.hadoop</groupId>
     <artifactId>hadoop-project</artifactId>
-    <version>3.2.3.3.2.3.3-2</version>
+    <version>3.2.3.3.2.3.3-3</version>
     <relativePath>../../hadoop-project</relativePath>
   </parent>
   <artifactId>hadoop-client-minicluster</artifactId>
-  <version>3.2.3.3.2.3.3-2</version>
+  <version>3.2.3.3.2.3.3-3</version>
   <packaging>jar</packaging>
 
   <description>Apache Hadoop Minicluster for Clients</description>
diff --git a/hadoop-client-modules/hadoop-client-runtime/pom.xml b/hadoop-client-modules/hadoop-client-runtime/pom.xml
index 8240de69f7775..5d511f26bf0e8 100644
--- a/hadoop-client-modules/hadoop-client-runtime/pom.xml
+++ b/hadoop-client-modules/hadoop-client-runtime/pom.xml
@@ -18,11 +18,11 @@
 <parent>
    <groupId>org.apache.hadoop</groupId>
    <artifactId>hadoop-project</artifactId>
-   <version>3.2.3.3.2.3.3-2</version>
+   <version>3.2.3.3.2.3.3-3</version>
    <relativePath>../../hadoop-project</relativePath>
 </parent>
   <artifactId>hadoop-client-runtime</artifactId>
-  <version>3.2.3.3.2.3.3-2</version>
+  <version>3.2.3.3.2.3.3-3</version>
   <packaging>jar</packaging>
 
   <description>Apache Hadoop Client</description>
diff --git a/hadoop-client-modules/hadoop-client/pom.xml b/hadoop-client-modules/hadoop-client/pom.xml
index 4aaa8671bae92..330301bee9f6f 100644
--- a/hadoop-client-modules/hadoop-client/pom.xml
+++ b/hadoop-client-modules/hadoop-client/pom.xml
@@ -18,11 +18,11 @@
 <parent>
    <groupId>org.apache.hadoop</groupId>
    <artifactId>hadoop-project-dist</artifactId>
-   <version>3.2.3.3.2.3.3-2</version>
+   <version>3.2.3.3.2.3.3-3</version>
    <relativePath>../../hadoop-project-dist</relativePath>
 </parent>
   <artifactId>hadoop-client</artifactId>
-  <version>3.2.3.3.2.3.3-2</version>
+  <version>3.2.3.3.2.3.3-3</version>
 
   <description>Apache Hadoop Client aggregation pom with dependencies exposed</description>
   <name>Apache Hadoop Client Aggregator</name>
diff --git a/hadoop-client-modules/pom.xml b/hadoop-client-modules/pom.xml
index 3f7c1f2fe20d2..a56dc68093182 100644
--- a/hadoop-client-modules/pom.xml
+++ b/hadoop-client-modules/pom.xml
@@ -18,7 +18,7 @@
   <parent>
     <groupId>org.apache.hadoop</groupId>
     <artifactId>hadoop-project</artifactId>
-    <version>3.2.3.3.2.3.3-2</version>
+    <version>3.2.3.3.2.3.3-3</version>
     <relativePath>../hadoop-project</relativePath>
   </parent>
   <artifactId>hadoop-client-modules</artifactId>
diff --git a/hadoop-cloud-storage-project/hadoop-cloud-storage/pom.xml b/hadoop-cloud-storage-project/hadoop-cloud-storage/pom.xml
index 60a9206ad68c8..371da335e48d2 100644
--- a/hadoop-cloud-storage-project/hadoop-cloud-storage/pom.xml
+++ b/hadoop-cloud-storage-project/hadoop-cloud-storage/pom.xml
@@ -18,11 +18,11 @@
   <parent>
     <groupId>org.apache.hadoop</groupId>
     <artifactId>hadoop-project</artifactId>
-    <version>3.2.3.3.2.3.3-2</version>
+    <version>3.2.3.3.2.3.3-3</version>
     <relativePath>../../hadoop-project</relativePath>
   </parent>
   <artifactId>hadoop-cloud-storage</artifactId>
-  <version>3.2.3.3.2.3.3-2</version>
+  <version>3.2.3.3.2.3.3-3</version>
   <packaging>jar</packaging>
 
   <description>Apache Hadoop Cloud Storage</description>
diff --git a/hadoop-cloud-storage-project/pom.xml b/hadoop-cloud-storage-project/pom.xml
index a9d97626836eb..fdd5e35e26abd 100644
--- a/hadoop-cloud-storage-project/pom.xml
+++ b/hadoop-cloud-storage-project/pom.xml
@@ -20,11 +20,11 @@
   <parent>
     <groupId>org.apache.hadoop</groupId>
     <artifactId>hadoop-project</artifactId>
-    <version>3.2.3.3.2.3.3-2</version>
+    <version>3.2.3.3.2.3.3-3</version>
     <relativePath>../hadoop-project</relativePath>
   </parent>
   <artifactId>hadoop-cloud-storage-project</artifactId>
-  <version>3.2.3.3.2.3.3-2</version>
+  <version>3.2.3.3.2.3.3-3</version>
   <description>Apache Hadoop Cloud Storage Project</description>
   <name>Apache Hadoop Cloud Storage Project</name>
   <packaging>pom</packaging>
diff --git a/hadoop-common-project/hadoop-annotations/pom.xml b/hadoop-common-project/hadoop-annotations/pom.xml
index 59ebb9a599eb0..f74ab9d7fc6ea 100644
--- a/hadoop-common-project/hadoop-annotations/pom.xml
+++ b/hadoop-common-project/hadoop-annotations/pom.xml
@@ -20,11 +20,11 @@
   <parent>
     <groupId>org.apache.hadoop</groupId>
     <artifactId>hadoop-project</artifactId>
-    <version>3.2.3.3.2.3.3-2</version>
+    <version>3.2.3.3.2.3.3-3</version>
     <relativePath>../../hadoop-project</relativePath>
   </parent>
   <artifactId>hadoop-annotations</artifactId>
-  <version>3.2.3.3.2.3.3-2</version>
+  <version>3.2.3.3.2.3.3-3</version>
   <description>Apache Hadoop Annotations</description>
   <name>Apache Hadoop Annotations</name>
   <packaging>jar</packaging>
diff --git a/hadoop-common-project/hadoop-auth-examples/pom.xml b/hadoop-common-project/hadoop-auth-examples/pom.xml
index af8f6eb8fe17d..1dbe278e20afe 100644
--- a/hadoop-common-project/hadoop-auth-examples/pom.xml
+++ b/hadoop-common-project/hadoop-auth-examples/pom.xml
@@ -20,11 +20,11 @@
   <parent>
     <groupId>org.apache.hadoop</groupId>
     <artifactId>hadoop-project</artifactId>
-    <version>3.2.3.3.2.3.3-2</version>
+    <version>3.2.3.3.2.3.3-3</version>
     <relativePath>../../hadoop-project</relativePath>
   </parent>
   <artifactId>hadoop-auth-examples</artifactId>
-  <version>3.2.3.3.2.3.3-2</version>
+  <version>3.2.3.3.2.3.3-3</version>
   <packaging>war</packaging>
 
   <name>Apache Hadoop Auth Examples</name>
diff --git a/hadoop-common-project/hadoop-auth/pom.xml b/hadoop-common-project/hadoop-auth/pom.xml
index fea0102fdce6c..96fe5a14459e3 100644
--- a/hadoop-common-project/hadoop-auth/pom.xml
+++ b/hadoop-common-project/hadoop-auth/pom.xml
@@ -20,11 +20,11 @@
   <parent>
     <groupId>org.apache.hadoop</groupId>
     <artifactId>hadoop-project</artifactId>
-    <version>3.2.3.3.2.3.3-2</version>
+    <version>3.2.3.3.2.3.3-3</version>
     <relativePath>../../hadoop-project</relativePath>
   </parent>
   <artifactId>hadoop-auth</artifactId>
-  <version>3.2.3.3.2.3.3-2</version>
+  <version>3.2.3.3.2.3.3-3</version>
   <packaging>jar</packaging>
 
   <name>Apache Hadoop Auth</name>
diff --git a/hadoop-common-project/hadoop-common/pom.xml b/hadoop-common-project/hadoop-common/pom.xml
index daef58f3a96bf..fec010ce66f4c 100644
--- a/hadoop-common-project/hadoop-common/pom.xml
+++ b/hadoop-common-project/hadoop-common/pom.xml
@@ -20,11 +20,11 @@
   <parent>
     <groupId>org.apache.hadoop</groupId>
     <artifactId>hadoop-project-dist</artifactId>
-    <version>3.2.3.3.2.3.3-2</version>
+    <version>3.2.3.3.2.3.3-3</version>
     <relativePath>../../hadoop-project-dist</relativePath>
   </parent>
   <artifactId>hadoop-common</artifactId>
-  <version>3.2.3.3.2.3.3-2</version>
+  <version>3.2.3.3.2.3.3-3</version>
   <description>Apache Hadoop Common</description>
   <name>Apache Hadoop Common</name>
   <packaging>jar</packaging>
diff --git a/hadoop-common-project/hadoop-kms/pom.xml b/hadoop-common-project/hadoop-kms/pom.xml
index 3424ba68ce628..957cdf34aafc0 100644
--- a/hadoop-common-project/hadoop-kms/pom.xml
+++ b/hadoop-common-project/hadoop-kms/pom.xml
@@ -22,11 +22,11 @@
   <parent>
     <groupId>org.apache.hadoop</groupId>
     <artifactId>hadoop-project</artifactId>
-    <version>3.2.3.3.2.3.3-2</version>
+    <version>3.2.3.3.2.3.3-3</version>
     <relativePath>../../hadoop-project</relativePath>
   </parent>
   <artifactId>hadoop-kms</artifactId>
-  <version>3.2.3.3.2.3.3-2</version>
+  <version>3.2.3.3.2.3.3-3</version>
   <packaging>jar</packaging>
 
   <name>Apache Hadoop KMS</name>
diff --git a/hadoop-common-project/hadoop-minikdc/pom.xml b/hadoop-common-project/hadoop-minikdc/pom.xml
index d642dd4541ce9..095f9a5935d1c 100644
--- a/hadoop-common-project/hadoop-minikdc/pom.xml
+++ b/hadoop-common-project/hadoop-minikdc/pom.xml
@@ -18,12 +18,12 @@
   <parent>
     <groupId>org.apache.hadoop</groupId>
     <artifactId>hadoop-project</artifactId>
-    <version>3.2.3.3.2.3.3-2</version>
+    <version>3.2.3.3.2.3.3-3</version>
     <relativePath>../../hadoop-project</relativePath>
   </parent>
   <modelVersion>4.0.0</modelVersion>
   <artifactId>hadoop-minikdc</artifactId>
-  <version>3.2.3.3.2.3.3-2</version>
+  <version>3.2.3.3.2.3.3-3</version>
   <description>Apache Hadoop MiniKDC</description>
   <name>Apache Hadoop MiniKDC</name>
   <packaging>jar</packaging>
diff --git a/hadoop-common-project/hadoop-nfs/pom.xml b/hadoop-common-project/hadoop-nfs/pom.xml
index fb42c25af39b6..ae3132331c1f7 100644
--- a/hadoop-common-project/hadoop-nfs/pom.xml
+++ b/hadoop-common-project/hadoop-nfs/pom.xml
@@ -20,11 +20,11 @@
   <parent>
     <groupId>org.apache.hadoop</groupId>
     <artifactId>hadoop-project</artifactId>
-    <version>3.2.3.3.2.3.3-2</version>
+    <version>3.2.3.3.2.3.3-3</version>
     <relativePath>../../hadoop-project</relativePath>
   </parent>
   <artifactId>hadoop-nfs</artifactId>
-  <version>3.2.3.3.2.3.3-2</version>
+  <version>3.2.3.3.2.3.3-3</version>
   <packaging>jar</packaging>
 
   <name>Apache Hadoop NFS</name>
diff --git a/hadoop-common-project/pom.xml b/hadoop-common-project/pom.xml
index 985a1796f4119..e511762f7ac34 100644
--- a/hadoop-common-project/pom.xml
+++ b/hadoop-common-project/pom.xml
@@ -20,11 +20,11 @@
   <parent>
     <groupId>org.apache.hadoop</groupId>
     <artifactId>hadoop-project</artifactId>
-    <version>3.2.3.3.2.3.3-2</version>
+    <version>3.2.3.3.2.3.3-3</version>
     <relativePath>../hadoop-project</relativePath>
   </parent>
   <artifactId>hadoop-common-project</artifactId>
-  <version>3.2.3.3.2.3.3-2</version>
+  <version>3.2.3.3.2.3.3-3</version>
   <description>Apache Hadoop Common Project</description>
   <name>Apache Hadoop Common Project</name>
   <packaging>pom</packaging>
diff --git a/hadoop-dist/pom.xml b/hadoop-dist/pom.xml
index 37197b8adb61a..3f1c9a2eb2b16 100644
--- a/hadoop-dist/pom.xml
+++ b/hadoop-dist/pom.xml
@@ -20,11 +20,11 @@
   <parent>
     <groupId>org.apache.hadoop</groupId>
     <artifactId>hadoop-project</artifactId>
-    <version>3.2.3.3.2.3.3-2</version>
+    <version>3.2.3.3.2.3.3-3</version>
     <relativePath>../hadoop-project</relativePath>
   </parent>
   <artifactId>hadoop-dist</artifactId>
-  <version>3.2.3.3.2.3.3-2</version>
+  <version>3.2.3.3.2.3.3-3</version>
   <description>Apache Hadoop Distribution</description>
   <name>Apache Hadoop Distribution</name>
   <packaging>jar</packaging>
diff --git a/hadoop-hdfs-project/hadoop-hdfs-client/pom.xml b/hadoop-hdfs-project/hadoop-hdfs-client/pom.xml
index d08dab395a34e..680557d35bbad 100644
--- a/hadoop-hdfs-project/hadoop-hdfs-client/pom.xml
+++ b/hadoop-hdfs-project/hadoop-hdfs-client/pom.xml
@@ -20,11 +20,11 @@ https://maven.apache.org/xsd/maven-4.0.0.xsd">
   <parent>
     <groupId>org.apache.hadoop</groupId>
     <artifactId>hadoop-project-dist</artifactId>
-    <version>3.2.3.3.2.3.3-2</version>
+    <version>3.2.3.3.2.3.3-3</version>
     <relativePath>../../hadoop-project-dist</relativePath>
   </parent>
   <artifactId>hadoop-hdfs-client</artifactId>
-  <version>3.2.3.3.2.3.3-2</version>
+  <version>3.2.3.3.2.3.3-3</version>
   <description>Apache Hadoop HDFS Client</description>
   <name>Apache Hadoop HDFS Client</name>
   <packaging>jar</packaging>
diff --git a/hadoop-hdfs-project/hadoop-hdfs-httpfs/pom.xml b/hadoop-hdfs-project/hadoop-hdfs-httpfs/pom.xml
index b1ff5a00be52b..78f432d93f2ed 100644
--- a/hadoop-hdfs-project/hadoop-hdfs-httpfs/pom.xml
+++ b/hadoop-hdfs-project/hadoop-hdfs-httpfs/pom.xml
@@ -22,11 +22,11 @@
   <parent>
     <groupId>org.apache.hadoop</groupId>
     <artifactId>hadoop-project</artifactId>
-    <version>3.2.3.3.2.3.3-2</version>
+    <version>3.2.3.3.2.3.3-3</version>
     <relativePath>../../hadoop-project</relativePath>
   </parent>
   <artifactId>hadoop-hdfs-httpfs</artifactId>
-  <version>3.2.3.3.2.3.3-2</version>
+  <version>3.2.3.3.2.3.3-3</version>
   <packaging>jar</packaging>
 
   <name>Apache Hadoop HttpFS</name>
diff --git a/hadoop-hdfs-project/hadoop-hdfs-native-client/pom.xml b/hadoop-hdfs-project/hadoop-hdfs-native-client/pom.xml
index 1526c99beebf5..0e911d0ca13ec 100644
--- a/hadoop-hdfs-project/hadoop-hdfs-native-client/pom.xml
+++ b/hadoop-hdfs-project/hadoop-hdfs-native-client/pom.xml
@@ -20,11 +20,11 @@ https://maven.apache.org/xsd/maven-4.0.0.xsd">
   <parent>
     <groupId>org.apache.hadoop</groupId>
     <artifactId>hadoop-project-dist</artifactId>
-    <version>3.2.3.3.2.3.3-2</version>
+    <version>3.2.3.3.2.3.3-3</version>
     <relativePath>../../hadoop-project-dist</relativePath>
   </parent>
   <artifactId>hadoop-hdfs-native-client</artifactId>
-  <version>3.2.3.3.2.3.3-2</version>
+  <version>3.2.3.3.2.3.3-3</version>
   <description>Apache Hadoop HDFS Native Client</description>
   <name>Apache Hadoop HDFS Native Client</name>
   <packaging>jar</packaging>
diff --git a/hadoop-hdfs-project/hadoop-hdfs-nfs/pom.xml b/hadoop-hdfs-project/hadoop-hdfs-nfs/pom.xml
index fe09c81680787..5c34ae9cbf6be 100644
--- a/hadoop-hdfs-project/hadoop-hdfs-nfs/pom.xml
+++ b/hadoop-hdfs-project/hadoop-hdfs-nfs/pom.xml
@@ -20,11 +20,11 @@ https://maven.apache.org/xsd/maven-4.0.0.xsd">
   <parent>
     <groupId>org.apache.hadoop</groupId>
     <artifactId>hadoop-project</artifactId>
-    <version>3.2.3.3.2.3.3-2</version>
+    <version>3.2.3.3.2.3.3-3</version>
     <relativePath>../../hadoop-project</relativePath>
   </parent>
   <artifactId>hadoop-hdfs-nfs</artifactId>
-  <version>3.2.3.3.2.3.3-2</version>
+  <version>3.2.3.3.2.3.3-3</version>
   <description>Apache Hadoop HDFS-NFS</description>
   <name>Apache Hadoop HDFS-NFS</name>
   <packaging>jar</packaging>
diff --git a/hadoop-hdfs-project/hadoop-hdfs-rbf/pom.xml b/hadoop-hdfs-project/hadoop-hdfs-rbf/pom.xml
index f8b6b95a898cf..7379b5d4715df 100644
--- a/hadoop-hdfs-project/hadoop-hdfs-rbf/pom.xml
+++ b/hadoop-hdfs-project/hadoop-hdfs-rbf/pom.xml
@@ -20,11 +20,11 @@ https://maven.apache.org/xsd/maven-4.0.0.xsd">
   <parent>
     <groupId>org.apache.hadoop</groupId>
     <artifactId>hadoop-project-dist</artifactId>
-    <version>3.2.3.3.2.3.3-2</version>
+    <version>3.2.3.3.2.3.3-3</version>
     <relativePath>../../hadoop-project-dist</relativePath>
   </parent>
   <artifactId>hadoop-hdfs-rbf</artifactId>
-  <version>3.2.3.3.2.3.3-2</version>
+  <version>3.2.3.3.2.3.3-3</version>
   <description>Apache Hadoop HDFS-RBF</description>
   <name>Apache Hadoop HDFS-RBF</name>
   <packaging>jar</packaging>
diff --git a/hadoop-hdfs-project/hadoop-hdfs/pom.xml b/hadoop-hdfs-project/hadoop-hdfs/pom.xml
index 7f767a3069f98..c06ff3326be62 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/pom.xml
+++ b/hadoop-hdfs-project/hadoop-hdfs/pom.xml
@@ -20,11 +20,11 @@ https://maven.apache.org/xsd/maven-4.0.0.xsd">
   <parent>
     <groupId>org.apache.hadoop</groupId>
     <artifactId>hadoop-project-dist</artifactId>
-    <version>3.2.3.3.2.3.3-2</version>
+    <version>3.2.3.3.2.3.3-3</version>
     <relativePath>../../hadoop-project-dist</relativePath>
   </parent>
   <artifactId>hadoop-hdfs</artifactId>
-  <version>3.2.3.3.2.3.3-2</version>
+  <version>3.2.3.3.2.3.3-3</version>
   <description>Apache Hadoop HDFS</description>
   <name>Apache Hadoop HDFS</name>
   <packaging>jar</packaging>
diff --git a/hadoop-hdfs-project/pom.xml b/hadoop-hdfs-project/pom.xml
index 3ba7d4c3234d3..3248cd37873f8 100644
--- a/hadoop-hdfs-project/pom.xml
+++ b/hadoop-hdfs-project/pom.xml
@@ -20,11 +20,11 @@ https://maven.apache.org/xsd/maven-4.0.0.xsd">
   <parent>
     <groupId>org.apache.hadoop</groupId>
     <artifactId>hadoop-project</artifactId>
-    <version>3.2.3.3.2.3.3-2</version>
+    <version>3.2.3.3.2.3.3-3</version>
     <relativePath>../hadoop-project</relativePath>
   </parent>
   <artifactId>hadoop-hdfs-project</artifactId>
-  <version>3.2.3.3.2.3.3-2</version>
+  <version>3.2.3.3.2.3.3-3</version>
   <description>Apache Hadoop HDFS Project</description>
   <name>Apache Hadoop HDFS Project</name>
   <packaging>pom</packaging>
diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/pom.xml b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/pom.xml
index dcf96c9b45ef5..baed2a1fecdc1 100644
--- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/pom.xml
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/pom.xml
@@ -19,11 +19,11 @@
   <parent>
     <artifactId>hadoop-mapreduce-client</artifactId>
     <groupId>org.apache.hadoop</groupId>
-    <version>3.2.3.3.2.3.3-2</version>
+    <version>3.2.3.3.2.3.3-3</version>
   </parent>
   <modelVersion>4.0.0</modelVersion>
   <artifactId>hadoop-mapreduce-client-app</artifactId>
-  <version>3.2.3.3.2.3.3-2</version>
+  <version>3.2.3.3.2.3.3-3</version>
   <name>Apache Hadoop MapReduce App</name>
 
   <properties>
diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-common/pom.xml b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-common/pom.xml
index 0af56e1ae2b6b..783096f1b5772 100644
--- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-common/pom.xml
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-common/pom.xml
@@ -19,11 +19,11 @@
   <parent>
     <artifactId>hadoop-mapreduce-client</artifactId>
     <groupId>org.apache.hadoop</groupId>
-    <version>3.2.3.3.2.3.3-2</version>
+    <version>3.2.3.3.2.3.3-3</version>
   </parent>
   <modelVersion>4.0.0</modelVersion>
   <artifactId>hadoop-mapreduce-client-common</artifactId>
-  <version>3.2.3.3.2.3.3-2</version>
+  <version>3.2.3.3.2.3.3-3</version>
   <name>Apache Hadoop MapReduce Common</name>
 
   <properties>
diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/pom.xml b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/pom.xml
index d1230ff765f68..47781bb0e0c37 100644
--- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/pom.xml
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/pom.xml
@@ -19,11 +19,11 @@
   <parent>
     <artifactId>hadoop-mapreduce-client</artifactId>
     <groupId>org.apache.hadoop</groupId>
-    <version>3.2.3.3.2.3.3-2</version>
+    <version>3.2.3.3.2.3.3-3</version>
   </parent>
   <modelVersion>4.0.0</modelVersion>
   <artifactId>hadoop-mapreduce-client-core</artifactId>
-  <version>3.2.3.3.2.3.3-2</version>
+  <version>3.2.3.3.2.3.3-3</version>
   <name>Apache Hadoop MapReduce Core</name>
 
   <properties>
diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-hs-plugins/pom.xml b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-hs-plugins/pom.xml
index 8c6802e9d2986..1f9e322acbe75 100644
--- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-hs-plugins/pom.xml
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-hs-plugins/pom.xml
@@ -19,11 +19,11 @@
   <parent>
     <artifactId>hadoop-mapreduce-client</artifactId>
     <groupId>org.apache.hadoop</groupId>
-    <version>3.2.3.3.2.3.3-2</version>
+    <version>3.2.3.3.2.3.3-3</version>
   </parent>
   <modelVersion>4.0.0</modelVersion>
   <artifactId>hadoop-mapreduce-client-hs-plugins</artifactId>
-  <version>3.2.3.3.2.3.3-2</version>
+  <version>3.2.3.3.2.3.3-3</version>
   <name>Apache Hadoop MapReduce HistoryServer Plugins</name>
 
   <properties>
diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-hs/pom.xml b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-hs/pom.xml
index 905a66b564b97..f7f327159a1aa 100644
--- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-hs/pom.xml
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-hs/pom.xml
@@ -19,11 +19,11 @@
   <parent>
     <artifactId>hadoop-mapreduce-client</artifactId>
     <groupId>org.apache.hadoop</groupId>
-    <version>3.2.3.3.2.3.3-2</version>
+    <version>3.2.3.3.2.3.3-3</version>
   </parent>
   <modelVersion>4.0.0</modelVersion>
   <artifactId>hadoop-mapreduce-client-hs</artifactId>
-  <version>3.2.3.3.2.3.3-2</version>
+  <version>3.2.3.3.2.3.3-3</version>
   <name>Apache Hadoop MapReduce HistoryServer</name>
 
   <properties>
diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-jobclient/pom.xml b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-jobclient/pom.xml
index 4a86fd847020b..460dd7dc9a8e7 100644
--- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-jobclient/pom.xml
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-jobclient/pom.xml
@@ -19,11 +19,11 @@
   <parent>
     <artifactId>hadoop-mapreduce-client</artifactId>
     <groupId>org.apache.hadoop</groupId>
-    <version>3.2.3.3.2.3.3-2</version>
+    <version>3.2.3.3.2.3.3-3</version>
   </parent>
   <modelVersion>4.0.0</modelVersion>
   <artifactId>hadoop-mapreduce-client-jobclient</artifactId>
-  <version>3.2.3.3.2.3.3-2</version>
+  <version>3.2.3.3.2.3.3-3</version>
   <name>Apache Hadoop MapReduce JobClient</name>
 
   <properties>
diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-nativetask/pom.xml b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-nativetask/pom.xml
index 580a77d822349..691a8234df4f5 100644
--- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-nativetask/pom.xml
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-nativetask/pom.xml
@@ -19,11 +19,11 @@
   <parent>
     <artifactId>hadoop-mapreduce-client</artifactId>
     <groupId>org.apache.hadoop</groupId>
-    <version>3.2.3.3.2.3.3-2</version>
+    <version>3.2.3.3.2.3.3-3</version>
   </parent>
   <modelVersion>4.0.0</modelVersion>
   <artifactId>hadoop-mapreduce-client-nativetask</artifactId>
-  <version>3.2.3.3.2.3.3-2</version>
+  <version>3.2.3.3.2.3.3-3</version>
   <name>Apache Hadoop MapReduce NativeTask</name>
 
   <properties>
diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-shuffle/pom.xml b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-shuffle/pom.xml
index cf428a7202a38..90a7ea3201396 100644
--- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-shuffle/pom.xml
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-shuffle/pom.xml
@@ -19,11 +19,11 @@
   <parent>
     <artifactId>hadoop-mapreduce-client</artifactId>
     <groupId>org.apache.hadoop</groupId>
-    <version>3.2.3.3.2.3.3-2</version>
+    <version>3.2.3.3.2.3.3-3</version>
   </parent>
   <modelVersion>4.0.0</modelVersion>
   <artifactId>hadoop-mapreduce-client-shuffle</artifactId>
-  <version>3.2.3.3.2.3.3-2</version>
+  <version>3.2.3.3.2.3.3-3</version>
   <name>Apache Hadoop MapReduce Shuffle</name>
 
   <properties>
diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-uploader/pom.xml b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-uploader/pom.xml
index 0c9ec62fd28c7..528de17e528d0 100644
--- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-uploader/pom.xml
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-uploader/pom.xml
@@ -18,11 +18,11 @@
     <parent>
         <artifactId>hadoop-mapreduce-client</artifactId>
         <groupId>org.apache.hadoop</groupId>
-        <version>3.2.3.3.2.3.3-2</version>
+        <version>3.2.3.3.2.3.3-3</version>
     </parent>
     <modelVersion>4.0.0</modelVersion>
     <artifactId>hadoop-mapreduce-client-uploader</artifactId>
-    <version>3.2.3.3.2.3.3-2</version>
+    <version>3.2.3.3.2.3.3-3</version>
     <name>Apache Hadoop MapReduce Uploader</name>
 
     <dependencies>
diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/pom.xml b/hadoop-mapreduce-project/hadoop-mapreduce-client/pom.xml
index 0e12bfe6dd3f8..fb2d210df2df4 100644
--- a/hadoop-mapreduce-project/hadoop-mapreduce-client/pom.xml
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/pom.xml
@@ -20,11 +20,11 @@
   <parent>
     <groupId>org.apache.hadoop</groupId>
     <artifactId>hadoop-project</artifactId>
-    <version>3.2.3.3.2.3.3-2</version>
+    <version>3.2.3.3.2.3.3-3</version>
     <relativePath>../../hadoop-project</relativePath>
   </parent>
   <artifactId>hadoop-mapreduce-client</artifactId>
-  <version>3.2.3.3.2.3.3-2</version>
+  <version>3.2.3.3.2.3.3-3</version>
   <name>Apache Hadoop MapReduce Client</name>
   <packaging>pom</packaging>
 
diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-examples/pom.xml b/hadoop-mapreduce-project/hadoop-mapreduce-examples/pom.xml
index e5c23476653da..6d2fa846f36e5 100644
--- a/hadoop-mapreduce-project/hadoop-mapreduce-examples/pom.xml
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-examples/pom.xml
@@ -20,11 +20,11 @@
   <parent>
     <groupId>org.apache.hadoop</groupId>
     <artifactId>hadoop-project</artifactId>
-    <version>3.2.3.3.2.3.3-2</version>
+    <version>3.2.3.3.2.3.3-3</version>
     <relativePath>../../hadoop-project</relativePath>
   </parent>
   <artifactId>hadoop-mapreduce-examples</artifactId>
-  <version>3.2.3.3.2.3.3-2</version>
+  <version>3.2.3.3.2.3.3-3</version>
   <description>Apache Hadoop MapReduce Examples</description>
   <name>Apache Hadoop MapReduce Examples</name>
   <packaging>jar</packaging>
diff --git a/hadoop-mapreduce-project/pom.xml b/hadoop-mapreduce-project/pom.xml
index 5e854a78bc5c0..76022456ef795 100644
--- a/hadoop-mapreduce-project/pom.xml
+++ b/hadoop-mapreduce-project/pom.xml
@@ -18,11 +18,11 @@
   <parent>
     <groupId>org.apache.hadoop</groupId>
     <artifactId>hadoop-project</artifactId>
-    <version>3.2.3.3.2.3.3-2</version>
+    <version>3.2.3.3.2.3.3-3</version>
     <relativePath>../hadoop-project</relativePath>
   </parent>
   <artifactId>hadoop-mapreduce</artifactId>
-  <version>3.2.3.3.2.3.3-2</version>
+  <version>3.2.3.3.2.3.3-3</version>
   <packaging>pom</packaging>
   <name>Apache Hadoop MapReduce</name>
   <url>https://hadoop.apache.org/</url>
diff --git a/hadoop-maven-plugins/pom.xml b/hadoop-maven-plugins/pom.xml
index 491a0bf9c49a0..71942eefe11ca 100644
--- a/hadoop-maven-plugins/pom.xml
+++ b/hadoop-maven-plugins/pom.xml
@@ -19,7 +19,7 @@
   <parent>
     <groupId>org.apache.hadoop</groupId>
     <artifactId>hadoop-project</artifactId>
-    <version>3.2.3.3.2.3.3-2</version>
+    <version>3.2.3.3.2.3.3-3</version>
     <relativePath>../hadoop-project</relativePath>
   </parent>
   <artifactId>hadoop-maven-plugins</artifactId>
diff --git a/hadoop-minicluster/pom.xml b/hadoop-minicluster/pom.xml
index e39e65e93a08f..a4f040d534c33 100644
--- a/hadoop-minicluster/pom.xml
+++ b/hadoop-minicluster/pom.xml
@@ -18,11 +18,11 @@
   <parent>
     <groupId>org.apache.hadoop</groupId>
     <artifactId>hadoop-project</artifactId>
-    <version>3.2.3.3.2.3.3-2</version>
+    <version>3.2.3.3.2.3.3-3</version>
     <relativePath>../hadoop-project</relativePath>
   </parent>
   <artifactId>hadoop-minicluster</artifactId>
-  <version>3.2.3.3.2.3.3-2</version>
+  <version>3.2.3.3.2.3.3-3</version>
   <packaging>jar</packaging>
 
   <description>Apache Hadoop Mini-Cluster</description>
diff --git a/hadoop-project-dist/pom.xml b/hadoop-project-dist/pom.xml
index 1bccda2889786..64f3377e9810f 100644
--- a/hadoop-project-dist/pom.xml
+++ b/hadoop-project-dist/pom.xml
@@ -20,11 +20,11 @@
   <parent>
     <groupId>org.apache.hadoop</groupId>
     <artifactId>hadoop-project</artifactId>
-    <version>3.2.3.3.2.3.3-2</version>
+    <version>3.2.3.3.2.3.3-3</version>
     <relativePath>../hadoop-project</relativePath>
   </parent>
   <artifactId>hadoop-project-dist</artifactId>
-  <version>3.2.3.3.2.3.3-2</version>
+  <version>3.2.3.3.2.3.3-3</version>
   <description>Apache Hadoop Project Dist POM</description>
   <name>Apache Hadoop Project Dist POM</name>
   <packaging>pom</packaging>
diff --git a/hadoop-project/pom.xml b/hadoop-project/pom.xml
index 26787bf6081b0..17b2f3f5c0502 100644
--- a/hadoop-project/pom.xml
+++ b/hadoop-project/pom.xml
@@ -20,10 +20,10 @@
   <parent>
     <groupId>org.apache.hadoop</groupId>
     <artifactId>hadoop-main</artifactId>
-    <version>3.2.3.3.2.3.3-2</version>
+    <version>3.2.3.3.2.3.3-3</version>
   </parent>
   <artifactId>hadoop-project</artifactId>
-  <version>3.2.3.3.2.3.3-2</version>
+  <version>3.2.3.3.2.3.3-3</version>
   <description>Apache Hadoop Project POM</description>
   <name>Apache Hadoop Project POM</name>
   <packaging>pom</packaging>
diff --git a/hadoop-tools/hadoop-aliyun/pom.xml b/hadoop-tools/hadoop-aliyun/pom.xml
index 9c149b7a3beab..785176caf5818 100644
--- a/hadoop-tools/hadoop-aliyun/pom.xml
+++ b/hadoop-tools/hadoop-aliyun/pom.xml
@@ -18,7 +18,7 @@
   <parent>
     <groupId>org.apache.hadoop</groupId>
     <artifactId>hadoop-project</artifactId>
-    <version>3.2.3.3.2.3.3-2</version>
+    <version>3.2.3.3.2.3.3-3</version>
     <relativePath>../../hadoop-project</relativePath>
   </parent>
   <artifactId>hadoop-aliyun</artifactId>
diff --git a/hadoop-tools/hadoop-archive-logs/pom.xml b/hadoop-tools/hadoop-archive-logs/pom.xml
index 2ceb6eba6e38d..73dc7164f27c0 100644
--- a/hadoop-tools/hadoop-archive-logs/pom.xml
+++ b/hadoop-tools/hadoop-archive-logs/pom.xml
@@ -20,11 +20,11 @@
   <parent>
     <groupId>org.apache.hadoop</groupId>
     <artifactId>hadoop-project</artifactId>
-    <version>3.2.3.3.2.3.3-2</version>
+    <version>3.2.3.3.2.3.3-3</version>
     <relativePath>../../hadoop-project</relativePath>
   </parent>
   <artifactId>hadoop-archive-logs</artifactId>
-  <version>3.2.3.3.2.3.3-2</version>
+  <version>3.2.3.3.2.3.3-3</version>
   <description>Apache Hadoop Archive Logs</description>
   <name>Apache Hadoop Archive Logs</name>
   <packaging>jar</packaging>
diff --git a/hadoop-tools/hadoop-archives/pom.xml b/hadoop-tools/hadoop-archives/pom.xml
index ab7d1184ac858..f42c3bcdbb36a 100644
--- a/hadoop-tools/hadoop-archives/pom.xml
+++ b/hadoop-tools/hadoop-archives/pom.xml
@@ -20,11 +20,11 @@
   <parent>
     <groupId>org.apache.hadoop</groupId>
     <artifactId>hadoop-project</artifactId>
-    <version>3.2.3.3.2.3.3-2</version>
+    <version>3.2.3.3.2.3.3-3</version>
     <relativePath>../../hadoop-project</relativePath>
   </parent>
   <artifactId>hadoop-archives</artifactId>
-  <version>3.2.3.3.2.3.3-2</version>
+  <version>3.2.3.3.2.3.3-3</version>
   <description>Apache Hadoop Archives</description>
   <name>Apache Hadoop Archives</name>
   <packaging>jar</packaging>
diff --git a/hadoop-tools/hadoop-aws/pom.xml b/hadoop-tools/hadoop-aws/pom.xml
index 9b6b705448dfc..8c12e74a6247c 100644
--- a/hadoop-tools/hadoop-aws/pom.xml
+++ b/hadoop-tools/hadoop-aws/pom.xml
@@ -19,11 +19,11 @@
   <parent>
     <groupId>org.apache.hadoop</groupId>
     <artifactId>hadoop-project</artifactId>
-    <version>3.2.3.3.2.3.3-2</version>
+    <version>3.2.3.3.2.3.3-3</version>
     <relativePath>../../hadoop-project</relativePath>
   </parent>
   <artifactId>hadoop-aws</artifactId>
-  <version>3.2.3.3.2.3.3-2</version>
+  <version>3.2.3.3.2.3.3-3</version>
   <name>Apache Hadoop Amazon Web Services support</name>
   <description>
     This module contains code to support integration with Amazon Web Services.
diff --git a/hadoop-tools/hadoop-azure-datalake/pom.xml b/hadoop-tools/hadoop-azure-datalake/pom.xml
index 260f9a8048751..697489a702ec6 100644
--- a/hadoop-tools/hadoop-azure-datalake/pom.xml
+++ b/hadoop-tools/hadoop-azure-datalake/pom.xml
@@ -19,7 +19,7 @@
   <parent>
     <groupId>org.apache.hadoop</groupId>
     <artifactId>hadoop-project</artifactId>
-    <version>3.2.3.3.2.3.3-2</version>
+    <version>3.2.3.3.2.3.3-3</version>
     <relativePath>../../hadoop-project</relativePath>
   </parent>
   <artifactId>hadoop-azure-datalake</artifactId>
diff --git a/hadoop-tools/hadoop-azure/pom.xml b/hadoop-tools/hadoop-azure/pom.xml
index b6dbf6a192582..a9a6b7ee18358 100644
--- a/hadoop-tools/hadoop-azure/pom.xml
+++ b/hadoop-tools/hadoop-azure/pom.xml
@@ -19,7 +19,7 @@
   <parent>
     <groupId>org.apache.hadoop</groupId>
     <artifactId>hadoop-project</artifactId>
-    <version>3.2.3.3.2.3.3-2</version>
+    <version>3.2.3.3.2.3.3-3</version>
     <relativePath>../../hadoop-project</relativePath>
   </parent>
   <artifactId>hadoop-azure</artifactId>
diff --git a/hadoop-tools/hadoop-datajoin/pom.xml b/hadoop-tools/hadoop-datajoin/pom.xml
index 2b620d92a656e..fce0b7ec948b1 100644
--- a/hadoop-tools/hadoop-datajoin/pom.xml
+++ b/hadoop-tools/hadoop-datajoin/pom.xml
@@ -20,11 +20,11 @@
   <parent>
     <groupId>org.apache.hadoop</groupId>
     <artifactId>hadoop-project</artifactId>
-    <version>3.2.3.3.2.3.3-2</version>
+    <version>3.2.3.3.2.3.3-3</version>
     <relativePath>../../hadoop-project</relativePath>
   </parent>
   <artifactId>hadoop-datajoin</artifactId>
-  <version>3.2.3.3.2.3.3-2</version>
+  <version>3.2.3.3.2.3.3-3</version>
   <description>Apache Hadoop Data Join</description>
   <name>Apache Hadoop Data Join</name>
   <packaging>jar</packaging>
diff --git a/hadoop-tools/hadoop-distcp/pom.xml b/hadoop-tools/hadoop-distcp/pom.xml
index 0b9e56db30513..9ea6b333035a7 100644
--- a/hadoop-tools/hadoop-distcp/pom.xml
+++ b/hadoop-tools/hadoop-distcp/pom.xml
@@ -20,11 +20,11 @@
   <parent>
     <groupId>org.apache.hadoop</groupId>
     <artifactId>hadoop-project</artifactId>
-    <version>3.2.3.3.2.3.3-2</version>
+    <version>3.2.3.3.2.3.3-3</version>
     <relativePath>../../hadoop-project</relativePath>
   </parent>
   <artifactId>hadoop-distcp</artifactId>
-  <version>3.2.3.3.2.3.3-2</version>
+  <version>3.2.3.3.2.3.3-3</version>
   <description>Apache Hadoop Distributed Copy</description>
   <name>Apache Hadoop Distributed Copy</name>
   <packaging>jar</packaging>
diff --git a/hadoop-tools/hadoop-extras/pom.xml b/hadoop-tools/hadoop-extras/pom.xml
index ef4a858b6045b..1324c362ecc76 100644
--- a/hadoop-tools/hadoop-extras/pom.xml
+++ b/hadoop-tools/hadoop-extras/pom.xml
@@ -20,11 +20,11 @@
   <parent>
     <groupId>org.apache.hadoop</groupId>
     <artifactId>hadoop-project</artifactId>
-    <version>3.2.3.3.2.3.3-2</version>
+    <version>3.2.3.3.2.3.3-3</version>
     <relativePath>../../hadoop-project</relativePath>
   </parent>
   <artifactId>hadoop-extras</artifactId>
-  <version>3.2.3.3.2.3.3-2</version>
+  <version>3.2.3.3.2.3.3-3</version>
   <description>Apache Hadoop Extras</description>
   <name>Apache Hadoop Extras</name>
   <packaging>jar</packaging>
diff --git a/hadoop-tools/hadoop-fs2img/pom.xml b/hadoop-tools/hadoop-fs2img/pom.xml
index aa291d0b2f64f..2edd0d0204344 100644
--- a/hadoop-tools/hadoop-fs2img/pom.xml
+++ b/hadoop-tools/hadoop-fs2img/pom.xml
@@ -17,12 +17,12 @@
   <parent>
     <groupId>org.apache.hadoop</groupId>
     <artifactId>hadoop-project</artifactId>
-    <version>3.2.3.3.2.3.3-2</version>
+    <version>3.2.3.3.2.3.3-3</version>
     <relativePath>../../hadoop-project</relativePath>
   </parent>
   <groupId>org.apache.hadoop</groupId>
   <artifactId>hadoop-fs2img</artifactId>
-  <version>3.2.3.3.2.3.3-2</version>
+  <version>3.2.3.3.2.3.3-3</version>
   <description>Apache Hadoop Image Generation Tool</description>
   <name>Apache Hadoop Image Generation Tool</name>
   <packaging>jar</packaging>
diff --git a/hadoop-tools/hadoop-gridmix/pom.xml b/hadoop-tools/hadoop-gridmix/pom.xml
index 9022fcaaf3393..8c7364abf9ca3 100644
--- a/hadoop-tools/hadoop-gridmix/pom.xml
+++ b/hadoop-tools/hadoop-gridmix/pom.xml
@@ -20,11 +20,11 @@
   <parent>
     <groupId>org.apache.hadoop</groupId>
     <artifactId>hadoop-project</artifactId>
-    <version>3.2.3.3.2.3.3-2</version>
+    <version>3.2.3.3.2.3.3-3</version>
     <relativePath>../../hadoop-project</relativePath>
   </parent>
   <artifactId>hadoop-gridmix</artifactId>
-  <version>3.2.3.3.2.3.3-2</version>
+  <version>3.2.3.3.2.3.3-3</version>
   <description>Apache Hadoop Gridmix</description>
   <name>Apache Hadoop Gridmix</name>
   <packaging>jar</packaging>
diff --git a/hadoop-tools/hadoop-kafka/pom.xml b/hadoop-tools/hadoop-kafka/pom.xml
index 8b27910d4faea..da7b8382607e2 100644
--- a/hadoop-tools/hadoop-kafka/pom.xml
+++ b/hadoop-tools/hadoop-kafka/pom.xml
@@ -19,11 +19,11 @@
   <parent>
     <groupId>org.apache.hadoop</groupId>
     <artifactId>hadoop-project</artifactId>
-    <version>3.2.3.3.2.3.3-2</version>
+    <version>3.2.3.3.2.3.3-3</version>
     <relativePath>../../hadoop-project</relativePath>
   </parent>
   <artifactId>hadoop-kafka</artifactId>
-  <version>3.2.3.3.2.3.3-2</version>
+  <version>3.2.3.3.2.3.3-3</version>
   <name>Apache Hadoop Kafka Library support</name>
   <description>
     This module contains code to support integration with Kafka.
diff --git a/hadoop-tools/hadoop-openstack/pom.xml b/hadoop-tools/hadoop-openstack/pom.xml
index c4d5d4884cb42..ae896baf5b14d 100644
--- a/hadoop-tools/hadoop-openstack/pom.xml
+++ b/hadoop-tools/hadoop-openstack/pom.xml
@@ -19,11 +19,11 @@
   <parent>
     <groupId>org.apache.hadoop</groupId>
     <artifactId>hadoop-project</artifactId>
-    <version>3.2.3.3.2.3.3-2</version>
+    <version>3.2.3.3.2.3.3-3</version>
     <relativePath>../../hadoop-project</relativePath>
   </parent>
   <artifactId>hadoop-openstack</artifactId>
-  <version>3.2.3.3.2.3.3-2</version>
+  <version>3.2.3.3.2.3.3-3</version>
   <name>Apache Hadoop OpenStack support</name>
   <description>
     This module contains code to support integration with OpenStack.
diff --git a/hadoop-tools/hadoop-pipes/pom.xml b/hadoop-tools/hadoop-pipes/pom.xml
index 294c7ab4ec327..2da6f3f3056f7 100644
--- a/hadoop-tools/hadoop-pipes/pom.xml
+++ b/hadoop-tools/hadoop-pipes/pom.xml
@@ -20,11 +20,11 @@
   <parent>
     <groupId>org.apache.hadoop</groupId>
     <artifactId>hadoop-project</artifactId>
-    <version>3.2.3.3.2.3.3-2</version>
+    <version>3.2.3.3.2.3.3-3</version>
     <relativePath>../../hadoop-project</relativePath>
   </parent>
   <artifactId>hadoop-pipes</artifactId>
-  <version>3.2.3.3.2.3.3-2</version>
+  <version>3.2.3.3.2.3.3-3</version>
   <description>Apache Hadoop Pipes</description>
   <name>Apache Hadoop Pipes</name>
   <packaging>pom</packaging>
diff --git a/hadoop-tools/hadoop-resourceestimator/pom.xml b/hadoop-tools/hadoop-resourceestimator/pom.xml
index 5fb4aaf05a89b..cd6530d7e0e46 100644
--- a/hadoop-tools/hadoop-resourceestimator/pom.xml
+++ b/hadoop-tools/hadoop-resourceestimator/pom.xml
@@ -25,7 +25,7 @@
     <parent>
         <groupId>org.apache.hadoop</groupId>
         <artifactId>hadoop-project</artifactId>
-        <version>3.2.3.3.2.3.3-2</version>
+        <version>3.2.3.3.2.3.3-3</version>
         <relativePath>../../hadoop-project</relativePath>
     </parent>
     <artifactId>hadoop-resourceestimator</artifactId>
diff --git a/hadoop-tools/hadoop-rumen/pom.xml b/hadoop-tools/hadoop-rumen/pom.xml
index fb7afbfd34584..6a1240f5d5fac 100644
--- a/hadoop-tools/hadoop-rumen/pom.xml
+++ b/hadoop-tools/hadoop-rumen/pom.xml
@@ -20,11 +20,11 @@
   <parent>
     <groupId>org.apache.hadoop</groupId>
     <artifactId>hadoop-project</artifactId>
-    <version>3.2.3.3.2.3.3-2</version>
+    <version>3.2.3.3.2.3.3-3</version>
     <relativePath>../../hadoop-project</relativePath>
   </parent>
   <artifactId>hadoop-rumen</artifactId>
-  <version>3.2.3.3.2.3.3-2</version>
+  <version>3.2.3.3.2.3.3-3</version>
   <description>Apache Hadoop Rumen</description>
   <name>Apache Hadoop Rumen</name>
   <packaging>jar</packaging>
diff --git a/hadoop-tools/hadoop-sls/pom.xml b/hadoop-tools/hadoop-sls/pom.xml
index 5e91843aa76d1..cdc266288a153 100644
--- a/hadoop-tools/hadoop-sls/pom.xml
+++ b/hadoop-tools/hadoop-sls/pom.xml
@@ -19,11 +19,11 @@
   <parent>
     <groupId>org.apache.hadoop</groupId>
     <artifactId>hadoop-project</artifactId>
-    <version>3.2.3.3.2.3.3-2</version>
+    <version>3.2.3.3.2.3.3-3</version>
     <relativePath>../../hadoop-project</relativePath>
   </parent>
   <artifactId>hadoop-sls</artifactId>
-  <version>3.2.3.3.2.3.3-2</version>
+  <version>3.2.3.3.2.3.3-3</version>
   <description>Apache Hadoop Scheduler Load Simulator</description>
   <name>Apache Hadoop Scheduler Load Simulator</name>
   <packaging>jar</packaging>
diff --git a/hadoop-tools/hadoop-streaming/pom.xml b/hadoop-tools/hadoop-streaming/pom.xml
index d59d946fff7b4..9c4131429bac5 100644
--- a/hadoop-tools/hadoop-streaming/pom.xml
+++ b/hadoop-tools/hadoop-streaming/pom.xml
@@ -20,11 +20,11 @@
   <parent>
     <groupId>org.apache.hadoop</groupId>
     <artifactId>hadoop-project</artifactId>
-    <version>3.2.3.3.2.3.3-2</version>
+    <version>3.2.3.3.2.3.3-3</version>
     <relativePath>../../hadoop-project</relativePath>
   </parent>
   <artifactId>hadoop-streaming</artifactId>
-  <version>3.2.3.3.2.3.3-2</version>
+  <version>3.2.3.3.2.3.3-3</version>
   <description>Apache Hadoop MapReduce Streaming</description>
   <name>Apache Hadoop MapReduce Streaming</name>
   <packaging>jar</packaging>
diff --git a/hadoop-tools/hadoop-tools-dist/pom.xml b/hadoop-tools/hadoop-tools-dist/pom.xml
index 9c1873ebe4ea2..c8a432644db2c 100644
--- a/hadoop-tools/hadoop-tools-dist/pom.xml
+++ b/hadoop-tools/hadoop-tools-dist/pom.xml
@@ -20,11 +20,11 @@
   <parent>
     <groupId>org.apache.hadoop</groupId>
     <artifactId>hadoop-project-dist</artifactId>
-    <version>3.2.3.3.2.3.3-2</version>
+    <version>3.2.3.3.2.3.3-3</version>
     <relativePath>../../hadoop-project-dist</relativePath>
   </parent>
   <artifactId>hadoop-tools-dist</artifactId>
-  <version>3.2.3.3.2.3.3-2</version>
+  <version>3.2.3.3.2.3.3-3</version>
   <description>Apache Hadoop Tools Dist</description>
   <name>Apache Hadoop Tools Dist</name>
   <packaging>jar</packaging>
diff --git a/hadoop-tools/pom.xml b/hadoop-tools/pom.xml
index 1ab291fefc5b9..d9ccc28cc247c 100644
--- a/hadoop-tools/pom.xml
+++ b/hadoop-tools/pom.xml
@@ -20,11 +20,11 @@
   <parent>
     <groupId>org.apache.hadoop</groupId>
     <artifactId>hadoop-project</artifactId>
-    <version>3.2.3.3.2.3.3-2</version>
+    <version>3.2.3.3.2.3.3-3</version>
     <relativePath>../hadoop-project</relativePath>
   </parent>
   <artifactId>hadoop-tools</artifactId>
-  <version>3.2.3.3.2.3.3-2</version>
+  <version>3.2.3.3.2.3.3-3</version>
   <description>Apache Hadoop Tools</description>
   <name>Apache Hadoop Tools</name>
   <packaging>pom</packaging>
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/pom.xml b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/pom.xml
index 577cdd7016bf2..aea90043c5d2e 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/pom.xml
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/pom.xml
@@ -19,11 +19,11 @@
   <parent>
     <artifactId>hadoop-yarn</artifactId>
     <groupId>org.apache.hadoop</groupId>
-    <version>3.2.3.3.2.3.3-2</version>
+    <version>3.2.3.3.2.3.3-3</version>
   </parent>
   <modelVersion>4.0.0</modelVersion>
   <artifactId>hadoop-yarn-api</artifactId>
-  <version>3.2.3.3.2.3.3-2</version>
+  <version>3.2.3.3.2.3.3-3</version>
   <name>Apache Hadoop YARN API</name>
 
   <properties>
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-applications-distributedshell/pom.xml b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-applications-distributedshell/pom.xml
index 15e293a2248b9..5530082b826ab 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-applications-distributedshell/pom.xml
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-applications-distributedshell/pom.xml
@@ -19,11 +19,11 @@
   <parent>
     <artifactId>hadoop-yarn-applications</artifactId>
     <groupId>org.apache.hadoop</groupId>
-    <version>3.2.3.3.2.3.3-2</version>
+    <version>3.2.3.3.2.3.3-3</version>
   </parent>
   <modelVersion>4.0.0</modelVersion>
   <artifactId>hadoop-yarn-applications-distributedshell</artifactId>
-  <version>3.2.3.3.2.3.3-2</version>
+  <version>3.2.3.3.2.3.3-3</version>
   <name>Apache Hadoop YARN DistributedShell</name>
 
   <properties>
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-applications-unmanaged-am-launcher/pom.xml b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-applications-unmanaged-am-launcher/pom.xml
index 4865106d43c72..4d47a324d7221 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-applications-unmanaged-am-launcher/pom.xml
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-applications-unmanaged-am-launcher/pom.xml
@@ -19,11 +19,11 @@
   <parent>
     <artifactId>hadoop-yarn-applications</artifactId>
     <groupId>org.apache.hadoop</groupId>
-    <version>3.2.3.3.2.3.3-2</version>
+    <version>3.2.3.3.2.3.3-3</version>
   </parent>
   <modelVersion>4.0.0</modelVersion>
   <artifactId>hadoop-yarn-applications-unmanaged-am-launcher</artifactId>
-  <version>3.2.3.3.2.3.3-2</version>
+  <version>3.2.3.3.2.3.3-3</version>
   <name>Apache Hadoop YARN Unmanaged Am Launcher</name>
 
   <properties>
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-services/hadoop-yarn-services-api/pom.xml b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-services/hadoop-yarn-services-api/pom.xml
index 9a6821febaae9..cbbeb4f6081a9 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-services/hadoop-yarn-services-api/pom.xml
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-services/hadoop-yarn-services-api/pom.xml
@@ -19,7 +19,7 @@
   <parent>
     <groupId>org.apache.hadoop</groupId>
     <artifactId>hadoop-yarn-services</artifactId>
-    <version>3.2.3.3.2.3.3-2</version>
+    <version>3.2.3.3.2.3.3-3</version>
   </parent>
   <artifactId>hadoop-yarn-services-api</artifactId>
   <name>Apache Hadoop YARN Services API</name>
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-services/hadoop-yarn-services-core/pom.xml b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-services/hadoop-yarn-services-core/pom.xml
index 7bbd1b803ce62..4fd8cc1fb887d 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-services/hadoop-yarn-services-core/pom.xml
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-services/hadoop-yarn-services-core/pom.xml
@@ -19,7 +19,7 @@
   <parent>
     <groupId>org.apache.hadoop</groupId>
     <artifactId>hadoop-yarn-services</artifactId>
-    <version>3.2.3.3.2.3.3-2</version>
+    <version>3.2.3.3.2.3.3-3</version>
   </parent>
   <artifactId>hadoop-yarn-services-core</artifactId>
   <packaging>jar</packaging>
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-services/pom.xml b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-services/pom.xml
index f3d9a4e2807f5..253eb0071b399 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-services/pom.xml
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-services/pom.xml
@@ -19,7 +19,7 @@
     <parent>
         <artifactId>hadoop-yarn-applications</artifactId>
         <groupId>org.apache.hadoop</groupId>
-        <version>3.2.3.3.2.3.3-2</version>
+        <version>3.2.3.3.2.3.3-3</version>
     </parent>
     <modelVersion>4.0.0</modelVersion>
     <artifactId>hadoop-yarn-services</artifactId>
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/pom.xml b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/pom.xml
index 45747207d1fbb..ac1957b3072c8 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/pom.xml
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/pom.xml
@@ -19,11 +19,11 @@
   <parent>
     <artifactId>hadoop-yarn-applications</artifactId>
     <groupId>org.apache.hadoop</groupId>
-    <version>3.2.3.3.2.3.3-2</version>
+    <version>3.2.3.3.2.3.3-3</version>
   </parent>
   <modelVersion>4.0.0</modelVersion>
   <artifactId>hadoop-yarn-submarine</artifactId>
-  <version>3.2.3.3.2.3.3-2</version>
+  <version>3.2.3.3.2.3.3-3</version>
   <name>Yet Another Learning Platform</name>
 
   <properties>
@@ -161,17 +161,17 @@
     <dependency>
       <groupId>org.apache.hadoop</groupId>
       <artifactId>hadoop-fs2img</artifactId>
-      <version>3.2.3.3.2.3.3-2</version>
+      <version>3.2.3.3.2.3.3-3</version>
     </dependency>
     <dependency>
       <groupId>org.apache.hadoop</groupId>
       <artifactId>hadoop-yarn-services-api</artifactId>
-      <version>3.2.3.3.2.3.3-2</version>
+      <version>3.2.3.3.2.3.3-3</version>
     </dependency>
     <dependency>
         <groupId>org.apache.hadoop</groupId>
         <artifactId>hadoop-yarn-services-core</artifactId>
-      <version>3.2.3.3.2.3.3-2</version>
+      <version>3.2.3.3.2.3.3-3</version>
     </dependency>
   </dependencies>
 
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/pom.xml b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/pom.xml
index 527698fe5512b..8504237334234 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/pom.xml
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/pom.xml
@@ -19,11 +19,11 @@
   <parent>
     <artifactId>hadoop-yarn</artifactId>
     <groupId>org.apache.hadoop</groupId>
-    <version>3.2.3.3.2.3.3-2</version>
+    <version>3.2.3.3.2.3.3-3</version>
   </parent>
   <modelVersion>4.0.0</modelVersion>
   <artifactId>hadoop-yarn-applications</artifactId>
-  <version>3.2.3.3.2.3.3-2</version>
+  <version>3.2.3.3.2.3.3-3</version>
   <name>Apache Hadoop YARN Applications</name>
   <packaging>pom</packaging>
 
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/pom.xml b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/pom.xml
index c3063e7ec0efc..1e1a10af23533 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/pom.xml
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/pom.xml
@@ -17,10 +17,10 @@
   <parent>
     <artifactId>hadoop-yarn</artifactId>
     <groupId>org.apache.hadoop</groupId>
-    <version>3.2.3.3.2.3.3-2</version>
+    <version>3.2.3.3.2.3.3-3</version>
   </parent>
   <artifactId>hadoop-yarn-client</artifactId>
-  <version>3.2.3.3.2.3.3-2</version>
+  <version>3.2.3.3.2.3.3-3</version>
   <name>Apache Hadoop YARN Client</name>
 
   <properties>
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/pom.xml b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/pom.xml
index 54c986c6899b0..3eccc75b08cf4 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/pom.xml
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/pom.xml
@@ -19,11 +19,11 @@
   <parent>
     <artifactId>hadoop-yarn</artifactId>
     <groupId>org.apache.hadoop</groupId>
-    <version>3.2.3.3.2.3.3-2</version>
+    <version>3.2.3.3.2.3.3-3</version>
   </parent>
   <modelVersion>4.0.0</modelVersion>
   <artifactId>hadoop-yarn-common</artifactId>
-  <version>3.2.3.3.2.3.3-2</version>
+  <version>3.2.3.3.2.3.3-3</version>
   <name>Apache Hadoop YARN Common</name>
 
   <properties>
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-registry/pom.xml b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-registry/pom.xml
index c5d50af3d02ff..a8d0012a374eb 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-registry/pom.xml
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-registry/pom.xml
@@ -19,11 +19,11 @@
   <parent>
     <artifactId>hadoop-yarn</artifactId>
     <groupId>org.apache.hadoop</groupId>
-    <version>3.2.3.3.2.3.3-2</version>
+    <version>3.2.3.3.2.3.3-3</version>
   </parent>
   <modelVersion>4.0.0</modelVersion>
   <artifactId>hadoop-yarn-registry</artifactId>
-  <version>3.2.3.3.2.3.3-2</version>
+  <version>3.2.3.3.2.3.3-3</version>
   <name>Apache Hadoop YARN Registry</name>
 
   <properties>
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-applicationhistoryservice/pom.xml b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-applicationhistoryservice/pom.xml
index f56d196ae4bf6..f147561022370 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-applicationhistoryservice/pom.xml
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-applicationhistoryservice/pom.xml
@@ -22,11 +22,11 @@
   <parent>
     <artifactId>hadoop-yarn-server</artifactId>
     <groupId>org.apache.hadoop</groupId>
-    <version>3.2.3.3.2.3.3-2</version>
+    <version>3.2.3.3.2.3.3-3</version>
   </parent>
   <modelVersion>4.0.0</modelVersion>
   <artifactId>hadoop-yarn-server-applicationhistoryservice</artifactId>
-  <version>3.2.3.3.2.3.3-2</version>
+  <version>3.2.3.3.2.3.3-3</version>
   <name>Apache Hadoop YARN ApplicationHistoryService</name>
 
   <properties>
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/pom.xml b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/pom.xml
index 1e5ed36fa1d07..60f4008a342ed 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/pom.xml
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/pom.xml
@@ -19,11 +19,11 @@
   <parent>
     <artifactId>hadoop-yarn-server</artifactId>
     <groupId>org.apache.hadoop</groupId>
-    <version>3.2.3.3.2.3.3-2</version>
+    <version>3.2.3.3.2.3.3-3</version>
   </parent>
   <modelVersion>4.0.0</modelVersion>
   <artifactId>hadoop-yarn-server-common</artifactId>
-  <version>3.2.3.3.2.3.3-2</version>
+  <version>3.2.3.3.2.3.3-3</version>
   <name>Apache Hadoop YARN Server Common</name>
 
   <properties>
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/pom.xml b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/pom.xml
index 234a09aa213b1..aaac903f0dea6 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/pom.xml
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/pom.xml
@@ -19,11 +19,11 @@
   <parent>
     <artifactId>hadoop-yarn-server</artifactId>
     <groupId>org.apache.hadoop</groupId>
-    <version>3.2.3.3.2.3.3-2</version>
+    <version>3.2.3.3.2.3.3-3</version>
   </parent>
   <modelVersion>4.0.0</modelVersion>
   <artifactId>hadoop-yarn-server-nodemanager</artifactId>
-  <version>3.2.3.3.2.3.3-2</version>
+  <version>3.2.3.3.2.3.3-3</version>
   <name>Apache Hadoop YARN NodeManager</name>
 
   <properties>
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/pom.xml b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/pom.xml
index f9e6534224c81..3da124fbf038c 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/pom.xml
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/pom.xml
@@ -19,11 +19,11 @@
   <parent>
     <artifactId>hadoop-yarn-server</artifactId>
     <groupId>org.apache.hadoop</groupId>
-    <version>3.2.3.3.2.3.3-2</version>
+    <version>3.2.3.3.2.3.3-3</version>
   </parent>
   <modelVersion>4.0.0</modelVersion>
   <artifactId>hadoop-yarn-server-resourcemanager</artifactId>
-  <version>3.2.3.3.2.3.3-2</version>
+  <version>3.2.3.3.2.3.3-3</version>
   <name>Apache Hadoop YARN ResourceManager</name>
 
   <properties>
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-router/pom.xml b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-router/pom.xml
index c01e34197f5eb..19cecd52ada31 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-router/pom.xml
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-router/pom.xml
@@ -19,12 +19,12 @@
   <parent>
     <artifactId>hadoop-yarn-server</artifactId>
     <groupId>org.apache.hadoop</groupId>
-    <version>3.2.3.3.2.3.3-2</version>
+    <version>3.2.3.3.2.3.3-3</version>
   </parent>
   <modelVersion>4.0.0</modelVersion>
   <groupId>org.apache.hadoop</groupId>
   <artifactId>hadoop-yarn-server-router</artifactId>
-  <version>3.2.3.3.2.3.3-2</version>
+  <version>3.2.3.3.2.3.3-3</version>
   <name>Apache Hadoop YARN Router</name>
 
   <properties>
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-sharedcachemanager/pom.xml b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-sharedcachemanager/pom.xml
index bb8e8ddbe28d3..cd711a2683705 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-sharedcachemanager/pom.xml
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-sharedcachemanager/pom.xml
@@ -17,10 +17,10 @@
   <parent>
     <artifactId>hadoop-yarn-server</artifactId>
     <groupId>org.apache.hadoop</groupId>
-    <version>3.2.3.3.2.3.3-2</version>
+    <version>3.2.3.3.2.3.3-3</version>
   </parent>
   <artifactId>hadoop-yarn-server-sharedcachemanager</artifactId>
-  <version>3.2.3.3.2.3.3-2</version>
+  <version>3.2.3.3.2.3.3-3</version>
   <name>Apache Hadoop YARN SharedCacheManager</name>
 
   <properties>
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-tests/pom.xml b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-tests/pom.xml
index 460e544d07f21..9760db16404c2 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-tests/pom.xml
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-tests/pom.xml
@@ -19,10 +19,10 @@
   <parent>
     <artifactId>hadoop-yarn-server</artifactId>
     <groupId>org.apache.hadoop</groupId>
-    <version>3.2.3.3.2.3.3-2</version>
+    <version>3.2.3.3.2.3.3-3</version>
   </parent>
   <artifactId>hadoop-yarn-server-tests</artifactId>
-  <version>3.2.3.3.2.3.3-2</version>
+  <version>3.2.3.3.2.3.3-3</version>
   <name>Apache Hadoop YARN Server Tests</name>
 
   <properties>
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-timeline-pluginstorage/pom.xml b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-timeline-pluginstorage/pom.xml
index 36818c7918c49..60623999ca66a 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-timeline-pluginstorage/pom.xml
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-timeline-pluginstorage/pom.xml
@@ -22,11 +22,11 @@
   <parent>
     <artifactId>hadoop-yarn-server</artifactId>
     <groupId>org.apache.hadoop</groupId>
-    <version>3.2.3.3.2.3.3-2</version>
+    <version>3.2.3.3.2.3.3-3</version>
   </parent>
   <modelVersion>4.0.0</modelVersion>
   <artifactId>hadoop-yarn-server-timeline-pluginstorage</artifactId>
-  <version>3.2.3.3.2.3.3-2</version>
+  <version>3.2.3.3.2.3.3-3</version>
   <name>Apache Hadoop YARN Timeline Plugin Storage</name>
 
   <properties>
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-timelineservice-hbase-tests/pom.xml b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-timelineservice-hbase-tests/pom.xml
index 67fc81e773052..659ea4a1d2f49 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-timelineservice-hbase-tests/pom.xml
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-timelineservice-hbase-tests/pom.xml
@@ -22,11 +22,11 @@
   <parent>
     <artifactId>hadoop-yarn-server</artifactId>
     <groupId>org.apache.hadoop</groupId>
-    <version>3.2.3.3.2.3.3-2</version>
+    <version>3.2.3.3.2.3.3-3</version>
   </parent>
   <modelVersion>4.0.0</modelVersion>
   <artifactId>hadoop-yarn-server-timelineservice-hbase-tests</artifactId>
-  <version>3.2.3.3.2.3.3-2</version>
+  <version>3.2.3.3.2.3.3-3</version>
   <name>Apache Hadoop YARN TimelineService HBase tests</name>
 
   <properties>
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-timelineservice-hbase/hadoop-yarn-server-timelineservice-hbase-client/pom.xml b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-timelineservice-hbase/hadoop-yarn-server-timelineservice-hbase-client/pom.xml
index 11918dc0e09bb..50816361b243f 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-timelineservice-hbase/hadoop-yarn-server-timelineservice-hbase-client/pom.xml
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-timelineservice-hbase/hadoop-yarn-server-timelineservice-hbase-client/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <artifactId>hadoop-yarn-server-timelineservice-hbase</artifactId>
     <groupId>org.apache.hadoop</groupId>
-    <version>3.2.3.3.2.3.3-2</version>
+    <version>3.2.3.3.2.3.3-3</version>
   </parent>
   <modelVersion>4.0.0</modelVersion>
   <artifactId>hadoop-yarn-server-timelineservice-hbase-client</artifactId>
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-timelineservice-hbase/hadoop-yarn-server-timelineservice-hbase-common/pom.xml b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-timelineservice-hbase/hadoop-yarn-server-timelineservice-hbase-common/pom.xml
index 282b29a676844..47853735fffce 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-timelineservice-hbase/hadoop-yarn-server-timelineservice-hbase-common/pom.xml
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-timelineservice-hbase/hadoop-yarn-server-timelineservice-hbase-common/pom.xml
@@ -22,13 +22,13 @@
   <parent>
     <artifactId>hadoop-yarn-server-timelineservice-hbase</artifactId>
     <groupId>org.apache.hadoop</groupId>
-    <version>3.2.3.3.2.3.3-2</version>
+    <version>3.2.3.3.2.3.3-3</version>
   </parent>
   <modelVersion>4.0.0</modelVersion>
 
   <artifactId>hadoop-yarn-server-timelineservice-hbase-common</artifactId>
   <name>Apache Hadoop YARN TimelineService HBase Common</name>
-  <version>3.2.3.3.2.3.3-2</version>
+  <version>3.2.3.3.2.3.3-3</version>
 
   <properties>
     <!-- Needed for generating FindBugs warnings using parent pom -->
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-timelineservice-hbase/hadoop-yarn-server-timelineservice-hbase-server/hadoop-yarn-server-timelineservice-hbase-server-1/pom.xml b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-timelineservice-hbase/hadoop-yarn-server-timelineservice-hbase-server/hadoop-yarn-server-timelineservice-hbase-server-1/pom.xml
index a8e2d70ead3b9..6f910bb68b22a 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-timelineservice-hbase/hadoop-yarn-server-timelineservice-hbase-server/hadoop-yarn-server-timelineservice-hbase-server-1/pom.xml
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-timelineservice-hbase/hadoop-yarn-server-timelineservice-hbase-server/hadoop-yarn-server-timelineservice-hbase-server-1/pom.xml
@@ -22,13 +22,13 @@
   <parent>
     <artifactId>hadoop-yarn-server-timelineservice-hbase-server</artifactId>
     <groupId>org.apache.hadoop</groupId>
-    <version>3.2.3.3.2.3.3-2</version>
+    <version>3.2.3.3.2.3.3-3</version>
   </parent>
 
   <modelVersion>4.0.0</modelVersion>
   <artifactId>hadoop-yarn-server-timelineservice-hbase-server-1</artifactId>
   <name>Apache Hadoop YARN TimelineService HBase Server 1.2</name>
-  <version>3.2.3.3.2.3.3-2</version>
+  <version>3.2.3.3.2.3.3-3</version>
 
   <properties>
     <!-- Needed for generating FindBugs warnings using parent pom -->
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-timelineservice-hbase/hadoop-yarn-server-timelineservice-hbase-server/hadoop-yarn-server-timelineservice-hbase-server-2/pom.xml b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-timelineservice-hbase/hadoop-yarn-server-timelineservice-hbase-server/hadoop-yarn-server-timelineservice-hbase-server-2/pom.xml
index be5a3ce9e5729..cec36ba4cc38c 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-timelineservice-hbase/hadoop-yarn-server-timelineservice-hbase-server/hadoop-yarn-server-timelineservice-hbase-server-2/pom.xml
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-timelineservice-hbase/hadoop-yarn-server-timelineservice-hbase-server/hadoop-yarn-server-timelineservice-hbase-server-2/pom.xml
@@ -22,13 +22,13 @@
   <parent>
     <artifactId>hadoop-yarn-server-timelineservice-hbase-server</artifactId>
     <groupId>org.apache.hadoop</groupId>
-    <version>3.2.3.3.2.3.3-2</version>
+    <version>3.2.3.3.2.3.3-3</version>
   </parent>
   <modelVersion>4.0.0</modelVersion>
 
   <artifactId>hadoop-yarn-server-timelineservice-hbase-server-2</artifactId>
   <name>Apache Hadoop YARN TimelineService HBase Server 2.0</name>
-  <version>3.2.3.3.2.3.3-2</version>
+  <version>3.2.3.3.2.3.3-3</version>
 
   <properties>
     <!-- Needed for generating FindBugs warnings using parent pom -->
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-timelineservice-hbase/hadoop-yarn-server-timelineservice-hbase-server/pom.xml b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-timelineservice-hbase/hadoop-yarn-server-timelineservice-hbase-server/pom.xml
index c71c09e5f1f6c..19e9f94aec35c 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-timelineservice-hbase/hadoop-yarn-server-timelineservice-hbase-server/pom.xml
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-timelineservice-hbase/hadoop-yarn-server-timelineservice-hbase-server/pom.xml
@@ -22,12 +22,12 @@
   <parent>
     <artifactId>hadoop-yarn-server-timelineservice-hbase</artifactId>
     <groupId>org.apache.hadoop</groupId>
-    <version>3.2.3.3.2.3.3-2</version>
+    <version>3.2.3.3.2.3.3-3</version>
   </parent>
   <modelVersion>4.0.0</modelVersion>
 
   <artifactId>hadoop-yarn-server-timelineservice-hbase-server</artifactId>
-  <version>3.2.3.3.2.3.3-2</version>
+  <version>3.2.3.3.2.3.3-3</version>
   <name>Apache Hadoop YARN TimelineService HBase Servers</name>
   <packaging>pom</packaging>
 
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-timelineservice-hbase/pom.xml b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-timelineservice-hbase/pom.xml
index 26d84139de524..6dfe177459672 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-timelineservice-hbase/pom.xml
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-timelineservice-hbase/pom.xml
@@ -22,12 +22,12 @@
   <parent>
     <artifactId>hadoop-yarn-server</artifactId>
     <groupId>org.apache.hadoop</groupId>
-    <version>3.2.3.3.2.3.3-2</version>
+    <version>3.2.3.3.2.3.3-3</version>
   </parent>
 
   <modelVersion>4.0.0</modelVersion>
   <artifactId>hadoop-yarn-server-timelineservice-hbase</artifactId>
-  <version>3.2.3.3.2.3.3-2</version>
+  <version>3.2.3.3.2.3.3-3</version>
   <name>Apache Hadoop YARN TimelineService HBase Backend</name>
   <packaging>pom</packaging>
 
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-timelineservice/pom.xml b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-timelineservice/pom.xml
index 67443f1dd0bbd..728b3716dc63c 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-timelineservice/pom.xml
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-timelineservice/pom.xml
@@ -22,11 +22,11 @@
   <parent>
     <artifactId>hadoop-yarn-server</artifactId>
     <groupId>org.apache.hadoop</groupId>
-    <version>3.2.3.3.2.3.3-2</version>
+    <version>3.2.3.3.2.3.3-3</version>
   </parent>
   <modelVersion>4.0.0</modelVersion>
   <artifactId>hadoop-yarn-server-timelineservice</artifactId>
-  <version>3.2.3.3.2.3.3-2</version>
+  <version>3.2.3.3.2.3.3-3</version>
   <name>Apache Hadoop YARN Timeline Service</name>
 
   <properties>
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-web-proxy/pom.xml b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-web-proxy/pom.xml
index f4215871fc7f4..049eb076d8ef5 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-web-proxy/pom.xml
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-web-proxy/pom.xml
@@ -19,11 +19,11 @@
   <parent>
     <artifactId>hadoop-yarn-server</artifactId>
     <groupId>org.apache.hadoop</groupId>
-    <version>3.2.3.3.2.3.3-2</version>
+    <version>3.2.3.3.2.3.3-3</version>
   </parent>
   <modelVersion>4.0.0</modelVersion>
   <artifactId>hadoop-yarn-server-web-proxy</artifactId>
-  <version>3.2.3.3.2.3.3-2</version>
+  <version>3.2.3.3.2.3.3-3</version>
   <name>Apache Hadoop YARN Web Proxy</name>
 
   <properties>
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/pom.xml b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/pom.xml
index 2a38b44ebdfea..888b70c991d01 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/pom.xml
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/pom.xml
@@ -19,11 +19,11 @@
   <parent>
     <artifactId>hadoop-yarn</artifactId>
     <groupId>org.apache.hadoop</groupId>
-    <version>3.2.3.3.2.3.3-2</version>
+    <version>3.2.3.3.2.3.3-3</version>
   </parent>
   <modelVersion>4.0.0</modelVersion>
   <artifactId>hadoop-yarn-server</artifactId>
-  <version>3.2.3.3.2.3.3-2</version>
+  <version>3.2.3.3.2.3.3-3</version>
   <name>Apache Hadoop YARN Server</name>
   <packaging>pom</packaging>
 
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-site/pom.xml b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-site/pom.xml
index 16b31a414fac5..c44355a1c652e 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-site/pom.xml
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-site/pom.xml
@@ -19,11 +19,11 @@
   <parent>
     <artifactId>hadoop-yarn</artifactId>
     <groupId>org.apache.hadoop</groupId>
-    <version>3.2.3.3.2.3.3-2</version>
+    <version>3.2.3.3.2.3.3-3</version>
   </parent>
   <modelVersion>4.0.0</modelVersion>
   <artifactId>hadoop-yarn-site</artifactId>
-  <version>3.2.3.3.2.3.3-2</version>
+  <version>3.2.3.3.2.3.3-3</version>
   <name>Apache Hadoop YARN Site</name>
   <packaging>pom</packaging>
 
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-ui/pom.xml b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-ui/pom.xml
index 2f45d8ef5f029..5ab7d52109403 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-ui/pom.xml
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-ui/pom.xml
@@ -20,11 +20,11 @@
   <parent>
     <artifactId>hadoop-yarn</artifactId>
     <groupId>org.apache.hadoop</groupId>
-    <version>3.2.3.3.2.3.3-2</version>
+    <version>3.2.3.3.2.3.3-3</version>
   </parent>
   <modelVersion>4.0.0</modelVersion>
   <artifactId>hadoop-yarn-ui</artifactId>
-  <version>3.2.3.3.2.3.3-2</version>
+  <version>3.2.3.3.2.3.3-3</version>
   <name>Apache Hadoop YARN UI</name>
   <packaging>${packagingType}</packaging>
 
diff --git a/hadoop-yarn-project/hadoop-yarn/pom.xml b/hadoop-yarn-project/hadoop-yarn/pom.xml
index 52cddfdef1f2d..31c08c10f6186 100644
--- a/hadoop-yarn-project/hadoop-yarn/pom.xml
+++ b/hadoop-yarn-project/hadoop-yarn/pom.xml
@@ -16,11 +16,11 @@
   <parent>
     <groupId>org.apache.hadoop</groupId>
     <artifactId>hadoop-project</artifactId>
-    <version>3.2.3.3.2.3.3-2</version>
+    <version>3.2.3.3.2.3.3-3</version>
     <relativePath>../../hadoop-project</relativePath>
   </parent>
   <artifactId>hadoop-yarn</artifactId>
-  <version>3.2.3.3.2.3.3-2</version>
+  <version>3.2.3.3.2.3.3-3</version>
   <packaging>pom</packaging>
   <name>Apache Hadoop YARN</name>
 
diff --git a/hadoop-yarn-project/pom.xml b/hadoop-yarn-project/pom.xml
index edc3beab8e95f..f4d96f734260a 100644
--- a/hadoop-yarn-project/pom.xml
+++ b/hadoop-yarn-project/pom.xml
@@ -18,11 +18,11 @@
   <parent>
     <groupId>org.apache.hadoop</groupId>
     <artifactId>hadoop-project</artifactId>
-    <version>3.2.3.3.2.3.3-2</version>
+    <version>3.2.3.3.2.3.3-3</version>
     <relativePath>../hadoop-project</relativePath>
   </parent>
   <artifactId>hadoop-yarn-project</artifactId>
-  <version>3.2.3.3.2.3.3-2</version>
+  <version>3.2.3.3.2.3.3-3</version>
   <packaging>pom</packaging>
   <name>Apache Hadoop YARN Project</name>
   <url>https://hadoop.apache.org/yarn/</url>
diff --git a/pom.xml b/pom.xml
index 8c2bf39ccc78e..f86164bd6e723 100644
--- a/pom.xml
+++ b/pom.xml
@@ -18,7 +18,7 @@ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/x
   <modelVersion>4.0.0</modelVersion>
   <groupId>org.apache.hadoop</groupId>
   <artifactId>hadoop-main</artifactId>
-  <version>3.2.3.3.2.3.3-2</version>
+  <version>3.2.3.3.2.3.3-3</version>
   <description>Apache Hadoop Main</description>
   <name>Apache Hadoop Main</name>
   <packaging>pom</packaging>

From dd69d2e9e0c08b985801fecb929a93dfffe6d80a Mon Sep 17 00:00:00 2001
From: Deepak Damri <deepakdamri@gmail.com>
Date: Thu, 5 Dec 2024 18:39:36 +0530
Subject: [PATCH 22/40] fixup! ODP-3.2.3.3-3 version update.

---
 hadoop-project/pom.xml | 2 +-
 pom.xml                | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/hadoop-project/pom.xml b/hadoop-project/pom.xml
index 17b2f3f5c0502..c3837cd8b5620 100644
--- a/hadoop-project/pom.xml
+++ b/hadoop-project/pom.xml
@@ -95,7 +95,7 @@
     <hadoop-thirdparty-shaded-protobuf-prefix>${hadoop-thirdparty-shaded-prefix}.protobuf</hadoop-thirdparty-shaded-protobuf-prefix>
     <hadoop-thirdparty-shaded-guava-prefix>${hadoop-thirdparty-shaded-prefix}.com.google.common</hadoop-thirdparty-shaded-guava-prefix>
 
-    <zookeeper.version>3.5.10.3.2.3.3-2</zookeeper.version>
+    <zookeeper.version>3.5.10.3.2.3.3-3</zookeeper.version>
     <curator.version>5.2.0</curator.version>
     <findbugs.version>3.0.5</findbugs.version>
     <dnsjava.version>2.1.7</dnsjava.version>
diff --git a/pom.xml b/pom.xml
index f86164bd6e723..6a5ae4fc72c13 100644
--- a/pom.xml
+++ b/pom.xml
@@ -89,7 +89,7 @@ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/x
 
   <properties>
     <!-- required as child projects with different version can't use ${project.version} -->
-    <hadoop.version>3.2.3.3.2.3.3-2</hadoop.version>
+    <hadoop.version>3.2.3.3.2.3.3-3</hadoop.version>
 
     <distMgmtSnapshotsId>apache.snapshots.https</distMgmtSnapshotsId>
     <distMgmtSnapshotsName>Apache Development Snapshot Repository</distMgmtSnapshotsName>

From b36e55c291ffd4296b0b5b5f00248c16edc9122d Mon Sep 17 00:00:00 2001
From: harshith gandhe <harshith@acceldata.io>
Date: Tue, 10 Dec 2024 14:18:39 +0530
Subject: [PATCH 23/40] ODP-2768 : Refactor shebangs for python scripts to
 ambari-python-wrap

---
 dev-support/bin/checkcompatibility.py         |  2 +-
 .../git_jira_fix_version_check.py             |  2 +-
 .../client/cli/TestRunJobCliParsing.java      | 20 +++++++++----------
 .../yarnservice/TestYarnServiceRunJobCli.java |  2 +-
 4 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/dev-support/bin/checkcompatibility.py b/dev-support/bin/checkcompatibility.py
index e8c0e26a712db..04e789f247e04 100755
--- a/dev-support/bin/checkcompatibility.py
+++ b/dev-support/bin/checkcompatibility.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python3
+#!/usr/bin/env ambari-python-wrap
 #
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
diff --git a/dev-support/git-jira-validation/git_jira_fix_version_check.py b/dev-support/git-jira-validation/git_jira_fix_version_check.py
index c2e12a13aae22..b74463711ac3e 100644
--- a/dev-support/git-jira-validation/git_jira_fix_version_check.py
+++ b/dev-support/git-jira-validation/git_jira_fix_version_check.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python3
+#!/usr/bin/env ambari-python-wrap
 ############################################################################
 #
 # Licensed to the Apache Software Foundation (ASF) under one
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/test/java/org/apache/hadoop/yarn/submarine/client/cli/TestRunJobCliParsing.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/test/java/org/apache/hadoop/yarn/submarine/client/cli/TestRunJobCliParsing.java
index 184d53d7a0116..335afe469e4d9 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/test/java/org/apache/hadoop/yarn/submarine/client/cli/TestRunJobCliParsing.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/test/java/org/apache/hadoop/yarn/submarine/client/cli/TestRunJobCliParsing.java
@@ -90,9 +90,9 @@ public void testBasicRunJobForDistributedTraining() throws Exception {
         new String[] { "--name", "my-job", "--docker_image", "tf-docker:1.1.0",
             "--input_path", "hdfs://input", "--checkpoint_path", "hdfs://output",
             "--num_workers", "3", "--num_ps", "2", "--worker_launch_cmd",
-            "python run-job.py", "--worker_resources", "memory=2048M,vcores=2",
+            "ambari-python-wrap run-job.py", "--worker_resources", "memory=2048M,vcores=2",
             "--ps_resources", "memory=4G,vcores=4", "--tensorboard", "true",
-            "--ps_launch_cmd", "python run-ps.py", "--keytab", "/keytab/path",
+            "--ps_launch_cmd", "ambari-python-wrap run-ps.py", "--keytab", "/keytab/path",
             "--principal", "user/_HOST@domain.com", "--distribute_keytab",
             "--verbose" });
 
@@ -101,11 +101,11 @@ public void testBasicRunJobForDistributedTraining() throws Exception {
     Assert.assertEquals(jobRunParameters.getInputPath(), "hdfs://input");
     Assert.assertEquals(jobRunParameters.getCheckpointPath(), "hdfs://output");
     Assert.assertEquals(jobRunParameters.getNumPS(), 2);
-    Assert.assertEquals(jobRunParameters.getPSLaunchCmd(), "python run-ps.py");
+    Assert.assertEquals(jobRunParameters.getPSLaunchCmd(), "ambari-python-wrap run-ps.py");
     Assert.assertEquals(Resources.createResource(4096, 4),
         jobRunParameters.getPsResource());
     Assert.assertEquals(jobRunParameters.getWorkerLaunchCmd(),
-        "python run-job.py");
+        "ambari-python-wrap run-job.py");
     Assert.assertEquals(Resources.createResource(2048, 2),
         jobRunParameters.getWorkerResource());
     Assert.assertEquals(jobRunParameters.getDockerImageName(),
@@ -126,7 +126,7 @@ public void testBasicRunJobForSingleNodeTraining() throws Exception {
     runJobCli.run(
         new String[] { "--name", "my-job", "--docker_image", "tf-docker:1.1.0",
             "--input_path", "hdfs://input", "--checkpoint_path", "hdfs://output",
-            "--num_workers", "1", "--worker_launch_cmd", "python run-job.py",
+            "--num_workers", "1", "--worker_launch_cmd", "ambari-python-wrap run-job.py",
             "--worker_resources", "memory=4g,vcores=2", "--tensorboard",
             "true", "--verbose", "--wait_job_finish" });
 
@@ -136,7 +136,7 @@ public void testBasicRunJobForSingleNodeTraining() throws Exception {
     Assert.assertEquals(jobRunParameters.getCheckpointPath(), "hdfs://output");
     Assert.assertEquals(jobRunParameters.getNumWorkers(), 1);
     Assert.assertEquals(jobRunParameters.getWorkerLaunchCmd(),
-        "python run-job.py");
+        "ambari-python-wrap run-job.py");
     Assert.assertEquals(Resources.createResource(4096, 2),
         jobRunParameters.getWorkerResource());
     Assert.assertTrue(SubmarineLogs.isVerbose());
@@ -152,7 +152,7 @@ public void testNoInputPathOptionSpecified() throws Exception {
       runJobCli.run(
           new String[]{"--name", "my-job", "--docker_image", "tf-docker:1.1.0",
               "--checkpoint_path", "hdfs://output",
-              "--num_workers", "1", "--worker_launch_cmd", "python run-job.py",
+              "--num_workers", "1", "--worker_launch_cmd", "ambari-python-wrap run-job.py",
               "--worker_resources", "memory=4g,vcores=2", "--tensorboard",
               "true", "--verbose", "--wait_job_finish"});
     } catch (ParseException e) {
@@ -190,14 +190,14 @@ public void testLaunchCommandPatternReplace() throws Exception {
         new String[] { "--name", "my-job", "--docker_image", "tf-docker:1.1.0",
             "--input_path", "hdfs://input", "--checkpoint_path", "hdfs://output",
             "--num_workers", "3", "--num_ps", "2", "--worker_launch_cmd",
-            "python run-job.py --input=%input_path% --model_dir=%checkpoint_path% --export_dir=%saved_model_path%/savedmodel",
+            "ambari-python-wrap run-job.py --input=%input_path% --model_dir=%checkpoint_path% --export_dir=%saved_model_path%/savedmodel",
             "--worker_resources", "memory=2048,vcores=2", "--ps_resources",
             "memory=4096,vcores=4", "--tensorboard", "true", "--ps_launch_cmd",
-            "python run-ps.py --input=%input_path% --model_dir=%checkpoint_path%/model",
+            "ambari-python-wrap run-ps.py --input=%input_path% --model_dir=%checkpoint_path%/model",
             "--verbose" });
 
     Assert.assertEquals(
-        "python run-job.py --input=hdfs://input --model_dir=hdfs://output "
+        "ambari-python-wrap run-job.py --input=hdfs://input --model_dir=hdfs://output "
             + "--export_dir=hdfs://output/savedmodel",
         runJobCli.getRunJobParameters().getWorkerLaunchCmd());
     Assert.assertEquals(
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/test/java/org/apache/hadoop/yarn/submarine/client/cli/yarnservice/TestYarnServiceRunJobCli.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/test/java/org/apache/hadoop/yarn/submarine/client/cli/yarnservice/TestYarnServiceRunJobCli.java
index f3d140975febd..ff3405e4dcf97 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/test/java/org/apache/hadoop/yarn/submarine/client/cli/yarnservice/TestYarnServiceRunJobCli.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/test/java/org/apache/hadoop/yarn/submarine/client/cli/yarnservice/TestYarnServiceRunJobCli.java
@@ -152,7 +152,7 @@ public void testBasicRunJobForDistributedTraining() throws Exception {
         new String[]{"--name", "my-job", "--docker_image", "tf-docker:1.1.0",
             "--input_path", "s3://input", "--checkpoint_path", "s3://output",
             "--num_workers", "3", "--num_ps", "2", "--worker_launch_cmd",
-            "python run-job.py", "--worker_resources", "memory=2048M,vcores=2",
+            "ambari-python-wrap run-job.py", "--worker_resources", "memory=2048M,vcores=2",
             "--ps_resources", "memory=4096M,vcores=4", "--ps_docker_image",
             "ps.image", "--worker_docker_image", "worker.image",
             "--ps_launch_cmd", "python run-ps.py", "--verbose"});

From 4f924c262aa239abd83ccc0b99508ba2507ed12b Mon Sep 17 00:00:00 2001
From: Sourabh Dilraj <sourabh@acceldata.io>
Date: Tue, 17 Dec 2024 17:58:19 +0530
Subject: [PATCH 24/40] ODP-2831 : Use CMAKE_CXX_STANDARD instead of -std=c++11
 directly

---
 .../hadoop-mapreduce-client-nativetask/src/CMakeLists.txt        | 1 +
 1 file changed, 1 insertion(+)

diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-nativetask/src/CMakeLists.txt b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-nativetask/src/CMakeLists.txt
index ae3b9c6029e57..4c32838afb0b4 100644
--- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-nativetask/src/CMakeLists.txt
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-nativetask/src/CMakeLists.txt
@@ -27,6 +27,7 @@ set(GTEST_SRC_DIR ${CMAKE_SOURCE_DIR}/../../../../hadoop-common-project/hadoop-c
 # Add extra compiler and linker flags.
 # -Wno-sign-compare
 hadoop_add_compiler_flags("-DNDEBUG -DSIMPLE_MEMCPY -fno-strict-aliasing -fsigned-char")
+set(CMAKE_CXX_STANDARD 11)
 
 # Source location.
 set(SRC main/native)

From 175aabda9e1261cb1a279d3f1aec47a62836ae7d Mon Sep 17 00:00:00 2001
From: Takanobu Asanuma <tasanuma@apache.org>
Date: Thu, 11 Oct 2018 10:21:51 +0530
Subject: [PATCH 25/40] HADOOP-15785. [JDK10] Javadoc build fails on JDK 10 in
 hadoop-common. Contributed by Dinesh Chitlangia.

---
 .../org/apache/hadoop/conf/Configurable.java  | 10 +++-
 .../org/apache/hadoop/conf/Configuration.java | 45 ++++++++--------
 .../hadoop/conf/ConfigurationWithLogging.java | 14 ++---
 .../org/apache/hadoop/crypto/CryptoCodec.java |  7 +--
 .../hadoop/crypto/CryptoInputStream.java      |  4 +-
 .../hadoop/crypto/CryptoOutputStream.java     |  4 +-
 .../org/apache/hadoop/crypto/Decryptor.java   | 14 ++---
 .../org/apache/hadoop/crypto/Encryptor.java   | 14 ++---
 .../apache/hadoop/crypto/OpensslCipher.java   | 18 +++----
 .../crypto/key/JavaKeyStoreProvider.java      | 15 +++---
 .../apache/hadoop/crypto/key/KeyProvider.java |  6 +--
 .../key/KeyProviderCryptoExtension.java       |  9 ++--
 .../KeyProviderDelegationTokenExtension.java  |  2 +-
 .../crypto/key/kms/KMSClientProvider.java     |  6 +--
 .../hadoop/crypto/key/kms/ValueQueue.java     |  2 +-
 .../crypto/random/OpensslSecureRandom.java    | 10 ++--
 .../apache/hadoop/fs/AbstractFileSystem.java  | 45 +++++++++-------
 .../hadoop/fs/BufferedFSInputStream.java      |  4 +-
 .../apache/hadoop/fs/ByteBufferReadable.java  |  8 +--
 .../apache/hadoop/fs/ChecksumFileSystem.java  |  2 +-
 .../java/org/apache/hadoop/fs/ChecksumFs.java |  2 +-
 .../hadoop/fs/CommonConfigurationKeys.java    |  2 +-
 .../java/org/apache/hadoop/fs/CreateFlag.java |  2 +-
 .../org/apache/hadoop/fs/FSInputChecker.java  |  2 +-
 .../org/apache/hadoop/fs/FileContext.java     | 51 ++++++++++---------
 .../java/org/apache/hadoop/fs/FileSystem.java | 11 ++--
 .../java/org/apache/hadoop/fs/FileUtil.java   |  4 +-
 .../org/apache/hadoop/fs/HarFileSystem.java   |  2 +-
 .../fs/HasEnhancedByteBufferAccess.java       | 15 +++---
 .../apache/hadoop/fs/LocalDirAllocator.java   |  3 +-
 .../org/apache/hadoop/fs/LocalFileSystem.java |  2 +-
 .../java/org/apache/hadoop/fs/Options.java    |  2 +-
 .../java/org/apache/hadoop/fs/QuotaUsage.java |  8 +--
 .../apache/hadoop/fs/ftp/FTPFileSystem.java   |  6 +--
 .../apache/hadoop/fs/ftp/FtpConfigKeys.java   |  1 +
 .../hadoop/fs/local/LocalConfigKeys.java      |  1 +
 .../hadoop/fs/permission/AclStatus.java       |  2 +-
 .../apache/hadoop/fs/permission/AclUtil.java  | 10 ++--
 .../fs/permission/ScopedAclEntries.java       | 14 ++---
 .../org/apache/hadoop/fs/shell/Command.java   | 24 ++++-----
 .../apache/hadoop/fs/shell/CommandFormat.java |  2 +-
 .../org/apache/hadoop/fs/viewfs/ViewFs.java   | 12 ++---
 .../hadoop/ha/ActiveStandbyElector.java       | 32 ++++++------
 .../java/org/apache/hadoop/ha/HAAdmin.java    |  2 +-
 .../java/org/apache/hadoop/ha/NodeFencer.java |  2 +-
 .../apache/hadoop/ha/SshFenceByTcpPort.java   |  2 +-
 .../org/apache/hadoop/http/HttpServer2.java   | 10 ++--
 .../org/apache/hadoop/io/EnumSetWritable.java |  2 +-
 .../java/org/apache/hadoop/io/IOUtils.java    |  2 +-
 .../org/apache/hadoop/io/ReadaheadPool.java   |  2 +-
 .../org/apache/hadoop/io/SecureIOUtils.java   |  4 +-
 .../org/apache/hadoop/io/SequenceFile.java    | 16 +++---
 .../java/org/apache/hadoop/io/Writable.java   |  4 +-
 .../apache/hadoop/io/WritableComparable.java  |  7 +--
 .../org/apache/hadoop/io/WritableUtils.java   |  6 ++-
 .../io/compress/CompressionCodecFactory.java  |  8 +--
 .../apache/hadoop/io/compress/Lz4Codec.java   |  4 +-
 .../hadoop/io/compress/SnappyCodec.java       |  2 +-
 .../io/compress/bzip2/Bzip2Compressor.java    |  2 +-
 .../io/compress/bzip2/Bzip2Decompressor.java  |  6 +--
 .../io/compress/bzip2/Bzip2Factory.java       |  4 +-
 .../io/compress/bzip2/CBZip2InputStream.java  | 26 +++++-----
 .../io/compress/bzip2/CBZip2OutputStream.java |  7 +--
 .../zlib/BuiltInGzipDecompressor.java         |  8 +--
 .../io/compress/zlib/ZlibCompressor.java      |  2 +-
 .../io/compress/zlib/ZlibDecompressor.java    |  6 +--
 .../hadoop/io/compress/zlib/ZlibFactory.java  |  4 +-
 .../hadoop/io/erasurecode/CodecUtil.java      |  2 +-
 .../rawcoder/util/GaloisField.java            |  2 +-
 .../apache/hadoop/io/file/tfile/TFile.java    |  8 +--
 .../apache/hadoop/io/file/tfile/Utils.java    | 48 +++++++++--------
 .../apache/hadoop/io/retry/RetryProxy.java    |  4 +-
 .../hadoop/io/serializer/Deserializer.java    |  2 +-
 .../hadoop/io/serializer/Serializer.java      |  2 +-
 .../org/apache/hadoop/ipc/CallerContext.java  |  2 +-
 .../org/apache/hadoop/ipc/ClientCache.java    |  6 +--
 .../apache/hadoop/ipc/DecayRpcScheduler.java  |  3 +-
 .../org/apache/hadoop/ipc/RefreshHandler.java |  1 -
 .../apache/hadoop/ipc/RemoteException.java    |  2 +-
 .../java/org/apache/hadoop/ipc/Server.java    |  2 +-
 .../org/apache/hadoop/jmx/JMXJsonServlet.java |  6 +--
 .../hadoop/log/LogThrottlingHelper.java       |  8 +--
 .../metrics2/lib/MutableRollingAverages.java  |  2 +-
 .../apache/hadoop/metrics2/package-info.java  |  3 ++
 .../metrics2/sink/RollingFileSystemSink.java  |  2 +-
 .../hadoop/metrics2/sink/StatsDSink.java      |  4 +-
 .../apache/hadoop/metrics2/util/MBeans.java   | 10 ++--
 .../net/AbstractDNSToSwitchMapping.java       |  8 +--
 .../main/java/org/apache/hadoop/net/DNS.java  |  2 +-
 .../apache/hadoop/net/DNSToSwitchMapping.java |  2 +-
 .../java/org/apache/hadoop/net/NetUtils.java  | 14 ++---
 .../apache/hadoop/net/NetworkTopology.java    | 12 ++---
 .../net/NetworkTopologyWithNodeGroup.java     |  4 +-
 .../apache/hadoop/net/ScriptBasedMapping.java |  8 +--
 .../net/ScriptBasedMappingWithDependency.java |  7 ++-
 .../apache/hadoop/net/SocketOutputStream.java |  6 +--
 .../AuthenticationFilterInitializer.java      |  6 +--
 .../security/IdMappingServiceProvider.java    |  5 +-
 .../apache/hadoop/security/SaslRpcClient.java | 10 ++--
 .../apache/hadoop/security/SecurityUtil.java  |  3 +-
 .../hadoop/security/UserGroupInformation.java |  2 +-
 .../security/alias/JavaKeyStoreProvider.java  |  6 +--
 .../alias/LocalJavaKeyStoreProvider.java      |  3 +-
 .../hadoop/security/authorize/ProxyUsers.java |  4 +-
 .../ssl/FileBasedKeyStoresFactory.java        |  4 +-
 .../hadoop/security/ssl/SSLFactory.java       |  4 +-
 .../security/ssl/SSLHostnameVerifier.java     | 20 ++++----
 .../web/DelegationTokenAuthenticatedURL.java  | 14 ++---
 .../DelegationTokenAuthenticationFilter.java  |  6 +--
 .../DelegationTokenAuthenticationHandler.java |  4 +-
 .../web/DelegationTokenManager.java           |  2 +-
 ...sDelegationTokenAuthenticationHandler.java |  2 +-
 .../KerberosDelegationTokenAuthenticator.java |  2 +-
 ...eDelegationTokenAuthenticationHandler.java |  3 +-
 ...oDelegationTokenAuthenticationHandler.java |  2 +-
 .../PseudoDelegationTokenAuthenticator.java   |  2 +-
 .../hadoop/service/ServiceOperations.java     |  4 +-
 .../service/launcher/ServiceLauncher.java     |  2 +-
 .../hadoop/service/launcher/package-info.java |  2 +-
 .../org/apache/hadoop/util/ClassUtil.java     |  1 -
 .../apache/hadoop/util/ComparableVersion.java |  2 +-
 .../org/apache/hadoop/util/FindClass.java     |  4 +-
 .../hadoop/util/GenericOptionsParser.java     | 21 ++++----
 .../hadoop/util/HttpExceptionUtils.java       |  6 +--
 .../apache/hadoop/util/JsonSerialization.java |  2 +-
 .../apache/hadoop/util/LightWeightCache.java  | 10 ++--
 .../org/apache/hadoop/util/LineReader.java    |  3 --
 .../hadoop/util/ShutdownHookManager.java      |  2 +-
 .../hadoop/util/ShutdownThreadsHelper.java    |  2 -
 .../org/apache/hadoop/util/StringUtils.java   |  6 +--
 .../java/org/apache/hadoop/util/Tool.java     |  2 +-
 .../java/org/apache/hadoop/util/ZKUtil.java   |  4 +-
 .../hadoop/util/bloom/DynamicBloomFilter.java |  5 +-
 .../hadoop/util/concurrent/AsyncGet.java      |  2 +-
 .../apache/hadoop/util/hash/JenkinsHash.java  |  4 +-
 .../apache/hadoop/cli/util/CLICommand.java    |  6 +--
 .../apache/hadoop/cli/util/CLICommandFS.java  |  4 +-
 .../hadoop/cli/util/CLICommandTypes.java      |  6 +--
 .../rawcoder/RawErasureCoderBenchmark.java    |  4 +-
 .../io/retry/TestDefaultRetryPolicy.java      |  4 +-
 .../org/apache/hadoop/net/StaticMapping.java  |  4 +-
 .../hadoop/tracing/SetSpanReceiver.java       |  2 +-
 142 files changed, 510 insertions(+), 477 deletions(-)

diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/conf/Configurable.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/conf/Configurable.java
index d847f29ed8509..5816039bc6cbe 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/conf/Configurable.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/conf/Configurable.java
@@ -26,9 +26,15 @@
 @InterfaceStability.Stable
 public interface Configurable {
 
-  /** Set the configuration to be used by this object. */
+  /**
+   * Set the configuration to be used by this object.
+   * @param conf configuration to be used
+   */
   void setConf(Configuration conf);
 
-  /** Return the configuration used by this object. */
+  /**
+   * Return the configuration used by this object.
+   * @return Configuration
+   */
   Configuration getConf();
 }
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/conf/Configuration.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/conf/Configuration.java
index 215627fd03d98..9bdd28a0ae9dd 100755
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/conf/Configuration.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/conf/Configuration.java
@@ -115,7 +115,7 @@
 /**
  * Provides access to configuration parameters.
  *
- * <h4 id="Resources">Resources</h4>
+ * <h3 id="Resources">Resources</h3>
  *
  * <p>Configurations are specified by resources. A resource contains a set of
  * name/value pairs as XML data. Each resource is named by either a 
@@ -141,12 +141,12 @@
  * Once a resource declares a value final, no subsequently-loaded 
  * resource can alter that value.  
  * For example, one might define a final parameter with:
- * <tt><pre>
+ * <pre><code>
  *  &lt;property&gt;
  *    &lt;name&gt;dfs.hosts.include&lt;/name&gt;
  *    &lt;value&gt;/etc/hadoop/conf/hosts.include&lt;/value&gt;
  *    <b>&lt;final&gt;true&lt;/final&gt;</b>
- *  &lt;/property&gt;</pre></tt>
+ *  &lt;/property&gt;</code></pre>
  *
  * Administrators typically define parameters as final in 
  * <tt>core-site.xml</tt> for values that user applications may not alter.
@@ -164,7 +164,7 @@
  *
  * <p>For example, if a configuration resource contains the following property
  * definitions: 
- * <tt><pre>
+ * <pre><code>
  *  &lt;property&gt;
  *    &lt;name&gt;basedir&lt;/name&gt;
  *    &lt;value&gt;/user/${<i>user.name</i>}&lt;/value&gt;
@@ -179,7 +179,7 @@
  *    &lt;name&gt;otherdir&lt;/name&gt;
  *    &lt;value&gt;${<i>env.BASE_DIR</i>}/other&lt;/value&gt;
  *  &lt;/property&gt;
- *  </pre></tt>
+ *  </code></pre>
  *
  * <p>When <tt>conf.get("tempdir")</tt> is called, then <tt>${<i>basedir</i>}</tt>
  * will be resolved to another property in this Configuration, while
@@ -203,7 +203,7 @@
  * can define there own custom tags in  hadoop.tags.custom property.
  *
  * <p>For example, we can tag existing property as:
- * <tt><pre>
+ * <pre><code>
  *  &lt;property&gt;
  *    &lt;name&gt;dfs.replication&lt;/name&gt;
  *    &lt;value&gt;3&lt;/value&gt;
@@ -215,7 +215,7 @@
  *    &lt;value&gt;3&lt;/value&gt;
  *    &lt;tag&gt;HDFS,SECURITY&lt;/tag&gt;
  *  &lt;/property&gt;
- * </pre></tt>
+ * </code></pre>
  * <p> Properties marked with tags can be retrieved with <tt>conf
  * .getAllPropertiesByTag("HDFS")</tt> or <tt>conf.getAllPropertiesByTags
  * (Arrays.asList("YARN","SECURITY"))</tt>.</p>
@@ -581,9 +581,9 @@ public static void addDeprecations(DeprecationDelta[] deltas) {
    * If you have multiple deprecation entries to add, it is more efficient to
    * use #addDeprecations(DeprecationDelta[] deltas) instead.
    * 
-   * @param key
-   * @param newKeys
-   * @param customMessage
+   * @param key to be deprecated
+   * @param newKeys list of keys that take up the values of deprecated key
+   * @param customMessage depcrication message
    * @deprecated use {@link #addDeprecation(String key, String newKey,
       String customMessage)} instead
    */
@@ -605,9 +605,9 @@ public static void addDeprecation(String key, String[] newKeys,
    * If you have multiple deprecation entries to add, it is more efficient to
    * use #addDeprecations(DeprecationDelta[] deltas) instead.
    *
-   * @param key
-   * @param newKey
-   * @param customMessage
+   * @param key to be deprecated
+   * @param newKey key that take up the values of deprecated key
+   * @param customMessage deprecation message
    */
   public static void addDeprecation(String key, String newKey,
 	      String customMessage) {
@@ -1404,6 +1404,7 @@ void logDeprecationOnce(String name, String source) {
 
   /**
    * Unset a previously set property.
+   * @param name the property name
    */
   public synchronized void unset(String name) {
     String[] names = null;
@@ -1693,6 +1694,7 @@ public void setBooleanIfUnset(String name, boolean value) {
    * is equivalent to <code>set(&lt;name&gt;, value.toString())</code>.
    * @param name property name
    * @param value new value
+   * @param <T> enumeration type
    */
   public <T extends Enum<T>> void setEnum(String name, T value) {
     set(name, value.toString());
@@ -1703,8 +1705,10 @@ public <T extends Enum<T>> void setEnum(String name, T value) {
    * Note that the returned value is trimmed by this method.
    * @param name Property name
    * @param defaultValue Value returned if no mapping exists
+   * @param <T> enumeration type
    * @throws IllegalArgumentException If mapping is illegal for the type
    * provided
+   * @return enumeration type
    */
   public <T extends Enum<T>> T getEnum(String name, T defaultValue) {
     final String val = getTrimmed(name);
@@ -1784,6 +1788,7 @@ public void setTimeDuration(String name, long value, TimeUnit unit) {
    * @param unit Unit to convert the stored property, if it exists.
    * @throws NumberFormatException If the property stripped of its unit is not
    *         a number
+   * @return time duration in given time unit
    */
   public long getTimeDuration(String name, long defaultValue, TimeUnit unit) {
     return getTimeDuration(name, defaultValue, unit, unit);
@@ -2315,6 +2320,7 @@ public void setStrings(String name, String... values) {
    * the CredentialProvider API and conditionally fallsback to config.
    * @param name property name
    * @return password
+   * @throws IOException when error in fetching password
    */
   public char[] getPassword(String name) throws IOException {
     char[] pass = null;
@@ -2374,7 +2380,7 @@ private CredentialEntry getCredentialEntry(CredentialProvider provider,
    * alias.
    * @param name alias of the provisioned credential
    * @return password or null if not found
-   * @throws IOException
+   * @throws IOException when error in fetching password
    */
   public char[] getPasswordFromCredentialProviders(String name)
       throws IOException {
@@ -3483,25 +3489,23 @@ public void writeXml(@Nullable Writer out) throws IOException {
   /**
    * Write out the non-default properties in this configuration to the
    * given {@link Writer}.
-   *
+   * <ul>
    * <li>
    * When property name is not empty and the property exists in the
    * configuration, this method writes the property and its attributes
    * to the {@link Writer}.
    * </li>
-   * <p>
    *
    * <li>
    * When property name is null or empty, this method writes all the
    * configuration properties and their attributes to the {@link Writer}.
    * </li>
-   * <p>
    *
    * <li>
    * When property name is not empty but the property doesn't exist in
    * the configuration, this method throws an {@link IllegalArgumentException}.
    * </li>
-   * <p>
+   * </ul>
    * @param out the writer to write to.
    */
   public void writeXml(@Nullable String propertyName, Writer out)
@@ -3611,7 +3615,7 @@ private synchronized void appendXMLProperty(Document doc, Element conf,
   /**
    *  Writes properties and their attributes (final and resource)
    *  to the given {@link Writer}.
-   *
+   *  <ul>
    *  <li>
    *  When propertyName is not empty, and the property exists
    *  in the configuration, the format of the output would be,
@@ -3651,6 +3655,7 @@ private synchronized void appendXMLProperty(Document doc, Element conf,
    *  found in the configuration, this method will throw an
    *  {@link IllegalArgumentException}.
    *  </li>
+   *  </ul>
    *  <p>
    * @param config the configuration
    * @param propertyName property name
@@ -3849,7 +3854,7 @@ public void write(DataOutput out) throws IOException {
   /**
    * get keys matching the the regex 
    * @param regex
-   * @return Map<String,String> with matching keys
+   * @return {@literal Map<String,String>} with matching keys
    */
   public Map<String,String> getValByRegex(String regex) {
     Pattern p = Pattern.compile(regex);
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/conf/ConfigurationWithLogging.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/conf/ConfigurationWithLogging.java
index 8a5e05462f8a5..68c51725e17fe 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/conf/ConfigurationWithLogging.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/conf/ConfigurationWithLogging.java
@@ -41,7 +41,7 @@ public ConfigurationWithLogging(Configuration conf) {
   }
 
   /**
-   * @see Configuration#get(String).
+   * See {@link Configuration#get(String)}.
    */
   @Override
   public String get(String name) {
@@ -51,7 +51,7 @@ public String get(String name) {
   }
 
   /**
-   * @see Configuration#get(String, String).
+   * See {@link Configuration#get(String, String)}.
    */
   @Override
   public String get(String name, String defaultValue) {
@@ -62,7 +62,7 @@ public String get(String name, String defaultValue) {
   }
 
   /**
-   * @see Configuration#getBoolean(String, boolean).
+   * See {@link Configuration#getBoolean(String, boolean)}.
    */
   @Override
   public boolean getBoolean(String name, boolean defaultValue) {
@@ -72,7 +72,7 @@ public boolean getBoolean(String name, boolean defaultValue) {
   }
 
   /**
-   * @see Configuration#getFloat(String, float).
+   * See {@link Configuration#getFloat(String, float)}.
    */
   @Override
   public float getFloat(String name, float defaultValue) {
@@ -82,7 +82,7 @@ public float getFloat(String name, float defaultValue) {
   }
 
   /**
-   * @see Configuration#getInt(String, int).
+   * See {@link Configuration#getInt(String, int)}.
    */
   @Override
   public int getInt(String name, int defaultValue) {
@@ -92,7 +92,7 @@ public int getInt(String name, int defaultValue) {
   }
 
   /**
-   * @see Configuration#getLong(String, long).
+   * See {@link Configuration#getLong(String, long)}.
    */
   @Override
   public long getLong(String name, long defaultValue) {
@@ -102,7 +102,7 @@ public long getLong(String name, long defaultValue) {
   }
 
   /**
-   * @see Configuration#set(String, String, String).
+   * See {@link Configuration#set(String, String, String)}.
    */
   @Override
   public void set(String name, String value, String source) {
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/crypto/CryptoCodec.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/crypto/CryptoCodec.java
index d9c16bbc7a41e..bcf4a65ec24d4 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/crypto/CryptoCodec.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/crypto/CryptoCodec.java
@@ -158,14 +158,15 @@ private static List<Class<? extends CryptoCodec>> getCodecClasses(
    * For example a {@link javax.crypto.Cipher} will maintain its encryption 
    * context internally when we do encryption/decryption using the 
    * Cipher#update interface. 
-   * <p/>
+   * <p>
    * Encryption/Decryption is not always on the entire file. For example,
    * in Hadoop, a node may only decrypt a portion of a file (i.e. a split).
    * In these situations, the counter is derived from the file position.
-   * <p/>
+   * <p>
    * The IV can be calculated by combining the initial IV and the counter with 
    * a lossless operation (concatenation, addition, or XOR).
-   * @see http://en.wikipedia.org/wiki/Block_cipher_mode_of_operation#Counter_.28CTR.29
+   * See http://en.wikipedia.org/wiki/Block_cipher_mode_of_operation#Counter_
+   * .28CTR.29
    * 
    * @param initIV initial IV
    * @param counter counter for input stream position 
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/crypto/CryptoInputStream.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/crypto/CryptoInputStream.java
index 2603ae342a2ad..5b706da3fef78 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/crypto/CryptoInputStream.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/crypto/CryptoInputStream.java
@@ -54,10 +54,10 @@
  * required in order to ensure that the plain text and cipher text have a 1:1
  * mapping. The decryption is buffer based. The key points of the decryption
  * are (1) calculating the counter and (2) padding through stream position:
- * <p/>
+ * <p>
  * counter = base + pos/(algorithm blocksize); 
  * padding = pos%(algorithm blocksize); 
- * <p/>
+ * <p>
  * The underlying stream offset is maintained as state.
  */
 @InterfaceAudience.Private
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/crypto/CryptoOutputStream.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/crypto/CryptoOutputStream.java
index 2f347c5816b2b..8d11043937612 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/crypto/CryptoOutputStream.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/crypto/CryptoOutputStream.java
@@ -36,10 +36,10 @@
  * required in order to ensure that the plain text and cipher text have a 1:1
  * mapping. The encryption is buffer based. The key points of the encryption are
  * (1) calculating counter and (2) padding through stream position.
- * <p/>
+ * <p>
  * counter = base + pos/(algorithm blocksize); 
  * padding = pos%(algorithm blocksize); 
- * <p/>
+ * <p>
  * The underlying stream offset is maintained as state.
  *
  * Note that while some of this class' methods are synchronized, this is just to
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/crypto/Decryptor.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/crypto/Decryptor.java
index 9958415ebd237..7556f18d6dee2 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/crypto/Decryptor.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/crypto/Decryptor.java
@@ -38,7 +38,7 @@ public interface Decryptor {
   
   /**
    * Indicate whether the decryption context is reset.
-   * <p/>
+   * <p>
    * Certain modes, like CTR, require a different IV depending on the 
    * position in the stream. Generally, the decryptor maintains any necessary
    * context for calculating the IV and counter so that no reinit is necessary 
@@ -49,22 +49,22 @@ public interface Decryptor {
   
   /**
    * This presents a direct interface decrypting with direct ByteBuffers.
-   * <p/>
+   * <p>
    * This function does not always decrypt the entire buffer and may potentially
    * need to be called multiple times to process an entire buffer. The object 
    * may hold the decryption context internally.
-   * <p/>
+   * <p>
    * Some implementations may require sufficient space in the destination 
    * buffer to decrypt the entire input buffer.
-   * <p/>
+   * <p>
    * Upon return, inBuffer.position() will be advanced by the number of bytes
    * read and outBuffer.position() by bytes written. Implementations should 
    * not modify inBuffer.limit() and outBuffer.limit().
-   * <p/>
+   * <p>
    * @param inBuffer a direct {@link ByteBuffer} to read from. inBuffer may 
-   * not be null and inBuffer.remaining() must be > 0
+   * not be null and inBuffer.remaining() must be {@literal >} 0
    * @param outBuffer a direct {@link ByteBuffer} to write to. outBuffer may 
-   * not be null and outBuffer.remaining() must be > 0
+   * not be null and outBuffer.remaining() must be {@literal >} 0
    * @throws IOException if decryption fails
    */
   public void decrypt(ByteBuffer inBuffer, ByteBuffer outBuffer) 
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/crypto/Encryptor.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/crypto/Encryptor.java
index 6dc3cfbe38f07..faeb176bf9de3 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/crypto/Encryptor.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/crypto/Encryptor.java
@@ -37,7 +37,7 @@ public interface Encryptor {
   
   /**
    * Indicate whether the encryption context is reset.
-   * <p/>
+   * <p>
    * Certain modes, like CTR, require a different IV depending on the
    * position in the stream. Generally, the encryptor maintains any necessary
    * context for calculating the IV and counter so that no reinit is necessary
@@ -48,22 +48,22 @@ public interface Encryptor {
   
   /**
    * This presents a direct interface encrypting with direct ByteBuffers.
-   * <p/>
+   * <p>
    * This function does not always encrypt the entire buffer and may potentially
    * need to be called multiple times to process an entire buffer. The object 
    * may hold the encryption context internally.
-   * <p/>
+   * <p>
    * Some implementations may require sufficient space in the destination 
    * buffer to encrypt the entire input buffer.
-   * <p/>
+   * <p>
    * Upon return, inBuffer.position() will be advanced by the number of bytes
    * read and outBuffer.position() by bytes written. Implementations should
    * not modify inBuffer.limit() and outBuffer.limit().
-   * <p/>
+   * <p>
    * @param inBuffer a direct {@link ByteBuffer} to read from. inBuffer may 
-   * not be null and inBuffer.remaining() must be > 0
+   * not be null and inBuffer.remaining() must be &gt; 0
    * @param outBuffer a direct {@link ByteBuffer} to write to. outBuffer may 
-   * not be null and outBuffer.remaining() must be > 0
+   * not be null and outBuffer.remaining() must be &gt; 0
    * @throws IOException if encryption fails
    */
   public void encrypt(ByteBuffer inBuffer, ByteBuffer outBuffer) 
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/crypto/OpensslCipher.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/crypto/OpensslCipher.java
index 133a9f9110216..0a2ba52e555e5 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/crypto/OpensslCipher.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/crypto/OpensslCipher.java
@@ -107,12 +107,12 @@ private OpensslCipher(long context, int alg, int padding) {
   }
   
   /**
-   * Return an <code>OpensslCipher<code> object that implements the specified
+   * Return an <code>OpensslCipher</code> object that implements the specified
    * transformation.
    * 
    * @param transformation the name of the transformation, e.g., 
    * AES/CTR/NoPadding.
-   * @return OpensslCipher an <code>OpensslCipher<code> object
+   * @return OpensslCipher an <code>OpensslCipher</code> object
    * @throws NoSuchAlgorithmException if <code>transformation</code> is null, 
    * empty, in an invalid format, or if Openssl doesn't implement the 
    * specified algorithm.
@@ -181,18 +181,18 @@ public void init(int mode, byte[] key, byte[] iv) {
   /**
    * Continues a multiple-part encryption or decryption operation. The data
    * is encrypted or decrypted, depending on how this cipher was initialized.
-   * <p/>
+   * <p>
    * 
    * All <code>input.remaining()</code> bytes starting at 
    * <code>input.position()</code> are processed. The result is stored in
    * the output buffer.
-   * <p/>
+   * <p>
    * 
    * Upon return, the input buffer's position will be equal to its limit;
    * its limit will not have changed. The output buffer's position will have
    * advanced by n, when n is the value returned by this method; the output
    * buffer's limit will not have changed.
-   * <p/>
+   * <p>
    * 
    * If <code>output.remaining()</code> bytes are insufficient to hold the
    * result, a <code>ShortBufferException</code> is thrown.
@@ -218,21 +218,21 @@ public int update(ByteBuffer input, ByteBuffer output)
   /**
    * Finishes a multiple-part operation. The data is encrypted or decrypted,
    * depending on how this cipher was initialized.
-   * <p/>
+   * <p>
    * 
    * The result is stored in the output buffer. Upon return, the output buffer's
    * position will have advanced by n, where n is the value returned by this
    * method; the output buffer's limit will not have changed.
-   * <p/>
+   * <p>
    * 
    * If <code>output.remaining()</code> bytes are insufficient to hold the result,
    * a <code>ShortBufferException</code> is thrown.
-   * <p/>
+   * <p>
    * 
    * Upon finishing, this method resets this cipher object to the state it was
    * in when previously initialized. That is, the object is available to encrypt
    * or decrypt more data.
-   * <p/>
+   * <p>
    * 
    * If any exception is thrown, this cipher object need to be reset before it 
    * can be used again.
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/crypto/key/JavaKeyStoreProvider.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/crypto/key/JavaKeyStoreProvider.java
index 5beda0d2d2eb8..7951af56bc8f9 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/crypto/key/JavaKeyStoreProvider.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/crypto/key/JavaKeyStoreProvider.java
@@ -62,23 +62,24 @@
 /**
  * KeyProvider based on Java's KeyStore file format. The file may be stored in
  * any Hadoop FileSystem using the following name mangling:
- *  jks://hdfs@nn1.example.com/my/keys.jks -> hdfs://nn1.example.com/my/keys.jks
- *  jks://file/home/owen/keys.jks -> file:///home/owen/keys.jks
- * <p/>
+ *  jks://hdfs@nn1.example.com/my/keys.jks {@literal ->}
+ *  hdfs://nn1.example.com/my/keys.jks
+ *  jks://file/home/owen/keys.jks {@literal ->} file:///home/owen/keys.jks
+ * <p>
  * If the <code>HADOOP_KEYSTORE_PASSWORD</code> environment variable is set,
  * its value is used as the password for the keystore.
- * <p/>
+ * <p>
  * If the <code>HADOOP_KEYSTORE_PASSWORD</code> environment variable is not set,
  * the password for the keystore is read from file specified in the
  * {@link #KEYSTORE_PASSWORD_FILE_KEY} configuration property. The password file
  * is looked up in Hadoop's configuration directory via the classpath.
- * <p/>
+ * <p>
  * <b>NOTE:</b> Make sure the password in the password file does not have an
  * ENTER at the end, else it won't be valid for the Java KeyStore.
- * <p/>
+ * <p>
  * If the environment variable, nor the property are not set, the password used
  * is 'none'.
- * <p/>
+ * <p>
  * It is expected for encrypted InputFormats and OutputFormats to copy the keys
  * from the original provider into the job's Credentials object, which is
  * accessed via the UserProvider. Therefore, this provider won't be used by
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/crypto/key/KeyProvider.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/crypto/key/KeyProvider.java
index 9985efa90b991..a8c283ab649cc 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/crypto/key/KeyProvider.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/crypto/key/KeyProvider.java
@@ -50,7 +50,7 @@
  * abstraction to separate key storage from users of encryption. It
  * is intended to support getting or storing keys in a variety of ways,
  * including third party bindings.
- * <P/>
+ * <p>
  * <code>KeyProvider</code> implementations must be thread safe.
  */
 @InterfaceAudience.Public
@@ -550,7 +550,7 @@ protected byte[] generateKey(int size, String algorithm)
   /**
    * Create a new key generating the material for it.
    * The given key must not already exist.
-   * <p/>
+   * <p>
    * This implementation generates the key material and calls the
    * {@link #createKey(String, byte[], Options)} method.
    *
@@ -594,7 +594,7 @@ public void close() throws IOException {
 
   /**
    * Roll a new version of the given key generating the material for it.
-   * <p/>
+   * <p>
    * This implementation generates the key material and calls the
    * {@link #rollNewVersion(String, byte[])} method.
    *
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/crypto/key/KeyProviderCryptoExtension.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/crypto/key/KeyProviderCryptoExtension.java
index 3ee3bd756e253..00d7a7dfce0f7 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/crypto/key/KeyProviderCryptoExtension.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/crypto/key/KeyProviderCryptoExtension.java
@@ -149,7 +149,7 @@ public KeyVersion getEncryptedKeyVersion() {
      * Derive the initialization vector (IV) for the encryption key from the IV
      * of the encrypted key. This derived IV is used with the encryption key to
      * decrypt the encrypted key.
-     * <p/>
+     * <p>
      * The alternative to this is using the same IV for both the encryption key
      * and the encrypted key. Even a simple symmetric transformation like this
      * improves security by avoiding IV re-use. IVs will also be fairly unique
@@ -195,7 +195,7 @@ public void warmUpEncryptedKeys(String... keyNames)
      * The generated key material is of the same
      * length as the <code>KeyVersion</code> material of the latest key version
      * of the key and is encrypted using the same cipher.
-     * <p/>
+     * <p>
      * NOTE: The generated key is not stored by the <code>KeyProvider</code>
      * 
      * @param encryptionKeyName
@@ -498,7 +498,7 @@ public void warmUpEncryptedKeys(String... keyNames)
    * and initialization vector. The generated key material is of the same
    * length as the <code>KeyVersion</code> material and is encrypted using the
    * same cipher.
-   * <p/>
+   * <p>
    * NOTE: The generated key is not stored by the <code>KeyProvider</code>
    *
    * @param encryptionKeyName The latest KeyVersion of this key's material will
@@ -576,7 +576,6 @@ public void drain(String keyName) {
    * NOTE: The generated key is not stored by the <code>KeyProvider</code>
    *
    * @param  ekvs List containing the EncryptedKeyVersion's
-   * @return      The re-encrypted EncryptedKeyVersion's, in the same order.
    * @throws IOException If any EncryptedKeyVersion could not be re-encrypted
    * @throws GeneralSecurityException If any EncryptedKeyVersion could not be
    *                            re-encrypted because of a cryptographic issue.
@@ -589,7 +588,7 @@ public void reencryptEncryptedKeys(List<EncryptedKeyVersion> ekvs)
   /**
    * Creates a <code>KeyProviderCryptoExtension</code> using a given
    * {@link KeyProvider}.
-   * <p/>
+   * <p>
    * If the given <code>KeyProvider</code> implements the
    * {@link CryptoExtension} interface the <code>KeyProvider</code> itself
    * will provide the extension functionality.
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/crypto/key/KeyProviderDelegationTokenExtension.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/crypto/key/KeyProviderDelegationTokenExtension.java
index 92853ab11752f..05d99ed0810fc 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/crypto/key/KeyProviderDelegationTokenExtension.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/crypto/key/KeyProviderDelegationTokenExtension.java
@@ -124,7 +124,7 @@ public Token<?> getDelegationToken(final String renewer) throws IOException {
   /**
    * Creates a <code>KeyProviderDelegationTokenExtension</code> using a given 
    * {@link KeyProvider}.
-   * <p/>
+   * <p>
    * If the given <code>KeyProvider</code> implements the 
    * {@link DelegationTokenExtension} interface the <code>KeyProvider</code> 
    * itself will provide the extension functionality, otherwise a default 
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/crypto/key/kms/KMSClientProvider.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/crypto/key/kms/KMSClientProvider.java
index f0eaef10c17d2..71ed4557b357b 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/crypto/key/kms/KMSClientProvider.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/crypto/key/kms/KMSClientProvider.java
@@ -263,12 +263,12 @@ public static class Factory extends KeyProviderFactory {
 
     /**
      * This provider expects URIs in the following form :
-     * kms://<PROTO>@<AUTHORITY>/<PATH>
+     * {@literal kms://<PROTO>@<AUTHORITY>/<PATH>}
      *
      * where :
      * - PROTO = http or https
-     * - AUTHORITY = <HOSTS>[:<PORT>]
-     * - HOSTS = <HOSTNAME>[;<HOSTS>]
+     * - AUTHORITY = {@literal <HOSTS>[:<PORT>]}
+     * - HOSTS = {@literal <HOSTNAME>[;<HOSTS>]}
      * - HOSTNAME = string
      * - PORT = integer
      *
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/crypto/key/kms/ValueQueue.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/crypto/key/kms/ValueQueue.java
index fa0010215de78..7d26acbf21a03 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/crypto/key/kms/ValueQueue.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/crypto/key/kms/ValueQueue.java
@@ -344,7 +344,7 @@ public int getSize(String keyName) {
    * <code>SyncGenerationPolicy</code> specified by the user.
    * @param keyName String key name
    * @param num Minimum number of values to return.
-   * @return List<E> values returned
+   * @return {@literal List<E>} values returned
    * @throws IOException
    * @throws ExecutionException
    */
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/crypto/random/OpensslSecureRandom.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/crypto/random/OpensslSecureRandom.java
index 1219bf9cc2c7d..1863f5ec2035f 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/crypto/random/OpensslSecureRandom.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/crypto/random/OpensslSecureRandom.java
@@ -30,16 +30,16 @@
 /**
  * OpenSSL secure random using JNI.
  * This implementation is thread-safe.
- * <p/>
+ * <p>
  * 
  * If using an Intel chipset with RDRAND, the high-performance hardware 
  * random number generator will be used and it's much faster than
  * {@link java.security.SecureRandom}. If RDRAND is unavailable, default
  * OpenSSL secure random generator will be used. It's still faster
  * and can generate strong random bytes.
- * <p/>
- * @see https://wiki.openssl.org/index.php/Random_Numbers
- * @see http://en.wikipedia.org/wiki/RdRand
+ * <p>
+ * See https://wiki.openssl.org/index.php/Random_Numbers
+ * See http://en.wikipedia.org/wiki/RdRand
  */
 @InterfaceAudience.Private
 public class OpensslSecureRandom extends Random {
@@ -97,7 +97,7 @@ public void setSeed(long seed) {
    * random bits (right justified, with leading zeros).
    *
    * @param numBits number of random bits to be generated, where
-   * 0 <= <code>numBits</code> <= 32.
+   * 0 {@literal <=} <code>numBits</code> {@literal <=} 32.
    *
    * @return int an <code>int</code> containing the user-specified number
    * of random bits (right justified, with leading zeros).
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/AbstractFileSystem.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/AbstractFileSystem.java
index 9b0bab11afb9c..cd7068025753e 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/AbstractFileSystem.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/AbstractFileSystem.java
@@ -338,7 +338,7 @@ private URI getUri(URI uri, String supportedScheme,
    * The default port of this file system.
    * 
    * @return default port of this file system's Uri scheme
-   *         A uri with a port of -1 => default port;
+   *         A uri with a port of -1 =&gt; default port;
    */
   public abstract int getUriDefaultPort();
 
@@ -480,9 +480,11 @@ public FsServerDefaults getServerDefaults(final Path f) throws IOException {
    * through any internal symlinks or mount point
    * @param p path to be resolved
    * @return fully qualified path 
-   * @throws FileNotFoundException, AccessControlException, IOException
-   *         UnresolvedLinkException if symbolic link on path cannot be resolved
-   *          internally
+   * @throws FileNotFoundException
+   * @throws AccessControlException
+   * @throws IOException
+   * @throws UnresolvedLinkException if symbolic link on path cannot be
+   * resolved internally
    */
    public Path resolvePath(final Path p) throws FileNotFoundException,
            UnresolvedLinkException, AccessControlException, IOException {
@@ -1037,7 +1039,7 @@ public List<Token<?>> getDelegationTokens(String renewer) throws IOException {
    * changes.  (Modifications are merged into the current ACL.)
    *
    * @param path Path to modify
-   * @param aclSpec List<AclEntry> describing modifications
+   * @param aclSpec List{@literal <AclEntry>} describing modifications
    * @throws IOException if an ACL could not be modified
    */
   public void modifyAclEntries(Path path, List<AclEntry> aclSpec)
@@ -1051,7 +1053,7 @@ public void modifyAclEntries(Path path, List<AclEntry> aclSpec)
    * retained.
    *
    * @param path Path to modify
-   * @param aclSpec List<AclEntry> describing entries to remove
+   * @param aclSpec List{@literal <AclEntry>} describing entries to remove
    * @throws IOException if an ACL could not be modified
    */
   public void removeAclEntries(Path path, List<AclEntry> aclSpec)
@@ -1091,8 +1093,9 @@ public void removeAcl(Path path)
    * entries.
    *
    * @param path Path to modify
-   * @param aclSpec List<AclEntry> describing modifications, must include entries
-   *   for user, group, and others for compatibility with permission bits.
+   * @param aclSpec List{@literal <AclEntry>} describing modifications, must
+   * include entries for user, group, and others for compatibility with
+   * permission bits.
    * @throws IOException if an ACL could not be modified
    */
   public void setAcl(Path path, List<AclEntry> aclSpec) throws IOException {
@@ -1104,7 +1107,7 @@ public void setAcl(Path path, List<AclEntry> aclSpec) throws IOException {
    * Gets the ACLs of files and directories.
    *
    * @param path Path to get
-   * @return RemoteIterator<AclStatus> which returns each AclStatus
+   * @return RemoteIterator{@literal <AclStatus>} which returns each AclStatus
    * @throws IOException if an ACL could not be read
    */
   public AclStatus getAclStatus(Path path) throws IOException {
@@ -1116,7 +1119,7 @@ public AclStatus getAclStatus(Path path) throws IOException {
    * Set an xattr of a file or directory.
    * The name must be prefixed with the namespace followed by ".". For example,
    * "user.attr".
-   * <p/>
+   * <p>
    * Refer to the HDFS extended attributes user documentation for details.
    *
    * @param path Path to modify
@@ -1134,7 +1137,7 @@ public void setXAttr(Path path, String name, byte[] value)
    * Set an xattr of a file or directory.
    * The name must be prefixed with the namespace followed by ".". For example,
    * "user.attr".
-   * <p/>
+   * <p>
    * Refer to the HDFS extended attributes user documentation for details.
    *
    * @param path Path to modify
@@ -1153,7 +1156,7 @@ public void setXAttr(Path path, String name, byte[] value,
    * Get an xattr for a file or directory.
    * The name must be prefixed with the namespace followed by ".". For example,
    * "user.attr".
-   * <p/>
+   * <p>
    * Refer to the HDFS extended attributes user documentation for details.
    *
    * @param path Path to get extended attribute
@@ -1170,11 +1173,13 @@ public byte[] getXAttr(Path path, String name) throws IOException {
    * Get all of the xattrs for a file or directory.
    * Only those xattrs for which the logged-in user has permissions to view
    * are returned.
-   * <p/>
+   * <p>
    * Refer to the HDFS extended attributes user documentation for details.
    *
    * @param path Path to get extended attributes
-   * @return Map<String, byte[]> describing the XAttrs of the file or directory
+   *
+   * @return {@literal Map<String, byte[]>} describing the XAttrs of the file
+   * or directory
    * @throws IOException
    */
   public Map<String, byte[]> getXAttrs(Path path) throws IOException {
@@ -1186,12 +1191,13 @@ public Map<String, byte[]> getXAttrs(Path path) throws IOException {
    * Get all of the xattrs for a file or directory.
    * Only those xattrs for which the logged-in user has permissions to view
    * are returned.
-   * <p/>
+   * <p>
    * Refer to the HDFS extended attributes user documentation for details.
    *
    * @param path Path to get extended attributes
    * @param names XAttr names.
-   * @return Map<String, byte[]> describing the XAttrs of the file or directory
+   * @return {@literal Map<String, byte[]>} describing the XAttrs of the file
+   * or directory
    * @throws IOException
    */
   public Map<String, byte[]> getXAttrs(Path path, List<String> names)
@@ -1204,11 +1210,12 @@ public Map<String, byte[]> getXAttrs(Path path, List<String> names)
    * Get all of the xattr names for a file or directory.
    * Only the xattr names for which the logged-in user has permissions to view
    * are returned.
-   * <p/>
+   * <p>
    * Refer to the HDFS extended attributes user documentation for details.
    *
    * @param path Path to get extended attributes
-   * @return Map<String, byte[]> describing the XAttrs of the file or directory
+   * @return {@literal Map<String, byte[]>} describing the XAttrs of the file
+   * or directory
    * @throws IOException
    */
   public List<String> listXAttrs(Path path)
@@ -1221,7 +1228,7 @@ public List<String> listXAttrs(Path path)
    * Remove an xattr of a file or directory.
    * The name must be prefixed with the namespace followed by ".". For example,
    * "user.attr".
-   * <p/>
+   * <p>
    * Refer to the HDFS extended attributes user documentation for details.
    *
    * @param path Path to remove extended attribute
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/BufferedFSInputStream.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/BufferedFSInputStream.java
index 2eb8b959b2eaf..973b136bb3ab2 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/BufferedFSInputStream.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/BufferedFSInputStream.java
@@ -27,7 +27,7 @@
 
 
 /**
- * A class that optimizes reading from FSInputStream by buffering
+ * A class that optimizes reading from FSInputStream by buffering.
  */
 
 @InterfaceAudience.Private
@@ -44,7 +44,7 @@ public class BufferedFSInputStream extends BufferedInputStream
    *
    * @param   in     the underlying input stream.
    * @param   size   the buffer size.
-   * @exception IllegalArgumentException if size <= 0.
+   * @exception IllegalArgumentException if size {@literal <=} 0.
    */
   public BufferedFSInputStream(FSInputStream in, int size) {
     super(in, size);
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/ByteBufferReadable.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/ByteBufferReadable.java
index 20f7224c22cd9..926b554f42ce7 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/ByteBufferReadable.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/ByteBufferReadable.java
@@ -32,18 +32,18 @@ public interface ByteBufferReadable {
   /**
    * Reads up to buf.remaining() bytes into buf. Callers should use
    * buf.limit(..) to control the size of the desired read.
-   * <p/>
+   * <p>
    * After a successful call, buf.position() will be advanced by the number 
    * of bytes read and buf.limit() should be unchanged.
-   * <p/>
+   * <p>
    * In the case of an exception, the values of buf.position() and buf.limit()
    * are undefined, and callers should be prepared to recover from this
    * eventuality.
-   * <p/>
+   * <p>
    * Many implementations will throw {@link UnsupportedOperationException}, so
    * callers that are not confident in support for this method from the
    * underlying filesystem should be prepared to handle that exception.
-   * <p/>
+   * <p>
    * Implementations should treat 0-length requests as legitimate, and must not
    * signal an error upon their receipt.
    *
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/ChecksumFileSystem.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/ChecksumFileSystem.java
index 88c30e2f99135..d5401308adc93 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/ChecksumFileSystem.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/ChecksumFileSystem.java
@@ -41,7 +41,7 @@
  * Abstract Checksumed FileSystem.
  * It provide a basic implementation of a Checksumed FileSystem,
  * which creates a checksum file for each raw file.
- * It generates & verifies checksums at the client side.
+ * It generates &amp; verifies checksums at the client side.
  *
  *****************************************************************/
 @InterfaceAudience.Public
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/ChecksumFs.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/ChecksumFs.java
index aed9db3362415..bc1122c56a2bd 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/ChecksumFs.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/ChecksumFs.java
@@ -42,7 +42,7 @@
  * Abstract Checksumed Fs.
  * It provide a basic implementation of a Checksumed Fs,
  * which creates a checksum file for each raw file.
- * It generates & verifies checksums at the client side.
+ * It generates &amp; verifies checksums at the client side.
  */
 @InterfaceAudience.Private
 @InterfaceStability.Evolving /*Evolving for a release,to be changed to Stable */
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/CommonConfigurationKeys.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/CommonConfigurationKeys.java
index 75749499ff72d..9d9475727d863 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/CommonConfigurationKeys.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/CommonConfigurationKeys.java
@@ -310,7 +310,7 @@ public class CommonConfigurationKeys extends CommonConfigurationKeysPublic {
     "dr.who";
 
   /**
-   * User->groups static mapping to override the groups lookup
+   * User{@literal ->}groups static mapping to override the groups lookup
    */
   public static final String HADOOP_USER_GROUP_STATIC_OVERRIDES = 
       "hadoop.user.group.static.mapping.overrides";
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/CreateFlag.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/CreateFlag.java
index c3e088b66d86c..58b5f704bb831 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/CreateFlag.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/CreateFlag.java
@@ -29,7 +29,7 @@
  * CreateFlag specifies the file create semantic. Users can combine flags like: <br>
  * <code>
  * EnumSet.of(CreateFlag.CREATE, CreateFlag.APPEND)
- * <code>
+ * </code>
  * <p>
  * 
  * Use the CreateFlag as follows:
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FSInputChecker.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FSInputChecker.java
index 4f06e26fcf330..de66eab713ab6 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FSInputChecker.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FSInputChecker.java
@@ -101,7 +101,7 @@ protected FSInputChecker( Path file, int numOfRetries,
    *     Implementors should simply pass through to the underlying data stream.
    * or
    *  (b) needChecksum() will return true:
-   *    - len >= maxChunkSize
+   *    - len {@literal >=} maxChunkSize
    *    - checksum.length is a multiple of CHECKSUM_SIZE
    *    Implementors should read an integer number of data chunks into
    *    buf. The amount read should be bounded by len or by 
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FileContext.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FileContext.java
index 2dd6ef3b1aab4..05fbc34730d0f 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FileContext.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FileContext.java
@@ -767,7 +767,7 @@ public FSDataOutputStream build() throws IOException {
    * Make(create) a directory and all the non-existent parents.
    * 
    * @param dir - the dir to make
-   * @param permission - permissions is set permission&~umask
+   * @param permission - permissions is set permission{@literal &~}umask
    * @param createParent - if true then missing parent dirs are created if false
    *          then parent must exist
    * 
@@ -981,7 +981,6 @@ public Boolean next(final AbstractFileSystem fs, final Path p)
   /**
    * Renames Path src to Path dst
    * <ul>
-   * <li
    * <li>Fails if src is a file and dst is a directory.
    * <li>Fails if src is a directory and dst is a file.
    * <li>Fails if the parent of dst does not exist or is a file.
@@ -1003,7 +1002,7 @@ public Boolean next(final AbstractFileSystem fs, final Path p)
    * 
    * @throws AccessControlException If access is denied
    * @throws FileAlreadyExistsException If <code>dst</code> already exists and
-   *           <code>options</options> has {@link Options.Rename#OVERWRITE} 
+   *           <code>options</code> has {@link Options.Rename#OVERWRITE}
    *           option false.
    * @throws FileNotFoundException If <code>src</code> does not exist
    * @throws ParentNotDirectoryException If parent of <code>dst</code> is not a
@@ -1262,7 +1261,7 @@ public void msync() throws IOException, UnsupportedOperationException {
    * checks to perform.  If the requested permissions are granted, then the
    * method returns normally.  If access is denied, then the method throws an
    * {@link AccessControlException}.
-   * <p/>
+   * <p>
    * The default implementation of this method calls {@link #getFileStatus(Path)}
    * and checks the returned permissions against the requested permissions.
    * Note that the getFileStatus call will be subject to authorization checks.
@@ -1509,9 +1508,9 @@ public FsStatus next(final AbstractFileSystem fs, final Path p)
    * <pre>
    * Given a path referring to a symlink of form:
    * 
-   *   <---X---> 
+   *   {@literal <---}X{@literal --->}
    *   fs://host/A/B/link 
-   *   <-----Y----->
+   *   {@literal <-----}Y{@literal ----->}
    * 
    * In this path X is the scheme and authority that identify the file system,
    * and Y is the path leading up to the final path component "link". If Y is
@@ -1548,7 +1547,7 @@ public FsStatus next(final AbstractFileSystem fs, final Path p)
    *
    *
    * @throws AccessControlException If access is denied
-   * @throws FileAlreadyExistsException If file <code>linkcode> already exists
+   * @throws FileAlreadyExistsException If file <code>link</code> already exists
    * @throws FileNotFoundException If <code>target</code> does not exist
    * @throws ParentNotDirectoryException If parent of <code>link</code> is not a
    *           directory.
@@ -2050,7 +2049,6 @@ public LocatedFileStatus next() throws IOException {
      * <dl>
      *  <dd>
      *   <dl>
-     *    <p>
      *    <dt> <tt> ? </tt>
      *    <dd> Matches any single character.
      *
@@ -2412,7 +2410,8 @@ public List<Token<?>> getDelegationTokens(
    * changes.  (Modifications are merged into the current ACL.)
    *
    * @param path Path to modify
-   * @param aclSpec List<AclEntry> describing modifications
+   * @param aclSpec List{@literal <}AclEntry{@literal >} describing
+   * modifications
    * @throws IOException if an ACL could not be modified
    */
   public void modifyAclEntries(final Path path, final List<AclEntry> aclSpec)
@@ -2433,7 +2432,8 @@ public Void next(final AbstractFileSystem fs, final Path p)
    * retained.
    *
    * @param path Path to modify
-   * @param aclSpec List<AclEntry> describing entries to remove
+   * @param aclSpec List{@literal <}AclEntry{@literal >} describing entries
+   * to remove
    * @throws IOException if an ACL could not be modified
    */
   public void removeAclEntries(final Path path, final List<AclEntry> aclSpec)
@@ -2493,8 +2493,9 @@ public Void next(final AbstractFileSystem fs, final Path p)
    * entries.
    *
    * @param path Path to modify
-   * @param aclSpec List<AclEntry> describing modifications, must include entries
-   *   for user, group, and others for compatibility with permission bits.
+   * @param aclSpec List{@literal <}AclEntry{@literal >} describing
+   * modifications, must include entries for user, group, and others for
+   * compatibility with permission bits.
    * @throws IOException if an ACL could not be modified
    */
   public void setAcl(Path path, final List<AclEntry> aclSpec)
@@ -2514,7 +2515,8 @@ public Void next(final AbstractFileSystem fs, final Path p)
    * Gets the ACLs of files and directories.
    *
    * @param path Path to get
-   * @return RemoteIterator<AclStatus> which returns each AclStatus
+   * @return RemoteIterator{@literal <}AclStatus{@literal >} which returns
+   *         each AclStatus
    * @throws IOException if an ACL could not be read
    */
   public AclStatus getAclStatus(Path path) throws IOException {
@@ -2532,7 +2534,7 @@ public AclStatus next(final AbstractFileSystem fs, final Path p)
    * Set an xattr of a file or directory.
    * The name must be prefixed with the namespace followed by ".". For example,
    * "user.attr".
-   * <p/>
+   * <p>
    * Refer to the HDFS extended attributes user documentation for details.
    *
    * @param path Path to modify
@@ -2550,7 +2552,7 @@ public void setXAttr(Path path, String name, byte[] value)
    * Set an xattr of a file or directory.
    * The name must be prefixed with the namespace followed by ".". For example,
    * "user.attr".
-   * <p/>
+   * <p>
    * Refer to the HDFS extended attributes user documentation for details.
    *
    * @param path Path to modify
@@ -2576,7 +2578,7 @@ public Void next(final AbstractFileSystem fs, final Path p)
    * Get an xattr for a file or directory.
    * The name must be prefixed with the namespace followed by ".". For example,
    * "user.attr".
-   * <p/>
+   * <p>
    * Refer to the HDFS extended attributes user documentation for details.
    *
    * @param path Path to get extended attribute
@@ -2599,11 +2601,12 @@ public byte[] next(final AbstractFileSystem fs, final Path p)
    * Get all of the xattrs for a file or directory.
    * Only those xattrs for which the logged-in user has permissions to view
    * are returned.
-   * <p/>
+   * <p>
    * Refer to the HDFS extended attributes user documentation for details.
    *
    * @param path Path to get extended attributes
-   * @return Map<String, byte[]> describing the XAttrs of the file or directory
+   * @return Map{@literal <}String, byte[]{@literal >} describing the XAttrs
+   * of the file or directory
    * @throws IOException
    */
   public Map<String, byte[]> getXAttrs(Path path) throws IOException {
@@ -2621,12 +2624,13 @@ public Map<String, byte[]> next(final AbstractFileSystem fs, final Path p)
    * Get all of the xattrs for a file or directory.
    * Only those xattrs for which the logged-in user has permissions to view
    * are returned.
-   * <p/>
+   * <p>
    * Refer to the HDFS extended attributes user documentation for details.
    *
    * @param path Path to get extended attributes
    * @param names XAttr names.
-   * @return Map<String, byte[]> describing the XAttrs of the file or directory
+   * @return Map{@literal <}String, byte[]{@literal >} describing the XAttrs
+   * of the file or directory
    * @throws IOException
    */
   public Map<String, byte[]> getXAttrs(Path path, final List<String> names)
@@ -2645,7 +2649,7 @@ public Map<String, byte[]> next(final AbstractFileSystem fs, final Path p)
    * Remove an xattr of a file or directory.
    * The name must be prefixed with the namespace followed by ".". For example,
    * "user.attr".
-   * <p/>
+   * <p>
    * Refer to the HDFS extended attributes user documentation for details.
    *
    * @param path Path to remove extended attribute
@@ -2668,11 +2672,12 @@ public Void next(final AbstractFileSystem fs, final Path p)
    * Get all of the xattr names for a file or directory.
    * Only those xattr names which the logged-in user has permissions to view
    * are returned.
-   * <p/>
+   * <p>
    * Refer to the HDFS extended attributes user documentation for details.
    *
    * @param path Path to get extended attributes
-   * @return List<String> of the XAttr names of the file or directory
+   * @return List{@literal <}String{@literal >} of the XAttr names of the
+   * file or directory
    * @throws IOException
    */
   public List<String> listXAttrs(Path path) throws IOException {
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FileSystem.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FileSystem.java
index ff5091c776b93..78608ffc9b73c 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FileSystem.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FileSystem.java
@@ -639,7 +639,7 @@ public DelegationTokenIssuer[] getAdditionalTokenIssuers()
    * Create a file with the provided permission.
    *
    * The permission of the file is set to be the provided permission as in
-   * setPermission, not permission&~umask
+   * setPermission, not permission{@literal &~}umask
    *
    * The HDFS implementation is implemented using two RPCs.
    * It is understood that it is inefficient,
@@ -664,7 +664,7 @@ public static FSDataOutputStream create(FileSystem fs,
   /**
    * Create a directory with the provided permission.
    * The permission of the directory is set to be the provided permission as in
-   * setPermission, not permission&~umask
+   * setPermission, not permission{@literal &~}umask
    *
    * @see #create(FileSystem, Path, FsPermission)
    *
@@ -745,7 +745,7 @@ protected void checkPath(Path path) {
    * <pre>
    *   if f == null :
    *     result = null
-   *   elif f.getLen() <= start:
+   *   elif f.getLen() {@literal <=} start:
    *     result = []
    *   else result = [ locations(FS, b) for b in blocks(FS, p, s, s+l)]
    * </pre>
@@ -2000,7 +2000,6 @@ public FileStatus[] listStatus(Path[] files, PathFilter filter)
    * <dl>
    *  <dd>
    *   <dl>
-   *    <p>
    *    <dt> <tt> ? </tt>
    *    <dd> Matches any single character.
    *
@@ -2908,7 +2907,7 @@ public void deleteSnapshot(Path path, String snapshotName)
    * changes.  (Modifications are merged into the current ACL.)
    *
    * @param path Path to modify
-   * @param aclSpec List<AclEntry> describing modifications
+   * @param aclSpec List&lt;AclEntry&gt; describing modifications
    * @throws IOException if an ACL could not be modified
    * @throws UnsupportedOperationException if the operation is unsupported
    *         (default outcome).
@@ -3101,7 +3100,7 @@ public Map<String, byte[]> getXAttrs(Path path, List<String> names)
    * Refer to the HDFS extended attributes user documentation for details.
    *
    * @param path Path to get extended attributes
-   * @return List<String> of the XAttr names of the file or directory
+   * @return List{@literal <String>} of the XAttr names of the file or directory
    * @throws IOException IO failure
    * @throws UnsupportedOperationException if the operation is unsupported
    *         (default outcome).
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FileUtil.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FileUtil.java
index 480495a9d5b19..f278a1a644de6 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FileUtil.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FileUtil.java
@@ -1499,8 +1499,8 @@ public static String[] createJarWithClassPath(String inputClassPath, Path pwd,
    * @param inputClassPath String input classpath to bundle into the jar manifest
    * @param pwd Path to working directory to save jar
    * @param targetDir path to where the jar execution will have its working dir
-   * @param callerEnv Map<String, String> caller's environment variables to use
-   *   for expansion
+   * @param callerEnv Map {@literal <}String, String{@literal >} caller's
+   * environment variables to use for expansion
    * @return String[] with absolute path to new jar in position 0 and
    *   unexpanded wild card entry path in position 1
    * @throws IOException if there is an I/O error while writing the jar file
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/HarFileSystem.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/HarFileSystem.java
index 25033059646fa..7e12d0a11e953 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/HarFileSystem.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/HarFileSystem.java
@@ -86,7 +86,7 @@ public HarFileSystem() {
 
   /**
    * Return the protocol scheme for the FileSystem.
-   * <p/>
+   * <p>
    *
    * @return <code>har</code>
    */
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/HasEnhancedByteBufferAccess.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/HasEnhancedByteBufferAccess.java
index 982a0efef86eb..8ceba7bddd8a2 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/HasEnhancedByteBufferAccess.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/HasEnhancedByteBufferAccess.java
@@ -52,18 +52,19 @@ public interface HasEnhancedByteBufferAccess {
    * @return
    *            We will always return an empty buffer if maxLength was 0,
    *            whether or not we are at EOF.
-   *            If maxLength > 0, we will return null if the stream has
-   *            reached EOF.
+   *            If maxLength &gt; 0, we will return null if the stream
+   *            has reached EOF.
    *            Otherwise, we will return a ByteBuffer containing at least one 
    *            byte.  You must free this ByteBuffer when you are done with it 
    *            by calling releaseBuffer on it.  The buffer will continue to be
    *            readable until it is released in this manner.  However, the
    *            input stream's close method may warn about unclosed buffers.
-   * @throws
-   *            IOException: if there was an error reading.
-   *            UnsupportedOperationException: if factory was null, and we
-   *            needed an external byte buffer.  UnsupportedOperationException
-   *            will never be thrown unless the factory argument is null.
+   * @throws    IOException if there was an error reading.
+   * @throws    UnsupportedOperationException  if factory was null,
+   *             and we needed an external byte buffer.
+   * @throws    UnsupportedOperationException  will never be thrown
+   *             unless the factory argument is null.
+   *
    */
   public ByteBuffer read(ByteBufferPool factory, int maxLength,
       EnumSet<ReadOption> opts)
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/LocalDirAllocator.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/LocalDirAllocator.java
index a4b158a85ab06..5f266a7b82555 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/LocalDirAllocator.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/LocalDirAllocator.java
@@ -241,9 +241,8 @@ public static void removeContext(String contextCfgItemName) {
    *  @param pathStr the requested file (this will be searched)
    *  @param conf the Configuration object
    *  @return true if files exist. false otherwise
-   *  @throws IOException
    */
-  public boolean ifExists(String pathStr,Configuration conf) {
+  public boolean ifExists(String pathStr, Configuration conf) {
     AllocatorPerContext context = obtainContext(contextCfgItemName);
     return context.ifExists(pathStr, conf);
   }
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/LocalFileSystem.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/LocalFileSystem.java
index 538ccdfcc620e..c41190a7b360b 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/LocalFileSystem.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/LocalFileSystem.java
@@ -54,7 +54,7 @@ public void initialize(URI name, Configuration conf) throws IOException {
 
   /**
    * Return the protocol scheme for the FileSystem.
-   * <p/>
+   * <p>
    *
    * @return <code>file</code>
    */
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/Options.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/Options.java
index 5e932864c8805..75bc12df8fdcf 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/Options.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/Options.java
@@ -290,7 +290,7 @@ public static ChecksumOpt createDisabled() {
      * @param defaultOpt Default checksum option
      * @param userOpt User-specified checksum option. Ignored if null.
      * @param userBytesPerChecksum User-specified bytesPerChecksum
-     *                Ignored if < 0.
+     *                Ignored if {@literal <} 0.
      */
     public static ChecksumOpt processChecksumOpt(ChecksumOpt defaultOpt, 
         ChecksumOpt userOpt, int userBytesPerChecksum) {
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/QuotaUsage.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/QuotaUsage.java
index 9ad6a2862337e..3472362dc4792 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/QuotaUsage.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/QuotaUsage.java
@@ -229,8 +229,8 @@ public int hashCode() {
 
   /**
    * Output format:
-   * <----12----> <----15----> <----15----> <----15----> <-------18------->
-   *    QUOTA   REMAINING_QUATA SPACE_QUOTA SPACE_QUOTA_REM FILE_NAME
+   * |----12----| |----15----| |----15----| |----15----| |-------18-------|
+   *    QUOTA   REMAINING_QUOTA SPACE_QUOTA SPACE_QUOTA_REM FILE_NAME
    */
   protected static final String QUOTA_STRING_FORMAT = "%12s %15s ";
   protected static final String SPACE_QUOTA_STRING_FORMAT = "%15s %15s ";
@@ -244,9 +244,9 @@ public int hashCode() {
 
   /**
    * Output format:
-   * <----12----> <------15-----> <------15-----> <------15----->
+   * |----12----| |------15-----| |------15-----| |------15-----|
    *        QUOTA       REM_QUOTA     SPACE_QUOTA REM_SPACE_QUOTA
-   * <----12----> <----12----> <-------18------->
+   * |----12----| |----12----| |-------18-------|
    *    DIR_COUNT   FILE_COUNT       CONTENT_SIZE
    */
   private static final String STORAGE_TYPE_SUMMARY_FORMAT = "%13s %17s ";
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/ftp/FTPFileSystem.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/ftp/FTPFileSystem.java
index 644cf4e50eae1..676c207e00dc4 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/ftp/FTPFileSystem.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/ftp/FTPFileSystem.java
@@ -76,7 +76,7 @@ public class FTPFileSystem extends FileSystem {
 
   /**
    * Return the protocol scheme for the FileSystem.
-   * <p/>
+   * <p>
    *
    * @return <code>ftp</code>
    */
@@ -162,7 +162,7 @@ private FTPClient connect() throws IOException {
   /**
    * Set FTP's transfer mode based on configuration. Valid values are
    * STREAM_TRANSFER_MODE, BLOCK_TRANSFER_MODE and COMPRESSED_TRANSFER_MODE.
-   * <p/>
+   * <p>
    * Defaults to BLOCK_TRANSFER_MODE.
    *
    * @param conf
@@ -195,7 +195,7 @@ int getTransferMode(Configuration conf) {
    * Set the FTPClient's data connection mode based on configuration. Valid
    * values are ACTIVE_LOCAL_DATA_CONNECTION_MODE,
    * PASSIVE_LOCAL_DATA_CONNECTION_MODE and PASSIVE_REMOTE_DATA_CONNECTION_MODE.
-   * <p/>
+   * <p>
    * Defaults to ACTIVE_LOCAL_DATA_CONNECTION_MODE.
    *
    * @param client
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/ftp/FtpConfigKeys.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/ftp/FtpConfigKeys.java
index e59efa5b2bc56..b522102e540a4 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/ftp/FtpConfigKeys.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/ftp/FtpConfigKeys.java
@@ -21,6 +21,7 @@
 import java.io.IOException;
 import org.apache.hadoop.classification.InterfaceAudience;
 import org.apache.hadoop.classification.InterfaceStability;
+import org.apache.hadoop.fs.ChecksumFileSystem;
 import org.apache.hadoop.fs.CommonConfigurationKeys;
 import org.apache.hadoop.fs.FsServerDefaults;
 import org.apache.hadoop.util.DataChecksum;
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/local/LocalConfigKeys.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/local/LocalConfigKeys.java
index 0b9e74553cd79..e93858ff1e63b 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/local/LocalConfigKeys.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/local/LocalConfigKeys.java
@@ -22,6 +22,7 @@
 
 import org.apache.hadoop.classification.InterfaceAudience;
 import org.apache.hadoop.classification.InterfaceStability;
+import org.apache.hadoop.fs.ChecksumFileSystem;
 import org.apache.hadoop.fs.CommonConfigurationKeys;
 import org.apache.hadoop.fs.FsServerDefaults;
 import org.apache.hadoop.util.DataChecksum;
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/permission/AclStatus.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/permission/AclStatus.java
index 131aa1994350c..385fed21d4194 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/permission/AclStatus.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/permission/AclStatus.java
@@ -69,7 +69,7 @@ public boolean isStickyBit() {
   /**
    * Returns the list of all ACL entries, ordered by their natural ordering.
    *
-   * @return List<AclEntry> unmodifiable ordered list of all ACL entries
+   * @return List&lt;AclEntry&gt; unmodifiable ordered list of all ACL entries
    */
   public List<AclEntry> getEntries() {
     return entries;
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/permission/AclUtil.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/permission/AclUtil.java
index 2811a89f24dc5..42492520dceaa 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/permission/AclUtil.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/permission/AclUtil.java
@@ -36,8 +36,8 @@ public final class AclUtil {
    * Given permissions and extended ACL entries, returns the full logical ACL.
    *
    * @param perm FsPermission containing permissions
-   * @param entries List<AclEntry> containing extended ACL entries
-   * @return List<AclEntry> containing full logical ACL
+   * @param entries List&lt;AclEntry&gt; containing extended ACL entries
+   * @return List&lt;AclEntry&gt; containing full logical ACL
    */
   public static List<AclEntry> getAclFromPermAndEntries(FsPermission perm,
       List<AclEntry> entries) {
@@ -93,8 +93,8 @@ public static List<AclEntry> getAclFromPermAndEntries(FsPermission perm,
    * Translates the given permission bits to the equivalent minimal ACL.
    *
    * @param perm FsPermission to translate
-   * @return List<AclEntry> containing exactly 3 entries representing the owner,
-   *   group and other permissions
+   * @return List&lt;AclEntry&gt; containing exactly 3 entries representing the
+   *         owner, group and other permissions
    */
   public static List<AclEntry> getMinimalAcl(FsPermission perm) {
     return Lists.newArrayList(
@@ -119,7 +119,7 @@ public static List<AclEntry> getMinimalAcl(FsPermission perm) {
    * Checks if the given entries represent a minimal ACL (contains exactly 3
    * entries).
    *
-   * @param entries List<AclEntry> entries to check
+   * @param entries List&lt;AclEntry&gt; entries to check
    * @return boolean true if the entries represent a minimal ACL
    */
   public static boolean isMinimalAcl(List<AclEntry> entries) {
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/permission/ScopedAclEntries.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/permission/ScopedAclEntries.java
index a16f439365261..a67cafe78b128 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/permission/ScopedAclEntries.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/permission/ScopedAclEntries.java
@@ -42,7 +42,7 @@ public final class ScopedAclEntries {
    * list is already sorted such that all access entries precede all default
    * entries.
    *
-   * @param aclEntries List<AclEntry> to separate
+   * @param aclEntries List&lt;AclEntry&gt; to separate
    */
   public ScopedAclEntries(List<AclEntry> aclEntries) {
     int pivot = calculatePivotOnDefaultEntries(aclEntries);
@@ -59,8 +59,8 @@ public ScopedAclEntries(List<AclEntry> aclEntries) {
   /**
    * Returns access entries.
    *
-   * @return List<AclEntry> containing just access entries, or an empty list if
-   *   there are no access entries
+   * @return List&lt;AclEntry&gt; containing just access entries, or an empty
+   * list if there are no access entries
    */
   public List<AclEntry> getAccessEntries() {
     return accessEntries;
@@ -69,8 +69,8 @@ public List<AclEntry> getAccessEntries() {
   /**
    * Returns default entries.
    *
-   * @return List<AclEntry> containing just default entries, or an empty list if
-   *   there are no default entries
+   * @return List&lt;AclEntry&gt; containing just default entries, or an empty
+   * list if there are no default entries
    */
   public List<AclEntry> getDefaultEntries() {
     return defaultEntries;
@@ -78,8 +78,8 @@ public List<AclEntry> getDefaultEntries() {
 
   /**
    * Returns the pivot point in the list between the access entries and the
-   * default entries.  This is the index of the first element in the list that is
-   * a default entry.
+   * default entries.  This is the index of the first element in the list that
+   * is a default entry.
    *
    * @param aclBuilder ArrayList<AclEntry> containing entries to build
    * @return int pivot point, or -1 if list contains no default entries
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/shell/Command.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/shell/Command.java
index a4746cf76cc7f..3eef2787e7e74 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/shell/Command.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/shell/Command.java
@@ -148,16 +148,16 @@ protected CommandFactory getCommandFactory() {
    * expand arguments, and then process each argument.
    * <pre>
    * run
-   * |-> {@link #processOptions(LinkedList)}
-   * \-> {@link #processRawArguments(LinkedList)}
-   *      |-> {@link #expandArguments(LinkedList)}
-   *      |   \-> {@link #expandArgument(String)}*
-   *      \-> {@link #processArguments(LinkedList)}
-   *          |-> {@link #processArgument(PathData)}*
-   *          |   |-> {@link #processPathArgument(PathData)}
-   *          |   \-> {@link #processPaths(PathData, PathData...)}
-   *          |        \-> {@link #processPath(PathData)}*
-   *          \-> {@link #processNonexistentPath(PathData)}
+   * |{@literal ->} {@link #processOptions(LinkedList)}
+   * \{@literal ->} {@link #processRawArguments(LinkedList)}
+   *      |{@literal ->} {@link #expandArguments(LinkedList)}
+   *      |   \{@literal ->} {@link #expandArgument(String)}*
+   *      \{@literal ->} {@link #processArguments(LinkedList)}
+   *          |{@literal ->} {@link #processArgument(PathData)}*
+   *          |   |{@literal ->} {@link #processPathArgument(PathData)}
+   *          |   \{@literal ->} {@link #processPaths(PathData, PathData...)}
+   *          |        \{@literal ->} {@link #processPath(PathData)}*
+   *          \{@literal ->} {@link #processNonexistentPath(PathData)}
    * </pre>
    * Most commands will chose to implement just
    * {@link #processOptions(LinkedList)} and {@link #processPath(PathData)}
@@ -292,8 +292,8 @@ protected void processArgument(PathData item) throws IOException {
   /**
    *  This is the last chance to modify an argument before going into the
    *  (possibly) recursive {@link #processPaths(PathData, PathData...)}
-   *  -> {@link #processPath(PathData)} loop.  Ex.  ls and du use this to
-   *  expand out directories.
+   *  {@literal ->} {@link #processPath(PathData)} loop.  Ex.  ls and du use
+   *  this to expand out directories.
    *  @param item a {@link PathData} representing a path which exists
    *  @throws IOException if anything goes wrong... 
    */
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/shell/CommandFormat.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/shell/CommandFormat.java
index bf30b22e1fbe5..4dd20d108428e 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/shell/CommandFormat.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/shell/CommandFormat.java
@@ -162,7 +162,7 @@ public String getOptValue(String option) {
 
   /** Returns all the options that are set
    * 
-   * @return Set<String> of the enabled options
+   * @return Set{@literal <}String{@literal >} of the enabled options
    */
   public Set<String> getOpts() {
     Set<String> optSet = new HashSet<String>();
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/viewfs/ViewFs.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/viewfs/ViewFs.java
index d4fd397aaba43..7eb49252a7f06 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/viewfs/ViewFs.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/viewfs/ViewFs.java
@@ -85,16 +85,16 @@
  * one or more individual file systems (a localFs or Hdfs, S3fs, etc).
  * For example one could have a mount table that provides links such as
  * <ul>
- * <li>  /user          -> hdfs://nnContainingUserDir/user
- * <li>  /project/foo   -> hdfs://nnProject1/projects/foo
- * <li>  /project/bar   -> hdfs://nnProject2/projects/bar
- * <li>  /tmp           -> hdfs://nnTmp/privateTmpForUserXXX
+ * <li>  /user          {@literal ->} hdfs://nnContainingUserDir/user
+ * <li>  /project/foo   {@literal ->} hdfs://nnProject1/projects/foo
+ * <li>  /project/bar   {@literal ->} hdfs://nnProject2/projects/bar
+ * <li>  /tmp           {@literal ->} hdfs://nnTmp/privateTmpForUserXXX
  * </ul> 
  * 
  * ViewFs is specified with the following URI: <b>viewfs:///</b> 
  * <p>
  * To use viewfs one would typically set the default file system in the
- * config  (i.e. fs.defaultFS < = viewfs:///) along with the
+ * config  (i.e. fs.defaultFS {@literal <} = viewfs:///) along with the
  * mount table config variables as described below. 
  * 
  * <p>
@@ -142,7 +142,7 @@
  *   (because they do not fit on one) then one could specify a mount
  *   entry such as following merges two dirs:
  *   <ul>
- *   <li> /user -> hdfs://nnUser1/user,hdfs://nnUser2/user
+ *   <li> /user {@literal ->} hdfs://nnUser1/user,hdfs://nnUser2/user
  *   </ul>
  *  Such a mergeLink can be specified with the following config var where ","
  *  is used as the separator for each of links to be merged:
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/ActiveStandbyElector.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/ActiveStandbyElector.java
index d099ca71ac7ac..12de2ef91c413 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/ActiveStandbyElector.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/ActiveStandbyElector.java
@@ -54,10 +54,10 @@
  * Apache Zookeeper. Using Zookeeper as a coordination service, leader election
  * can be performed by atomically creating an ephemeral lock file (znode) on
  * Zookeeper. The service instance that successfully creates the znode becomes
- * active and the rest become standbys. <br/>
+ * active and the rest become standbys. <br>
  * This election mechanism is only efficient for small number of election
  * candidates (order of 10's) because contention on single znode by a large
- * number of candidates can result in Zookeeper overload. <br/>
+ * number of candidates can result in Zookeeper overload. <br>
  * The elector does not guarantee fencing (protection of shared resources) among
  * service instances. After it has notified an instance about becoming a leader,
  * then that instance must ensure that it meets the service consistency
@@ -70,10 +70,10 @@
 public class ActiveStandbyElector implements StatCallback, StringCallback {
 
   /**
-   * Callback interface to interact with the ActiveStandbyElector object. <br/>
+   * Callback interface to interact with the ActiveStandbyElector object. <br>
    * The application will be notified with a callback only on state changes
    * (i.e. there will never be successive calls to becomeActive without an
-   * intermediate call to enterNeutralMode). <br/>
+   * intermediate call to enterNeutralMode). <br>
    * The callbacks will be running on Zookeeper client library threads. The
    * application should return from these callbacks quickly so as not to impede
    * Zookeeper client library performance and notifications. The app will
@@ -105,7 +105,7 @@ public interface ActiveStandbyElectorCallback {
      * interface. The service may choose to ignore this or stop doing state
      * changing operations. Upon reconnection, the elector verifies the leader
      * status and calls back on the becomeActive and becomeStandby app
-     * interfaces. <br/>
+     * interfaces. <br>
      * Zookeeper disconnects can happen due to network issues or loss of
      * Zookeeper quorum. Thus enterNeutralMode can be used to guard against
      * split-brain issues. In such situations it might be prudent to call
@@ -178,12 +178,12 @@ enum State {
   private ZooKeeper monitorLockNodeClient;
 
   /**
-   * Create a new ActiveStandbyElector object <br/>
+   * Create a new ActiveStandbyElector object <br>
    * The elector is created by providing to it the Zookeeper configuration, the
    * parent znode under which to create the znode and a reference to the
-   * callback interface. <br/>
+   * callback interface. <br>
    * The parent znode name must be the same for all service instances and
-   * different across services. <br/>
+   * different across services. <br>
    * After the leader has been lost, a new leader will be elected after the
    * session timeout expires. Hence, the app must set this parameter based on
    * its needs for failure response time. The session timeout must be greater
@@ -217,12 +217,12 @@ public ActiveStandbyElector(String zookeeperHostPorts,
   }
 
   /**
-   * Create a new ActiveStandbyElector object <br/>
+   * Create a new ActiveStandbyElector object <br>
    * The elector is created by providing to it the Zookeeper configuration, the
    * parent znode under which to create the znode and a reference to the
-   * callback interface. <br/>
+   * callback interface. <br>
    * The parent znode name must be the same for all service instances and
-   * different across services. <br/>
+   * different across services. <br>
    * After the leader has been lost, a new leader will be elected after the
    * session timeout expires. Hence, the app must set this parameter based on
    * its needs for failure response time. The session timeout must be greater
@@ -278,9 +278,9 @@ public ActiveStandbyElector(String zookeeperHostPorts,
   /**
    * To participate in election, the app will call joinElection. The result will
    * be notified by a callback on either the becomeActive or becomeStandby app
-   * interfaces. <br/>
+   * interfaces. <br>
    * After this the elector will automatically monitor the leader status and
-   * perform re-election if necessary<br/>
+   * perform re-election if necessary<br>
    * The app could potentially start off in standby mode and ignore the
    * becomeStandby call.
    * 
@@ -397,11 +397,11 @@ public Void run() throws KeeperException, InterruptedException {
 
   /**
    * Any service instance can drop out of the election by calling quitElection. 
-   * <br/>
+   * <br>
    * This will lose any leader status, if held, and stop monitoring of the lock
-   * node. <br/>
+   * node. <br>
    * If the instance wants to participate in election again, then it needs to
-   * call joinElection(). <br/>
+   * call joinElection(). <br>
    * This allows service instances to take themselves out of rotation for known
    * impending unavailable states (e.g. long GC pause or software upgrade).
    * 
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/HAAdmin.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/HAAdmin.java
index 7038efa36520f..0950ea7e01c57 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/HAAdmin.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/HAAdmin.java
@@ -324,7 +324,7 @@ private int getServiceState(final CommandLine cmd)
 
   /**
    * Return the serviceId as is, we are assuming it was
-   * given as a service address of form <host:ipcport>.
+   * given as a service address of form {@literal <}host:ipcport{@literal >}.
    */
   protected String getServiceAddr(String serviceId) {
     return serviceId;
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/NodeFencer.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/NodeFencer.java
index c8a7e60aa9115..b0cead56ac0e7 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/NodeFencer.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/NodeFencer.java
@@ -44,7 +44,7 @@
  * <code>com.example.foo.MyMethod</code>
  * The class provided must implement the {@link FenceMethod} interface.
  * The fencing methods that ship with Hadoop may also be referred to
- * by shortened names:<p>
+ * by shortened names:<br>
  * <ul>
  * <li><code>shell(/path/to/some/script.sh args...)</code></li>
  * <li><code>sshfence(...)</code> (see {@link SshFenceByTcpPort})
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/SshFenceByTcpPort.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/SshFenceByTcpPort.java
index 9ae113b0ea60d..e0c2f4d9e7b77 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/SshFenceByTcpPort.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/SshFenceByTcpPort.java
@@ -52,7 +52,7 @@
  * with ssh.
  * <p>
  * In order to achieve passwordless SSH, the operator must also configure
- * <code>dfs.ha.fencing.ssh.private-key-files<code> to point to an
+ * <code>dfs.ha.fencing.ssh.private-key-files</code> to point to an
  * SSH key that has passphrase-less access to the given username and host.
  */
 public class SshFenceByTcpPort extends Configured
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/http/HttpServer2.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/http/HttpServer2.java
index 705f9980ffbbb..496ecdb347d69 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/http/HttpServer2.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/http/HttpServer2.java
@@ -105,9 +105,9 @@
 /**
  * Create a Jetty embedded server to answer http requests. The primary goal is
  * to serve up status information for the server. There are three contexts:
- * "/logs/" -> points to the log directory "/static/" -> points to common static
- * files (src/webapps/static) "/" -> the jsp server code from
- * (src/webapps/<name>)
+ * "/logs/" {@literal ->} points to the log directory "/static/" {@literal ->}
+ * points to common static files (src/webapps/static) "/" {@literal ->} the
+ * jsp server code from (src/webapps/{@literal <}name{@literal >})
  *
  * This class is a fork of the old HttpServer. HttpServer exists for
  * compatibility reasons. See HBASE-10336 for more details.
@@ -1395,10 +1395,10 @@ public String toString() {
 
   /**
    * Checks the user has privileges to access to instrumentation servlets.
-   * <p/>
+   * <p>
    * If <code>hadoop.security.instrumentation.requires.admin</code> is set to FALSE
    * (default value) it always returns TRUE.
-   * <p/>
+   * <p>
    * If <code>hadoop.security.instrumentation.requires.admin</code> is set to TRUE
    * it will check that if the current user is in the admin ACLS. If the user is
    * in the admin ACLs it returns TRUE, otherwise it returns FALSE.
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/EnumSetWritable.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/EnumSetWritable.java
index dc430cc29c39e..be86159519b87 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/EnumSetWritable.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/EnumSetWritable.java
@@ -83,7 +83,7 @@ public EnumSetWritable(EnumSet<E> value) {
 
   /**
    * reset the EnumSetWritable with specified
-   * <tt>value</value> and <tt>elementType</tt>. If the <tt>value</tt> argument
+   * <tt>value</tt> and <tt>elementType</tt>. If the <tt>value</tt> argument
    * is null or its size is zero, the <tt>elementType</tt> argument must not be
    * null. If the argument <tt>value</tt>'s size is bigger than zero, the
    * argument <tt>elementType</tt> is not be used.
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/IOUtils.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/IOUtils.java
index 3708a3b4d1c24..5bbfba39b76e2 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/IOUtils.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/IOUtils.java
@@ -370,7 +370,7 @@ public static void writeFully(FileChannel fc, ByteBuffer buf,
   }
 
   /**
-   * Return the complete list of files in a directory as strings.<p/>
+   * Return the complete list of files in a directory as strings.<p>
    *
    * This is better than File#listDir because it does not ignore IOExceptions.
    *
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/ReadaheadPool.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/ReadaheadPool.java
index d5d22d96e4291..2bfbc70fc901b 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/ReadaheadPool.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/ReadaheadPool.java
@@ -91,7 +91,7 @@ private ReadaheadPool() {
    * @param readaheadLength the configured length to read ahead
    * @param maxOffsetToRead the maximum offset that will be readahead
    *        (useful if, for example, only some segment of the file is
-   *        requested by the user). Pass {@link Long.MAX_VALUE} to allow
+   *        requested by the user). Pass {@link Long#MAX_VALUE} to allow
    *        readahead to the end of the file.
    * @param lastReadahead the result returned by the previous invocation
    *        of this function on this file descriptor, or null if this is
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/SecureIOUtils.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/SecureIOUtils.java
index 252ee4c08e3ec..9d3c3c1ceeaa7 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/SecureIOUtils.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/SecureIOUtils.java
@@ -37,7 +37,7 @@
 /**
  * This class provides secure APIs for opening and creating files on the local
  * disk. The main issue this class tries to handle is that of symlink traversal.
- * <br/>
+ * <br>
  * An example of such an attack is:
  * <ol>
  * <li> Malicious user removes his task's syslog file, and puts a link to the
@@ -50,7 +50,7 @@
  * </ol>
  * A similar attack is possible involving task log truncation, but in that case
  * due to an insecure write to a file.
- * <br/>
+ * <br>
  */
 public class SecureIOUtils {
 
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/SequenceFile.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/SequenceFile.java
index f42848b00cd34..9afa621892bf7 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/SequenceFile.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/SequenceFile.java
@@ -79,7 +79,7 @@
  *                                       values.
  *   </li>
  *   <li>
- *   <code>BlockCompressWriter</code> : Block-compressed files, both keys & 
+ *   <code>BlockCompressWriter</code> : Block-compressed files, both keys &amp;
  *                                      values are collected in 'blocks' 
  *                                      separately and compressed. The size of 
  *                                      the 'block' is configurable.
@@ -94,13 +94,13 @@
  * <p>The {@link SequenceFile.Reader} acts as the bridge and can read any of the
  * above <code>SequenceFile</code> formats.</p>
  *
- * <h4 id="Formats">SequenceFile Formats</h4>
+ * <h3 id="Formats">SequenceFile Formats</h3>
  * 
  * <p>Essentially there are 3 different formats for <code>SequenceFile</code>s
  * depending on the <code>CompressionType</code> specified. All of them share a
  * <a href="#Header">common header</a> described below.
  * 
- * <h5 id="Header">SequenceFile Header</h5>
+ * <h4 id="Header">SequenceFile Header</h4>
  * <ul>
  *   <li>
  *   version - 3 bytes of magic header <b>SEQ</b>, followed by 1 byte of actual 
@@ -133,7 +133,7 @@
  *   </li>
  * </ul>
  * 
- * <h5 id="#UncompressedFormat">Uncompressed SequenceFile Format</h5>
+ * <h5>Uncompressed SequenceFile Format</h5>
  * <ul>
  * <li>
  * <a href="#Header">Header</a>
@@ -152,7 +152,7 @@
  * </li>
  * </ul>
  *
- * <h5 id="#RecordCompressedFormat">Record-Compressed SequenceFile Format</h5>
+ * <h5>Record-Compressed SequenceFile Format</h5>
  * <ul>
  * <li>
  * <a href="#Header">Header</a>
@@ -171,7 +171,7 @@
  * </li>
  * </ul>
  * 
- * <h5 id="#BlockCompressedFormat">Block-Compressed SequenceFile Format</h5>
+ * <h5>Block-Compressed SequenceFile Format</h5>
  * <ul>
  * <li>
  * <a href="#Header">Header</a>
@@ -1935,8 +1935,8 @@ private void initialize(Path filename, FSDataInputStream in,
      * @param fs The file system used to open the file.
      * @param file The file being read.
      * @param bufferSize The buffer size used to read the file.
-     * @param length The length being read if it is >= 0.  Otherwise,
-     *               the length is not available.
+     * @param length The length being read if it is {@literal >=} 0.
+     *               Otherwise, the length is not available.
      * @return The opened stream.
      * @throws IOException
      */
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/Writable.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/Writable.java
index f0fe6fb830a45..b94de6c3c72bd 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/Writable.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/Writable.java
@@ -37,7 +37,7 @@
  * and returns the instance.</p>
  * 
  * <p>Example:</p>
- * <p><blockquote><pre>
+ * <blockquote><pre>
  *     public class MyWritable implements Writable {
  *       // Some data
  *       private int counter;
@@ -62,7 +62,7 @@
  *         return w;
  *       }
  *     }
- * </pre></blockquote></p>
+ * </pre></blockquote>
  */
 @InterfaceAudience.Public
 @InterfaceStability.Stable
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/WritableComparable.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/WritableComparable.java
index b030481231e35..c1208aa1173e0 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/WritableComparable.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/WritableComparable.java
@@ -36,8 +36,9 @@
  * satisfy this property.</p>
  *  
  * <p>Example:</p>
- * <p><blockquote><pre>
- *     public class MyWritableComparable implements WritableComparable<MyWritableComparable> {
+ * <blockquote><pre>
+ *     public class MyWritableComparable implements
+ *      WritableComparable{@literal <MyWritableComparable>} {
  *       // Some data
  *       private int counter;
  *       private long timestamp;
@@ -66,7 +67,7 @@
  *         return result
  *       }
  *     }
- * </pre></blockquote></p>
+ * </pre></blockquote>
  */
 @InterfaceAudience.Public
 @InterfaceStability.Stable
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/WritableUtils.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/WritableUtils.java
index e58e0e1c3fa9f..2062fb6fe3705 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/WritableUtils.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/WritableUtils.java
@@ -236,7 +236,8 @@ public static void cloneInto(Writable dst, Writable src) throws IOException {
 
   /**
    * Serializes an integer to a binary stream with zero-compressed encoding.
-   * For -112 <= i <= 127, only one byte is used with the actual value.
+   * For -112 {@literal <=} i {@literal <=} 127, only one byte is used with the
+   * actual value.
    * For other values of i, the first byte value indicates whether the
    * integer is positive or negative, and the number of bytes that follow.
    * If the first byte value v is between -113 and -116, the following integer
@@ -255,7 +256,8 @@ public static void writeVInt(DataOutput stream, int i) throws IOException {
   
   /**
    * Serializes a long to a binary stream with zero-compressed encoding.
-   * For -112 <= i <= 127, only one byte is used with the actual value.
+   * For -112 {@literal <=} i {@literal <=} 127, only one byte is used with the
+   * actual value.
    * For other values of i, the first byte value indicates whether the
    * long is positive or negative, and the number of bytes that follow.
    * If the first byte value v is between -113 and -120, the following long
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/compress/CompressionCodecFactory.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/compress/CompressionCodecFactory.java
index 3701f2026af78..e24812058e0a7 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/compress/CompressionCodecFactory.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/compress/CompressionCodecFactory.java
@@ -227,9 +227,9 @@ public CompressionCodec getCodecByClassName(String classname) {
     /**
      * Find the relevant compression codec for the codec's canonical class name
      * or by codec alias.
-     * <p/>
+     * <p>
      * Codec aliases are case insensitive.
-     * <p/>
+     * <p>
      * The code alias is the short class name (without the package name).
      * If the short class name ends with 'Codec', then there are two aliases for
      * the codec, the complete short class name and the short class name without
@@ -255,9 +255,9 @@ public CompressionCodec getCodecByName(String codecName) {
     /**
      * Find the relevant compression codec for the codec's canonical class name
      * or by codec alias and returns its implemetation class.
-     * <p/>
+     * <p>
      * Codec aliases are case insensitive.
-     * <p/>
+     * <p>
      * The code alias is the short class name (without the package name).
      * If the short class name ends with 'Codec', then there are two aliases for
      * the codec, the complete short class name and the short class name without
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/compress/Lz4Codec.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/compress/Lz4Codec.java
index 6b4a686e56cc1..ba6b487150501 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/compress/Lz4Codec.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/compress/Lz4Codec.java
@@ -61,9 +61,9 @@ public Configuration getConf() {
   }
 
   /**
-   * Are the native lz4 libraries loaded & initialized?
+   * Are the native lz4 libraries loaded &amp; initialized?
    *
-   * @return true if loaded & initialized, otherwise false
+   * @return true if loaded &amp; initialized, otherwise false
    */
   public static boolean isNativeCodeLoaded() {
     return NativeCodeLoader.isNativeCodeLoaded();
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/compress/SnappyCodec.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/compress/SnappyCodec.java
index 2ce7fafbec4ca..686f30c9f89a2 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/compress/SnappyCodec.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/compress/SnappyCodec.java
@@ -57,7 +57,7 @@ public Configuration getConf() {
   }
 
   /**
-   * Are the native snappy libraries loaded & initialized?
+   * Are the native snappy libraries loaded &amp; initialized?
    */
   public static void checkNativeCodeLoaded() {
     if (!NativeCodeLoader.buildSupportsSnappy()) {
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/compress/bzip2/Bzip2Compressor.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/compress/bzip2/Bzip2Compressor.java
index d4a9787a4ab04..5713c56df6aef 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/compress/bzip2/Bzip2Compressor.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/compress/bzip2/Bzip2Compressor.java
@@ -247,7 +247,7 @@ public synchronized long getBytesWritten() {
   }
 
   /**
-   * Returns the total number of uncompressed bytes input so far.</p>
+   * Returns the total number of uncompressed bytes input so far.
    *
    * @return the total (non-negative) number of uncompressed bytes input so far
    */
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/compress/bzip2/Bzip2Decompressor.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/compress/bzip2/Bzip2Decompressor.java
index 96693ad30d42b..72ba97630e206 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/compress/bzip2/Bzip2Decompressor.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/compress/bzip2/Bzip2Decompressor.java
@@ -183,7 +183,7 @@ public synchronized long getBytesWritten() {
   }
 
   /**
-   * Returns the total number of compressed bytes input so far.</p>
+   * Returns the total number of compressed bytes input so far.
    *
    * @return the total (non-negative) number of compressed bytes input so far
    */
@@ -195,7 +195,7 @@ public synchronized long getBytesRead() {
   /**
    * Returns the number of bytes remaining in the input buffers; normally
    * called when finished() is true to determine amount of post-gzip-stream
-   * data.</p>
+   * data.
    *
    * @return the total (non-negative) number of unprocessed bytes in input
    */
@@ -206,7 +206,7 @@ public synchronized int getRemaining() {
   }
 
   /**
-   * Resets everything including the input buffers (user and direct).</p>
+   * Resets everything including the input buffers (user and direct).
    */
   @Override
   public synchronized void reset() {
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/compress/bzip2/Bzip2Factory.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/compress/bzip2/Bzip2Factory.java
index d24b4bf2a6f63..3af5309ca217c 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/compress/bzip2/Bzip2Factory.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/compress/bzip2/Bzip2Factory.java
@@ -37,11 +37,11 @@ public class Bzip2Factory {
   private static boolean nativeBzip2Loaded;
   
   /**
-   * Check if native-bzip2 code is loaded & initialized correctly and 
+   * Check if native-bzip2 code is loaded &amp; initialized correctly and
    * can be loaded for this job.
    * 
    * @param conf configuration
-   * @return <code>true</code> if native-bzip2 is loaded & initialized 
+   * @return <code>true</code> if native-bzip2 is loaded &amp; initialized
    *         and can be loaded for this job, else <code>false</code>
    */
   public static synchronized boolean isNativeBzip2Loaded(Configuration conf) {
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/compress/bzip2/CBZip2InputStream.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/compress/bzip2/CBZip2InputStream.java
index bb02cf27a5a31..8426d25c2950e 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/compress/bzip2/CBZip2InputStream.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/compress/bzip2/CBZip2InputStream.java
@@ -200,20 +200,18 @@ private int readAByte(InputStream inStream) throws IOException {
   }
 
   /**
-  * This method tries to find the marker (passed to it as the first parameter)
-  * in the stream.  It can find bit patterns of length <= 63 bits.  Specifically
-  * this method is used in CBZip2InputStream to find the end of block (EOB)
-  * delimiter in the stream, starting from the current position of the stream.
-  * If marker is found, the stream position will be at the byte containing
-  * the starting bit of the marker.
-  *
-  * @param marker  The bit pattern to be found in the stream
-  * @param markerBitLength  No of bits in the marker
-  * @return true if the marker was found otherwise false
-  *
-  * @throws IOException
-  * @throws IllegalArgumentException  if marketBitLength is greater than 63
-  */
+   * This method tries to find the marker (passed to it as the first parameter)
+   * in the stream. It can find bit patterns of length &lt;= 63 bits.
+   * Specifically this method is used in CBZip2InputStream to find the end of
+   * block (EOB) delimiter in the stream, starting from the current position
+   * of the stream. If marker is found, the stream position will be at the
+   * byte containing the starting bit of the marker.
+   * @param marker The bit pattern to be found in the stream
+   * @param markerBitLength No of bits in the marker
+   * @return true if the marker was found otherwise false
+   * @throws IOException
+   * @throws IllegalArgumentException if marketBitLength is greater than 63
+   */
   public boolean skipToNextMarker(long marker, int markerBitLength)
       throws IOException, IllegalArgumentException {
     try {
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/compress/bzip2/CBZip2OutputStream.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/compress/bzip2/CBZip2OutputStream.java
index ca4e5cd0df560..850fec77c5109 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/compress/bzip2/CBZip2OutputStream.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/compress/bzip2/CBZip2OutputStream.java
@@ -64,7 +64,8 @@
  * </pre>
  *
  * <table width="100%" border="1">
- * <colgroup> <col width="33%" /> <col width="33%" /> <col width="33%" />
+ *   <caption></caption>
+ * <colgroup> <col width="33%" > <col width="33%" > <col width="33%" >
  * </colgroup>
  * <tr>
  * <th colspan="3">Memory usage by blocksize</th>
@@ -614,9 +615,9 @@ public CBZip2OutputStream(final OutputStream out) throws IOException {
   * @throws IOException
   *             if an I/O error occurs in the specified stream.
   * @throws IllegalArgumentException
-  *             if <code>(blockSize < 1) || (blockSize > 9)</code>.
+  *             if {@code (blockSize < 1) || (blockSize > 9)}
   * @throws NullPointerException
-  *             if <code>out == null</code>.
+  *             if {@code out == null}.
   *
   * @see #MIN_BLOCKSIZE
   * @see #MAX_BLOCKSIZE
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/compress/zlib/BuiltInGzipDecompressor.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/compress/zlib/BuiltInGzipDecompressor.java
index b4c66596ccd87..896d35eb1808b 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/compress/zlib/BuiltInGzipDecompressor.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/compress/zlib/BuiltInGzipDecompressor.java
@@ -404,7 +404,7 @@ private void executeTrailerState() throws IOException {
 
   /**
    * Returns the total number of compressed bytes input so far, including
-   * gzip header/trailer bytes.</p>
+   * gzip header/trailer bytes.
    *
    * @return the total (non-negative) number of compressed bytes read so far
    */
@@ -420,7 +420,7 @@ public synchronized long getBytesRead() {
    * non-zero value unless called after {@link #setInput(byte[] b, int off,
    * int len)} and before {@link #decompress(byte[] b, int off, int len)}.
    * (That is, after {@link #decompress(byte[] b, int off, int len)} it
-   * always returns zero, except in finished state with concatenated data.)</p>
+   * always returns zero, except in finished state with concatenated data.)
    *
    * @return the total (non-negative) number of unprocessed bytes in input
    */
@@ -441,7 +441,7 @@ public synchronized void setDictionary(byte[] b, int off, int len) {
 
   /**
    * Returns true if the end of the gzip substream (single "member") has been
-   * reached.</p>
+   * reached.
    */
   @Override
   public synchronized boolean finished() {
@@ -450,7 +450,7 @@ public synchronized boolean finished() {
 
   /**
    * Resets everything, including the input buffer, regardless of whether the
-   * current gzip substream is finished.</p>
+   * current gzip substream is finished.
    */
   @Override
   public synchronized void reset() {
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/compress/zlib/ZlibCompressor.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/compress/zlib/ZlibCompressor.java
index 438c8bedad515..da8a90bb3170e 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/compress/zlib/ZlibCompressor.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/compress/zlib/ZlibCompressor.java
@@ -435,7 +435,7 @@ public long getBytesWritten() {
   }
 
   /**
-   * Returns the total number of uncompressed bytes input so far.</p>
+   * Returns the total number of uncompressed bytes input so far.
    *
    * @return the total (non-negative) number of uncompressed bytes input so far
    */
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/compress/zlib/ZlibDecompressor.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/compress/zlib/ZlibDecompressor.java
index dd550b9acba01..f642d7713035d 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/compress/zlib/ZlibDecompressor.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/compress/zlib/ZlibDecompressor.java
@@ -243,7 +243,7 @@ public long getBytesWritten() {
   }
 
   /**
-   * Returns the total number of compressed bytes input so far.</p>
+   * Returns the total number of compressed bytes input so far.
    *
    * @return the total (non-negative) number of compressed bytes input so far
    */
@@ -255,7 +255,7 @@ public long getBytesRead() {
   /**
    * Returns the number of bytes remaining in the input buffers; normally
    * called when finished() is true to determine amount of post-gzip-stream
-   * data.</p>
+   * data.
    *
    * @return the total (non-negative) number of unprocessed bytes in input
    */
@@ -266,7 +266,7 @@ public int getRemaining() {
   }
 
   /**
-   * Resets everything including the input buffers (user and direct).</p>
+   * Resets everything including the input buffers (user and direct).
    */
   @Override
   public void reset() {
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/compress/zlib/ZlibFactory.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/compress/zlib/ZlibFactory.java
index 93b3b6db32ff7..07afbab7246b7 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/compress/zlib/ZlibFactory.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/compress/zlib/ZlibFactory.java
@@ -73,11 +73,11 @@ public static void setNativeZlibLoaded(final boolean isLoaded) {
     ZlibFactory.nativeZlibLoaded = isLoaded;
   }
   /**
-   * Check if native-zlib code is loaded & initialized correctly and 
+   * Check if native-zlib code is loaded &amp; initialized correctly and
    * can be loaded for this job.
    * 
    * @param conf configuration
-   * @return <code>true</code> if native-zlib is loaded & initialized 
+   * @return <code>true</code> if native-zlib is loaded &amp; initialized
    *         and can be loaded for this job, else <code>false</code>
    */
   public static boolean isNativeZlibLoaded(Configuration conf) {
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/erasurecode/CodecUtil.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/erasurecode/CodecUtil.java
index 8ec0e72c3e114..5ba6e9c0dd1ed 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/erasurecode/CodecUtil.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/erasurecode/CodecUtil.java
@@ -36,7 +36,7 @@
 import java.lang.reflect.InvocationTargetException;
 
 /**
- * A codec & coder utility to help create coders conveniently.
+ * A codec &amp; coder utility to help create coders conveniently.
  *
  * {@link CodecUtil} includes erasure coder configurations key and default
  * values such as coder class name and erasure codec option values included
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/erasurecode/rawcoder/util/GaloisField.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/erasurecode/rawcoder/util/GaloisField.java
index fdb47be9c9a6b..f80fceca94c34 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/erasurecode/rawcoder/util/GaloisField.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/erasurecode/rawcoder/util/GaloisField.java
@@ -518,7 +518,7 @@ public void remainder(ByteBuffer[] dividend, int[] divisor) {
 
   /**
    * Perform Gaussian elimination on the given matrix. This matrix has to be a
-   * fat matrix (number of rows > number of columns).
+   * fat matrix (number of rows &gt; number of columns).
    */
   public void gaussianElimination(int[][] matrix) {
     assert(matrix != null && matrix.length > 0 && matrix[0].length > 0
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/file/tfile/TFile.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/file/tfile/TFile.java
index c63baa550b13d..09cd2825e3cf2 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/file/tfile/TFile.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/file/tfile/TFile.java
@@ -1308,11 +1308,11 @@ protected Scanner(Reader reader, long offBegin, long offEnd)
        * @param reader
        *          The TFile reader object.
        * @param beginKey
-       *          Begin key of the scan. If null, scan from the first <K,V>
-       *          entry of the TFile.
+       *          Begin key of the scan. If null, scan from the first
+       *          &lt;K, V&gt; entry of the TFile.
        * @param endKey
-       *          End key of the scan. If null, scan up to the last <K, V> entry
-       *          of the TFile.
+       *          End key of the scan. If null, scan up to the last &lt;K, V&gt;
+       *          entry of the TFile.
        * @throws IOException
        */
       protected Scanner(Reader reader, RawComparable beginKey,
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/file/tfile/Utils.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/file/tfile/Utils.java
index 8cb6e0d95ce1e..17a27f16b9a4a 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/file/tfile/Utils.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/file/tfile/Utils.java
@@ -62,27 +62,33 @@ public static void writeVInt(DataOutput out, int n) throws IOException {
    * <li>if n in [-32, 127): encode in one byte with the actual value.
    * Otherwise,
    * <li>if n in [-20*2^8, 20*2^8): encode in two bytes: byte[0] = n/256 - 52;
-   * byte[1]=n&0xff. Otherwise,
+   * byte[1]=n&amp;0xff. Otherwise,
    * <li>if n IN [-16*2^16, 16*2^16): encode in three bytes: byte[0]=n/2^16 -
-   * 88; byte[1]=(n>>8)&0xff; byte[2]=n&0xff. Otherwise,
+   * 88; byte[1]=(n&gt;&gt;8)&amp;0xff; byte[2]=n&amp;0xff. Otherwise,
    * <li>if n in [-8*2^24, 8*2^24): encode in four bytes: byte[0]=n/2^24 - 112;
-   * byte[1] = (n>>16)&0xff; byte[2] = (n>>8)&0xff; byte[3]=n&0xff. Otherwise:
+   * byte[1] = (n&gt;&gt;16)&amp;0xff; byte[2] = (n&gt;&gt;8)&amp;0xff;
+   * byte[3]=n&amp;0xff.
+   * Otherwise:
    * <li>if n in [-2^31, 2^31): encode in five bytes: byte[0]=-125; byte[1] =
-   * (n>>24)&0xff; byte[2]=(n>>16)&0xff; byte[3]=(n>>8)&0xff; byte[4]=n&0xff;
+   * (n&gt;&gt;24)&amp;0xff; byte[2]=(n&gt;&gt;16)&amp;0xff;
+   * byte[3]=(n&gt;&gt;8)&amp;0xff; byte[4]=n&amp;0xff;
    * <li>if n in [-2^39, 2^39): encode in six bytes: byte[0]=-124; byte[1] =
-   * (n>>32)&0xff; byte[2]=(n>>24)&0xff; byte[3]=(n>>16)&0xff;
-   * byte[4]=(n>>8)&0xff; byte[5]=n&0xff
+   * (n&gt;&gt;32)&amp;0xff; byte[2]=(n&gt;&gt;24)&amp;0xff;
+   * byte[3]=(n&gt;&gt;16)&amp;0xff; byte[4]=(n&gt;&gt;8)&amp;0xff;
+   * byte[5]=n&amp;0xff
    * <li>if n in [-2^47, 2^47): encode in seven bytes: byte[0]=-123; byte[1] =
-   * (n>>40)&0xff; byte[2]=(n>>32)&0xff; byte[3]=(n>>24)&0xff;
-   * byte[4]=(n>>16)&0xff; byte[5]=(n>>8)&0xff; byte[6]=n&0xff;
+   * (n&gt;&gt;40)&amp;0xff; byte[2]=(n&gt;&gt;32)&amp;0xff;
+   * byte[3]=(n&gt;&gt;24)&amp;0xff; byte[4]=(n&gt;&gt;16)&amp;0xff;
+   * byte[5]=(n&gt;&gt;8)&amp;0xff; byte[6]=n&amp;0xff;
    * <li>if n in [-2^55, 2^55): encode in eight bytes: byte[0]=-122; byte[1] =
-   * (n>>48)&0xff; byte[2] = (n>>40)&0xff; byte[3]=(n>>32)&0xff;
-   * byte[4]=(n>>24)&0xff; byte[5]=(n>>16)&0xff; byte[6]=(n>>8)&0xff;
-   * byte[7]=n&0xff;
+   * (n&gt;&gt;48)&amp;0xff; byte[2] = (n&gt;&gt;40)&amp;0xff;
+   * byte[3]=(n&gt;&gt;32)&amp;0xff; byte[4]=(n&gt;&gt;24)&amp;0xff; byte[5]=
+   * (n&gt;&gt;16)&amp;0xff; byte[6]=(n&gt;&gt;8)&amp;0xff; byte[7]=n&amp;0xff;
    * <li>if n in [-2^63, 2^63): encode in nine bytes: byte[0]=-121; byte[1] =
-   * (n>>54)&0xff; byte[2] = (n>>48)&0xff; byte[3] = (n>>40)&0xff;
-   * byte[4]=(n>>32)&0xff; byte[5]=(n>>24)&0xff; byte[6]=(n>>16)&0xff;
-   * byte[7]=(n>>8)&0xff; byte[8]=n&0xff;
+   * (n&gt;&gt;54)&amp;0xff; byte[2] = (n&gt;&gt;48)&amp;0xff;
+   * byte[3] = (n&gt;&gt;40)&amp;0xff; byte[4]=(n&gt;&gt;32)&amp;0xff;
+   * byte[5]=(n&gt;&gt;24)&amp;0xff; byte[6]=(n&gt;&gt;16)&amp;0xff; byte[7]=
+   * (n&gt;&gt;8)&amp;0xff; byte[8]=n&amp;0xff;
    * </ul>
    * 
    * @param out
@@ -181,15 +187,15 @@ public static int readVInt(DataInput in) throws IOException {
    * Decoding the variable-length integer. Suppose the value of the first byte
    * is FB, and the following bytes are NB[*].
    * <ul>
-   * <li>if (FB >= -32), return (long)FB;
-   * <li>if (FB in [-72, -33]), return (FB+52)<<8 + NB[0]&0xff;
-   * <li>if (FB in [-104, -73]), return (FB+88)<<16 + (NB[0]&0xff)<<8 +
-   * NB[1]&0xff;
-   * <li>if (FB in [-120, -105]), return (FB+112)<<24 + (NB[0]&0xff)<<16 +
-   * (NB[1]&0xff)<<8 + NB[2]&0xff;
+   * <li>if (FB &gt;= -32), return (long)FB;
+   * <li>if (FB in [-72, -33]), return (FB+52)&lt;&lt;8 + NB[0]&amp;0xff;
+   * <li>if (FB in [-104, -73]), return (FB+88)&lt;&lt;16 +
+   * (NB[0]&amp;0xff)&lt;&lt;8 + NB[1]&amp;0xff;
+   * <li>if (FB in [-120, -105]), return (FB+112)&lt;&lt;24 + (NB[0]&amp;0xff)
+   * &lt;&lt;16 + (NB[1]&amp;0xff)&lt;&lt;8 + NB[2]&amp;0xff;
    * <li>if (FB in [-128, -121]), return interpret NB[FB+129] as a signed
    * big-endian integer.
-   * 
+   * </ul>
    * @param in
    *          input stream
    * @return the decoded long integer.
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/retry/RetryProxy.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/retry/RetryProxy.java
index 9875bcd185d02..7fcd5fd4b0080 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/retry/RetryProxy.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/retry/RetryProxy.java
@@ -89,12 +89,12 @@ public static <T> Object create(Class<T> iface, T implementation,
    * 
    * @param iface the interface that the retry will implement
    * @param proxyProvider provides implementation instances whose methods should be retried
-   * @param methodNameToPolicyMapa map of method names to retry policies
+   * @param methodNameToPolicyMap map of method names to retry policies
    * @return the retry proxy
    */
   public static <T> Object create(Class<T> iface,
       FailoverProxyProvider<T> proxyProvider,
-      Map<String,RetryPolicy> methodNameToPolicyMap,
+      Map<String, RetryPolicy> methodNameToPolicyMap,
       RetryPolicy defaultPolicy) {
     return Proxy.newProxyInstance(
         proxyProvider.getInterface().getClassLoader(),
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/serializer/Deserializer.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/serializer/Deserializer.java
index 3b727d906b6f0..3c8dfccafa8bb 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/serializer/Deserializer.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/serializer/Deserializer.java
@@ -26,7 +26,7 @@
 
 /**
  * <p>
- * Provides a facility for deserializing objects of type <T> from an
+ * Provides a facility for deserializing objects of type {@literal <T>} from an
  * {@link InputStream}.
  * </p>
  * 
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/serializer/Serializer.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/serializer/Serializer.java
index 63d3738de88b7..5ada541370ee0 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/serializer/Serializer.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/serializer/Serializer.java
@@ -26,7 +26,7 @@
 
 /**
  * <p>
- * Provides a facility for serializing objects of type <T> to an
+ * Provides a facility for serializing objects of type &lt;T&gt; to an
  * {@link OutputStream}.
  * </p>
  * 
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ipc/CallerContext.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ipc/CallerContext.java
index b156d1fe64781..0be5939e9906b 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ipc/CallerContext.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ipc/CallerContext.java
@@ -131,7 +131,7 @@ public CallerContext build() {
 
   /**
    * The thread local current caller context.
-   * <p/>
+   * <p>
    * Internal class for defered singleton idiom.
    * https://en.wikipedia.org/wiki/Initialization_on_demand_holder_idiom
    */
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ipc/ClientCache.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ipc/ClientCache.java
index 8a5e324e226d4..9c02c459e5cbc 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ipc/ClientCache.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ipc/ClientCache.java
@@ -39,7 +39,7 @@ public class ClientCache {
     new HashMap<SocketFactory, Client>();
 
   /**
-   * Construct & cache an IPC client with the user-provided SocketFactory 
+   * Construct &amp; cache an IPC client with the user-provided SocketFactory
    * if no cached client exists.
    * 
    * @param conf Configuration
@@ -68,7 +68,7 @@ public synchronized Client getClient(Configuration conf,
   }
 
   /**
-   * Construct & cache an IPC client with the default SocketFactory 
+   * Construct &amp; cache an IPC client with the default SocketFactory
    * and default valueClass if no cached client exists. 
    * 
    * @param conf Configuration
@@ -79,7 +79,7 @@ public synchronized Client getClient(Configuration conf) {
   }
   
   /**
-   * Construct & cache an IPC client with the user-provided SocketFactory 
+   * Construct &amp; cache an IPC client with the user-provided SocketFactory
    * if no cached client exists. Default response type is ObjectWritable.
    * 
    * @param conf Configuration
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ipc/DecayRpcScheduler.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ipc/DecayRpcScheduler.java
index d1108a993564e..7d3b34b1c082f 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ipc/DecayRpcScheduler.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ipc/DecayRpcScheduler.java
@@ -81,7 +81,8 @@ public class DecayRpcScheduler implements RpcScheduler,
 
   /**
    * Decay factor controls how much each count is suppressed by on each sweep.
-   * Valid numbers are > 0 and < 1. Decay factor works in tandem with period
+   * Valid numbers are &gt; 0 and &lt; 1. Decay factor works in tandem with
+   * period
    * to control how long the scheduler remembers an identity.
    */
   public static final String IPC_SCHEDULER_DECAYSCHEDULER_FACTOR_KEY =
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ipc/RefreshHandler.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ipc/RefreshHandler.java
index 3fe9eb71fb6c9..3622d2c47bf33 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ipc/RefreshHandler.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ipc/RefreshHandler.java
@@ -28,7 +28,6 @@ public interface RefreshHandler {
    * Implement this method to accept refresh requests from the administrator.
    * @param identifier is the identifier you registered earlier
    * @param args contains a list of string args from the administrator
-   * @throws Exception as a shorthand for a RefreshResponse(-1, message)
    * @return a RefreshResponse
    */
   RefreshResponse handleRefresh(String identifier, String[] args);
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ipc/RemoteException.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ipc/RemoteException.java
index 620e100603c48..36e280f39990b 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ipc/RemoteException.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ipc/RemoteException.java
@@ -102,7 +102,7 @@ public IOException unwrapRemoteException(Class<?>... lookupTypes) {
    * a <code>String</code> as a parameter.
    * Otherwise it returns this.
    * 
-   * @return <code>Throwable
+   * @return <code>Throwable</code>
    */
   public IOException unwrapRemoteException() {
     try {
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ipc/Server.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ipc/Server.java
index f152368be1d2d..f508757f84381 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ipc/Server.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ipc/Server.java
@@ -3039,7 +3039,7 @@ protected Server(String bindAddress, int port,
   
   /** 
    * Constructs a server listening on the named port and address.  Parameters passed must
-   * be of the named class.  The <code>handlerCount</handlerCount> determines
+   * be of the named class.  The <code>handlerCount</code> determines
    * the number of handler threads that will be used to process calls.
    * If queueSizePerHandler or numReaders are not -1 they will be used instead of parameters
    * from configuration. Otherwise the configuration will be picked up.
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/jmx/JMXJsonServlet.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/jmx/JMXJsonServlet.java
index 093d0af9dfad1..c404ebedebbe1 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/jmx/JMXJsonServlet.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/jmx/JMXJsonServlet.java
@@ -70,7 +70,7 @@
  * <p>
  * The optional <code>get</code> parameter is used to query an specific 
  * attribute of a JMX bean.  The format of the URL is
- * <code>http://.../jmx?get=MXBeanName::AttributeName<code>
+ * <code>http://.../jmx?get=MXBeanName::AttributeName</code>
  * <p>
  * For example 
  * <code>
@@ -85,7 +85,7 @@
  * <p>
  * The return format is JSON and in the form
  * <p>
- *  <code><pre>
+ *  <pre><code>
  *  {
  *    "beans" : [
  *      {
@@ -94,7 +94,7 @@
  *      }
  *    ]
  *  }
- *  </pre></code>
+ *  </code></pre>
  *  <p>
  *  The servlet attempts to convert the the JMXBeans into JSON. Each
  *  bean's attributes will be converted to a JSON object member.
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/log/LogThrottlingHelper.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/log/LogThrottlingHelper.java
index 591c3fb8a0b95..cde180bd1e038 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/log/LogThrottlingHelper.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/log/LogThrottlingHelper.java
@@ -62,10 +62,10 @@
  * still maintaining overall information about how many large requests were
  * received.
  *
- * <p/>This class can also be used to coordinate multiple logging points; see
+ * <p>This class can also be used to coordinate multiple logging points; see
  * {@link #record(String, long, double...)} for more details.
  *
- * <p/>This class is not thread-safe.
+ * <p>This class is not thread-safe.
  */
 public class LogThrottlingHelper {
 
@@ -175,7 +175,7 @@ public LogThrottlingHelper(long minLogPeriodMs, String primaryRecorderName) {
    * about the values specified since the last time the caller was expected to
    * write to its log.
    *
-   * <p/>Specifying multiple values will maintain separate summary statistics
+   * <p>Specifying multiple values will maintain separate summary statistics
    * about each value. For example:
    * <pre>{@code
    *   helper.record(1, 0);
@@ -230,7 +230,7 @@ public LogAction record(double... values) {
    * iteration as "pre", yet each one is able to maintain its own summary
    * information.
    *
-   * <p/>Other behavior is the same as {@link #record(double...)}.
+   * <p>Other behavior is the same as {@link #record(double...)}.
    *
    * @param recorderName The name of the recorder. This is used to check if the
    *                     current recorder is the primary. Other names are
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/metrics2/lib/MutableRollingAverages.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/metrics2/lib/MutableRollingAverages.java
index 3ae9568ba552f..700fc62f74608 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/metrics2/lib/MutableRollingAverages.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/metrics2/lib/MutableRollingAverages.java
@@ -280,7 +280,7 @@ public void close() throws IOException {
   }
 
   /**
-   * Retrieve a map of metric name -> (aggregate).
+   * Retrieve a map of metric name {@literal ->} (aggregate).
    * Filter out entries that don't have at least minSamples.
    *
    * @return a map of peer DataNode Id to the average latency to that
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/metrics2/package-info.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/metrics2/package-info.java
index ff7cd2510a009..8fd3b33b3a253 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/metrics2/package-info.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/metrics2/package-info.java
@@ -87,6 +87,7 @@ usually does not need to reference any class here.
   <h2><a name="gettingstarted">Getting started</a></h2>
   <h3>Implementing metrics sources</h3>
   <table width="99%" border="1" cellspacing="0" cellpadding="4">
+    <caption></caption>
     <tbody>
       <tr>
         <th>Using annotations</th><th>Using MetricsSource interface</th>
@@ -289,6 +290,7 @@ metrics system decouples the concept for context (for grouping) with the
     backend that can handle multiple contexts (file, gangalia etc.):
   </p>
   <table width="99%" border="1" cellspacing="0" cellpadding="4">
+    <caption></caption>
     <tbody>
       <tr>
         <th width="40%">Before</th><th>After</th>
@@ -310,6 +312,7 @@ backend that can handle multiple contexts (file, gangalia etc.):
     using the context option in the sink options like the following:
   </p>
   <table width="99%" border="1" cellspacing="0" cellpadding="4">
+    <caption></caption>
     <tbody>
       <tr>
         <th width="40%">Before</th><th>After</th>
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/metrics2/sink/RollingFileSystemSink.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/metrics2/sink/RollingFileSystemSink.java
index 92ac9529becda..1d330c74ab46a 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/metrics2/sink/RollingFileSystemSink.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/metrics2/sink/RollingFileSystemSink.java
@@ -111,7 +111,7 @@
  * <i>unknown</i>.</p>
  *
  * <p>Instead of appending to an existing file, by default the sink
- * will create a new file with a suffix of &quot;.&lt;n&gt;&quet;, where
+ * will create a new file with a suffix of &quot;.&lt;n&gt;&quot;, where
  * <i>n</i> is the next lowest integer that isn't already used in a file name,
  * similar to the Hadoop daemon logs.  NOTE: the file with the <b>highest</b>
  * sequence number is the <b>newest</b> file, unlike the Hadoop daemon logs.</p>
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/metrics2/sink/StatsDSink.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/metrics2/sink/StatsDSink.java
index b2be0a20f7ddf..c1dbf7ec82c99 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/metrics2/sink/StatsDSink.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/metrics2/sink/StatsDSink.java
@@ -47,10 +47,10 @@
  * a daemon that is running on the localhost and will add the
  * hostname to the metric (such as the
  * <a href="https://collectd.org/">CollectD</a> StatsD plugin).
- * <br/>
+ * <br>
  * To configure this plugin, you will need to add the following
  * entries to your hadoop-metrics2.properties file:
- * <br/>
+ * <br>
  * <pre>
  * *.sink.statsd.class=org.apache.hadoop.metrics2.sink.StatsDSink
  * [prefix].sink.statsd.server.host=
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/metrics2/util/MBeans.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/metrics2/util/MBeans.java
index 916367f0439fb..1b50498bbaf5a 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/metrics2/util/MBeans.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/metrics2/util/MBeans.java
@@ -59,8 +59,9 @@ private MBeans() {
 
   /**
    * Register the MBean using our standard MBeanName format
-   * "hadoop:service=<serviceName>,name=<nameName>"
-   * Where the <serviceName> and <nameName> are the supplied parameters.
+   * "hadoop:service={@literal <serviceName>,name=<nameName>}"
+   * Where the {@literal <serviceName> and <nameName>} are the supplied
+   * parameters.
    *
    * @param serviceName
    * @param nameName
@@ -75,8 +76,9 @@ static public ObjectName register(String serviceName, String nameName,
 
   /**
    * Register the MBean using our standard MBeanName format
-   * "hadoop:service=<serviceName>,name=<nameName>"
-   * Where the <serviceName> and <nameName> are the supplied parameters.
+   * "hadoop:service={@literal <serviceName>,name=<nameName>}"
+   * Where the {@literal <serviceName> and <nameName>} are the supplied
+   * parameters.
    *
    * @param serviceName
    * @param nameName
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/net/AbstractDNSToSwitchMapping.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/net/AbstractDNSToSwitchMapping.java
index b2d803c95f58b..97723c4a37270 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/net/AbstractDNSToSwitchMapping.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/net/AbstractDNSToSwitchMapping.java
@@ -29,11 +29,11 @@
 import java.util.Set;
 
 /**
- * This is a base class for DNS to Switch mappings. <p/> It is not mandatory to
+ * This is a base class for DNS to Switch mappings. <p> It is not mandatory to
  * derive {@link DNSToSwitchMapping} implementations from it, but it is strongly
  * recommended, as it makes it easy for the Hadoop developers to add new methods
  * to this base class that are automatically picked up by all implementations.
- * <p/>
+ * <p>
  *
  * This class does not extend the <code>Configured</code>
  * base class, and should not be changed to do so, as it causes problems
@@ -81,7 +81,7 @@ public void setConf(Configuration conf) {
    * multi-rack. Subclasses may override this with methods that are more aware
    * of their topologies.
    *
-   * <p/>
+   * <p>
    *
    * This method is used when parts of Hadoop need know whether to apply
    * single rack vs multi-rack policies, such as during block placement.
@@ -140,7 +140,7 @@ protected boolean isSingleSwitchByScriptPolicy() {
   /**
    * Query for a {@link DNSToSwitchMapping} instance being on a single
    * switch.
-   * <p/>
+   * <p>
    * This predicate simply assumes that all mappings not derived from
    * this class are multi-switch.
    * @param mapping the mapping to query
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/net/DNS.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/net/DNS.java
index 2fb4d3e1b7b00..061971cb6817f 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/net/DNS.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/net/DNS.java
@@ -141,7 +141,7 @@ private static LinkedHashSet<InetAddress> getSubinterfaceInetAddrs(
   }
 
   /**
-   * Like {@link DNS#getIPs(String, boolean), but returns all
+   * Like {@link DNS#getIPs(String, boolean)}, but returns all
    * IPs associated with the given interface and its subinterfaces.
    */
   public static String[] getIPs(String strInterface)
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/net/DNSToSwitchMapping.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/net/DNSToSwitchMapping.java
index 7b1b332b9b736..1e6f5f500849f 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/net/DNSToSwitchMapping.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/net/DNSToSwitchMapping.java
@@ -40,7 +40,7 @@ public interface DNSToSwitchMapping {
    * Note the hostname/ip-address is not part of the returned path.
    * The network topology of the cluster would determine the number of
    * components in the network path.
-   * <p/>
+   * <p>
    *
    * If a name cannot be resolved to a rack, the implementation
    * should return {@link NetworkTopology#DEFAULT_RACK}. This
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/net/NetUtils.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/net/NetUtils.java
index 9ded0f4be8e2b..b16be2f85d73a 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/net/NetUtils.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/net/NetUtils.java
@@ -148,8 +148,8 @@ public static SocketFactory getSocketFactoryFromProperty(
 
   /**
    * Util method to build socket addr from either:
-   *   <host>:<port>
-   *   <fs>://<host>:<port>/<path>
+   *   {@literal <host>:<port>}
+   *   {@literal <fs>://<host>:<port>/<path>}
    */
   public static InetSocketAddress createSocketAddr(String target) {
     return createSocketAddr(target, -1);
@@ -157,9 +157,9 @@ public static InetSocketAddress createSocketAddr(String target) {
 
   /**
    * Util method to build socket addr from either:
-   *   <host>
-   *   <host>:<port>
-   *   <fs>://<host>:<port>/<path>
+   *   {@literal <host>}
+   *   {@literal <host>:<port>}
+   *   {@literal <fs>://<host>:<port>/<path>}
    */
   public static InetSocketAddress createSocketAddr(String target,
                                                    int defaultPort) {
@@ -949,7 +949,7 @@ public static List<InetAddress> getIPs(String subnet,
    * Return a free port number. There is no guarantee it will remain free, so
    * it should be used immediately.
    *
-   * @returns A free port for binding a local socket
+   * @return A free port for binding a local socket
    */
   public static int getFreeSocketPort() {
     int port = 0;
@@ -970,7 +970,7 @@ public static int getFreeSocketPort() {
    *
    * @param localAddr
    * @param bindWildCardAddress
-   * @returns InetAddress
+   * @return InetAddress
    */
   public static InetAddress bindToLocalAddress(InetAddress localAddr, boolean
       bindWildCardAddress) {
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/net/NetworkTopology.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/net/NetworkTopology.java
index dea8f2ac3c24a..ed22711f8288f 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/net/NetworkTopology.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/net/NetworkTopology.java
@@ -114,7 +114,7 @@ public NetworkTopology() {
   }
 
   /** Add a leaf node
-   * Update node counter & rack counter if necessary
+   * Update node counter &amp; rack counter if necessary
    * @param node node to be added; can be null
    * @exception IllegalArgumentException if add a node to a leave 
                                          or node to be added is not a leaf
@@ -856,12 +856,12 @@ private static String normalizeNetworkLocationPath(String path) {
 
   /**
    * Sort nodes array by network distance to <i>reader</i>.
-   * <p/>
+   * <p>
    * In a three-level topology, a node can be either local, on the same rack,
    * or on a different rack from the reader. Sorting the nodes based on network
    * distance from the reader reduces network traffic and improves
    * performance.
-   * <p/>
+   * <p>
    * As an additional twist, we also randomize the nodes at each network
    * distance. This helps with load balancing when there is data skew.
    *
@@ -879,11 +879,11 @@ public void sortByDistance(Node reader, Node[] nodes, int activeLen) {
 
   /**
    * Sort nodes array by network distance to <i>reader</i>.
-   * <p/> using network location. This is used when the reader
+   * <p> using network location. This is used when the reader
    * is not a datanode. Sorting the nodes based on network distance
    * from the reader reduces network traffic and improves
    * performance.
-   * <p/>
+   * <p>
    *
    * @param reader    Node where data will be read
    * @param nodes     Available replicas with the requested data
@@ -900,7 +900,7 @@ public void sortByDistanceUsingNetworkLocation(Node reader, Node[] nodes,
 
   /**
    * Sort nodes array by network distance to <i>reader</i>.
-   * <p/>
+   * <p>
    * As an additional twist, we also randomize the nodes at each network
    * distance. This helps with load balancing when there is data skew.
    *
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/net/NetworkTopologyWithNodeGroup.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/net/NetworkTopologyWithNodeGroup.java
index bec0fe13064f4..487a4575af4f9 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/net/NetworkTopologyWithNodeGroup.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/net/NetworkTopologyWithNodeGroup.java
@@ -168,7 +168,7 @@ public boolean isNodeGroupAware() {
   }
 
   /** Add a leaf node
-   * Update node counter & rack counter if necessary
+   * Update node counter &amp; rack counter if necessary
    * @param node node to be added; can be null
    * @exception IllegalArgumentException if add a node to a leave 
    *                                     or node to be added is not a leaf
@@ -272,7 +272,7 @@ protected int getWeight(Node reader, Node node) {
 
   /**
    * Sort nodes array by their distances to <i>reader</i>.
-   * <p/>
+   * <p>
    * This is the same as {@link NetworkTopology#sortByDistance(Node, Node[],
    * int)} except with a four-level network topology which contains the
    * additional network distance of a "node group" which is between local and
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/net/ScriptBasedMapping.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/net/ScriptBasedMapping.java
index 02b44a54fec91..4db8155ffed3b 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/net/ScriptBasedMapping.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/net/ScriptBasedMapping.java
@@ -33,13 +33,13 @@
  * This class implements the {@link DNSToSwitchMapping} interface using a 
  * script configured via the
  * {@link CommonConfigurationKeys#NET_TOPOLOGY_SCRIPT_FILE_NAME_KEY} option.
- * <p/>
+ * <p>
  * It contains a static class <code>RawScriptBasedMapping</code> that performs
  * the work: reading the configuration parameters, executing any defined
  * script, handling errors and such like. The outer
  * class extends {@link CachedDNSToSwitchMapping} to cache the delegated
  * queries.
- * <p/>
+ * <p>
  * This DNS mapper's {@link #isSingleSwitch()} predicate returns
  * true if and only if a script is defined.
  */
@@ -78,7 +78,7 @@ public class ScriptBasedMapping extends CachedDNSToSwitchMapping {
 
   /**
    * Create an instance with the default configuration.
-   * </p>
+   * <p>
    * Calling {@link #setConf(Configuration)} will trigger a
    * re-evaluation of the configuration settings and so be used to
    * set up the mapping script.
@@ -125,7 +125,7 @@ public String toString() {
 
   /**
    * {@inheritDoc}
-   * <p/>
+   * <p>
    * This will get called in the superclass constructor, so a check is needed
    * to ensure that the raw mapping is defined before trying to relaying a null
    * configuration.
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/net/ScriptBasedMappingWithDependency.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/net/ScriptBasedMappingWithDependency.java
index 086650bd7d31a..e05fae6496a15 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/net/ScriptBasedMappingWithDependency.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/net/ScriptBasedMappingWithDependency.java
@@ -32,10 +32,9 @@
  * the {@link DNSToSwitchMappingWithDependency} interface using 
  * a script configured via the 
  * {@link CommonConfigurationKeys#NET_DEPENDENCY_SCRIPT_FILE_NAME_KEY} option.
- * <p/>
+ * <p>
  * It contains a static class <code>RawScriptBasedMappingWithDependency</code>
  * that performs the getDependency work.
- * <p/>
  */
 @InterfaceAudience.Private
 @InterfaceStability.Evolving
@@ -52,7 +51,7 @@ public class ScriptBasedMappingWithDependency  extends ScriptBasedMapping
 
   /**
    * Create an instance with the default configuration.
-   * </p>
+   * <p>
    * Calling {@link #setConf(Configuration)} will trigger a
    * re-evaluation of the configuration settings and so be used to
    * set up the mapping script.
@@ -76,7 +75,7 @@ public String toString() {
 
   /**
    * {@inheritDoc}
-   * <p/>
+   * <p>
    * This will get called in the superclass constructor, so a check is needed
    * to ensure that the raw mapping is defined before trying to relaying a null
    * configuration.
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/net/SocketOutputStream.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/net/SocketOutputStream.java
index ead1d7b2b05a1..93f4f56d78d63 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/net/SocketOutputStream.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/net/SocketOutputStream.java
@@ -32,7 +32,6 @@
 import org.apache.hadoop.classification.InterfaceAudience;
 import org.apache.hadoop.classification.InterfaceStability;
 import org.apache.hadoop.io.LongWritable;
-import org.apache.hadoop.metrics2.lib.MutableRate;
 
 /**
  * This implements an output stream that can have a timeout while writing.
@@ -187,7 +186,7 @@ public void waitForWritable() throws IOException {
    * @param count number of bytes to transfer.
    * @param waitForWritableTime nanoseconds spent waiting for the socket 
    *        to become writable
-   * @param transferTime nanoseconds spent transferring data
+   * @param transferToTime nanoseconds spent transferring data
    * 
    * @throws EOFException 
    *         If end of input file is reached before requested number of 
@@ -253,7 +252,8 @@ public void transferToFully(FileChannel fileCh, long position, int count,
 
   /**
    * Call
-   * {@link #transferToFully(FileChannel, long, int, MutableRate, MutableRate)}
+   * {@link #transferToFully(FileChannel, long, int, LongWritable, LongWritable)
+   * }
    * with null <code>waitForWritableTime</code> and <code>transferToTime</code>
    */
   public void transferToFully(FileChannel fileCh, long position, int count)
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/AuthenticationFilterInitializer.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/AuthenticationFilterInitializer.java
index 7e6b3a8bf76f7..b25d5d71616f9 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/AuthenticationFilterInitializer.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/AuthenticationFilterInitializer.java
@@ -31,11 +31,11 @@
 /**
  * Initializes hadoop-auth AuthenticationFilter which provides support for
  * Kerberos HTTP SPNEGO authentication.
- * <p/>
+ * <p>
  * It enables anonymous access, simple/speudo and Kerberos HTTP SPNEGO
  * authentication  for Hadoop JobTracker, NameNode, DataNodes and
  * TaskTrackers.
- * <p/>
+ * <p>
  * Refer to the <code>core-default.xml</code> file, after the comment
  * 'HTTP Authentication' for details on the configuration options.
  * All related configuration properties have 'hadoop.http.authentication.'
@@ -47,7 +47,7 @@ public class AuthenticationFilterInitializer extends FilterInitializer {
 
   /**
    * Initializes hadoop-auth AuthenticationFilter.
-   * <p/>
+   * <p>
    * Propagates to hadoop-auth AuthenticationFilter configuration all Hadoop
    * configuration properties prefixed with "hadoop.http.authentication."
    *
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/IdMappingServiceProvider.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/IdMappingServiceProvider.java
index 4a1185e8e7fe3..86edab7de7097 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/IdMappingServiceProvider.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/IdMappingServiceProvider.java
@@ -25,8 +25,9 @@
 import org.apache.hadoop.fs.CommonConfigurationKeysPublic;
 
 /**
- * An interface for the implementation of <userId, userName> mapping
- * and <groupId, groupName> mapping
+ * An interface for the implementation of {@literal <}userId,
+ * userName{@literal >} mapping and {@literal <}groupId, groupName{@literal >}
+ * mapping.
  */
 @InterfaceAudience.Public
 @InterfaceStability.Evolving
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/SaslRpcClient.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/SaslRpcClient.java
index 11714b15bd18a..d236ab0c0e948 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/SaslRpcClient.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/SaslRpcClient.java
@@ -343,13 +343,9 @@ String getServerPrincipal(SaslAuth authType) throws IOException {
   }
 
   /**
-   * Do client side SASL authentication with server via the given InputStream
-   * and OutputStream
-   * 
-   * @param inS
-   *          InputStream to use
-   * @param outS
-   *          OutputStream to use
+   * Do client side SASL authentication with server via the given IpcStreams.
+   *
+   * @param ipcStreams
    * @return AuthMethod used to negotiate the connection
    * @throws IOException
    */
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/SecurityUtil.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/SecurityUtil.java
index dbf1328a17eac..03c5b58795c5d 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/SecurityUtil.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/SecurityUtil.java
@@ -332,7 +332,8 @@ public static String buildDTServiceName(URI uri, int defPort) {
    }
   
   /**
-   * Get the host name from the principal name of format <service>/host@realm.
+   * Get the host name from the principal name of format {@literal <}service
+   * {@literal >}/host@realm.
    * @param principalName principal name of format as described above
    * @return host name if the the string conforms to the above format, else null
    */
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/UserGroupInformation.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/UserGroupInformation.java
index 23f3ae9bacc8c..6eb04a62d106e 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/UserGroupInformation.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/UserGroupInformation.java
@@ -684,7 +684,7 @@ public static UserGroupInformation getLoginUser() throws IOException {
 
   /**
    * remove the login method that is followed by a space from the username
-   * e.g. "jack (auth:SIMPLE)" -> "jack"
+   * e.g. "jack (auth:SIMPLE)" {@literal ->} "jack"
    *
    * @param userName
    * @return userName without login method
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/alias/JavaKeyStoreProvider.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/alias/JavaKeyStoreProvider.java
index 52f39ef3c19e6..5028482dfc4aa 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/alias/JavaKeyStoreProvider.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/alias/JavaKeyStoreProvider.java
@@ -33,9 +33,9 @@
 /**
  * CredentialProvider based on Java's KeyStore file format. The file may be
  * stored in any Hadoop FileSystem using the following name mangling:
- * jceks://hdfs@nn1.example.com/my/creds.jceks ->
- * hdfs://nn1.example.com/my/creds.jceks jceks://file/home/larry/creds.jceks ->
- * file:///home/larry/creds.jceks
+ * jceks://hdfs@nn1.example.com/my/creds.jceks {@literal ->}
+ * hdfs://nn1.example.com/my/creds.jceks jceks://file/home/larry/creds.jceks
+ * {@literal ->} file:///home/larry/creds.jceks
  */
 @InterfaceAudience.Private
 public class JavaKeyStoreProvider extends AbstractJavaKeyStoreProvider {
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/alias/LocalJavaKeyStoreProvider.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/alias/LocalJavaKeyStoreProvider.java
index 9ea9a579655e3..ce0eb7d2bfe65 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/alias/LocalJavaKeyStoreProvider.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/alias/LocalJavaKeyStoreProvider.java
@@ -44,7 +44,8 @@
 /**
  * CredentialProvider based on Java's KeyStore file format. The file may be
  * stored only on the local filesystem using the following name mangling:
- * localjceks://file/home/larry/creds.jceks -> file:///home/larry/creds.jceks
+ * localjceks://file/home/larry/creds.jceks {@literal ->}
+ * file:///home/larry/creds.jceks
  */
 @InterfaceAudience.Private
 public final class LocalJavaKeyStoreProvider extends
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/authorize/ProxyUsers.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/authorize/ProxyUsers.java
index a387cbe4867fe..6829a2aebcafe 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/authorize/ProxyUsers.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/authorize/ProxyUsers.java
@@ -129,12 +129,12 @@ private static ImpersonationProvider getSip() {
    * @param remoteAddress
    * @param conf
    * @throws AuthorizationException
-   * @deprecated use {@link #authorize(UserGroupInformation, String) instead. 
+   * @deprecated use {@link #authorize(UserGroupInformation, String)} instead.
    */
   @Deprecated
   public static void authorize(UserGroupInformation user, 
       String remoteAddress, Configuration conf) throws AuthorizationException {
-    authorize(user,remoteAddress);
+    authorize(user, remoteAddress);
   }
   
   @VisibleForTesting 
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/ssl/FileBasedKeyStoresFactory.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/ssl/FileBasedKeyStoresFactory.java
index b0df8f012127f..e5bdab3347f9a 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/ssl/FileBasedKeyStoresFactory.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/ssl/FileBasedKeyStoresFactory.java
@@ -38,7 +38,7 @@
 /**
  * {@link KeyStoresFactory} implementation that reads the certificates from
  * keystore files.
- * <p/>
+ * <p>
  * if the trust certificates keystore file changes, the {@link TrustManager}
  * is refreshed with the new trust certificate entries (using a
  * {@link ReloadingX509TrustManager} trustmanager).
@@ -87,7 +87,7 @@ public class FileBasedKeyStoresFactory implements KeyStoresFactory {
 
   /**
    * Resolves a property name to its client/server version if applicable.
-   * <p/>
+   * <p>
    * NOTE: This method is public for testing purposes.
    *
    * @param mode client/server mode.
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/ssl/SSLFactory.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/ssl/SSLFactory.java
index 8e8421b9bb66a..32e5728ed5259 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/ssl/SSLFactory.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/ssl/SSLFactory.java
@@ -44,10 +44,10 @@
 /**
  * Factory that creates SSLEngine and SSLSocketFactory instances using
  * Hadoop configuration information.
- * <p/>
+ * <p>
  * This SSLFactory uses a {@link ReloadingX509TrustManager} instance,
  * which reloads public keys if the truststore file changes.
- * <p/>
+ * <p>
  * This factory is used to configure HTTPS in Hadoop HTTP based endpoints, both
  * client and server.
  */
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/ssl/SSLHostnameVerifier.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/ssl/SSLHostnameVerifier.java
index 47546b3d993b5..f9236b64b4616 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/ssl/SSLHostnameVerifier.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/ssl/SSLHostnameVerifier.java
@@ -58,15 +58,15 @@
  * or X509Certificate, or ultimately (they all end up calling this one),
  * String.  (It's easier to supply JUnit with Strings instead of mock
  * SSLSession objects!)
- * </p><p>Our check() methods throw exceptions if the name is
+ * <p>Our check() methods throw exceptions if the name is
  * invalid, whereas javax.net.ssl.HostnameVerifier just returns true/false.
- * <p/>
+ * <p>
  * We provide the HostnameVerifier.DEFAULT, HostnameVerifier.STRICT, and
  * HostnameVerifier.ALLOW_ALL implementations.  We also provide the more
  * specialized HostnameVerifier.DEFAULT_AND_LOCALHOST, as well as
  * HostnameVerifier.STRICT_IE6.  But feel free to define your own
  * implementations!
- * <p/>
+ * <p>
  * Inspired by Sebastian Hauer's original StrictSSLProtocolSocketFactory in the
  * HttpClient "contrib" repository.
  */
@@ -109,10 +109,10 @@ void check(String[] hosts, String[] cns, String[] subjectAlts)
 
     /**
      * The DEFAULT HostnameVerifier works the same way as Curl and Firefox.
-     * <p/>
+     * <p>
      * The hostname must match either the first CN, or any of the subject-alts.
      * A wildcard can occur in the CN, and in any of the subject-alts.
-     * <p/>
+     * <p>
      * The only difference between DEFAULT and STRICT is that a wildcard (such
      * as "*.foo.com") with DEFAULT matches all subdomains, including
      * "a.b.foo.com".
@@ -158,13 +158,13 @@ public final void check(final String[] hosts, final String[] cns,
      * Java 1.4, Sun Java 5, Sun Java 6.  It's also pretty close to IE6.
      * This implementation appears to be compliant with RFC 2818 for dealing
      * with wildcards.
-     * <p/>
+     * <p>
      * The hostname must match either the first CN, or any of the subject-alts.
      * A wildcard can occur in the CN, and in any of the subject-alts.  The
      * one divergence from IE6 is how we only check the first CN.  IE6 allows
      * a match against any of the CNs present.  We decided to follow in
      * Sun Java 1.4's footsteps and only check the first CN.
-     * <p/>
+     * <p>
      * A wildcard such as "*.foo.com" matches only subdomains in the same
      * level, for example "a.foo.com".  It does not match deeper subdomains
      * such as "a.b.foo.com".
@@ -229,7 +229,7 @@ abstract class AbstractVerifier implements SSLHostnameVerifier {
          * This contains a list of 2nd-level domains that aren't allowed to
          * have wildcards when combined with country-codes.
          * For example: [*.co.uk].
-         * <p/>
+         * <p>
          * The [*.co.uk] problem is an interesting one.  Should we just hope
          * that CA's would never foolishly allow such a certificate to happen?
          * Looks like we're the only implementation guarding against this.
@@ -564,11 +564,11 @@ Looks like toString() even works with non-ascii domain names!
       /**
        * Extracts the array of SubjectAlt DNS names from an X509Certificate.
        * Returns null if there aren't any.
-       * <p/>
+       * <p>
        * Note:  Java doesn't appear able to extract international characters
        * from the SubjectAlts.  It can only extract international characters
        * from the CN field.
-       * <p/>
+       * <p>
        * (Or maybe the version of OpenSSL I'm using to test isn't storing the
        * international characters correctly in the SubjectAlts?).
        *
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/token/delegation/web/DelegationTokenAuthenticatedURL.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/token/delegation/web/DelegationTokenAuthenticatedURL.java
index 0a70998b2ebdc..504592ef16462 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/token/delegation/web/DelegationTokenAuthenticatedURL.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/token/delegation/web/DelegationTokenAuthenticatedURL.java
@@ -44,19 +44,19 @@
  * The <code>DelegationTokenAuthenticatedURL</code> is a
  * {@link AuthenticatedURL} sub-class with built-in Hadoop Delegation Token
  * functionality.
- * <p/>
+ * <p>
  * The authentication mechanisms supported by default are Hadoop Simple
  * authentication (also known as pseudo authentication) and Kerberos SPNEGO
  * authentication.
- * <p/>
+ * <p>
  * Additional authentication mechanisms can be supported via {@link
  * DelegationTokenAuthenticator} implementations.
- * <p/>
+ * <p>
  * The default {@link DelegationTokenAuthenticator} is the {@link
  * KerberosDelegationTokenAuthenticator} class which supports
  * automatic fallback from Kerberos SPNEGO to Hadoop Simple authentication via
  * the {@link PseudoDelegationTokenAuthenticator} class.
- * <p/>
+ * <p>
  * <code>AuthenticatedURL</code> instances are not thread-safe.
  */
 @InterfaceAudience.Public
@@ -115,7 +115,7 @@ public static void setDefaultDelegationTokenAuthenticator(
    * Returns the default {@link DelegationTokenAuthenticator} class to use when
    * an {@link DelegationTokenAuthenticatedURL} instance is created without
    * specifying one.
-   * <p/>
+   * <p>
    * The default class is {@link KerberosDelegationTokenAuthenticator}
    *
    * @return the delegation token authenticator class to use as default.
@@ -143,7 +143,7 @@ public static void setDefaultDelegationTokenAuthenticator(
 
   /**
    * Creates an <code>DelegationTokenAuthenticatedURL</code>.
-   * <p/>
+   * <p>
    * An instance of the default {@link DelegationTokenAuthenticator} will be
    * used.
    */
@@ -191,7 +191,7 @@ public DelegationTokenAuthenticatedURL(
    * Sets if delegation token should be transmitted in the URL query string.
    * By default it is transmitted using the
    * {@link DelegationTokenAuthenticator#DELEGATION_TOKEN_HEADER} HTTP header.
-   * <p/>
+   * <p>
    * This method is provided to enable WebHDFS backwards compatibility.
    *
    * @param useQueryString  <code>TRUE</code> if the token is transmitted in the
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/token/delegation/web/DelegationTokenAuthenticationFilter.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/token/delegation/web/DelegationTokenAuthenticationFilter.java
index f5e798e2556fe..5275526202f2b 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/token/delegation/web/DelegationTokenAuthenticationFilter.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/token/delegation/web/DelegationTokenAuthenticationFilter.java
@@ -60,7 +60,7 @@
 /**
  *  The <code>DelegationTokenAuthenticationFilter</code> filter is a
  *  {@link AuthenticationFilter} with Hadoop Delegation Token support.
- *  <p/>
+ *  <p>
  *  By default it uses it own instance of the {@link
  *  AbstractDelegationTokenSecretManager}. For situations where an external
  *  <code>AbstractDelegationTokenSecretManager</code> is required (i.e. one that
@@ -86,7 +86,7 @@ public class DelegationTokenAuthenticationFilter
   /**
    * Sets an external <code>DelegationTokenSecretManager</code> instance to
    * manage creation and verification of Delegation Tokens.
-   * <p/>
+   * <p>
    * This is useful for use cases where secrets must be shared across multiple
    * services.
    */
@@ -148,7 +148,7 @@ protected void setAuthHandlerClass(Properties props)
   /**
    * Returns the proxyuser configuration. All returned properties must start
    * with <code>proxyuser.</code>'
-   * <p/>
+   * <p>
    * Subclasses may override this method if the proxyuser configuration is 
    * read from other place than the filter init parameters.
    *
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/token/delegation/web/DelegationTokenAuthenticationHandler.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/token/delegation/web/DelegationTokenAuthenticationHandler.java
index 6ee59f1d17522..284044fd938a8 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/token/delegation/web/DelegationTokenAuthenticationHandler.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/token/delegation/web/DelegationTokenAuthenticationHandler.java
@@ -59,7 +59,7 @@
 /**
  * An {@link AuthenticationHandler} that implements Kerberos SPNEGO mechanism
  * for HTTP and supports Delegation Token functionality.
- * <p/>
+ * <p>
  * In addition to the wrapped {@link AuthenticationHandler} configuration
  * properties, this handler supports the following properties prefixed
  * with the type of the wrapped <code>AuthenticationHandler</code>:
@@ -135,7 +135,7 @@ public void init(Properties config) throws ServletException {
   /**
    * Sets an external <code>DelegationTokenSecretManager</code> instance to
    * manage creation and verification of Delegation Tokens.
-   * <p/>
+   * <p>
    * This is useful for use cases where secrets must be shared across multiple
    * services.
    *
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/token/delegation/web/DelegationTokenManager.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/token/delegation/web/DelegationTokenManager.java
index fd19b67fb6dbb..e1445fb5ca05b 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/token/delegation/web/DelegationTokenManager.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/token/delegation/web/DelegationTokenManager.java
@@ -127,7 +127,7 @@ public DelegationTokenManager(Configuration conf, Text tokenKind) {
   /**
    * Sets an external <code>DelegationTokenSecretManager</code> instance to
    * manage creation and verification of Delegation Tokens.
-   * <p/>
+   * <p>
    * This is useful for use cases where secrets must be shared across multiple
    * services.
    *
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/token/delegation/web/KerberosDelegationTokenAuthenticationHandler.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/token/delegation/web/KerberosDelegationTokenAuthenticationHandler.java
index 395d2f2f27037..28509e1330e0c 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/token/delegation/web/KerberosDelegationTokenAuthenticationHandler.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/token/delegation/web/KerberosDelegationTokenAuthenticationHandler.java
@@ -25,7 +25,7 @@
 /**
  * An {@link AuthenticationHandler} that implements Kerberos SPNEGO mechanism
  * for HTTP and supports Delegation Token functionality.
- * <p/>
+ * <p>
  * In addition to the {@link KerberosAuthenticationHandler} configuration
  * properties, this handler supports:
  * <ul>
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/token/delegation/web/KerberosDelegationTokenAuthenticator.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/token/delegation/web/KerberosDelegationTokenAuthenticator.java
index 7e0e26610923d..8b8a4bdccdbbc 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/token/delegation/web/KerberosDelegationTokenAuthenticator.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/token/delegation/web/KerberosDelegationTokenAuthenticator.java
@@ -26,7 +26,7 @@
  * The <code>KerberosDelegationTokenAuthenticator</code> provides support for
  * Kerberos SPNEGO authentication mechanism and support for Hadoop Delegation
  * Token operations.
- * <p/>
+ * <p>
  * It falls back to the {@link PseudoDelegationTokenAuthenticator} if the HTTP
  * endpoint does not trigger a SPNEGO authentication
  */
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/token/delegation/web/MultiSchemeDelegationTokenAuthenticationHandler.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/token/delegation/web/MultiSchemeDelegationTokenAuthenticationHandler.java
index fc32a19d1fdd9..0661fb2b5a2a7 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/token/delegation/web/MultiSchemeDelegationTokenAuthenticationHandler.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/token/delegation/web/MultiSchemeDelegationTokenAuthenticationHandler.java
@@ -29,6 +29,7 @@
 import org.apache.hadoop.classification.InterfaceAudience;
 import org.apache.hadoop.classification.InterfaceStability;
 import org.apache.hadoop.security.authentication.client.AuthenticationException;
+import org.apache.hadoop.security.authentication.server.AuthenticationHandler;
 import org.apache.hadoop.security.authentication.server.AuthenticationHandlerUtil;
 import org.apache.hadoop.security.authentication.server.AuthenticationToken;
 import org.apache.hadoop.security.authentication.server.CompositeAuthenticationHandler;
@@ -52,7 +53,7 @@
  * required to ensure that only schemes with strongest level of security should
  * be used for delegation token management.
  *
- * <p/>
+ * <p>
  * In addition to the wrapped {@link AuthenticationHandler} configuration
  * properties, this handler supports the following properties prefixed with the
  * type of the wrapped <code>AuthenticationHandler</code>:
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/token/delegation/web/PseudoDelegationTokenAuthenticationHandler.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/token/delegation/web/PseudoDelegationTokenAuthenticationHandler.java
index 6846fdb87e9be..9a4527a90640d 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/token/delegation/web/PseudoDelegationTokenAuthenticationHandler.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/token/delegation/web/PseudoDelegationTokenAuthenticationHandler.java
@@ -26,7 +26,7 @@
 /**
  * An {@link AuthenticationHandler} that implements Kerberos SPNEGO mechanism
  * for HTTP and supports Delegation Token functionality.
- * <p/>
+ * <p>
  * In addition to the {@link KerberosAuthenticationHandler} configuration
  * properties, this handler supports:
  * <ul>
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/token/delegation/web/PseudoDelegationTokenAuthenticator.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/token/delegation/web/PseudoDelegationTokenAuthenticator.java
index 8713aa47b8069..3478f3989ae3c 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/token/delegation/web/PseudoDelegationTokenAuthenticator.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/token/delegation/web/PseudoDelegationTokenAuthenticator.java
@@ -29,7 +29,7 @@
  * Hadoop's pseudo authentication mechanism that accepts
  * the user name specified as a query string parameter and support for Hadoop
  * Delegation Token operations.
- * <p/>
+ * <p>
  * This mimics the model of Hadoop Simple authentication trusting the
  * {@link UserGroupInformation#getCurrentUser()} value.
  */
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/service/ServiceOperations.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/service/ServiceOperations.java
index d064ef96f7c88..726a83da2572b 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/service/ServiceOperations.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/service/ServiceOperations.java
@@ -42,9 +42,9 @@ private ServiceOperations() {
 
   /**
    * Stop a service.
-   * <p/>Do nothing if the service is null or not
+   * <p>Do nothing if the service is null or not
    * in a state in which it can be/needs to be stopped.
-   * <p/>
+   * <p>
    * The service state is checked <i>before</i> the operation begins.
    * This process is <i>not</i> thread safe.
    * @param service a service or null
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/service/launcher/ServiceLauncher.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/service/launcher/ServiceLauncher.java
index 6b0b4e8628091..da91a3d0e6c70 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/service/launcher/ServiceLauncher.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/service/launcher/ServiceLauncher.java
@@ -268,7 +268,7 @@ public String toString() {
    * <ol>
    * <li>Parse the command line.</li> 
    * <li>Build the service configuration from it.</li>
-   * <li>Start the service.</li>.
+   * <li>Start the service.</li>
    * <li>If it is a {@link LaunchableService}: execute it</li>
    * <li>Otherwise: wait for it to finish.</li>
    * <li>Exit passing the status code to the {@link #exit(int, String)}
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/service/launcher/package-info.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/service/launcher/package-info.java
index 85163575d7854..f582fa2d97875 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/service/launcher/package-info.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/service/launcher/package-info.java
@@ -379,7 +379,7 @@ interface listing common exception codes. These are exception codes
  a new configuration is created:
 
  <pre>
- public Configuration bindArgs(Configuration config, List<String> args)
+ public Configuration bindArgs(Configuration config, List&lt;String&gt; args)
     throws Exception {
    Configuration newConf = new YarnConfiguration(config);
    return newConf;
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/ClassUtil.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/ClassUtil.java
index 6f949891508c1..44c94669f515f 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/ClassUtil.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/ClassUtil.java
@@ -34,7 +34,6 @@ public class ClassUtil {
    * 
    * @param clazz the class to find.
    * @return a jar file that contains the class, or null.
-   * @throws IOException
    */
   public static String findContainingJar(Class<?> clazz) {
     ClassLoader loader = clazz.getClassLoader();
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/ComparableVersion.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/ComparableVersion.java
index 1f3429113df0b..bcb17d0f1adbb 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/ComparableVersion.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/ComparableVersion.java
@@ -64,7 +64,7 @@
  *     Unknown qualifiers are considered after known qualifiers, with lexical order (always case insensitive),
  *   </li>
  * <li>a dash usually precedes a qualifier, and is always less important than something preceded with a dot.</li>
- * </ul></p>
+ * </ul><p>
  *
  * @see <a href="https://cwiki.apache.org/confluence/display/MAVENOLD/Versioning">"Versioning" on Maven Wiki</a>
  */
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/FindClass.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/FindClass.java
index b7feb22d34d21..690d09755171f 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/FindClass.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/FindClass.java
@@ -33,9 +33,9 @@
  * This entry point exists for diagnosing classloader problems:
  * is a class or resource present -and if so, where?
  *
- * <p/>
+ * <p>
  * Actions
- * <p/>
+ * <br>
  * <ul>
  *   <li><pre>load</pre>: load a class but do not attempt to create it </li>
  *   <li><pre>create</pre>: load and create a class, print its string value</li>
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/GenericOptionsParser.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/GenericOptionsParser.java
index a8a513dfb1335..8ca7a904fdc84 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/GenericOptionsParser.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/GenericOptionsParser.java
@@ -15,7 +15,8 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.hadoop.util;import java.io.File;
+package org.apache.hadoop.util;
+import java.io.File;
 import java.io.FileNotFoundException;
 import java.io.IOException;
 import java.io.PrintStream;
@@ -54,9 +55,9 @@
  * line arguments, enabling applications to easily specify a namenode, a 
  * ResourceManager, additional configuration resources etc.
  * 
- * <h4 id="GenericOptions">Generic Options</h4>
+ * <h3 id="GenericOptions">Generic Options</h3>
  * 
- * <p>The supported generic options are:</p>
+ * <p>The supported generic options are:
  * <p><blockquote><pre>
  *     -conf &lt;configuration file&gt;     specify a configuration file
  *     -D &lt;property=value&gt;            use value for given property
@@ -69,12 +70,12 @@
  *     -archives &lt;comma separated list of archives&gt;    specify comma
  *             separated archives to be unarchived on the compute machines.
 
- * </pre></blockquote></p>
+ * </pre></blockquote><p>
  * 
  * <p>The general command line syntax is:</p>
- * <p><tt><pre>
+ * <p><pre><code>
  * bin/hadoop command [genericOptions] [commandOptions]
- * </pre></tt></p>
+ * </code></pre><p>
  * 
  * <p>Generic command line arguments <strong>might</strong> modify 
  * <code>Configuration </code> objects, given to constructors.</p>
@@ -104,7 +105,7 @@
  * $ bin/hadoop jar -libjars testlib.jar 
  * -archives test.tgz -files file.txt inputjar args
  * job submission with libjars, files and archives
- * </pre></blockquote></p>
+ * </pre></blockquote><p>
  *
  * @see Tool
  * @see ToolRunner
@@ -141,8 +142,8 @@ public GenericOptionsParser(String[] args)
   }
   
   /** 
-   * Create a <code>GenericOptionsParser<code> to parse only the generic Hadoop  
-   * arguments. 
+   * Create a <code>GenericOptionsParser</code> to parse only the generic
+   * Hadoop arguments.
    * 
    * The array of string arguments other than the generic arguments can be 
    * obtained by {@link #getRemainingArgs()}.
@@ -217,7 +218,7 @@ public boolean isParseSuccessful() {
 
   /**
    * Specify properties of each generic option.
-   * <i>Important</i?: as {@link OptionBuilder} is not thread safe, subclasses
+   * <i>Important</i>: as {@link OptionBuilder} is not thread safe, subclasses
    * must synchronize use on {@code OptionBuilder.class}
    */
   @SuppressWarnings("static-access")
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/HttpExceptionUtils.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/HttpExceptionUtils.java
index 24ed5e4302be8..12d1ef01201a2 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/HttpExceptionUtils.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/HttpExceptionUtils.java
@@ -34,10 +34,10 @@
 /**
  * HTTP utility class to help propagate server side exception to the client
  * over HTTP as a JSON payload.
- * <p/>
+ * <p>
  * It creates HTTP Servlet and JAX-RPC error responses including details of the
  * exception that allows a client to recreate the remote exception.
- * <p/>
+ * <p>
  * It parses HTTP client connections and recreates the exception.
  */
 @InterfaceAudience.Private
@@ -125,7 +125,7 @@ private static <E extends Throwable> void throwException(Throwable ex)
    * expected HTTP status code. If the current status code is not the expected
    * one it throws an exception with a detail message using Server side error
    * messages if available.
-   * <p/>
+   * <p>
    * <b>NOTE:</b> this method will throw the deserialized exception even if not
    * declared in the <code>throws</code> of the method signature.
    *
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/JsonSerialization.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/JsonSerialization.java
index cbc8560a406c0..7e09a61ba9787 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/JsonSerialization.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/JsonSerialization.java
@@ -185,7 +185,7 @@ public synchronized T load(File jsonFile)
    * Save to a local file. Any existing file is overwritten unless
    * the OS blocks that.
    * @param file file
-   * @param path path
+   * @param instance instance
    * @throws IOException IO exception
    */
   public void save(File file, T instance) throws
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/LightWeightCache.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/LightWeightCache.java
index d79aade315888..79de1ac554476 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/LightWeightCache.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/LightWeightCache.java
@@ -99,11 +99,11 @@ private static int updateRecommendedLength(int recommendedLength,
   /**
    * @param recommendedLength Recommended size of the internal array.
    * @param sizeLimit the limit of the size of the cache.
-   *            The limit is disabled if it is <= 0.
-   * @param creationExpirationPeriod the time period C > 0 in nanoseconds that
-   *            the creation of an entry is expired if it is added to the cache
-   *            longer than C.
-   * @param accessExpirationPeriod the time period A >= 0 in nanoseconds that
+   *            The limit is disabled if it is &lt;= 0.
+   * @param creationExpirationPeriod the time period C &gt; 0 in nanoseconds
+   *            that the creation of an entry is expired if it is added to the
+   *            cache longer than C.
+   * @param accessExpirationPeriod the time period A &gt;= 0 in nanoseconds that
    *            the access of an entry is expired if it is not accessed
    *            longer than A. 
    */
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/LineReader.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/LineReader.java
index a1cf7099ada32..e2cd3048d5843 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/LineReader.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/LineReader.java
@@ -62,7 +62,6 @@ public class LineReader implements Closeable {
    * Create a line reader that reads from the given stream using the
    * default buffer-size (64k).
    * @param in The input stream
-   * @throws IOException
    */
   public LineReader(InputStream in) {
     this(in, DEFAULT_BUFFER_SIZE);
@@ -73,7 +72,6 @@ public LineReader(InputStream in) {
    * given buffer-size.
    * @param in The input stream
    * @param bufferSize Size of the read buffer
-   * @throws IOException
    */
   public LineReader(InputStream in, int bufferSize) {
     this.in = in;
@@ -115,7 +113,6 @@ public LineReader(InputStream in, byte[] recordDelimiterBytes) {
    * @param in The input stream
    * @param bufferSize Size of the read buffer
    * @param recordDelimiterBytes The delimiter
-   * @throws IOException
    */
   public LineReader(InputStream in, int bufferSize,
       byte[] recordDelimiterBytes) {
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/ShutdownHookManager.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/ShutdownHookManager.java
index 19adf80573742..5cbc22d0cace9 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/ShutdownHookManager.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/ShutdownHookManager.java
@@ -46,7 +46,7 @@
 /**
  * The <code>ShutdownHookManager</code> enables running shutdownHook
  * in a deterministic order, higher priority first.
- * <p/>
+ * <p>
  * The JVM runs ShutdownHooks in a non-deterministic order or in parallel.
  * This class registers a single JVM shutdownHook and run all the
  * shutdownHooks registered to it (to this class) in order based on their
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/ShutdownThreadsHelper.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/ShutdownThreadsHelper.java
index 5405d7756afab..50a728e568a4e 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/ShutdownThreadsHelper.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/ShutdownThreadsHelper.java
@@ -39,7 +39,6 @@ public class ShutdownThreadsHelper {
    * @param thread {@link Thread to be shutdown}
    * @return <tt>true</tt> if the thread is successfully interrupted,
    * <tt>false</tt> otherwise
-   * @throws InterruptedException
    */
   public static boolean shutdownThread(Thread thread) {
     return shutdownThread(thread, SHUTDOWN_WAIT_MS);
@@ -51,7 +50,6 @@ public static boolean shutdownThread(Thread thread) {
    *                              interrupted
    * @return <tt>true</tt> if the thread is successfully interrupted,
    * <tt>false</tt> otherwise
-   * @throws InterruptedException
    */
   public static boolean shutdownThread(Thread thread,
                                     long timeoutInMilliSeconds) {
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/StringUtils.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/StringUtils.java
index 2e50963cd7c59..cf7b04ab61a7e 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/StringUtils.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/StringUtils.java
@@ -454,7 +454,7 @@ public static Collection<String> getTrimmedStringCollection(String str,
    * Splits a comma separated value <code>String</code>, trimming leading and
    * trailing whitespace on each value. Duplicate and empty values are removed.
    *
-   * @param str a comma separated <String> with values, may be null
+   * @param str a comma separated <code>String</code> with values, may be null
    * @return a <code>Collection</code> of <code>String</code> values, empty
    *         Collection if null String input
    */
@@ -1032,8 +1032,8 @@ public static String camelize(String s) {
    * @param template String template to receive replacements
    * @param pattern Pattern to match for identifying tokens, must use a capturing
    *   group
-   * @param replacements Map<String, String> mapping tokens identified by the
-   *   capturing group to their replacement values
+   * @param replacements Map&lt;String, String&gt; mapping tokens identified by
+   * the capturing group to their replacement values
    * @return String template with replacements
    */
   public static String replaceTokens(String template, Pattern pattern,
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/Tool.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/Tool.java
index b6e3d7db71353..a4fbce4ace86f 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/Tool.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/Tool.java
@@ -69,7 +69,7 @@
  *         System.exit(res);
  *       }
  *     }
- * </pre></blockquote></p>
+ * </pre></blockquote><p>
  * 
  * @see GenericOptionsParser
  * @see ToolRunner
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/ZKUtil.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/ZKUtil.java
index c6f8a959b1cf5..48cef5f06fcc9 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/ZKUtil.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/ZKUtil.java
@@ -90,7 +90,7 @@ public static int removeSpecificPerms(int perms, int remove) {
    * <code>sasl:hdfs/host1@MY.DOMAIN:cdrwa,sasl:hdfs/host2@MY.DOMAIN:cdrwa</code>
    *
    * @return ACL list
-   * @throws {@link BadAclFormatException} if an ACL is invalid
+   * @throws BadAclFormatException if an ACL is invalid
    */
   public static List<ACL> parseACLs(String aclString) throws
       BadAclFormatException {
@@ -128,7 +128,7 @@ public static List<ACL> parseACLs(String aclString) throws
    * 
    * @param authString the comma-separated auth mechanisms
    * @return a list of parsed authentications
-   * @throws {@link BadAuthFormatException} if the auth format is invalid
+   * @throws BadAuthFormatException if the auth format is invalid
    */
   public static List<ZKAuthInfo> parseAuth(String authString) throws
       BadAuthFormatException{
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/bloom/DynamicBloomFilter.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/bloom/DynamicBloomFilter.java
index be9a4077c8bbc..8a7ec6954c76b 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/bloom/DynamicBloomFilter.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/bloom/DynamicBloomFilter.java
@@ -64,8 +64,9 @@
  * process of a DBF is iterative. At the start, the DBF is a <code>1 * m</code>
  * bit matrix, i.e., it is composed of a single standard Bloom filter.
  * It assumes that <code>n<sub>r</sub></code> elements are recorded in the 
- * initial bit vector, where <code>n<sub>r</sub> <= n</code> (<code>n</code> is
- * the cardinality of the set <code>A</code> to record in the filter).  
+ * initial bit vector, where <code>n<sub>r</sub> {@literal <=} n</code>
+ * (<code>n</code> is the cardinality of the set <code>A</code> to record in
+ * the filter).
  * <p>
  * As the size of <code>A</code> grows during the execution of the application,
  * several keys must be inserted in the DBF.  When inserting a key into the DBF,
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/concurrent/AsyncGet.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/concurrent/AsyncGet.java
index f124890dd53f4..9304b483952d0 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/concurrent/AsyncGet.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/concurrent/AsyncGet.java
@@ -37,7 +37,7 @@ public interface AsyncGet<R, E extends Throwable> {
    *
    * @param timeout The maximum time period to wait.
    *                When timeout == 0, it does not wait at all.
-   *                When timeout < 0, it waits indefinitely.
+   *                When timeout &lt; 0, it waits indefinitely.
    * @param unit The unit of the timeout value
    * @return the result, which is possibly null.
    * @throws E an exception thrown by the underlying implementation.
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/hash/JenkinsHash.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/hash/JenkinsHash.java
index f3895d0248414..8c3b9da4ae337 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/hash/JenkinsHash.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/hash/JenkinsHash.java
@@ -69,11 +69,11 @@ private static long rot(long val, int pos) {
    * <p>The best hash table sizes are powers of 2.  There is no need to do mod
    * a prime (mod is sooo slow!).  If you need less than 32 bits, use a bitmask.
    * For example, if you need only 10 bits, do
-   * <code>h = (h & hashmask(10));</code>
+   * <code>h = (h &amp; hashmask(10));</code>
    * In which case, the hash table should have hashsize(10) elements.
    * 
    * <p>If you are hashing n strings byte[][] k, do it like this:
-   * for (int i = 0, h = 0; i < n; ++i) h = hash( k[i], h);
+   * for (int i = 0, h = 0; i &lt; n; ++i) h = hash( k[i], h);
    * 
    * <p>By Bob Jenkins, 2006.  bob_jenkins@burtleburtle.net.  You may use this
    * code any way you wish, private, educational, or commercial.  It's free.
diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/cli/util/CLICommand.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/cli/util/CLICommand.java
index 8823f5c3832c8..a7a8fa516a104 100644
--- a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/cli/util/CLICommand.java
+++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/cli/util/CLICommand.java
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * <p/>
+ * <p>
  * http://www.apache.org/licenses/LICENSE-2.0
- * <p/>
+ * <p>
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -20,7 +20,7 @@
 import org.apache.hadoop.conf.Configuration;
 
 /**
- * This interface is to generalize types of test command for upstream projects
+ * This interface is to generalize types of test command for upstream projects.
  */
 public interface CLICommand {
   public CommandExecutor getExecutor(String tag, Configuration conf)
diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/cli/util/CLICommandFS.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/cli/util/CLICommandFS.java
index eb96a06709b14..e22c2001cbbae 100644
--- a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/cli/util/CLICommandFS.java
+++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/cli/util/CLICommandFS.java
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * <p/>
+ * <p>
  * http://www.apache.org/licenses/LICENSE-2.0
- * <p/>
+ * <p>
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/cli/util/CLICommandTypes.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/cli/util/CLICommandTypes.java
index 44e0c307d3e47..8efe70c1f727f 100644
--- a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/cli/util/CLICommandTypes.java
+++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/cli/util/CLICommandTypes.java
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * <p/>
+ * <p>
  * http://www.apache.org/licenses/LICENSE-2.0
- * <p/>
+ * <p>
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -18,7 +18,7 @@
 package org.apache.hadoop.cli.util;
 
 /**
- * This interface is to provide command type for test commands enums
+ * This interface is to provide command type for test commands enums.
  */
 public interface CLICommandTypes {
 }
diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/io/erasurecode/rawcoder/RawErasureCoderBenchmark.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/io/erasurecode/rawcoder/RawErasureCoderBenchmark.java
index df8c54b9cdddf..362bde9806327 100644
--- a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/io/erasurecode/rawcoder/RawErasureCoderBenchmark.java
+++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/io/erasurecode/rawcoder/RawErasureCoderBenchmark.java
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * <p/>
+ * <p>
  * http://www.apache.org/licenses/LICENSE-2.0
- * <p/>
+ * <p>
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/io/retry/TestDefaultRetryPolicy.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/io/retry/TestDefaultRetryPolicy.java
index 56dec3a203141..6b82077e8df91 100644
--- a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/io/retry/TestDefaultRetryPolicy.java
+++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/io/retry/TestDefaultRetryPolicy.java
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * <p/>
+ * <p>
  * http://www.apache.org/licenses/LICENSE-2.0
- * <p/>
+ * <p>
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/net/StaticMapping.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/net/StaticMapping.java
index 493d86000eec2..eb518209f1f7c 100644
--- a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/net/StaticMapping.java
+++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/net/StaticMapping.java
@@ -44,9 +44,9 @@ public class StaticMapping extends AbstractDNSToSwitchMapping  {
   /**
    * Key to define the node mapping as a comma-delimited list of host=rack
    * mappings, e.g. <code>host1=r1,host2=r1,host3=r2</code>.
-   * <p/>
+   * <p>
    * Value: {@value}
-   * <p/>
+   * <p>
    * <b>Important: </b>spaces not trimmed and are considered significant.
    */
   public static final String KEY_HADOOP_CONFIGURED_NODE_MAPPING =
diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/tracing/SetSpanReceiver.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/tracing/SetSpanReceiver.java
index 09d637e1ba387..d87da0ac301c7 100644
--- a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/tracing/SetSpanReceiver.java
+++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/tracing/SetSpanReceiver.java
@@ -35,7 +35,7 @@
 /**
  * Span receiver that puts all spans into a single set.
  * This is useful for testing.
- * <p/>
+ * <p>
  * We're not using HTrace's POJOReceiver here so as that doesn't
  * push all the metrics to a static place, and would make testing
  * SpanReceiverHost harder.

From 8c8f1b0aff78b05a340e5fb40926d399459e53db Mon Sep 17 00:00:00 2001
From: Steve Loughran <stevel@apache.org>
Date: Tue, 5 Feb 2019 17:21:02 +0530
Subject: [PATCH 26/40] HADOOP-15229. Add FileSystem builder-based openFile()
 API to match createFile(); S3A to implement S3 Select through this API.

The new openFile() API is asynchronous, and implemented across FileSystem and FileContext.

The MapReduce V2 inputs are moved to this API, and you can actually set must/may
options to pass in.

This is more useful for setting things like s3a seek policy than for S3 select,
as the existing input format/record readers can't handle S3 select output where
the stream is shorter than the file length, and splitting plain text is suboptimal.
Future work is needed there.

In the meantime, any/all filesystem connectors are now free to add their own filesystem-specific
configuration parameters which can be set in jobs and used to set filesystem input stream
options (seek policy, retry, encryption secrets, etc).

Contributed by Steve Loughran
---
 .../apache/hadoop/fs/AbstractFileSystem.java  |   67 +-
 .../hadoop/fs/DelegateToFileSystem.java       |   23 +-
 .../java/org/apache/hadoop/fs/FSBuilder.java  |  131 ++
 .../hadoop/fs/FSDataOutputStreamBuilder.java  |  193 +--
 .../org/apache/hadoop/fs/FileContext.java     |   96 +-
 .../java/org/apache/hadoop/fs/FileSystem.java |  188 ++-
 .../apache/hadoop/fs/FilterFileSystem.java    |   41 +-
 .../java/org/apache/hadoop/fs/FilterFs.java   |   20 +-
 .../fs/FutureDataInputStreamBuilder.java      |   50 +
 .../hadoop/fs/impl/AbstractFSBuilderImpl.java |  356 ++++++
 .../FutureDataInputStreamBuilderImpl.java     |  141 +++
 .../hadoop/fs/impl/FutureIOSupport.java       |  191 +++
 .../hadoop/fs/impl/WrappedIOException.java    |   56 +
 .../hadoop/io/compress/PassthroughCodec.java  |  246 ++++
 .../org/apache/hadoop/util/LambdaUtils.java   |   59 +
 .../src/main/resources/core-default.xml       |  112 ++
 .../site/markdown/filesystem/filesystem.md    |   87 +-
 .../markdown/filesystem/fsdatainputstream.md  |   14 +
 .../filesystem/fsdatainputstreambuilder.md    |  112 ++
 .../filesystem/fsdataoutputstreambuilder.md   |    6 +-
 .../fs/FileContextMainOperationsBaseTest.java |   95 +-
 .../apache/hadoop/fs/TestHarFileSystem.java   |   20 +
 .../apache/hadoop/fs/TestLocalFileSystem.java |    2 +-
 .../fs/contract/AbstractContractOpenTest.java |  135 +-
 .../AbstractContractPathHandleTest.java       |   61 +
 .../hadoop/fs/contract/ContractTestUtils.java |   50 +-
 .../apache/hadoop/test/LambdaTestUtils.java   |  223 +++-
 .../hadoop/test/TestLambdaTestUtils.java      |  114 +-
 .../hadoop/hdfs/DistributedFileSystem.java    |    2 +-
 .../contract/hdfs/TestHDFSContractOpen.java   |    2 +-
 .../hadoop/mapred/LineRecordReader.java       |   12 +-
 .../apache/hadoop/mapreduce/MRJobConfig.java  |   14 +
 .../lib/input/FixedLengthRecordReader.java    |   14 +-
 .../mapreduce/lib/input/LineRecordReader.java |   12 +-
 .../mapreduce/lib/input/NLineInputFormat.java |   12 +-
 .../dev-support/findbugs-exclude.xml          |    5 +
 .../hadoop/fs/s3a/InternalConstants.java      |   53 +
 .../apache/hadoop/fs/s3a/S3AFileSystem.java   |  908 +++++++++-----
 .../apache/hadoop/fs/s3a/S3AInputStream.java  |   54 +-
 .../hadoop/fs/s3a/S3AInstrumentation.java     |   54 +-
 .../apache/hadoop/fs/s3a/S3AOpContext.java    |   25 +
 .../hadoop/fs/s3a/S3AReadOpContext.java       |   23 +-
 .../org/apache/hadoop/fs/s3a/S3AUtils.java    |    9 +-
 .../hadoop/fs/s3a/S3ObjectAttributes.java     |   27 +-
 .../org/apache/hadoop/fs/s3a/Statistic.java   |    2 +
 .../hadoop/fs/s3a/WriteOperationHelper.java   |  105 +-
 .../hadoop/fs/s3a/s3guard/S3GuardTool.java    |  309 +++--
 .../s3a/select/InternalSelectConstants.java   |   77 ++
 .../hadoop/fs/s3a/select/SelectBinding.java   |  431 +++++++
 .../hadoop/fs/s3a/select/SelectConstants.java |  296 +++++
 .../fs/s3a/select/SelectInputStream.java      |  457 +++++++
 .../hadoop/fs/s3a/select/SelectTool.java      |  355 ++++++
 .../hadoop/fs/s3a/select/package-info.java    |   27 +
 .../markdown/tools/hadoop-aws/s3_select.md    | 1100 +++++++++++++++++
 .../site/markdown/tools/hadoop-aws/testing.md |   16 +
 .../s3a/ITestS3AAWSCredentialsProvider.java   |    4 +-
 .../fs/s3a/ITestS3AFailureHandling.java       |   11 +-
 .../apache/hadoop/fs/s3a/S3ATestUtils.java    |  235 +++-
 .../fs/s3a/TestS3AAWSCredentialsProvider.java |    3 +-
 .../fs/s3a/commit/AbstractCommitITest.java    |    2 +-
 .../s3guard/AbstractS3GuardToolTestBase.java  |  217 ++--
 .../s3a/s3guard/ITestS3GuardToolDynamoDB.java |    2 +-
 .../fs/s3a/s3guard/ITestS3GuardToolLocal.java |   16 +-
 .../fs/s3a/s3guard/S3GuardToolTestHelper.java |   89 ++
 .../fs/s3a/select/AbstractS3SelectTest.java   |  746 +++++++++++
 .../apache/hadoop/fs/s3a/select/CsvFile.java  |  138 +++
 .../hadoop/fs/s3a/select/ITestS3Select.java   |  967 +++++++++++++++
 .../fs/s3a/select/ITestS3SelectCLI.java       |  347 ++++++
 .../fs/s3a/select/ITestS3SelectLandsat.java   |  432 +++++++
 .../fs/s3a/select/ITestS3SelectMRJob.java     |  206 +++
 .../mapreduce/StreamInputFormat.java          |   14 +-
 71 files changed, 9694 insertions(+), 1025 deletions(-)
 create mode 100644 hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FSBuilder.java
 create mode 100644 hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FutureDataInputStreamBuilder.java
 create mode 100644 hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/impl/AbstractFSBuilderImpl.java
 create mode 100644 hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/impl/FutureDataInputStreamBuilderImpl.java
 create mode 100644 hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/impl/FutureIOSupport.java
 create mode 100644 hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/impl/WrappedIOException.java
 create mode 100644 hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/compress/PassthroughCodec.java
 create mode 100644 hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/LambdaUtils.java
 create mode 100644 hadoop-common-project/hadoop-common/src/site/markdown/filesystem/fsdatainputstreambuilder.md
 create mode 100644 hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/InternalConstants.java
 create mode 100644 hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/select/InternalSelectConstants.java
 create mode 100644 hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/select/SelectBinding.java
 create mode 100644 hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/select/SelectConstants.java
 create mode 100644 hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/select/SelectInputStream.java
 create mode 100644 hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/select/SelectTool.java
 create mode 100644 hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/select/package-info.java
 create mode 100644 hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/s3_select.md
 create mode 100644 hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/S3GuardToolTestHelper.java
 create mode 100644 hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/select/AbstractS3SelectTest.java
 create mode 100644 hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/select/CsvFile.java
 create mode 100644 hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/select/ITestS3Select.java
 create mode 100644 hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/select/ITestS3SelectCLI.java
 create mode 100644 hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/select/ITestS3SelectLandsat.java
 create mode 100644 hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/select/ITestS3SelectMRJob.java

diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/AbstractFileSystem.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/AbstractFileSystem.java
index cd7068025753e..dc6cd2bc2b07f 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/AbstractFileSystem.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/AbstractFileSystem.java
@@ -25,12 +25,15 @@
 import java.net.URISyntaxException;
 import java.util.ArrayList;
 import java.util.Collection;
+import java.util.Collections;
 import java.util.EnumSet;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
 import java.util.NoSuchElementException;
+import java.util.Set;
 import java.util.StringTokenizer;
+import java.util.concurrent.CompletableFuture;
 import java.util.concurrent.ConcurrentHashMap;
 
 import org.apache.hadoop.HadoopIllegalArgumentException;
@@ -41,6 +44,7 @@
 import org.apache.hadoop.fs.Options.ChecksumOpt;
 import org.apache.hadoop.fs.Options.CreateOpts;
 import org.apache.hadoop.fs.Options.Rename;
+import org.apache.hadoop.fs.impl.AbstractFSBuilderImpl;
 import org.apache.hadoop.fs.permission.AclEntry;
 import org.apache.hadoop.fs.permission.AclStatus;
 import org.apache.hadoop.fs.permission.FsAction;
@@ -48,14 +52,13 @@
 import org.apache.hadoop.security.AccessControlException;
 import org.apache.hadoop.security.SecurityUtil;
 import org.apache.hadoop.security.token.Token;
+import org.apache.hadoop.util.LambdaUtils;
 import org.apache.hadoop.util.Progressable;
 
 import com.google.common.annotations.VisibleForTesting;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-import static org.apache.hadoop.fs.impl.PathCapabilitiesSupport.validatePathCapabilityArgs;
-
 /**
  * This class provides an interface for implementors of a Hadoop file system
  * (analogous to the VFS of Unix). Applications do not access this class;
@@ -68,7 +71,7 @@
  */
 @InterfaceAudience.Public
 @InterfaceStability.Stable
-public abstract class AbstractFileSystem implements PathCapabilities {
+public abstract class AbstractFileSystem {
   static final Logger LOG = LoggerFactory.getLogger(AbstractFileSystem.class);
 
   /** Recording statistics per a file system class. */
@@ -398,8 +401,11 @@ public void checkPath(Path path) {
       thatPort = this.getUriDefaultPort();
     }
     if (thisPort != thatPort) {
-      throw new InvalidPathException("Wrong FS: " + path + ", expected: "
-          + this.getUri());
+      throw new InvalidPathException("Wrong FS: " + path
+          + " and port=" + thatPort
+          + ", expected: "
+          + this.getUri()
+          + " with port=" + thisPort);
     }
   }
   
@@ -848,20 +854,6 @@ public abstract FileStatus getFileStatus(final Path f)
       throws AccessControlException, FileNotFoundException,
       UnresolvedLinkException, IOException;
 
-  /**
-   * Synchronize client metadata state.
-   * <p>
-   * In some FileSystem implementations such as HDFS metadata
-   * synchronization is essential to guarantee consistency of read requests
-   * particularly in HA setting.
-   * @throws IOException
-   * @throws UnsupportedOperationException
-   */
-  public void msync() throws IOException, UnsupportedOperationException {
-    throw new UnsupportedOperationException(getClass().getCanonicalName() +
-        " does not support method msync");
-  }
-
   /**
    * The specification of this method matches that of
    * {@link FileContext#access(Path, FsAction)}
@@ -1343,16 +1335,31 @@ public boolean equals(Object other) {
     return myUri.equals(((AbstractFileSystem) other).myUri);
   }
 
-  public boolean hasPathCapability(final Path path,
-      final String capability)
-      throws IOException {
-    switch (validatePathCapabilityArgs(makeQualified(path), capability)) {
-    case CommonPathCapabilities.FS_SYMLINKS:
-      // delegate to the existing supportsSymlinks() call.
-      return supportsSymlinks();
-    default:
-      // the feature is not implemented.
-      return false;
-    }
+  /**
+   * Open a file with the given set of options.
+   * The base implementation performs a blocking
+   * call to {@link #open(Path, int)}in this call;
+   * the actual outcome is in the returned {@code CompletableFuture}.
+   * This avoids having to create some thread pool, while still
+   * setting up the expectation that the {@code get()} call
+   * is needed to evaluate the result.
+   * @param path path to the file
+   * @param mandatoryKeys set of options declared as mandatory.
+   * @param options options set during the build sequence.
+   * @param bufferSize buffer size
+   * @return a future which will evaluate to the opened file.
+   * @throws IOException failure to resolve the link.
+   * @throws IllegalArgumentException unknown mandatory key
+   */
+  public CompletableFuture<FSDataInputStream> openFileWithOptions(Path path,
+      Set<String> mandatoryKeys,
+      Configuration options,
+      int bufferSize) throws IOException {
+    AbstractFSBuilderImpl.rejectUnknownMandatoryKeys(mandatoryKeys,
+        Collections.emptySet(),
+        "for " + path);
+    return LambdaUtils.eval(
+        new CompletableFuture<>(), () -> open(path, bufferSize));
   }
+
 }
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/DelegateToFileSystem.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/DelegateToFileSystem.java
index c6c5cbb15b06c..165c56c3d5c37 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/DelegateToFileSystem.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/DelegateToFileSystem.java
@@ -24,6 +24,8 @@
 import java.util.Arrays;
 import java.util.EnumSet;
 import java.util.List;
+import java.util.Set;
+import java.util.concurrent.CompletableFuture;
 
 import org.apache.hadoop.classification.InterfaceAudience;
 import org.apache.hadoop.classification.InterfaceStability;
@@ -262,10 +264,21 @@ public List<Token<?>> getDelegationTokens(String renewer) throws IOException {
     return Arrays.asList(fsImpl.addDelegationTokens(renewer, null));
   }
 
-  @Override
-  public boolean hasPathCapability(final Path path,
-      final String capability)
-      throws IOException {
-    return fsImpl.hasPathCapability(path, capability);
+  /**
+   * Open a file by delegating to
+   * {@link FileSystem#openFileWithOptions(Path, Set, Configuration, int)}.
+   * @param path path to the file
+   * @param mandatoryKeys set of options declared as mandatory.
+   * @param options options set during the build sequence.
+   * @param bufferSize buffer size
+   * @return a future which will evaluate to the opened file.
+   * @throws IOException failure to resolve the link.
+   * @throws IllegalArgumentException unknown mandatory key
+   */
+  public CompletableFuture<FSDataInputStream> openFileWithOptions(Path path,
+      Set<String> mandatoryKeys,
+      Configuration options,
+      int bufferSize) throws IOException {
+    return fsImpl.openFileWithOptions(path, mandatoryKeys, options, bufferSize);
   }
 }
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FSBuilder.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FSBuilder.java
new file mode 100644
index 0000000000000..b7757a62e28ad
--- /dev/null
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FSBuilder.java
@@ -0,0 +1,131 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs;
+
+import javax.annotation.Nonnull;
+import java.io.IOException;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.classification.InterfaceStability;
+
+/**
+ * The base interface which various FileSystem FileContext Builder
+ * interfaces can extend, and which underlying implementations
+ * will then implement.
+ * @param <S> Return type on the {@link #build()} call.
+ * @param <B> type of builder itself.
+ */
+@InterfaceAudience.Public
+@InterfaceStability.Unstable
+public interface FSBuilder<S, B extends FSBuilder<S, B>> {
+
+  /**
+   * Set optional Builder parameter.
+   */
+  B opt(@Nonnull String key, @Nonnull String value);
+
+  /**
+   * Set optional boolean parameter for the Builder.
+   *
+   * @see #opt(String, String)
+   */
+  B opt(@Nonnull String key, boolean value);
+
+  /**
+   * Set optional int parameter for the Builder.
+   *
+   * @see #opt(String, String)
+   */
+  B opt(@Nonnull String key, int value);
+
+  /**
+   * Set optional float parameter for the Builder.
+   *
+   * @see #opt(String, String)
+   */
+  B opt(@Nonnull String key, float value);
+
+  /**
+   * Set optional double parameter for the Builder.
+   *
+   * @see #opt(String, String)
+   */
+  B opt(@Nonnull String key, double value);
+
+  /**
+   * Set an array of string values as optional parameter for the Builder.
+   *
+   * @see #opt(String, String)
+   */
+  B opt(@Nonnull String key, @Nonnull String... values);
+
+  /**
+   * Set mandatory option to the Builder.
+   *
+   * If the option is not supported or unavailable,
+   * the client should expect {@link #build()} throws IllegalArgumentException.
+   */
+  B must(@Nonnull String key, @Nonnull String value);
+
+  /**
+   * Set mandatory boolean option.
+   *
+   * @see #must(String, String)
+   */
+  B must(@Nonnull String key, boolean value);
+
+  /**
+   * Set mandatory int option.
+   *
+   * @see #must(String, String)
+   */
+  B must(@Nonnull String key, int value);
+
+  /**
+   * Set mandatory float option.
+   *
+   * @see #must(String, String)
+   */
+  B must(@Nonnull String key, float value);
+
+  /**
+   * Set mandatory double option.
+   *
+   * @see #must(String, String)
+   */
+  B must(@Nonnull String key, double value);
+
+  /**
+   * Set a string array as mandatory option.
+   *
+   * @see #must(String, String)
+   */
+  B must(@Nonnull String key, @Nonnull String... values);
+
+  /**
+   * Instantiate the object which was being built.
+   *
+   * @throws IllegalArgumentException if the parameters are not valid.
+   * @throws UnsupportedOperationException if the filesystem does not support
+   * the specific operation.
+   * @throws IOException on filesystem IO errors.
+   */
+  S build() throws IllegalArgumentException,
+      UnsupportedOperationException, IOException;
+}
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FSDataOutputStreamBuilder.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FSDataOutputStreamBuilder.java
index d43129388bf2e..62a3182dfba20 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FSDataOutputStreamBuilder.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FSDataOutputStreamBuilder.java
@@ -17,22 +17,18 @@
  */
 package org.apache.hadoop.fs;
 
-import com.google.common.annotations.VisibleForTesting;
-import com.google.common.base.Preconditions;
 import org.apache.hadoop.classification.InterfaceAudience;
 import org.apache.hadoop.classification.InterfaceStability;
-import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.Options.ChecksumOpt;
+import org.apache.hadoop.fs.impl.AbstractFSBuilderImpl;
 import org.apache.hadoop.fs.permission.FsPermission;
 import org.apache.hadoop.util.Progressable;
 
 import javax.annotation.Nonnull;
 import java.io.IOException;
-import java.util.Collections;
 import java.util.EnumSet;
-import java.util.HashSet;
-import java.util.Set;
 
+import static com.google.common.base.Preconditions.checkNotNull;
 import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.IO_FILE_BUFFER_SIZE_DEFAULT;
 import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.IO_FILE_BUFFER_SIZE_KEY;
 
@@ -87,9 +83,9 @@
 @InterfaceAudience.Public
 @InterfaceStability.Evolving
 public abstract class FSDataOutputStreamBuilder
-    <S extends FSDataOutputStream, B extends FSDataOutputStreamBuilder<S, B>> {
+    <S extends FSDataOutputStream, B extends FSDataOutputStreamBuilder<S, B>>
+    extends AbstractFSBuilderImpl<S, B> {
   private final FileSystem fs;
-  private final Path path;
   private FsPermission permission = null;
   private int bufferSize;
   private short replication;
@@ -100,34 +96,23 @@ public abstract class FSDataOutputStreamBuilder
   private Progressable progress = null;
   private ChecksumOpt checksumOpt = null;
 
-  /**
-   * Contains optional and mandatory parameters.
-   *
-   * It does not load default configurations from default files.
-   */
-  private final Configuration options = new Configuration(false);
-
-  /** Keep track of the keys for mandatory options. */
-  private final Set<String> mandatoryKeys = new HashSet<>();
-
   /**
    * Return the concrete implementation of the builder instance.
    */
-  protected abstract B getThisBuilder();
+  public abstract B getThisBuilder();
 
   /**
    * Construct from a {@link FileContext}.
    *
    * @param fc FileContext
    * @param p path.
-   * @throws IOException
+   * @throws IOException failure
    */
   FSDataOutputStreamBuilder(@Nonnull FileContext fc,
       @Nonnull Path p) throws IOException {
-    Preconditions.checkNotNull(fc);
-    Preconditions.checkNotNull(p);
+    super(checkNotNull(p));
+    checkNotNull(fc);
     this.fs = null;
-    this.path = p;
 
     AbstractFileSystem afs = fc.getFSofPath(p);
     FsServerDefaults defaults = afs.getServerDefaults(p);
@@ -141,25 +126,20 @@ public abstract class FSDataOutputStreamBuilder
    */
   protected FSDataOutputStreamBuilder(@Nonnull FileSystem fileSystem,
       @Nonnull Path p) {
-    Preconditions.checkNotNull(fileSystem);
-    Preconditions.checkNotNull(p);
+    super(checkNotNull(p));
+    checkNotNull(fileSystem);
     fs = fileSystem;
-    path = p;
     bufferSize = fs.getConf().getInt(IO_FILE_BUFFER_SIZE_KEY,
         IO_FILE_BUFFER_SIZE_DEFAULT);
-    replication = fs.getDefaultReplication(path);
+    replication = fs.getDefaultReplication(p);
     blockSize = fs.getDefaultBlockSize(p);
   }
 
   protected FileSystem getFS() {
-    Preconditions.checkNotNull(fs);
+    checkNotNull(fs);
     return fs;
   }
 
-  protected Path getPath() {
-    return path;
-  }
-
   protected FsPermission getPermission() {
     if (permission == null) {
       permission = FsPermission.getFileDefault();
@@ -171,7 +151,7 @@ protected FsPermission getPermission() {
    * Set permission for the file.
    */
   public B permission(@Nonnull final FsPermission perm) {
-    Preconditions.checkNotNull(perm);
+    checkNotNull(perm);
     permission = perm;
     return getThisBuilder();
   }
@@ -235,7 +215,7 @@ protected Progressable getProgress() {
    * Set the facility of reporting progress.
    */
   public B progress(@Nonnull final Progressable prog) {
-    Preconditions.checkNotNull(prog);
+    checkNotNull(prog);
     progress = prog;
     return getThisBuilder();
   }
@@ -282,154 +262,11 @@ protected ChecksumOpt getChecksumOpt() {
    * Set checksum opt.
    */
   public B checksumOpt(@Nonnull final ChecksumOpt chksumOpt) {
-    Preconditions.checkNotNull(chksumOpt);
+    checkNotNull(chksumOpt);
     checksumOpt = chksumOpt;
     return getThisBuilder();
   }
 
-  /**
-   * Set optional Builder parameter.
-   */
-  public B opt(@Nonnull final String key, @Nonnull final String value) {
-    mandatoryKeys.remove(key);
-    options.set(key, value);
-    return getThisBuilder();
-  }
-
-  /**
-   * Set optional boolean parameter for the Builder.
-   *
-   * @see #opt(String, String)
-   */
-  public B opt(@Nonnull final String key, boolean value) {
-    mandatoryKeys.remove(key);
-    options.setBoolean(key, value);
-    return getThisBuilder();
-  }
-
-  /**
-   * Set optional int parameter for the Builder.
-   *
-   * @see #opt(String, String)
-   */
-  public B opt(@Nonnull final String key, int value) {
-    mandatoryKeys.remove(key);
-    options.setInt(key, value);
-    return getThisBuilder();
-  }
-
-  /**
-   * Set optional float parameter for the Builder.
-   *
-   * @see #opt(String, String)
-   */
-  public B opt(@Nonnull final String key, float value) {
-    mandatoryKeys.remove(key);
-    options.setFloat(key, value);
-    return getThisBuilder();
-  }
-
-  /**
-   * Set optional double parameter for the Builder.
-   *
-   * @see #opt(String, String)
-   */
-  public B opt(@Nonnull final String key, double value) {
-    mandatoryKeys.remove(key);
-    options.setDouble(key, value);
-    return getThisBuilder();
-  }
-
-  /**
-   * Set an array of string values as optional parameter for the Builder.
-   *
-   * @see #opt(String, String)
-   */
-  public B opt(@Nonnull final String key, @Nonnull final String... values) {
-    mandatoryKeys.remove(key);
-    options.setStrings(key, values);
-    return getThisBuilder();
-  }
-
-  /**
-   * Set mandatory option to the Builder.
-   *
-   * If the option is not supported or unavailable on the {@link FileSystem},
-   * the client should expect {@link #build()} throws IllegalArgumentException.
-   */
-  public B must(@Nonnull final String key, @Nonnull final String value) {
-    mandatoryKeys.add(key);
-    options.set(key, value);
-    return getThisBuilder();
-  }
-
-  /**
-   * Set mandatory boolean option.
-   *
-   * @see #must(String, String)
-   */
-  public B must(@Nonnull final String key, boolean value) {
-    mandatoryKeys.add(key);
-    options.setBoolean(key, value);
-    return getThisBuilder();
-  }
-
-  /**
-   * Set mandatory int option.
-   *
-   * @see #must(String, String)
-   */
-  public B must(@Nonnull final String key, int value) {
-    mandatoryKeys.add(key);
-    options.setInt(key, value);
-    return getThisBuilder();
-  }
-
-  /**
-   * Set mandatory float option.
-   *
-   * @see #must(String, String)
-   */
-  public B must(@Nonnull final String key, float value) {
-    mandatoryKeys.add(key);
-    options.setFloat(key, value);
-    return getThisBuilder();
-  }
-
-  /**
-   * Set mandatory double option.
-   *
-   * @see #must(String, String)
-   */
-  public B must(@Nonnull final String key, double value) {
-    mandatoryKeys.add(key);
-    options.setDouble(key, value);
-    return getThisBuilder();
-  }
-
-  /**
-   * Set a string array as mandatory option.
-   *
-   * @see #must(String, String)
-   */
-  public B must(@Nonnull final String key, @Nonnull final String... values) {
-    mandatoryKeys.add(key);
-    options.setStrings(key, values);
-    return getThisBuilder();
-  }
-
-  protected Configuration getOptions() {
-    return options;
-  }
-
-  /**
-   * Get all the keys that are set as mandatory keys.
-   */
-  @VisibleForTesting
-  protected Set<String> getMandatoryKeys() {
-    return Collections.unmodifiableSet(mandatoryKeys);
-  }
-
   /**
    * Create the FSDataOutputStream to write on the file system.
    *
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FileContext.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FileContext.java
index 05fbc34730d0f..f65074856bf3e 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FileContext.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FileContext.java
@@ -35,6 +35,7 @@
 import java.util.Stack;
 import java.util.TreeSet;
 import java.util.Map.Entry;
+import java.util.concurrent.CompletableFuture;
 
 import javax.annotation.Nonnull;
 
@@ -44,7 +45,7 @@
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileSystem.Statistics;
 import org.apache.hadoop.fs.Options.CreateOpts;
-import org.apache.hadoop.fs.impl.FsLinkResolution;
+import org.apache.hadoop.fs.impl.FutureDataInputStreamBuilderImpl;
 import org.apache.hadoop.fs.permission.AclEntry;
 import org.apache.hadoop.fs.permission.AclStatus;
 import org.apache.hadoop.fs.permission.FsAction;
@@ -63,13 +64,10 @@
 import org.apache.hadoop.util.ShutdownHookManager;
 
 import com.google.common.base.Preconditions;
-import com.google.common.annotations.VisibleForTesting;
 import org.apache.htrace.core.Tracer;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-import static org.apache.hadoop.fs.impl.PathCapabilitiesSupport.validatePathCapabilityArgs;
-
 /**
  * The FileContext class provides an interface for users of the Hadoop
  * file system. It exposes a number of file system operations, e.g. create,
@@ -173,7 +171,7 @@
 
 @InterfaceAudience.Public
 @InterfaceStability.Stable
-public class FileContext implements PathCapabilities {
+public class FileContext {
   
   public static final Logger LOG = LoggerFactory.getLogger(FileContext.class);
   /**
@@ -505,9 +503,10 @@ public static FileContext getLocalFSFileContext(final Configuration aConf)
     return getFileContext(FsConstants.LOCAL_FS_URI, aConf);
   }
 
-  @VisibleForTesting
+  /* This method is needed for tests. */
   @InterfaceAudience.Private
-  @InterfaceStability.Unstable
+  @InterfaceStability.Unstable /* return type will change to AFS once
+                                  HADOOP-6223 is completed */
   public AbstractFileSystem getDefaultFileSystem() {
     return defaultFS;
   }
@@ -716,7 +715,7 @@ private FCDataOutputStreamBuilder(
     }
 
     @Override
-    protected FCDataOutputStreamBuilder getThisBuilder() {
+    public FCDataOutputStreamBuilder getThisBuilder() {
       return this;
     }
 
@@ -1246,16 +1245,6 @@ public FileStatus next(final AbstractFileSystem fs, final Path p)
     }.resolve(this, absF);
   }
 
-  /**
-   * Synchronize client metadata state.
-   *
-   * @throws IOException
-   * @throws UnsupportedOperationException
-   */
-  public void msync() throws IOException, UnsupportedOperationException {
-    defaultFS.msync();
-  }
-
   /**
    * Checks if the user can access a path.  The mode specifies which access
    * checks to perform.  If the requested permissions are granted, then the
@@ -2883,19 +2872,66 @@ Tracer getTracer() {
   }
 
   /**
-   * Return the path capabilities of the bonded {@code AbstractFileSystem}.
-   * @param path path to query the capability of.
-   * @param capability string to query the stream support for.
-   * @return true iff the capability is supported under that FS.
-   * @throws IOException path resolution or other IO failure
-   * @throws IllegalArgumentException invalid arguments
+   * Open a file for reading through a builder API.
+   * Ultimately calls {@link #open(Path, int)} unless a subclass
+   * executes the open command differently.
+   *
+   * The semantics of this call are therefore the same as that of
+   * {@link #open(Path, int)} with one special point: it is in
+   * {@code FSDataInputStreamBuilder.build()} in which the open operation
+   * takes place -it is there where all preconditions to the operation
+   * are checked.
+   * @param path file path
+   * @return a FSDataInputStreamBuilder object to build the input stream
+   * @throws IOException if some early checks cause IO failures.
+   * @throws UnsupportedOperationException if support is checked early.
    */
-  public boolean hasPathCapability(Path path, String capability)
-      throws IOException {
-    validatePathCapabilityArgs(path, capability);
-    return FsLinkResolution.resolve(this,
-        fixRelativePart(path),
-        (fs, p) -> fs.hasPathCapability(p, capability));
+  @InterfaceStability.Unstable
+  public FutureDataInputStreamBuilder openFile(Path path)
+      throws IOException, UnsupportedOperationException {
+
+    return new FSDataInputStreamBuilder(path);
   }
 
+  /**
+   * Builder returned for {@link #openFile(Path)}.
+   */
+  private class FSDataInputStreamBuilder
+      extends FutureDataInputStreamBuilderImpl {
+
+    /**
+     * Path Constructor.
+     * @param path path to open.
+     */
+    protected FSDataInputStreamBuilder(
+        @Nonnull final Path path) throws IOException {
+      super(FileContext.this, path);
+    }
+
+    /**
+     * Perform the open operation.
+     *
+     * @return a future to the input stream.
+     * @throws IOException early failure to open
+     * @throws UnsupportedOperationException if the specific operation
+     * is not supported.
+     * @throws IllegalArgumentException if the parameters are not valid.
+     */
+    @Override
+    public CompletableFuture<FSDataInputStream> build() throws IOException {
+      final Path absF = fixRelativePart(getPath());
+      return new FSLinkResolver<CompletableFuture<FSDataInputStream>>() {
+        @Override
+        public CompletableFuture<FSDataInputStream> next(
+            final AbstractFileSystem fs,
+            final Path p)
+            throws IOException {
+          return fs.openFileWithOptions(p,
+              getMandatoryKeys(),
+              getOptions(),
+              getBufferSize());
+        }
+      }.resolve(FileContext.this, absF);
+    }
+  }
 }
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FileSystem.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FileSystem.java
index 78608ffc9b73c..61fa43bdf6fa2 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FileSystem.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FileSystem.java
@@ -17,6 +17,7 @@
  */
 package org.apache.hadoop.fs;
 
+import javax.annotation.Nonnull;
 import java.io.Closeable;
 import java.io.FileNotFoundException;
 import java.io.IOException;
@@ -27,6 +28,7 @@
 import java.security.PrivilegedExceptionAction;
 import java.util.ArrayList;
 import java.util.Collection;
+import java.util.Collections;
 import java.util.EnumSet;
 import java.util.HashMap;
 import java.util.HashSet;
@@ -35,11 +37,13 @@
 import java.util.List;
 import java.util.Map;
 import java.util.NoSuchElementException;
+import java.util.Optional;
 import java.util.ServiceConfigurationError;
 import java.util.ServiceLoader;
 import java.util.Set;
 import java.util.Stack;
 import java.util.TreeSet;
+import java.util.concurrent.CompletableFuture;
 import java.util.concurrent.atomic.AtomicLong;
 
 import org.apache.commons.logging.Log;
@@ -52,6 +56,8 @@
 import org.apache.hadoop.fs.Options.ChecksumOpt;
 import org.apache.hadoop.fs.Options.HandleOpt;
 import org.apache.hadoop.fs.Options.Rename;
+import org.apache.hadoop.fs.impl.AbstractFSBuilderImpl;
+import org.apache.hadoop.fs.impl.FutureDataInputStreamBuilderImpl;
 import org.apache.hadoop.fs.permission.AclEntry;
 import org.apache.hadoop.fs.permission.AclStatus;
 import org.apache.hadoop.fs.permission.FsAction;
@@ -67,6 +73,7 @@
 import org.apache.hadoop.security.token.DelegationTokenIssuer;
 import org.apache.hadoop.util.ClassUtil;
 import org.apache.hadoop.util.DataChecksum;
+import org.apache.hadoop.util.LambdaUtils;
 import org.apache.hadoop.util.Progressable;
 import org.apache.hadoop.util.ReflectionUtils;
 import org.apache.hadoop.util.ShutdownHookManager;
@@ -118,6 +125,11 @@
  * <li>The term "file" refers to a file in the remote filesystem,
  * rather than instances of {@code java.io.File}.</li>
  * </ol>
+ *
+ * This is a carefully evolving class.
+ * New methods may be marked as Unstable or Evolving for their initial release,
+ * as a warning that they are new and may change based on the
+ * experience of use in applications.
  *****************************************************************/
 @SuppressWarnings("DeprecatedIsStillUsed")
 @InterfaceAudience.Public
@@ -4308,6 +4320,8 @@ protected FileSystemDataOutputStreamBuilder(FileSystem fileSystem, Path p) {
 
     @Override
     public FSDataOutputStream build() throws IOException {
+      rejectUnknownMandatoryKeys(Collections.emptySet(),
+          " for " + getPath());
       if (getFlags().contains(CreateFlag.CREATE) ||
           getFlags().contains(CreateFlag.OVERWRITE)) {
         if (isRecursive()) {
@@ -4322,11 +4336,12 @@ public FSDataOutputStream build() throws IOException {
       } else if (getFlags().contains(CreateFlag.APPEND)) {
         return getFS().append(getPath(), getBufferSize(), getProgress());
       }
-      throw new IOException("Must specify either create, overwrite or append");
+      throw new PathIOException(getPath().toString(),
+          "Must specify either create, overwrite or append");
     }
 
     @Override
-    protected FileSystemDataOutputStreamBuilder getThisBuilder() {
+    public FileSystemDataOutputStreamBuilder getThisBuilder() {
       return this;
     }
   }
@@ -4370,4 +4385,173 @@ private void methodNotSupported() {
   public FSDataOutputStreamBuilder appendFile(Path path) {
     return new FileSystemDataOutputStreamBuilder(this, path).append();
   }
+
+  /**
+   * Open a file for reading through a builder API.
+   * Ultimately calls {@link #open(Path, int)} unless a subclass
+   * executes the open command differently.
+   *
+   * The semantics of this call are therefore the same as that of
+   * {@link #open(Path, int)} with one special point: it is in
+   * {@code FSDataInputStreamBuilder.build()} in which the open operation
+   * takes place -it is there where all preconditions to the operation
+   * are checked.
+   * @param path file path
+   * @return a FSDataInputStreamBuilder object to build the input stream
+   * @throws IOException if some early checks cause IO failures.
+   * @throws UnsupportedOperationException if support is checked early.
+   */
+  @InterfaceStability.Unstable
+  public FutureDataInputStreamBuilder openFile(Path path)
+      throws IOException, UnsupportedOperationException {
+    return new FSDataInputStreamBuilder(this, path).getThisBuilder();
+  }
+
+  /**
+   * Open a file for reading through a builder API.
+   * Ultimately calls {@link #open(PathHandle, int)} unless a subclass
+   * executes the open command differently.
+   *
+   * If PathHandles are unsupported, this may fail in the
+   * {@code FSDataInputStreamBuilder.build()}  command,
+   * rather than in this {@code openFile()} operation.
+   * @param pathHandle path handle.
+   * @return a FSDataInputStreamBuilder object to build the input stream
+   * @throws IOException if some early checks cause IO failures.
+   * @throws UnsupportedOperationException if support is checked early.
+   */
+  @InterfaceStability.Unstable
+  public FutureDataInputStreamBuilder openFile(PathHandle pathHandle)
+      throws IOException, UnsupportedOperationException {
+    return new FSDataInputStreamBuilder(this, pathHandle)
+        .getThisBuilder();
+  }
+
+  /**
+   * Execute the actual open file operation.
+   *
+   * This is invoked from {@code FSDataInputStreamBuilder.build()}
+   * and from {@link DelegateToFileSystem} and is where
+   * the action of opening the file should begin.
+   *
+   * The base implementation performs a blocking
+   * call to {@link #open(Path, int)}in this call;
+   * the actual outcome is in the returned {@code CompletableFuture}.
+   * This avoids having to create some thread pool, while still
+   * setting up the expectation that the {@code get()} call
+   * is needed to evaluate the result.
+   * @param path path to the file
+   * @param mandatoryKeys set of options declared as mandatory.
+   * @param options options set during the build sequence.
+   * @param bufferSize buffer size
+   * @return a future which will evaluate to the opened file.
+   * @throws IOException failure to resolve the link.
+   * @throws IllegalArgumentException unknown mandatory key
+   */
+  protected CompletableFuture<FSDataInputStream> openFileWithOptions(
+      final Path path,
+      final Set<String> mandatoryKeys,
+      final Configuration options,
+      final int bufferSize) throws IOException {
+    AbstractFSBuilderImpl.rejectUnknownMandatoryKeys(mandatoryKeys,
+        Collections.emptySet(),
+        "for " + path);
+    return LambdaUtils.eval(
+        new CompletableFuture<>(), () -> open(path, bufferSize));
+  }
+
+  /**
+   * Execute the actual open file operation.
+   * The base implementation performs a blocking
+   * call to {@link #open(Path, int)}in this call;
+   * the actual outcome is in the returned {@code CompletableFuture}.
+   * This avoids having to create some thread pool, while still
+   * setting up the expectation that the {@code get()} call
+   * is needed to evaluate the result.
+   * @param pathHandle path to the file
+   * @param mandatoryKeys set of options declared as mandatory.
+   * @param options options set during the build sequence.
+   * @param bufferSize buffer size
+   * @return a future which will evaluate to the opened file.
+   * @throws IOException failure to resolve the link.
+   * @throws IllegalArgumentException unknown mandatory key
+   * @throws UnsupportedOperationException PathHandles are not supported.
+   * This may be deferred until the future is evaluated.
+   */
+  protected CompletableFuture<FSDataInputStream> openFileWithOptions(
+      final PathHandle pathHandle,
+      final Set<String> mandatoryKeys,
+      final Configuration options,
+      final int bufferSize) throws IOException {
+    AbstractFSBuilderImpl.rejectUnknownMandatoryKeys(mandatoryKeys,
+        Collections.emptySet(), "");
+    CompletableFuture<FSDataInputStream> result = new CompletableFuture<>();
+    try {
+      result.complete(open(pathHandle, bufferSize));
+    } catch (UnsupportedOperationException tx) {
+      // fail fast here
+      throw tx;
+    } catch (Throwable tx) {
+      // fail lazily here to ensure callers expect all File IO operations to
+      // surface later
+      result.completeExceptionally(tx);
+    }
+    return result;
+  }
+
+  /**
+   * Builder returned for {@code #openFile(Path)}
+   * and {@code #openFile(PathHandle)}.
+   */
+  private static class FSDataInputStreamBuilder
+      extends FutureDataInputStreamBuilderImpl
+      implements FutureDataInputStreamBuilder {
+
+    /**
+     * Path Constructor.
+     * @param fileSystem owner
+     * @param path path to open.
+     */
+    protected FSDataInputStreamBuilder(
+        @Nonnull final FileSystem fileSystem,
+        @Nonnull final Path path) {
+      super(fileSystem, path);
+    }
+
+    /**
+     * Construct from a path handle.
+     * @param fileSystem owner
+     * @param pathHandle path handle of file to open.
+     */
+    protected FSDataInputStreamBuilder(
+        @Nonnull final FileSystem fileSystem,
+        @Nonnull final PathHandle pathHandle) {
+      super(fileSystem, pathHandle);
+    }
+
+    /**
+     * Perform the open operation.
+     * Returns a future which, when get() or a chained completion
+     * operation is invoked, will supply the input stream of the file
+     * referenced by the path/path handle.
+     * @return a future to the input stream.
+     * @throws IOException early failure to open
+     * @throws UnsupportedOperationException if the specific operation
+     * is not supported.
+     * @throws IllegalArgumentException if the parameters are not valid.
+     */
+    @Override
+    public CompletableFuture<FSDataInputStream> build() throws IOException {
+      Optional<Path> optionalPath = getOptionalPath();
+      if(optionalPath.isPresent()) {
+        return getFS().openFileWithOptions(optionalPath.get(),
+            getMandatoryKeys(), getOptions(), getBufferSize());
+      } else {
+        return getFS().openFileWithOptions(getPathHandle(),
+            getMandatoryKeys(), getOptions(), getBufferSize());
+      }
+    }
+
+  }
+
 }
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FilterFileSystem.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FilterFileSystem.java
index bcdac5b2d6984..99c18b6646cd6 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FilterFileSystem.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FilterFileSystem.java
@@ -25,6 +25,8 @@
 import java.util.EnumSet;
 import java.util.List;
 import java.util.Map;
+import java.util.Set;
+import java.util.concurrent.CompletableFuture;
 
 import org.apache.hadoop.classification.InterfaceAudience;
 import org.apache.hadoop.classification.InterfaceStability;
@@ -332,10 +334,6 @@ public boolean mkdirs(Path f, FsPermission permission) throws IOException {
     return fs.mkdirs(f, permission);
   }
 
-  @Override
-  public boolean mkdirs(Path f) throws IOException {
-    return fs.mkdirs(f);
-  }
 
   /**
    * The src file is on the local disk.  Add it to FS at
@@ -458,11 +456,6 @@ public FileStatus getFileStatus(Path f) throws IOException {
     return fs.getFileStatus(f);
   }
 
-  @Override
-  public void msync() throws IOException, UnsupportedOperationException {
-    fs.msync();
-  }
-
   @Override
   public void access(Path path, FsAction mode) throws AccessControlException,
       FileNotFoundException, IOException {
@@ -703,9 +696,33 @@ public FSDataOutputStreamBuilder appendFile(Path path) {
   }
 
   @Override
-  public boolean hasPathCapability(final Path path, final String capability)
-      throws IOException {
-    return fs.hasPathCapability(path, capability);
+  public FutureDataInputStreamBuilder openFile(final Path path)
+      throws IOException, UnsupportedOperationException {
+    return fs.openFile(path);
+  }
+
+  @Override
+  public FutureDataInputStreamBuilder openFile(final PathHandle pathHandle)
+      throws IOException, UnsupportedOperationException {
+    return fs.openFile(pathHandle);
   }
 
+  @Override
+  protected CompletableFuture<FSDataInputStream> openFileWithOptions(
+      final Path path,
+      final Set<String> mandatoryKeys,
+      final Configuration options,
+      final int bufferSize) throws IOException {
+    return fs.openFileWithOptions(path, mandatoryKeys, options, bufferSize);
+  }
+
+  @Override
+  protected CompletableFuture<FSDataInputStream> openFileWithOptions(
+      final PathHandle pathHandle,
+      final Set<String> mandatoryKeys,
+      final Configuration options,
+      final int bufferSize) throws IOException {
+    return fs.openFileWithOptions(pathHandle, mandatoryKeys, options,
+        bufferSize);
+  }
 }
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FilterFs.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FilterFs.java
index e3281698a65a7..f5430d6026160 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FilterFs.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FilterFs.java
@@ -26,9 +26,12 @@
 import java.util.EnumSet;
 import java.util.List;
 import java.util.Map;
+import java.util.Set;
+import java.util.concurrent.CompletableFuture;
 
 import org.apache.hadoop.classification.InterfaceAudience;
 import org.apache.hadoop.classification.InterfaceStability;
+import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileSystem.Statistics;
 import org.apache.hadoop.fs.permission.AclEntry;
 import org.apache.hadoop.fs.permission.AclStatus;
@@ -122,11 +125,6 @@ public FileStatus getFileStatus(Path f)
     return myFs.getFileStatus(f);
   }
 
-  @Override
-  public void msync() throws IOException, UnsupportedOperationException {
-    myFs.msync();
-  }
-
   @Override
   public void access(Path path, FsAction mode) throws AccessControlException,
       FileNotFoundException, UnresolvedLinkException, IOException {
@@ -439,9 +437,13 @@ public Collection<? extends BlockStoragePolicySpi> getAllStoragePolicies()
     return myFs.getAllStoragePolicies();
   }
 
-  public boolean hasPathCapability(final Path path,
-      final String capability)
-      throws IOException {
-    return myFs.hasPathCapability(path, capability);
+  @Override
+  public CompletableFuture<FSDataInputStream> openFileWithOptions(
+      final Path path,
+      final Set<String> mandatoryKeys,
+      final Configuration options,
+      final int bufferSize) throws IOException {
+    return myFs.openFileWithOptions(path, mandatoryKeys, options, bufferSize);
   }
+
 }
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FutureDataInputStreamBuilder.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FutureDataInputStreamBuilder.java
new file mode 100644
index 0000000000000..774d30927df2c
--- /dev/null
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FutureDataInputStreamBuilder.java
@@ -0,0 +1,50 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.fs;
+
+import java.io.IOException;
+import java.util.concurrent.CompletableFuture;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.classification.InterfaceStability;
+
+/**
+ * Builder for input streams and subclasses whose return value is
+ * actually a completable future: this allows for better asynchronous
+ * operation.
+ *
+ * To be more generic, {@link #opt(String, int)} and {@link #must(String, int)}
+ * variants provide implementation-agnostic way to customize the builder.
+ * Each FS-specific builder implementation can interpret the FS-specific
+ * options accordingly, for example:
+ *
+ * If the option is not related to the file system, the option will be ignored.
+ * If the option is must, but not supported by the file system, a
+ * {@link IllegalArgumentException} will be thrown.
+ *
+ */
+@InterfaceAudience.Public
+@InterfaceStability.Unstable
+public interface FutureDataInputStreamBuilder
+    extends FSBuilder<CompletableFuture<FSDataInputStream>, FutureDataInputStreamBuilder> {
+
+  @Override
+  CompletableFuture<FSDataInputStream> build()
+      throws IllegalArgumentException, UnsupportedOperationException,
+      IOException;
+}
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/impl/AbstractFSBuilderImpl.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/impl/AbstractFSBuilderImpl.java
new file mode 100644
index 0000000000000..5fc92e97be76c
--- /dev/null
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/impl/AbstractFSBuilderImpl.java
@@ -0,0 +1,356 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.impl;
+
+import javax.annotation.Nonnull;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.NoSuchElementException;
+import java.util.Optional;
+import java.util.Set;
+
+import com.google.common.annotations.VisibleForTesting;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.classification.InterfaceStability;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSBuilder;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.PathHandle;
+
+import static com.google.common.base.Preconditions.checkArgument;
+import static com.google.common.base.Preconditions.checkNotNull;
+
+/**
+ * Builder for filesystem/filecontext operations of various kinds,
+ * with option support.
+ *
+ * <code>
+ *   .opt("foofs:option.a", true)
+ *   .opt("foofs:option.b", "value")
+ *   .opt("barfs:cache", true)
+ *   .must("foofs:cache", true)
+ *   .must("barfs:cache-size", 256 * 1024 * 1024)
+ *   .build();
+ * </code>
+ *
+ * Configuration keys declared in an {@code opt()} may be ignored by
+ * a builder which does not recognise them.
+ *
+ * Configuration keys declared in a {@code must()} function set must
+ * be understood by the implementation or a
+ * {@link IllegalArgumentException} will be thrown.
+ *
+ * @param <S> Return type on the {@link #build()} call.
+ * @param <B> type of builder itself.
+ */
+@InterfaceAudience.Public
+@InterfaceStability.Unstable
+public abstract class
+    AbstractFSBuilderImpl<S, B extends FSBuilder<S, B>>
+    implements FSBuilder<S, B> {
+
+  public static final String UNKNOWN_MANDATORY_KEY = "Unknown mandatory key";
+
+  @VisibleForTesting
+  static final String E_BOTH_A_PATH_AND_A_PATH_HANDLE
+      = "Both a path and a pathHandle has been provided to the constructor";
+
+  private final Optional<Path> optionalPath;
+
+  private final Optional<PathHandle> optionalPathHandle;
+
+  /**
+   * Contains optional and mandatory parameters.
+   *
+   * It does not load default configurations from default files.
+   */
+  private final Configuration options = new Configuration(false);
+
+  /** Keep track of the keys for mandatory options. */
+  private final Set<String> mandatoryKeys = new HashSet<>();
+
+  /**
+   * Constructor with both optional path and path handle.
+   * Either or both argument may be empty, but it is an error for
+   * both to be defined.
+   * @param optionalPath a path or empty
+   * @param optionalPathHandle a path handle/empty
+   * @throws IllegalArgumentException if both parameters are set.
+   */
+  protected AbstractFSBuilderImpl(
+      @Nonnull Optional<Path> optionalPath,
+      @Nonnull Optional<PathHandle> optionalPathHandle) {
+    checkArgument(!(checkNotNull(optionalPath).isPresent()
+            && checkNotNull(optionalPathHandle).isPresent()),
+        E_BOTH_A_PATH_AND_A_PATH_HANDLE);
+    this.optionalPath = optionalPath;
+    this.optionalPathHandle = optionalPathHandle;
+  }
+
+  protected AbstractFSBuilderImpl(@Nonnull final Path path) {
+    this(Optional.of(path), Optional.empty());
+  }
+
+  protected AbstractFSBuilderImpl(@Nonnull final PathHandle pathHandle) {
+    this(Optional.empty(), Optional.of(pathHandle));
+  }
+
+
+  /**
+   * Get the cast builder.
+   * @return this object, typecast
+   */
+  public B getThisBuilder() {
+    return (B)this;
+  }
+
+  /**
+   * Get the optional path; may be empty.
+   * @return the optional path field.
+   */
+  public Optional<Path> getOptionalPath() {
+    return optionalPath;
+  }
+
+  /**
+   * Get the path: only valid if constructed with a path.
+   * @return the path
+   * @throws NoSuchElementException if the field is empty.
+   */
+  public Path getPath() {
+    return optionalPath.get();
+  }
+
+  /**
+   * Get the optional path handle; may be empty.
+   * @return the optional path handle field.
+   */
+  public Optional<PathHandle> getOptionalPathHandle() {
+    return optionalPathHandle;
+  }
+
+  /**
+   * Get the PathHandle: only valid if constructed with a PathHandle.
+   * @return the PathHandle
+   * @throws NoSuchElementException if the field is empty.
+   */
+  public PathHandle getPathHandle() {
+    return optionalPathHandle.get();
+  }
+
+  /**
+   * Set optional Builder parameter.
+   */
+  @Override
+  public B opt(@Nonnull final String key, @Nonnull final String value) {
+    mandatoryKeys.remove(key);
+    options.set(key, value);
+    return getThisBuilder();
+  }
+
+  /**
+   * Set optional boolean parameter for the Builder.
+   *
+   * @see #opt(String, String)
+   */
+  @Override
+  public B opt(@Nonnull final String key, boolean value) {
+    mandatoryKeys.remove(key);
+    options.setBoolean(key, value);
+    return getThisBuilder();
+  }
+
+  /**
+   * Set optional int parameter for the Builder.
+   *
+   * @see #opt(String, String)
+   */
+  @Override
+  public B opt(@Nonnull final String key, int value) {
+    mandatoryKeys.remove(key);
+    options.setInt(key, value);
+    return getThisBuilder();
+  }
+
+  /**
+   * Set optional float parameter for the Builder.
+   *
+   * @see #opt(String, String)
+   */
+  @Override
+  public B opt(@Nonnull final String key, float value) {
+    mandatoryKeys.remove(key);
+    options.setFloat(key, value);
+    return getThisBuilder();
+  }
+
+  /**
+   * Set optional double parameter for the Builder.
+   *
+   * @see #opt(String, String)
+   */
+  @Override
+  public B opt(@Nonnull final String key, double value) {
+    mandatoryKeys.remove(key);
+    options.setDouble(key, value);
+    return getThisBuilder();
+  }
+
+  /**
+   * Set an array of string values as optional parameter for the Builder.
+   *
+   * @see #opt(String, String)
+   */
+  @Override
+  public B opt(@Nonnull final String key, @Nonnull final String... values) {
+    mandatoryKeys.remove(key);
+    options.setStrings(key, values);
+    return getThisBuilder();
+  }
+
+  /**
+   * Set mandatory option to the Builder.
+   *
+   * If the option is not supported or unavailable on the {@link FileSystem},
+   * the client should expect {@link #build()} throws IllegalArgumentException.
+   */
+  @Override
+  public B must(@Nonnull final String key, @Nonnull final String value) {
+    mandatoryKeys.add(key);
+    options.set(key, value);
+    return getThisBuilder();
+  }
+
+  /**
+   * Set mandatory boolean option.
+   *
+   * @see #must(String, String)
+   */
+  @Override
+  public B must(@Nonnull final String key, boolean value) {
+    mandatoryKeys.add(key);
+    options.setBoolean(key, value);
+    return getThisBuilder();
+  }
+
+  /**
+   * Set mandatory int option.
+   *
+   * @see #must(String, String)
+   */
+  @Override
+  public B must(@Nonnull final String key, int value) {
+    mandatoryKeys.add(key);
+    options.setInt(key, value);
+    return getThisBuilder();
+  }
+
+  /**
+   * Set mandatory float option.
+   *
+   * @see #must(String, String)
+   */
+  @Override
+  public B must(@Nonnull final String key, float value) {
+    mandatoryKeys.add(key);
+    options.setFloat(key, value);
+    return getThisBuilder();
+  }
+
+  /**
+   * Set mandatory double option.
+   *
+   * @see #must(String, String)
+   */
+  @Override
+  public B must(@Nonnull final String key, double value) {
+    mandatoryKeys.add(key);
+    options.setDouble(key, value);
+    return getThisBuilder();
+  }
+
+  /**
+   * Set a string array as mandatory option.
+   *
+   * @see #must(String, String)
+   */
+  @Override
+  public B must(@Nonnull final String key, @Nonnull final String... values) {
+    mandatoryKeys.add(key);
+    options.setStrings(key, values);
+    return getThisBuilder();
+  }
+
+  /**
+   * Get the mutable option configuration.
+   * @return the option configuration.
+   */
+  public Configuration getOptions() {
+    return options;
+  }
+
+  /**
+   * Get all the keys that are set as mandatory keys.
+   */
+  public Set<String> getMandatoryKeys() {
+    return Collections.unmodifiableSet(mandatoryKeys);
+  }
+
+  /**
+   * Reject a configuration if one or more mandatory keys are
+   * not in the set of mandatory keys.
+   * The first invalid key raises the exception; the order of the
+   * scan and hence the specific key raising the exception is undefined.
+   * @param knownKeys a possibly empty collection of known keys
+   * @param extraErrorText extra error text to include.
+   * @throws IllegalArgumentException if any key is unknown.
+   */
+  protected void rejectUnknownMandatoryKeys(final Collection<String> knownKeys,
+      String extraErrorText)
+      throws IllegalArgumentException {
+    rejectUnknownMandatoryKeys(mandatoryKeys, knownKeys, extraErrorText);
+  }
+
+  /**
+   * Reject a configuration if one or more mandatory keys are
+   * not in the set of mandatory keys.
+   * The first invalid key raises the exception; the order of the
+   * scan and hence the specific key raising the exception is undefined.
+   * @param mandatory the set of mandatory keys
+   * @param knownKeys a possibly empty collection of known keys
+   * @param extraErrorText extra error text to include.
+   * @throws IllegalArgumentException if any key is unknown.
+   */
+  public static void rejectUnknownMandatoryKeys(
+      final Set<String> mandatory,
+      final Collection<String> knownKeys,
+      final String extraErrorText)
+      throws IllegalArgumentException {
+    final String eText = extraErrorText.isEmpty()
+        ? ""
+        : (extraErrorText + " ");
+    mandatory.forEach((key) ->
+        checkArgument(knownKeys.contains(key),
+            UNKNOWN_MANDATORY_KEY + " %s\"%s\"", eText, key));
+  }
+
+}
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/impl/FutureDataInputStreamBuilderImpl.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/impl/FutureDataInputStreamBuilderImpl.java
new file mode 100644
index 0000000000000..2aa4a5d95fcc7
--- /dev/null
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/impl/FutureDataInputStreamBuilderImpl.java
@@ -0,0 +1,141 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.impl;
+
+import javax.annotation.Nonnull;
+import java.io.IOException;
+import java.util.concurrent.CompletableFuture;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.classification.InterfaceStability;
+import org.apache.hadoop.fs.FSDataInputStream;
+import org.apache.hadoop.fs.FileContext;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.FutureDataInputStreamBuilder;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.PathHandle;
+
+import static com.google.common.base.Preconditions.checkNotNull;
+import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.IO_FILE_BUFFER_SIZE_DEFAULT;
+import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.IO_FILE_BUFFER_SIZE_KEY;
+
+/**
+ * Builder for input streams and subclasses whose return value is
+ * actually a completable future: this allows for better asynchronous
+ * operation.
+ *
+ * To be more generic, {@link #opt(String, int)} and {@link #must(String, int)}
+ * variants provide implementation-agnostic way to customize the builder.
+ * Each FS-specific builder implementation can interpret the FS-specific
+ * options accordingly, for example:
+ *
+ * If the option is not related to the file system, the option will be ignored.
+ * If the option is must, but not supported by the file system, a
+ * {@link IllegalArgumentException} will be thrown.
+ *
+ */
+@InterfaceAudience.Public
+@InterfaceStability.Unstable
+public abstract class FutureDataInputStreamBuilderImpl
+    extends AbstractFSBuilderImpl<CompletableFuture<FSDataInputStream>, FutureDataInputStreamBuilder>
+    implements FutureDataInputStreamBuilder {
+
+  private final FileSystem fileSystem;
+
+  private int bufferSize;
+
+  /**
+   * Construct from a {@link FileContext}.
+   *
+   * @param fc FileContext
+   * @param path path.
+   * @throws IOException failure
+   */
+  protected FutureDataInputStreamBuilderImpl(@Nonnull FileContext fc,
+      @Nonnull Path path) throws IOException {
+    super(checkNotNull(path));
+    checkNotNull(fc);
+    this.fileSystem = null;
+    bufferSize = IO_FILE_BUFFER_SIZE_DEFAULT;
+  }
+
+  /**
+   * Constructor.
+   * @param fileSystem owner FS.
+   * @param path path
+   */
+  protected FutureDataInputStreamBuilderImpl(@Nonnull FileSystem fileSystem,
+      @Nonnull Path path) {
+    super(checkNotNull(path));
+    this.fileSystem = checkNotNull(fileSystem);
+    initFromFS();
+  }
+
+  /**
+   * Constructor with PathHandle.
+   * @param fileSystem owner FS.
+   * @param pathHandle path handle
+   */
+  public FutureDataInputStreamBuilderImpl(@Nonnull FileSystem fileSystem,
+      @Nonnull PathHandle pathHandle) {
+    super(pathHandle);
+    this.fileSystem = fileSystem;
+    initFromFS();
+  }
+
+  /**
+   * Initialize from a filesystem.
+   */
+  private void initFromFS() {
+    bufferSize = fileSystem.getConf().getInt(IO_FILE_BUFFER_SIZE_KEY,
+        IO_FILE_BUFFER_SIZE_DEFAULT);
+  }
+
+  protected FileSystem getFS() {
+    checkNotNull(fileSystem);
+    return fileSystem;
+  }
+
+  protected int getBufferSize() {
+    return bufferSize;
+  }
+
+  /**
+   * Set the size of the buffer to be used.
+   */
+  public FutureDataInputStreamBuilder bufferSize(int bufSize) {
+    bufferSize = bufSize;
+    return getThisBuilder();
+  }
+
+  /**
+   * Get the builder.
+   * This must be used after the constructor has been invoked to create
+   * the actual builder: it allows for subclasses to do things after
+   * construction.
+   */
+  public FutureDataInputStreamBuilder builder() {
+    return getThisBuilder();
+  }
+
+  @Override
+  public FutureDataInputStreamBuilder getThisBuilder() {
+    return this;
+  }
+}
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/impl/FutureIOSupport.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/impl/FutureIOSupport.java
new file mode 100644
index 0000000000000..9d5f2bf4b6ed1
--- /dev/null
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/impl/FutureIOSupport.java
@@ -0,0 +1,191 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.impl;
+
+import java.io.IOException;
+import java.io.InterruptedIOException;
+import java.util.Map;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.Future;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.TimeoutException;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.classification.InterfaceStability;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSBuilder;
+
+/**
+ * Support for future IO and the FS Builder subclasses.
+ */
+@InterfaceAudience.Private
+@InterfaceStability.Unstable
+public final class FutureIOSupport {
+
+  private FutureIOSupport() {
+  }
+
+  /**
+   * Given a future, evaluate it. Raised exceptions are
+   * extracted and handled.
+   * @param future future to evaluate
+   * @param <T> type of the result.
+   * @return the result, if all went well.
+   * @throws InterruptedIOException future was interrupted
+   * @throws IOException if something went wrong
+   * @throws RuntimeException any nested RTE thrown
+   */
+  public static <T> T awaitFuture(final Future<T> future)
+      throws InterruptedIOException, IOException, RuntimeException {
+    try {
+      return future.get();
+    } catch (InterruptedException e) {
+      throw (InterruptedIOException)new InterruptedIOException(e.toString())
+          .initCause(e);
+    } catch (ExecutionException e) {
+      return raiseInnerCause(e);
+    }
+  }
+
+
+  /**
+   * Given a future, evaluate it. Raised exceptions are
+   * extracted and handled.
+   * @param future future to evaluate
+   * @param <T> type of the result.
+   * @return the result, if all went well.
+   * @throws InterruptedIOException future was interrupted
+   * @throws IOException if something went wrong
+   * @throws RuntimeException any nested RTE thrown
+   * @throws TimeoutException the future timed out.
+   */
+  public static <T> T awaitFuture(final Future<T> future,
+      final long timeout,
+      final TimeUnit unit)
+      throws InterruptedIOException, IOException, RuntimeException,
+      TimeoutException {
+
+    try {
+      return future.get(timeout, unit);
+    } catch (InterruptedException e) {
+      throw (InterruptedIOException)new InterruptedIOException(e.toString())
+          .initCause(e);
+    } catch (ExecutionException e) {
+      return raiseInnerCause(e);
+    }
+  }
+
+
+  /**
+   * From the inner cause of an execution exception, extract the inner cause
+   * if it is an IOE or RTE.
+   * This will always raise an exception, either the inner IOException,
+   * an inner RuntimeException, or a new IOException wrapping the raised
+   * exception.
+   *
+   * @param e exception.
+   * @param <T> type of return value.
+   * @return nothing, ever.
+   * @throws IOException either the inner IOException, or a wrapper around
+   * any non-Runtime-Exception
+   * @throws RuntimeException if that is the inner cause.
+   */
+  public static <T> T raiseInnerCause(final ExecutionException e)
+      throws IOException {
+    Throwable cause = e.getCause();
+    if (cause instanceof IOException) {
+      throw (IOException) cause;
+    } else if (cause instanceof WrappedIOException){
+      throw ((WrappedIOException) cause).getCause();
+    } else if (cause instanceof RuntimeException){
+      throw (RuntimeException) cause;
+    } else if (cause != null) {
+      // other type: wrap with a new IOE
+      throw new IOException(cause);
+    } else {
+      // this only happens if somebody deliberately raises
+      // an ExecutionException
+      throw new IOException(e);
+    }
+  }
+
+  /**
+   * Propagate options to any builder, converting everything with the
+   * prefix to an option where, if there were 2+ dot-separated elements,
+   * it is converted to a schema.
+   * <pre>
+   *   fs.example.s3a.option => s3a:option
+   *   fs.example.fs.io.policy => s3a.io.policy
+   *   fs.example.something => something
+   * </pre>
+   * @param builder builder to modify
+   * @param conf configuration to read
+   * @param optionalPrefix prefix for optional settings
+   * @param mandatoryPrefix prefix for mandatory settings
+   * @param <T> type of result
+   * @param <U> type of builder
+   * @return the builder passed in.
+   */
+  public static <T, U extends FSBuilder<T, U>>
+        FSBuilder<T, U> propagateOptions(
+      final FSBuilder<T, U> builder,
+      final Configuration conf,
+      final String optionalPrefix,
+      final String mandatoryPrefix) {
+    propagateOptions(builder, conf,
+        optionalPrefix, false);
+    propagateOptions(builder, conf,
+        mandatoryPrefix, true);
+    return builder;
+  }
+
+  /**
+   * Propagate options to any builder, converting everything with the
+   * prefix to an option where, if there were 2+ dot-separated elements,
+   * it is converted to a schema.
+   * <pre>
+   *   fs.example.s3a.option => s3a:option
+   *   fs.example.fs.io.policy => s3a.io.policy
+   *   fs.example.something => something
+   * </pre>
+   * @param builder builder to modify
+   * @param conf configuration to read
+   * @param prefix prefix to scan/strip
+   * @param mandatory are the options to be mandatory or optional?
+   */
+  public static void propagateOptions(
+      final FSBuilder<?, ?> builder,
+      final Configuration conf,
+      final String prefix,
+      final boolean mandatory) {
+
+    final String p = prefix.endsWith(".") ? prefix : (prefix + ".");
+    final Map<String, String> propsWithPrefix = conf.getPropsWithPrefix(p);
+    for (Map.Entry<String, String> entry : propsWithPrefix.entrySet()) {
+      // change the schema off each entry
+      String key = entry.getKey();
+      String val = entry.getValue();
+      if (mandatory) {
+        builder.must(key, val);
+      } else {
+        builder.opt(key, val);
+      }
+    }
+  }
+}
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/impl/WrappedIOException.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/impl/WrappedIOException.java
new file mode 100644
index 0000000000000..1de1ecb785368
--- /dev/null
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/impl/WrappedIOException.java
@@ -0,0 +1,56 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.impl;
+
+import java.io.IOException;
+import java.util.concurrent.ExecutionException;
+
+import com.google.common.base.Preconditions;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.classification.InterfaceStability;
+
+/**
+ * A wrapper for an IOException which
+ * {@link FutureIOSupport#raiseInnerCause(ExecutionException)} knows to
+ * always extract the exception.
+ *
+ * The constructor signature guarantees the cause will be an IOException,
+ * and as it checks for a null-argument, non-null.
+ */
+@InterfaceAudience.Private
+@InterfaceStability.Unstable
+public class WrappedIOException extends RuntimeException {
+
+  private static final long serialVersionUID = 2510210974235779294L;
+
+  /**
+   * Construct from a non-null IOException.
+   * @param cause inner cause
+   * @throws NullPointerException if the cause is null.
+   */
+  public WrappedIOException(final IOException cause) {
+    super(Preconditions.checkNotNull(cause));
+  }
+
+  @Override
+  public synchronized IOException getCause() {
+    return (IOException) super.getCause();
+  }
+}
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/compress/PassthroughCodec.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/compress/PassthroughCodec.java
new file mode 100644
index 0000000000000..a3f0bffeebc0f
--- /dev/null
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/compress/PassthroughCodec.java
@@ -0,0 +1,246 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.io.compress;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.classification.InterfaceStability;
+import org.apache.hadoop.conf.Configurable;
+import org.apache.hadoop.conf.Configuration;
+
+/**
+ * This is a special codec which does not transform the output.
+ * It can be declared as a codec in the option "io.compression.codecs",
+ * and then it will declare that it supports the file extension
+ * set in {@link #OPT_EXTENSION}.
+ *
+ * This allows decompression to be disabled on a job, even when there is
+ * a registered/discoverable decompression codec for a file extension
+ * -without having to change the standard codec binding mechanism.
+ *
+ * For example, to disable decompression for a gzipped files, set the
+ * options
+ * <pre>
+ *   io.compression.codecs = org.apache.hadoop.io.compress.PassthroughCodec
+ *   io.compress.passthrough.extension = .gz
+ * </pre>
+ *
+ * <i>Note:</i> this is not a Splittable codec: it doesn't know the
+ * capabilities of the passed in stream. It should be possible to
+ * extend this in a subclass: the inner classes are marked as protected
+ * to enable this. <i>Do not retrofit splitting to this class.</i>.
+ *
+ */
+@InterfaceAudience.Public
+@InterfaceStability.Unstable
+public class PassthroughCodec
+    implements Configurable, CompressionCodec {
+
+  private static final Logger LOG =
+      LoggerFactory.getLogger(PassthroughCodec.class);
+
+  /**
+   * Classname of the codec: {@value}.
+   */
+  public static final String CLASSNAME =
+      "org.apache.hadoop.io.compress.PassthroughCodec";
+
+  /**
+   * Option to control the extension of the code: {@value}.
+   */
+  public static final String OPT_EXTENSION =
+      "io.compress.passthrough.extension";
+
+  /**
+   * This default extension is here so that if no extension has been defined,
+   * some value is still returned: {@value}..
+   */
+  public static final String DEFAULT_EXTENSION = ".passthrough";
+
+  private Configuration conf;
+
+  private String extension = DEFAULT_EXTENSION;
+
+  public PassthroughCodec() {
+  }
+
+  @Override
+  public Configuration getConf() {
+    return conf;
+  }
+
+  @Override
+  public void setConf(final Configuration conf) {
+    this.conf = conf;
+    // update the default extension value at this point, adding
+    // a dot prefix if needed.
+    String ex = conf.getTrimmed(OPT_EXTENSION, DEFAULT_EXTENSION);
+    extension = ex.startsWith(".") ? ex : ("." + ex);
+  }
+
+  @Override
+  public String getDefaultExtension() {
+    LOG.info("Registering fake codec for extension {}", extension);
+    return extension;
+  }
+
+  @Override
+  public CompressionOutputStream createOutputStream(final OutputStream out)
+      throws IOException {
+    throw new UnsupportedOperationException();
+  }
+
+  @Override
+  public CompressionOutputStream createOutputStream(final OutputStream out,
+      final Compressor compressor) throws IOException {
+    throw new UnsupportedOperationException();
+  }
+
+  @Override
+  public Class<? extends Compressor> getCompressorType() {
+    throw new UnsupportedOperationException();
+  }
+
+  @Override
+  public Compressor createCompressor() {
+    throw new UnsupportedOperationException();
+  }
+
+  @Override
+  public CompressionInputStream createInputStream(final InputStream in)
+      throws IOException {
+    return createInputStream(in, null);
+  }
+
+  @Override
+  public CompressionInputStream createInputStream(final InputStream in,
+      final Decompressor decompressor) throws IOException {
+    return new PassthroughDecompressorStream(in);
+  }
+
+  @Override
+  public Class<? extends Decompressor> getDecompressorType() {
+    return StubDecompressor.class;
+  }
+
+  @Override
+  public Decompressor createDecompressor() {
+    return new StubDecompressor();
+  }
+
+  /**
+   * The decompressor.
+   */
+  protected static final class PassthroughDecompressorStream
+      extends DecompressorStream {
+
+    private final InputStream input;
+
+    PassthroughDecompressorStream(final InputStream input)
+        throws IOException {
+      super(input);
+      this.input = input;
+    }
+
+    @Override
+    public int read(final byte[] b) throws IOException {
+      return input.read(b);
+    }
+
+    @Override
+    public int read() throws IOException {
+      return input.read();
+    }
+
+    @Override
+    public int read(final byte[] b, final int off, final int len)
+        throws IOException {
+      return input.read(b, off, len);
+    }
+
+    @Override
+    public long skip(final long n) throws IOException {
+      return input.skip(n);
+    }
+
+    @Override
+    public int available() throws IOException {
+      return input.available();
+    }
+  }
+
+  /**
+   * The decompressor is a no-op. It is not needed other than
+   * to complete the methods offered by the interface.
+   */
+  protected static final class StubDecompressor implements Decompressor {
+
+    @Override
+    public void setInput(final byte[] b, final int off, final int len) {
+
+    }
+
+    @Override
+    public boolean needsInput() {
+      return false;
+    }
+
+    @Override
+    public void setDictionary(final byte[] b, final int off, final int len) {
+
+    }
+
+    @Override
+    public boolean needsDictionary() {
+      return false;
+    }
+
+    @Override
+    public boolean finished() {
+      return false;
+    }
+
+    @Override
+    public int decompress(final byte[] b, final int off, final int len)
+        throws IOException {
+      return 0;
+    }
+
+    @Override
+    public int getRemaining() {
+      return 0;
+    }
+
+    @Override
+    public void reset() {
+
+    }
+
+    @Override
+    public void end() {
+
+    }
+  }
+}
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/LambdaUtils.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/LambdaUtils.java
new file mode 100644
index 0000000000000..14c6db608aa4d
--- /dev/null
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/LambdaUtils.java
@@ -0,0 +1,59 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.util;
+
+import java.util.concurrent.Callable;
+import java.util.concurrent.CompletableFuture;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.classification.InterfaceStability;
+
+/**
+ * Lambda-expression utilities be they generic or specific to
+ * Hadoop datatypes.
+ */
+@InterfaceAudience.Private
+@InterfaceStability.Unstable
+public final class LambdaUtils {
+
+  private LambdaUtils() {
+  }
+
+  /**
+   * Utility method to evaluate a callable and fill in the future
+   * with the result or the exception raised.
+   * Once this method returns, the future will have been evaluated to
+   * either a return value or an exception.
+   * @param <T> type of future
+   * @param result future for the result.
+   * @param call callable to invoke.
+   * @return the future passed in
+   */
+  public static <T> CompletableFuture<T> eval(
+      final CompletableFuture<T> result,
+      final Callable<T> call) {
+    try {
+      result.complete(call.call());
+    } catch (Throwable tx) {
+      result.completeExceptionally(tx);
+    }
+    return result;
+  }
+
+}
diff --git a/hadoop-common-project/hadoop-common/src/main/resources/core-default.xml b/hadoop-common-project/hadoop-common/src/main/resources/core-default.xml
index 3ef98086e1a74..6366cc483f359 100644
--- a/hadoop-common-project/hadoop-common/src/main/resources/core-default.xml
+++ b/hadoop-common-project/hadoop-common/src/main/resources/core-default.xml
@@ -1781,6 +1781,118 @@
   </description>
 </property>
 
+<property>
+  <name>fs.s3a.select.enabled</name>
+  <value>true</value>
+  <description>Is S3 Select enabled?</description>
+</property>
+
+<property>
+  <name>fs.s3a.select.input.csv.comment.marker</name>
+  <value>#</value>
+  <description>In S3 Select queries: the marker for comment lines in CSV files</description>
+</property>
+
+<property>
+  <name>fs.s3a.select.input.csv.record.delimiter</name>
+  <value>\n</value>
+  <description>In S3 Select queries over CSV files: the record delimiter.
+    \t is remapped to the TAB character, \r to CR \n to newline. \\ to \
+    and \" to "
+  </description>
+</property>
+
+<property>
+  <name>fs.s3a.select.input.csv.field.delimiter</name>
+  <value>,</value>
+  <description>In S3 Select queries over CSV files: the field delimiter.
+    \t is remapped to the TAB character, \r to CR \n to newline. \\ to \
+    and \" to "
+  </description>
+</property>
+
+<property>
+  <name>fs.s3a.select.input.csv.quote.character</name>
+  <value>"</value>
+  <description>In S3 Select queries over CSV files: quote character.
+    \t is remapped to the TAB character, \r to CR \n to newline. \\ to \
+    and \" to "
+  </description>
+</property>
+
+<property>
+  <name>fs.s3a.select.input.csv.quote.escape.character</name>
+  <value>\\</value>
+  <description>In S3 Select queries over CSV files: quote escape character.
+    \t is remapped to the TAB character, \r to CR \n to newline. \\ to \
+    and \" to "
+  </description>
+</property>
+
+<property>
+  <name>fs.s3a.select.input.csv.header</name>
+  <value>none</value>
+  <description>In S3 Select queries over CSV files: what is the role of the header? One of "none", "ignore" and "use"</description>
+</property>
+
+<property>
+  <name>fs.s3a.select.input.compression</name>
+  <value>none</value>
+  <description>In S3 Select queries, the source compression
+    algorithm. One of: "none" and "gzip"</description>
+</property>
+
+<property>
+  <name>fs.s3a.select.output.csv.quote.fields</name>
+  <value>always</value>
+  <description>
+    In S3 Select queries: should fields in generated CSV Files be quoted?
+    One of: "always", "asneeded".
+  </description>
+</property>
+
+<property>
+  <name>fs.s3a.select.output.csv.quote.character</name>
+  <value>"</value>
+  <description>
+    In S3 Select queries: the quote character for generated CSV Files.
+  </description>
+</property>
+
+<property>
+  <name>fs.s3a.select.output.csv.quote.escape.character</name>
+  <value>\\</value>
+  <description>
+    In S3 Select queries: the quote escape character for generated CSV Files.
+  </description>
+</property>
+
+<property>
+  <name>fs.s3a.select.output.csv.record.delimiter</name>
+  <value>\n</value>
+  <description>
+    In S3 Select queries: the record delimiter for generated CSV Files.
+  </description>
+</property>
+
+<property>
+  <name>fs.s3a.select.output.csv.field.delimiter</name>
+  <value>,</value>
+  <description>
+    In S3 Select queries: the field delimiter for generated CSV Files.
+  </description>
+</property>
+
+<property>
+  <name>fs.s3a.select.errors.include.sql</name>
+  <value>false</value>
+  <description>
+    Include the SQL statement in errors: this is useful for development but
+    may leak security and Personally Identifying Information in production,
+    so must be disabled there.
+  </description>
+</property>
+
 <property>
   <name>fs.AbstractFileSystem.s3a.impl</name>
   <value>org.apache.hadoop.fs.s3a.S3A</value>
diff --git a/hadoop-common-project/hadoop-common/src/site/markdown/filesystem/filesystem.md b/hadoop-common-project/hadoop-common/src/site/markdown/filesystem/filesystem.md
index 7e5e8f8d4e692..7b356121e1fb1 100644
--- a/hadoop-common-project/hadoop-common/src/site/markdown/filesystem/filesystem.md
+++ b/hadoop-common-project/hadoop-common/src/site/markdown/filesystem/filesystem.md
@@ -734,9 +734,94 @@ symbolic links
 exists in the metadata, but no copies of any its blocks can be located;
 -`FileNotFoundException` would seem more accurate and useful.
 
+### `FSDataInputStreamBuilder openFile(Path path)`
+
+Creates a [`FSDataInputStreamBuilder`](fsdatainputstreambuilder.html)
+to construct a operation to open the file at `path` for reading.
+
+
+When `build()` is invoked on the returned `FSDataInputStreamBuilder` instance,
+the builder parameters are verified and
+`openFileWithOptions(Path, Set<String>, Configuration, int)` invoked.
+
+This (protected) operation returns a `CompletableFuture<FSDataInputStream>`
+which, when its `get()` method is called, either returns an input
+stream of the contents of opened file, or raises an exception.
+
+The base implementation of the `openFileWithOptions(PathHandle, Set<String>, Configuration, int)`
+ultimately invokes `open(Path, int)`.
+
+Thus the chain `openFile(path).build().get()` has the same preconditions
+and postconditions as `open(Path p, int bufferSize)`
+
+
+The `openFile()` operation may check the state of the filesystem during this
+call, but as the state of the filesystem may change betwen this call and
+the actual `build()` and `get()` operations, this file-specific
+preconditions (file exists, file is readable, etc) MUST NOT be checked here.
+
+FileSystem implementations which do not implement `open(Path, int)`
+MAY postpone raising an `UnsupportedOperationException` until either the
+`FSDataInputStreamBuilder.build()` or the subsequent `get()` call,
+else they MAY fail fast in the `openFile()` call.
+
+### Implementors notes
+
+The base implementation of `openFileWithOptions()` actually executes
+the `open(path)` operation synchronously, yet still returns the result
+or any failures in the `CompletableFuture<>`, so as to ensure that users
+code expecting this.
+
+Any filesystem where the time to open a file may be significant SHOULD
+execute it asynchronously by submitting the operation in some executor/thread
+pool. This is particularly recommended for object stores and other filesystems
+likely to be accessed over long-haul connections.
+
+Arbitrary filesystem-specific options MAY be supported; these MUST
+be prefixed with either the filesystem schema, e.g. `hdfs.`
+or in the "fs.SCHEMA" format as normal configuration settings `fs.hdfs`). The
+latter style allows the same configuration option to be used for both
+filesystem configuration and file-specific configuration.
+
+It SHOULD be possible to always open a file without specifying any options,
+so as to present a consistent model to users. However, an implementation MAY
+opt to require one or more mandatory options to be set.
+
+### `FSDataInputStreamBuilder openFile(PathHandle)`
+
+Creates a `FSDataInputStreamBuilder` to build an operation to open a file.
+Creates a [`FSDataInputStreamBuilder`](fsdatainputstreambuilder.html)
+to construct a operation to open the file identified by the given `PathHandle` for reading.
+
+When `build()` is invoked on the returned `FSDataInputStreamBuilder` instance,
+the builder parameters are verified and
+`openFileWithOptions(PathHandle, Set<String>, Configuration, int)` invoked.
+
+This (protected) operation returns a `CompletableFuture<FSDataInputStream>`
+which, when its `get()` method is called, either returns an input
+stream of the contents of opened file, or raises an exception.
+
+The base implementation of the `openFileWithOptions(Path,PathHandle, Set<String>, Configuration, int)` method
+returns a future which invokes `open(Path, int)`.
+
+Thus the chain `openFile(pathhandle).build().get()` has the same preconditions
+and postconditions as `open(Pathhandle, int)`
+
+As with `FSDataInputStreamBuilder openFile(PathHandle)`, the `openFile()`
+call must not be where path-specific preconditions are checked -that
+is postponed to the `build()` and `get()` calls.
+
+FileSystem implementations which do not implement `open(PathHandle handle, int bufferSize)`
+MAY postpone raising an `UnsupportedOperationException` until either the
+`FSDataInputStreamBuilder.build()` or the subsequent `get()` call,
+else they MAY fail fast in the `openFile()` call.
+
+The base implementation raises this exception in the `build()` operation;
+other implementations SHOULD copy this.
+
 ### `PathHandle getPathHandle(FileStatus stat, HandleOpt... options)`
 
-Implementaions without a compliant call MUST throw `UnsupportedOperationException`
+Implementations without a compliant call MUST throw `UnsupportedOperationException`
 
 #### Preconditions
 
diff --git a/hadoop-common-project/hadoop-common/src/site/markdown/filesystem/fsdatainputstream.md b/hadoop-common-project/hadoop-common/src/site/markdown/filesystem/fsdatainputstream.md
index 32eeb5b757447..e067b078b3114 100644
--- a/hadoop-common-project/hadoop-common/src/site/markdown/filesystem/fsdatainputstream.md
+++ b/hadoop-common-project/hadoop-common/src/site/markdown/filesystem/fsdatainputstream.md
@@ -200,6 +200,10 @@ Some FileSystems do not raise an exception if this condition is not met. They
 instead return -1 on any `read()` operation where, at the time of the read,
 `len(data(FSDIS)) < pos(FSDIS)`.
 
+After a failed seek, the value of `pos(FSDIS)` may change.
+As an example, seeking past the EOF may move the read position
+to the end of the file, *as well as raising an `EOFException`.*
+
 #### Postconditions
 
     FSDIS' = (s, data, True)
@@ -211,6 +215,16 @@ There is an implicit invariant: a seek to the current position is a no-op
 Implementations may recognise this operation and bypass all other precondition
 checks, leaving the input stream unchanged.
 
+The most recent connectors to object stores all implement some form
+of "lazy-seek": the `seek()` call may appear to update the stream, and the value
+of `getPos()` is updated, but the file is not opened/reopenend until
+data is actually read. Implementations of lazy seek MUST still validate
+the new seek position against the known length of the file.
+However the state of the file (i.e. does it exist, what
+its current length is) does not need to be refreshed at this point.
+The fact that a file has been deleted or truncated may not surface until
+that `read()` call.
+
 
 ### `Seekable.seekToNewSource(offset)`
 
diff --git a/hadoop-common-project/hadoop-common/src/site/markdown/filesystem/fsdatainputstreambuilder.md b/hadoop-common-project/hadoop-common/src/site/markdown/filesystem/fsdatainputstreambuilder.md
new file mode 100644
index 0000000000000..f1beed862cdbf
--- /dev/null
+++ b/hadoop-common-project/hadoop-common/src/site/markdown/filesystem/fsdatainputstreambuilder.md
@@ -0,0 +1,112 @@
+<!---
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License. See accompanying LICENSE file.
+-->
+
+<!--  ============================================================= -->
+<!--  CLASS: FSDataInputStreamBuilder -->
+<!--  ============================================================= -->
+
+# class `org.apache.hadoop.fs.FSDataInputStreamBuilder`
+
+<!-- MACRO{toc|fromDepth=1|toDepth=2} -->
+
+An interface offering of the Builder pattern for creating Java `Future`
+references to `FSDataInputStream` and its subclasses.
+It is used to initate a (potentially asynchronous) operation to open an existing
+file for reading.
+
+## Invariants
+
+The `FSDataInputStreamBuilder` interface does not require parameters or
+or the state of `FileSystem` until [`build()`](#build) is
+invoked and/or during the asynchronous open operation itself.
+
+Some aspects of the state of the filesystem, MAY be checked in the initial
+`openFile()` call, provided they are known to be invariants which will not
+change between `openFile()` and the `build().get()` sequence. For example,
+path validation.
+
+## Implementation-agnostic parameters.
+
+
+### <a name="Builder.bufferSize"></a> `FSDataInputStreamBuilder bufferSize(int bufSize)`
+
+Set the size of the buffer to be used.
+
+### Set optional or mandatory parameters
+
+    FSDataInputStreamBuilder opt(String key, ...)
+    FSDataInputStreamBuilder must(String key, ...)
+
+Set optional or mandatory parameters to the builder. Using `opt()` or `must()`,
+client can specify FS-specific parameters without inspecting the concrete type
+of `FileSystem`.
+
+```java
+out = fs.openFile(path)
+    .opt("fs.s3a.experimental.fadvise", "random")
+    .must("fs.s3a.readahead.range", 256 * 1024)
+    .build()
+    .get();
+```
+
+#### Implementation Notes
+
+Checking for supported options must be performed in the `build()` operation.
+
+1. If a mandatory parameter declared via `must(key, value)`) is not recognized,
+`IllegalArgumentException` MUST be thrown.
+
+1. If a mandatory parameter declared via `must(key, value)`) relies on
+a feature which is recognized but not supported in the specific
+Filesystem/FileContext instance `UnsupportedException` MUST be thrown.
+
+The behavior of resolving the conflicts between the parameters set by
+builder methods (i.e., `bufferSize()`) and `opt()`/`must()` is as follows:
+
+> The last option specified defines the value and its optional/mandatory state.
+
+
+## Builder interface
+
+### <a name="build"></a> `CompletableFuture<FSDataInputStream> build()`
+
+
+Return an `CompletableFuture<FSDataInputStream>` which, when successfully
+completed, returns an input stream which can read data from the filesystem.
+
+The `build()` operation MAY perform the validation of the file's existence,
+its kind, so rejecting attempts to read from a directory or non-existent
+file. **Alternatively**, the `build()` operation may delay all checks
+until an asynchronous operation whose outcome is provided by the `Future`
+
+That is, the precondition  `exists(FS, path)` and `isFile(FS, path)` are
+only guaranteed to have been met after the `get()` on the returned future is successful.
+
+Thus, if even a file does not exist, the following call will still succeed, returning
+a future to be evaluated.
+
+```java
+Path p = new Path("file://tmp/file-which-does-not-exist");
+
+CompletableFuture<FSDataInputStream> future = p.getFileSystem(conf)
+      .openFile(p)
+      .build;
+```
+
+The preconditions for opening the file are checked during the asynchronous
+evaluation, and so will surface when the future is completed:
+
+```java
+FSDataInputStream in = future.get();
+```
diff --git a/hadoop-common-project/hadoop-common/src/site/markdown/filesystem/fsdataoutputstreambuilder.md b/hadoop-common-project/hadoop-common/src/site/markdown/filesystem/fsdataoutputstreambuilder.md
index 4ea1fd168f2fa..64dda2df8c63c 100644
--- a/hadoop-common-project/hadoop-common/src/site/markdown/filesystem/fsdataoutputstreambuilder.md
+++ b/hadoop-common-project/hadoop-common/src/site/markdown/filesystem/fsdataoutputstreambuilder.md
@@ -114,10 +114,12 @@ MUST verify that implementation-agnostic parameters (i.e., "syncable") or
 implementation-specific parameters (i.e., "foofs:cache")
 are supported. `FileSystem` will satisfy optional parameters (via `opt(key, ...)`)
 on best effort. If the mandatory parameters (via `must(key, ...)`) can not be satisfied
-in the `FileSystem`, `IllegalArgumentException` should be thrown in `build()`.
+in the `FileSystem`, `IllegalArgumentException` must be thrown in `build()`.
 
 The behavior of resolving the conflicts between the parameters set by
-builder methods (i.e., `bufferSize()`) and `opt()`/`must()` is undefined.
+builder methods (i.e., `bufferSize()`) and `opt()`/`must()` is as follows:
+
+> The last option specified defines the value and its optional/mandatory state.
 
 ## HDFS-specific parameters.
 
diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/FileContextMainOperationsBaseTest.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/FileContextMainOperationsBaseTest.java
index c07a6ffa34400..4c90490b090e7 100644
--- a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/FileContextMainOperationsBaseTest.java
+++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/FileContextMainOperationsBaseTest.java
@@ -23,10 +23,13 @@
 import java.io.IOException;
 import java.util.EnumSet;
 import java.util.NoSuchElementException;
+import java.util.concurrent.CompletableFuture;
+import java.util.concurrent.atomic.AtomicBoolean;
 
 import org.apache.hadoop.HadoopIllegalArgumentException;
 import org.apache.hadoop.fs.Options.CreateOpts;
 import org.apache.hadoop.fs.Options.Rename;
+import org.apache.hadoop.fs.contract.ContractTestUtils;
 import org.apache.hadoop.fs.permission.FsPermission;
 import org.apache.hadoop.security.AccessControlException;
 import org.apache.hadoop.test.GenericTestUtils;
@@ -40,6 +43,8 @@
 
 import static org.apache.hadoop.fs.FileContextTestHelper.*;
 import static org.apache.hadoop.fs.CreateFlag.*;
+import static org.apache.hadoop.test.LambdaTestUtils.intercept;
+import static org.apache.hadoop.test.LambdaTestUtils.interceptFuture;
 
 /**
  * <p>
@@ -1326,13 +1331,10 @@ public void testOpen2() throws IOException {
     final Path path = new Path(rootPath, "zoo");
     createFile(path);
     final long length = fc.getFileStatus(path).getLen();
-    FSDataInputStream fsdis = fc.open(path, 2048);
-    try {
-      byte[] bb = new byte[(int)length];
+    try (FSDataInputStream fsdis = fc.open(path, 2048)) {
+      byte[] bb = new byte[(int) length];
       fsdis.readFully(bb);
       assertArrayEquals(data, bb);
-    } finally {
-      fsdis.close();
     }
   }
 
@@ -1452,4 +1454,87 @@ public void testGetFileContext1() throws IOException {
   private Path getTestRootPath(FileContext fc, String pathString) {
     return fileContextTestHelper.getTestRootPath(fc, pathString);
   }
+
+  /**
+   * Create a path under the test path.
+   * @param filepath path string in
+   * @return a path qualified by the test filesystem
+   * @throws IOException IO problems
+   */
+  protected Path path(String filepath) throws IOException {
+    return getTestRootPath(fc, filepath);
+  }
+
+  /**
+   * Describe a test. This is a replacement for javadocs
+   * where the tests role is printed in the log output
+   * @param text description
+   */
+  protected void describe(String text) {
+    LOG.info(text);
+  }
+
+  @Test
+  public void testOpenFileRead() throws Exception {
+    final Path path = path("testOpenFileRead");
+    createFile(path);
+    final long length = fc.getFileStatus(path).getLen();
+    try (FSDataInputStream fsdis = fc.openFile(path)
+        .opt("fs.test.something", true)
+        .opt("fs.test.something2", 3)
+        .opt("fs.test.something3", "3")
+        .build().get()) {
+      byte[] bb = new byte[(int) length];
+      fsdis.readFully(bb);
+      assertArrayEquals(data, bb);
+    }
+  }
+
+  @Test
+  public void testOpenFileUnknownOption() throws Throwable {
+    describe("calling openFile fails when a 'must()' option is unknown");
+
+    final Path path = path("testOpenFileUnknownOption");
+    FutureDataInputStreamBuilder builder =
+        fc.openFile(path)
+            .opt("fs.test.something", true)
+            .must("fs.test.something", true);
+    intercept(IllegalArgumentException.class,
+        () -> builder.build());
+  }
+
+  @Test
+  public void testOpenFileLazyFail() throws Throwable {
+    describe("openFile fails on a missing file in the get() and not before");
+    FutureDataInputStreamBuilder builder =
+        fc.openFile(path("testOpenFileUnknownOption"))
+            .opt("fs.test.something", true);
+    interceptFuture(FileNotFoundException.class, "", builder.build());
+  }
+
+  @Test
+  public void testOpenFileApplyRead() throws Throwable {
+    describe("use the apply sequence");
+    Path path = path("testOpenFileApplyRead");
+    createFile(path);
+    CompletableFuture<Long> readAllBytes = fc.openFile(path)
+        .build()
+        .thenApply(ContractTestUtils::readStream);
+    assertEquals("Wrong number of bytes read from stream",
+        data.length,
+        (long)readAllBytes.get());
+  }
+
+  @Test
+  public void testOpenFileApplyAsyncRead() throws Throwable {
+    describe("verify that async accept callbacks are evaluated");
+    Path path = path("testOpenFileApplyAsyncRead");
+    createFile(path);
+    CompletableFuture<FSDataInputStream> future = fc.openFile(path).build();
+    AtomicBoolean accepted = new AtomicBoolean(false);
+    future.thenAcceptAsync(i -> accepted.set(true)).get();
+    assertTrue("async accept operation not invoked",
+        accepted.get());
+  }
+
 }
diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/TestHarFileSystem.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/TestHarFileSystem.java
index 2f7f2b1bd9e71..b442553924fd2 100644
--- a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/TestHarFileSystem.java
+++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/TestHarFileSystem.java
@@ -40,6 +40,8 @@
 import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
+import java.util.Set;
+import java.util.concurrent.CompletableFuture;
 
 import static org.apache.hadoop.fs.Options.ChecksumOpt;
 import static org.apache.hadoop.fs.Options.CreateOpts;
@@ -232,6 +234,24 @@ public Collection<? extends BlockStoragePolicySpi> getAllStoragePolicies()
 
     public Collection<FileStatus> getTrashRoots(boolean allUsers) throws IOException;
     StorageStatistics getStorageStatistics();
+
+    FutureDataInputStreamBuilder openFile(Path path)
+        throws IOException, UnsupportedOperationException;
+
+    FutureDataInputStreamBuilder openFile(PathHandle pathHandle)
+        throws IOException, UnsupportedOperationException;
+
+    CompletableFuture<FSDataInputStream> openFileWithOptions(
+        PathHandle pathHandle,
+        Set<String> mandatoryKeys,
+        Configuration options,
+        int bufferSize) throws IOException;
+
+    CompletableFuture<FSDataInputStream> openFileWithOptions(
+        Path path,
+        Set<String> mandatoryKeys,
+        Configuration options,
+        int bufferSize) throws IOException;
   }
 
   @Test
diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/TestLocalFileSystem.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/TestLocalFileSystem.java
index d5622af085186..fae3db83cf06f 100644
--- a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/TestLocalFileSystem.java
+++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/TestLocalFileSystem.java
@@ -729,7 +729,7 @@ private static class BuilderWithSupportedKeys
     }
 
     @Override
-    protected BuilderWithSupportedKeys getThisBuilder() {
+    public BuilderWithSupportedKeys getThisBuilder() {
       return this;
     }
 
diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/AbstractContractOpenTest.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/AbstractContractOpenTest.java
index 00591126a636e..b6e94a664165e 100644
--- a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/AbstractContractOpenTest.java
+++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/AbstractContractOpenTest.java
@@ -19,22 +19,30 @@
 
 import java.io.FileNotFoundException;
 import java.io.IOException;
+import java.util.concurrent.CompletableFuture;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicBoolean;
 
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.CommonConfigurationKeysPublic;
 import org.apache.hadoop.fs.FSDataInputStream;
 import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.FutureDataInputStreamBuilder;
 import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.impl.FutureIOSupport;
 import org.apache.hadoop.io.IOUtils;
 
 import static org.apache.hadoop.fs.contract.ContractTestUtils.createFile;
 import static org.apache.hadoop.fs.contract.ContractTestUtils.dataset;
 import static org.apache.hadoop.fs.contract.ContractTestUtils.touch;
+import static org.apache.hadoop.test.LambdaTestUtils.intercept;
+import static org.apache.hadoop.test.LambdaTestUtils.interceptFuture;
 
 import org.junit.Test;
 
 /**
- * Test Seek operations
+ * Test Open operations.
  */
 public abstract class AbstractContractOpenTest
     extends AbstractFSContractTestBase {
@@ -63,8 +71,7 @@ public void testOpenReadZeroByteFile() throws Throwable {
     instream = getFileSystem().open(path);
     assertEquals(0, instream.getPos());
     //expect initial read to fail
-    int result = instream.read();
-    assertMinusOne("initial byte read", result);
+    assertMinusOne("initial byte read", instream.read());
   }
 
   @Test
@@ -173,4 +180,126 @@ public void testSequentialRead() throws Throwable {
     instream.close();
   }
 
+  @Test
+  public void testOpenFileReadZeroByte() throws Throwable {
+    describe("create & read a 0 byte file through the builders");
+    Path path = path("zero.txt");
+    FileSystem fs = getFileSystem();
+    fs.createFile(path).overwrite(true).build().close();
+    try (FSDataInputStream is = fs.openFile(path)
+        .opt("fs.test.something", true)
+        .opt("fs.test.something2", 3)
+        .opt("fs.test.something3", "3")
+        .build().get()) {
+      assertMinusOne("initial byte read", is.read());
+    }
+  }
+
+  @Test
+  public void testOpenFileUnknownOption() throws Throwable {
+    describe("calling openFile fails when a 'must()' option is unknown");
+    FutureDataInputStreamBuilder builder =
+        getFileSystem().openFile(path("testOpenFileUnknownOption"))
+        .opt("fs.test.something", true)
+        .must("fs.test.something", true);
+    intercept(IllegalArgumentException.class,
+        () -> builder.build());
+  }
+
+  @Test
+  public void testOpenFileLazyFail() throws Throwable {
+    describe("openFile fails on a missing file in the get() and not before");
+    FutureDataInputStreamBuilder builder =
+        getFileSystem().openFile(path("testOpenFileLazyFail"))
+            .opt("fs.test.something", true);
+    interceptFuture(FileNotFoundException.class, "", builder.build());
+  }
+
+  @Test
+  public void testOpenFileFailExceptionally() throws Throwable {
+    describe("openFile missing file chains into exceptionally()");
+    FutureDataInputStreamBuilder builder =
+        getFileSystem().openFile(path("testOpenFileFailExceptionally"))
+            .opt("fs.test.something", true);
+    assertNull("exceptional uprating",
+        builder.build().exceptionally(ex -> null).get());
+  }
+
+  @Test
+  public void testAwaitFutureFailToFNFE() throws Throwable {
+    describe("Verify that FutureIOSupport.awaitFuture extracts IOExceptions");
+    FutureDataInputStreamBuilder builder =
+        getFileSystem().openFile(path("testAwaitFutureFailToFNFE"))
+            .opt("fs.test.something", true);
+    intercept(FileNotFoundException.class,
+        () -> FutureIOSupport.awaitFuture(builder.build()));
+  }
+
+  @Test
+  public void testAwaitFutureTimeoutFailToFNFE() throws Throwable {
+    describe("Verify that FutureIOSupport.awaitFuture with a timeout works");
+    FutureDataInputStreamBuilder builder =
+        getFileSystem().openFile(path("testAwaitFutureFailToFNFE"))
+            .opt("fs.test.something", true);
+    intercept(FileNotFoundException.class,
+        () -> FutureIOSupport.awaitFuture(builder.build(),
+            10, TimeUnit.DAYS));
+  }
+
+  @Test
+  public void testOpenFileExceptionallyTranslating() throws Throwable {
+    describe("openFile missing file chains into exceptionally()");
+    CompletableFuture<FSDataInputStream> f = getFileSystem()
+        .openFile(path("testOpenFileUnknownOption")).build();
+    interceptFuture(RuntimeException.class,
+        "exceptionally",
+        f.exceptionally(ex -> {
+          throw new RuntimeException("exceptionally", ex);
+        }));
+  }
+
+  @Test
+  public void testChainedFailureAwaitFuture() throws Throwable {
+    describe("await Future handles chained failures");
+    CompletableFuture<FSDataInputStream> f = getFileSystem()
+        .openFile(path("testOpenFileUnknownOption"))
+        .build();
+    intercept(RuntimeException.class,
+        "exceptionally",
+        () -> FutureIOSupport.awaitFuture(
+            f.exceptionally(ex -> {
+              throw new RuntimeException("exceptionally", ex);
+            })));
+  }
+
+  @Test
+  public void testOpenFileApplyRead() throws Throwable {
+    describe("use the apply sequence to read a whole file");
+    Path path = path("testOpenFileApplyRead");
+    FileSystem fs = getFileSystem();
+    int len = 4096;
+    createFile(fs, path, true,
+        dataset(len, 0x40, 0x80));
+    CompletableFuture<Long> readAllBytes = fs.openFile(path)
+        .build()
+        .thenApply(ContractTestUtils::readStream);
+    assertEquals("Wrong number of bytes read value",
+        len,
+        (long) readAllBytes.get());
+  }
+
+  @Test
+  public void testOpenFileApplyAsyncRead() throws Throwable {
+    describe("verify that async accept callbacks are evaluated");
+    Path path = path("testOpenFileApplyAsyncRead");
+    FileSystem fs = getFileSystem();
+    createFile(fs, path, true,
+        dataset(4, 0x40, 0x80));
+    CompletableFuture<FSDataInputStream> future = fs.openFile(path).build();
+    AtomicBoolean accepted = new AtomicBoolean(false);
+    future.thenAcceptAsync(i -> accepted.set(true)).get();
+    assertTrue("async accept operation not invoked",
+        accepted.get());
+  }
+
 }
diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/AbstractContractPathHandleTest.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/AbstractContractPathHandleTest.java
index 36cfa6ccdaf87..17043dca93e43 100644
--- a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/AbstractContractPathHandleTest.java
+++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/AbstractContractPathHandleTest.java
@@ -17,16 +17,19 @@
  */
 package org.apache.hadoop.fs.contract;
 
+import java.io.FileNotFoundException;
 import java.io.IOException;
 import java.nio.ByteBuffer;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collection;
+import java.util.concurrent.CompletableFuture;
 import java.util.stream.Collectors;
 
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FSDataInputStream;
 import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.InvalidPathHandleException;
 import org.apache.hadoop.fs.Options.HandleOpt;
 import org.apache.hadoop.fs.Path;
@@ -38,6 +41,7 @@
 import static org.apache.hadoop.fs.contract.ContractTestUtils.verifyRead;
 import static org.apache.hadoop.fs.contract.ContractTestUtils.verifyFileContents;
 import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.IO_FILE_BUFFER_SIZE_KEY;
+import static org.apache.hadoop.test.LambdaTestUtils.interceptFuture;
 
 import org.apache.hadoop.fs.RawPathHandle;
 import org.junit.Test;
@@ -249,4 +253,61 @@ protected PathHandle getHandleOrSkip(FileStatus stat) {
     // unreachable
     return null;
   }
+
+
+  @Test
+  public void testOpenFileApplyRead() throws Throwable {
+    describe("use the apply sequence to read a whole file");
+    CompletableFuture<Long> readAllBytes = getFileSystem()
+        .openFile(
+            getHandleOrSkip(
+                testFile(B1)))
+        .build()
+        .thenApply(ContractTestUtils::readStream);
+    assertEquals("Wrong number of bytes read value",
+        TEST_FILE_LEN,
+        (long) readAllBytes.get());
+  }
+
+  @Test
+  public void testOpenFileDelete() throws Throwable {
+    describe("use the apply sequence to read a whole file");
+    FileStatus testFile = testFile(B1);
+    PathHandle handle = getHandleOrSkip(testFile);
+    // delete that file
+    FileSystem fs = getFileSystem();
+    fs.delete(testFile.getPath(), false);
+    // now construct the builder.
+    // even if the open happens in the build operation,
+    // the failure must not surface until later.
+    CompletableFuture<FSDataInputStream> builder =
+        fs.openFile(handle)
+            .opt("fs.test.something", true)
+            .build();
+    IOException ioe = interceptFuture(IOException.class, "", builder);
+    if (!(ioe instanceof FileNotFoundException)
+        && !(ioe instanceof InvalidPathHandleException)) {
+      // support both FileNotFoundException
+      // and InvalidPathHandleException as different implementations
+      // support either -and with non-atomic open sequences, possibly
+      // both
+      throw ioe;
+    }
+  }
+
+  @Test
+  public void testOpenFileLazyFail() throws Throwable {
+    describe("openFile fails on a misssng file in the get() and not before");
+    FileStatus stat = testFile(B1);
+    CompletableFuture<Long> readAllBytes = getFileSystem()
+        .openFile(
+            getHandleOrSkip(
+                stat))
+        .build()
+        .thenApply(ContractTestUtils::readStream);
+    assertEquals("Wrong number of bytes read value",
+        TEST_FILE_LEN,
+        (long) readAllBytes.get());
+  }
+
 }
diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/ContractTestUtils.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/ContractTestUtils.java
index c5ce46f292712..b4db3a5803ad8 100644
--- a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/ContractTestUtils.java
+++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/ContractTestUtils.java
@@ -25,7 +25,6 @@
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.LocatedFileStatus;
 import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.fs.PathCapabilities;
 import org.apache.hadoop.fs.RemoteIterator;
 import org.apache.hadoop.fs.StreamCapabilities;
 import org.apache.hadoop.io.IOUtils;
@@ -1467,61 +1466,22 @@ public static void assertCapabilities(
     assertTrue("Stream should be instanceof StreamCapabilities",
         stream instanceof StreamCapabilities);
 
-    StreamCapabilities source = (StreamCapabilities) stream;
-    if (shouldHaveCapabilities != null) {
+    if (shouldHaveCapabilities!=null) {
       for (String shouldHaveCapability : shouldHaveCapabilities) {
         assertTrue("Should have capability: " + shouldHaveCapability,
-            source.hasCapability(shouldHaveCapability));
+            ((StreamCapabilities) stream).hasCapability(shouldHaveCapability));
       }
     }
 
-    if (shouldNotHaveCapabilities != null) {
+    if (shouldNotHaveCapabilities!=null) {
       for (String shouldNotHaveCapability : shouldNotHaveCapabilities) {
         assertFalse("Should not have capability: " + shouldNotHaveCapability,
-            source.hasCapability(shouldNotHaveCapability));
+            ((StreamCapabilities) stream)
+                .hasCapability(shouldNotHaveCapability));
       }
     }
   }
 
-  /**
-   * Custom assert to test {@link PathCapabilities}.
-   *
-   * @param source source (FS, FC, etc)
-   * @param path path to check
-   * @param capabilities The array of unexpected capabilities
-   */
-  public static void assertHasPathCapabilities(
-      final PathCapabilities source,
-      final Path path,
-      final String...capabilities) throws IOException {
-
-    for (String shouldHaveCapability: capabilities) {
-      assertTrue("Should have capability: " + shouldHaveCapability
-              + " under " + path,
-          source.hasPathCapability(path, shouldHaveCapability));
-    }
-  }
-
-  /**
-   * Custom assert to test that the named {@link PathCapabilities}
-   * are not supported.
-   *
-   * @param source source (FS, FC, etc)
-   * @param path path to check
-   * @param capabilities The array of unexpected capabilities
-   */
-  public static void assertLacksPathCapabilities(
-      final PathCapabilities source,
-      final Path path,
-      final String...capabilities) throws IOException {
-
-    for (String shouldHaveCapability: capabilities) {
-      assertFalse("Path  must not support capability: " + shouldHaveCapability
-              + " under " + path,
-          source.hasPathCapability(path, shouldHaveCapability));
-    }
-  }
-
   /**
    * Function which calls {@code InputStream.read()} and
    * downgrades an IOE to a runtime exception.
diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/test/LambdaTestUtils.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/test/LambdaTestUtils.java
index cf12735566b9c..c1b6cc4081e5c 100644
--- a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/test/LambdaTestUtils.java
+++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/test/LambdaTestUtils.java
@@ -23,10 +23,17 @@
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+import org.apache.hadoop.security.UserGroupInformation;
 import org.apache.hadoop.util.Time;
 
+import java.io.IOException;
+import java.security.PrivilegedExceptionAction;
 import java.util.Optional;
 import java.util.concurrent.Callable;
+import java.util.concurrent.CancellationException;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.Future;
+import java.util.concurrent.TimeUnit;
 import java.util.concurrent.TimeoutException;
 
 /**
@@ -399,7 +406,7 @@ public static <E extends Throwable> E intercept(
       throws Exception {
     try {
       eval.call();
-      throw new AssertionError("Expected an exception of type " + clazz);
+      throw new AssertionError("Expected an exception");
     } catch (Throwable e) {
       if (clazz.isAssignableFrom(e.getClass())) {
         return (E)e;
@@ -645,6 +652,174 @@ public static void eval(VoidCallable closure) {
     }
   }
 
+  /**
+   * Evaluate a closure and return the result, after verifying that it is
+   * not null.
+   * @param message message to use in assertion text if the result is null
+   * @param eval closure to evaluate
+   * @param <T> type of response
+   * @return the evaluated result
+   * @throws Exception on any problem
+   */
+  public static<T> T notNull(String message, Callable<T> eval)
+      throws Exception {
+    T t = eval.call();
+    Assert.assertNotNull(message, t);
+    return t;
+  }
+
+  /**
+   * Execute a closure as the given user.
+   * @param user user to invoke the closure as
+   * @param eval closure to evaluate
+   * @param <T> return type
+   * @return the result of calling the closure under the identity of the user.
+   * @throws IOException IO failure
+   * @throws InterruptedException interrupted operation.
+   */
+  public static<T> T doAs(UserGroupInformation user, Callable<T> eval)
+      throws IOException, InterruptedException {
+    return user.doAs(new PrivilegedOperation<>(eval));
+  }
+
+  /**
+   * Execute a closure as the given user.
+   * @param user user to invoke the closure as
+   * @param eval closure to evaluate
+   * @throws IOException IO failure
+   * @throws InterruptedException interrupted operation.
+   */
+  public static void doAs(UserGroupInformation user, VoidCallable eval)
+      throws IOException, InterruptedException {
+    user.doAs(new PrivilegedVoidOperation(eval));
+  }
+
+  /**
+   * Expect a future to raise a specific exception class when evaluated,
+   * <i>looking inside the raised {@code ExecutionException}</i> for it.
+   * @param clazz class of exception; the nested exception must be this class
+   * <i>or a subclass</i>.
+   *
+   * This is simply an unwrapping of the outcome of the future.
+   *
+   * If an exception is not raised, the return value of the {@code get()}
+   * call is included in the exception string.
+   *
+   * If the nested cause of the raised ExecutionException is not an
+   * Exception (i.e its an error), then the outer ExecutionException is
+   * rethrown.
+   * This keeps the operation signatures in sync.
+   *
+   * @param contained string which must be in the {@code toString()} value
+   * of the exception
+   * @param future future to get
+   * @param <T> return type of expression
+   * @param <E> exception class
+   * @return the caught exception if it was of the expected type and contents
+   * @throws AssertionError if the evaluation call didn't raise an exception.
+   * The error includes the {@code toString()} value of the result, if this
+   * can be determined.
+   * @throws CancellationException if the computation was cancelled
+   * @throws ExecutionException if the raised exception didn't contain an
+   * exception.
+   * @throws InterruptedException if the current thread was interrupted
+   * @throws TimeoutException if the wait timed out
+   * @throws Exception if the wrong exception was raised, or there was
+   * a text mismatch.
+   */
+  public static <T, E extends Throwable> E interceptFuture(
+      Class<E> clazz,
+      String contained,
+      Future<T> future) throws Exception {
+    return intercept(clazz,
+        contained,
+        () -> {
+          try {
+            return future.get();
+          } catch (ExecutionException e) {
+            Throwable cause = e.getCause();
+            if (cause instanceof Exception) {
+              throw (Exception) cause;
+            } else {
+              throw e;
+            }
+          }
+        });
+  }
+
+  /**
+   * Expect a future to raise a specific exception class when evaluated,
+   * <i>looking inside the raised {@code ExecutionException}</i> for it.
+   * @param clazz class of exception; the nested exception must be this class
+   * <i>or a subclass</i>.
+   *
+   * This is simply an unwrapping of the outcome of the future.
+   *
+   * If an exception is not raised, the return value of the {@code get()}
+   * call is included in the exception string.
+   *
+   * If the nested cause of the raised ExecutionException is not an
+   * Exception (i.e its an error), then the outer ExecutionException is
+   * rethrown.
+   * This keeps the operation signatures in sync.
+   *
+   * @param contained string which must be in the {@code toString()} value
+   * of the exception
+   * @param future future to get
+   * @param <T> return type of expression
+   * @param <E> exception class
+   * @return the caught exception if it was of the expected type and contents
+   * @throws AssertionError if the evaluation call didn't raise an exception.
+   * The error includes the {@code toString()} value of the result, if this
+   * can be determined.
+   * @throws CancellationException if the computation was cancelled
+   * @throws ExecutionException if the raised exception didn't contain an
+   * exception.
+   * @throws InterruptedException if the current thread was interrupted
+   * @throws TimeoutException if the wait timed out
+   * @throws Exception if the wrong exception was raised, or there was
+   * a text mismatch.
+   */
+  public static <T, E extends Throwable> E interceptFuture(
+      final Class<E> clazz,
+      final String contained,
+      final long timeout,
+      final TimeUnit tu,
+      final Future<T> future) throws Exception {
+    return intercept(clazz,
+        contained,
+        () -> {
+          try {
+            return future.get(timeout, tu);
+          } catch (ExecutionException e) {
+            Throwable cause = e.getCause();
+            if (cause instanceof Exception) {
+              throw (Exception) cause;
+            } else {
+              throw e;
+            }
+          }
+        });
+   }
+
+  /**
+   * Verify that the cause of an exception is of the given type.
+   * @param <E> exception class
+   * @param caught caught exception
+   * @return the extracted exception if it is of the expect type.
+   * @throws Exception the outer exception if there is no inner/wrong type
+   */
+  public static <E extends Throwable> E verifyCause(
+      Class<E> clazz,
+      final Throwable caught) throws Throwable {
+    Throwable cause = caught.getCause();
+    if (cause == null || !clazz.isAssignableFrom(cause.getClass())) {
+      throw caught;
+    } else {
+      return (E) caught;
+    }
+  }
+
   /**
    * Returns {@code TimeoutException} on a timeout. If
    * there was a inner class passed in, includes it as the
@@ -812,4 +987,50 @@ public Void call() throws Exception {
     }
   }
 
+  /**
+   * A lambda-invoker for doAs use; invokes the callable provided
+   * in the constructor.
+   * @param <T> return type.
+   */
+  public static class PrivilegedOperation<T>
+      implements PrivilegedExceptionAction<T> {
+
+    private final Callable<T> callable;
+
+    /**
+     * Constructor.
+     * @param callable a non-null callable/closure.
+     */
+    public PrivilegedOperation(final Callable<T> callable) {
+      this.callable = Preconditions.checkNotNull(callable);
+    }
+
+    @Override
+    public T run() throws Exception {
+      return callable.call();
+    }
+  }
+
+  /**
+   * VoidCaller variant of {@link PrivilegedOperation}: converts
+   * a void-returning closure to an action which {@code doAs} can call.
+   */
+  public static class PrivilegedVoidOperation
+      implements PrivilegedExceptionAction<Void> {
+
+    private final Callable<Void> callable;
+
+    /**
+     * Constructor.
+     * @param callable a non-null callable/closure.
+     */
+    public PrivilegedVoidOperation(final VoidCallable callable) {
+      this.callable = new VoidCaller(callable);
+    }
+
+    @Override
+    public Void run() throws Exception {
+      return callable.call();
+    }
+  }
 }
diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/test/TestLambdaTestUtils.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/test/TestLambdaTestUtils.java
index 694fe73724af9..479dd35b0aa1d 100644
--- a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/test/TestLambdaTestUtils.java
+++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/test/TestLambdaTestUtils.java
@@ -24,6 +24,10 @@
 import java.io.FileNotFoundException;
 import java.io.IOException;
 import java.util.concurrent.Callable;
+import java.util.concurrent.CancellationException;
+import java.util.concurrent.CompletableFuture;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.TimeUnit;
 import java.util.concurrent.TimeoutException;
 import java.util.concurrent.atomic.AtomicInteger;
 
@@ -516,17 +520,105 @@ public void testEvalDoesntWrapRTEs() throws Throwable {
    */
   @Test
   public void testEvalDoesWrapIOEs() throws Throwable {
-    AssertionError ex = intercept(AssertionError.class, "ioe",
-        () -> eval(() -> {
-          throw new IOException("ioe");
-        }));
-    Throwable cause = ex.getCause();
-    if (cause == null) {
-      throw ex;
-    }
-    if (!(cause instanceof IOException)) {
-      throw cause;
-    }
+    verifyCause(IOException.class,
+        intercept(AssertionError.class, "ioe",
+          () -> eval(() -> {
+            throw new IOException("ioe");
+          })));
+  }
+
+  @Test
+  public void testInterceptFutureUnwrapped() throws Throwable {
+    CompletableFuture<String> future = new CompletableFuture<>();
+    future.completeExceptionally(new IOException("oops"));
+    interceptFuture(IOException.class, "oops", future);
+  }
+
+  @Test
+  public void testInterceptFutureWrongException() throws Throwable {
+    CompletableFuture<String> future = new CompletableFuture<>();
+    future.completeExceptionally(new RuntimeException("oops"));
+    intercept(RuntimeException.class,
+        "oops",
+        () -> interceptFuture(IOException.class, "", future));
+  }
+
+  @Test
+  public void testInterceptFutureNotAnException() throws Throwable {
+    CompletableFuture<String> future = new CompletableFuture<>();
+    future.completeExceptionally(new Error("oops"));
+    verifyCause(Error.class,
+        intercept(ExecutionException.class,
+            "oops",
+            () -> interceptFuture(IOException.class, "", future)));
+  }
+
+  /**
+   * Variant for exception catching.
+   */
+  @Test
+  public void testInterceptFutureNotAnException2() throws Throwable {
+    CompletableFuture<String> future = new CompletableFuture<>();
+    future.completeExceptionally(new Error("oops"));
+    verifyCause(Error.class,
+        interceptFuture(ExecutionException.class, "", future));
+  }
+
+  @Test
+  public void testInterceptFutureNoFailures() throws Throwable {
+    CompletableFuture<String> future = new CompletableFuture<>();
+    future.complete("happy");
+    intercept(AssertionError.class,
+        "happy",
+        () -> interceptFuture(IOException.class, "oops", future));
+  }
+
+  /**
+   * This will timeout immediately and raise a TimeoutException.
+   */
+  @Test
+  public void testInterceptFutureTimeout() throws Throwable {
+    CompletableFuture<String> future = new CompletableFuture<>();
+    intercept(TimeoutException.class,
+        "",
+        () -> interceptFuture(IOException.class, "oops",
+            1, TimeUnit.NANOSECONDS,
+            future));
+  }
+
+  /**
+   * This will timeout immediately and raise a TimeoutException.
+   */
+  @Test
+  public void testInterceptFutureTimeout2() throws Throwable {
+    CompletableFuture<String> future = new CompletableFuture<>();
+    interceptFuture(TimeoutException.class, "",
+            1, TimeUnit.NANOSECONDS,
+            future);
+  }
+
+  /**
+   * This will timeout immediately and raise a TimeoutException.
+   */
+  @Test
+  public void testInterceptFutureTimeoutSuccess() throws Throwable {
+    CompletableFuture<String> future = new CompletableFuture<>();
+    future.completeExceptionally(new IOException("oops"));
+    interceptFuture(IOException.class, "oops",
+        1, TimeUnit.NANOSECONDS,
+        future);
+  }
+
+  /**
+   * This will timeout immediately and raise a TimeoutException.
+   */
+  @Test
+  public void testInterceptFutureCancelled() throws Throwable {
+    CompletableFuture<String> future = new CompletableFuture<>();
+    future.cancel(false);
+    interceptFuture(CancellationException.class, "",
+        1, TimeUnit.NANOSECONDS,
+        future);
   }
 
 }
diff --git a/hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/DistributedFileSystem.java b/hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/DistributedFileSystem.java
index abf56a5474aad..5c1002e7a0350 100644
--- a/hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/DistributedFileSystem.java
+++ b/hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/DistributedFileSystem.java
@@ -3235,7 +3235,7 @@ private HdfsDataOutputStreamBuilder(DistributedFileSystem dfs, Path path) {
     }
 
     @Override
-    protected HdfsDataOutputStreamBuilder getThisBuilder() {
+    public HdfsDataOutputStreamBuilder getThisBuilder() {
       return this;
     }
 
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/fs/contract/hdfs/TestHDFSContractOpen.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/fs/contract/hdfs/TestHDFSContractOpen.java
index 125e8eec935b5..0d9e8103208ee 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/fs/contract/hdfs/TestHDFSContractOpen.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/fs/contract/hdfs/TestHDFSContractOpen.java
@@ -27,7 +27,7 @@
 import java.io.IOException;
 
 /**
- * Test dir operations on a the local FS.
+ * Test Open operations on HDFS.
  */
 public class TestHDFSContractOpen extends AbstractContractOpenTest {
 
diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapred/LineRecordReader.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapred/LineRecordReader.java
index bfc6c0e855b52..1fcb118a100fc 100644
--- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapred/LineRecordReader.java
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapred/LineRecordReader.java
@@ -25,9 +25,10 @@
 import org.apache.hadoop.classification.InterfaceStability;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FSDataInputStream;
-import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.FutureDataInputStreamBuilder;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.fs.Seekable;
+import org.apache.hadoop.fs.impl.FutureIOSupport;
 import org.apache.hadoop.io.LongWritable;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.io.compress.CodecPool;
@@ -36,6 +37,7 @@
 import org.apache.hadoop.io.compress.Decompressor;
 import org.apache.hadoop.io.compress.SplitCompressionInputStream;
 import org.apache.hadoop.io.compress.SplittableCompressionCodec;
+import org.apache.hadoop.mapreduce.MRJobConfig;
 import org.apache.hadoop.mapreduce.lib.input.CompressedSplitLineReader;
 import org.apache.hadoop.mapreduce.lib.input.SplitLineReader;
 import org.apache.hadoop.mapreduce.lib.input.UncompressedSplitLineReader;
@@ -105,8 +107,12 @@ public LineRecordReader(Configuration job, FileSplit split,
     codec = compressionCodecs.getCodec(file);
 
     // open the file and seek to the start of the split
-    final FileSystem fs = file.getFileSystem(job);
-    fileIn = fs.open(file);
+    final FutureDataInputStreamBuilder builder =
+        file.getFileSystem(job).openFile(file);
+    FutureIOSupport.propagateOptions(builder, job,
+        MRJobConfig.INPUT_FILE_OPTION_PREFIX,
+        MRJobConfig.INPUT_FILE_MANDATORY_PREFIX);
+    fileIn = FutureIOSupport.awaitFuture(builder.build());
     if (isCompressedInput()) {
       decompressor = CodecPool.getDecompressor(codec);
       if (codec instanceof SplittableCompressionCodec) {
diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/MRJobConfig.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/MRJobConfig.java
index 4f005dff2aa00..95f0457ce27ac 100644
--- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/MRJobConfig.java
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/MRJobConfig.java
@@ -1250,4 +1250,18 @@ public interface MRJobConfig {
       MR_AM_STAGING_DIR + ".erasurecoding.enabled";
 
   boolean DEFAULT_MR_AM_STAGING_ERASURECODING_ENABLED = false;
+
+  /**
+   * Prefix for options which are passed in to the filesystem
+   * after converting the subsequent dotted element to the schema.
+   */
+  @Unstable
+  String INPUT_FILE_OPTION_PREFIX = "mapreduce.job.input.file.option.";
+
+  /**
+   * Prefix for mandatory options which are passed in to the filesystem
+   * after converting the subsequent dotted element to the schema.
+   */
+  @Unstable
+  String INPUT_FILE_MANDATORY_PREFIX = "mapreduce.job.input.file.must.";
 }
diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/input/FixedLengthRecordReader.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/input/FixedLengthRecordReader.java
index 71b2b79bea0ce..c0ae9a5cdac61 100644
--- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/input/FixedLengthRecordReader.java
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/input/FixedLengthRecordReader.java
@@ -25,9 +25,10 @@
 import org.apache.hadoop.classification.InterfaceStability;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FSDataInputStream;
-import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.FutureDataInputStreamBuilder;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.fs.Seekable;
+import org.apache.hadoop.fs.impl.FutureIOSupport;
 import org.apache.hadoop.io.BytesWritable;
 import org.apache.hadoop.io.LongWritable;
 import org.apache.hadoop.io.compress.CodecPool;
@@ -36,6 +37,7 @@
 import org.apache.hadoop.io.compress.CompressionInputStream;
 import org.apache.hadoop.io.compress.Decompressor;
 import org.apache.hadoop.mapreduce.InputSplit;
+import org.apache.hadoop.mapreduce.MRJobConfig;
 import org.apache.hadoop.mapreduce.RecordReader;
 import org.apache.hadoop.mapreduce.TaskAttemptContext;
 import org.slf4j.Logger;
@@ -89,9 +91,13 @@ public void initialize(Configuration job, long splitStart, long splitLength,
       numBytesToSkip = recordLength - partialRecordLength;
     }
 
-    // open the file and seek to the start of the split
-    final FileSystem fs = file.getFileSystem(job);
-    fileIn = fs.open(file);
+    // open the file
+    final FutureDataInputStreamBuilder builder =
+        file.getFileSystem(job).openFile(file);
+    FutureIOSupport.propagateOptions(builder, job,
+        MRJobConfig.INPUT_FILE_OPTION_PREFIX,
+        MRJobConfig.INPUT_FILE_MANDATORY_PREFIX);
+    fileIn = FutureIOSupport.awaitFuture(builder.build());
 
     CompressionCodec codec = new CompressionCodecFactory(job).getCodec(file);
     if (null != codec) {
diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/input/LineRecordReader.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/input/LineRecordReader.java
index ca85982e0ae74..160c7635658a4 100644
--- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/input/LineRecordReader.java
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/input/LineRecordReader.java
@@ -24,9 +24,10 @@
 import org.apache.hadoop.classification.InterfaceStability;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FSDataInputStream;
-import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.FutureDataInputStreamBuilder;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.fs.Seekable;
+import org.apache.hadoop.fs.impl.FutureIOSupport;
 import org.apache.hadoop.io.LongWritable;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.io.compress.CodecPool;
@@ -36,6 +37,7 @@
 import org.apache.hadoop.io.compress.CompressionCodecFactory;
 import org.apache.hadoop.io.compress.Decompressor;
 import org.apache.hadoop.mapreduce.InputSplit;
+import org.apache.hadoop.mapreduce.MRJobConfig;
 import org.apache.hadoop.mapreduce.RecordReader;
 import org.apache.hadoop.mapreduce.TaskAttemptContext;
 import org.slf4j.Logger;
@@ -82,8 +84,12 @@ public void initialize(InputSplit genericSplit,
     final Path file = split.getPath();
 
     // open the file and seek to the start of the split
-    final FileSystem fs = file.getFileSystem(job);
-    fileIn = fs.open(file);
+    final FutureDataInputStreamBuilder builder =
+        file.getFileSystem(job).openFile(file);
+    FutureIOSupport.propagateOptions(builder, job,
+        MRJobConfig.INPUT_FILE_OPTION_PREFIX,
+        MRJobConfig.INPUT_FILE_MANDATORY_PREFIX);
+    fileIn = FutureIOSupport.awaitFuture(builder.build());
     
     CompressionCodec codec = new CompressionCodecFactory(job).getCodec(file);
     if (null!=codec) {
diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/input/NLineInputFormat.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/input/NLineInputFormat.java
index 758996165f45c..dfff9ad0d2b73 100644
--- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/input/NLineInputFormat.java
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/input/NLineInputFormat.java
@@ -27,13 +27,15 @@
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FSDataInputStream;
 import org.apache.hadoop.fs.FileStatus;
-import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.FutureDataInputStreamBuilder;
 import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.impl.FutureIOSupport;
 import org.apache.hadoop.io.LongWritable;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.mapreduce.InputSplit;
 import org.apache.hadoop.mapreduce.Job;
 import org.apache.hadoop.mapreduce.JobContext;
+import org.apache.hadoop.mapreduce.MRJobConfig;
 import org.apache.hadoop.mapreduce.RecordReader;
 import org.apache.hadoop.mapreduce.TaskAttemptContext;
 import org.apache.hadoop.util.LineReader;
@@ -93,10 +95,14 @@ public static List<FileSplit> getSplitsForFile(FileStatus status,
     if (status.isDirectory()) {
       throw new IOException("Not a file: " + fileName);
     }
-    FileSystem  fs = fileName.getFileSystem(conf);
     LineReader lr = null;
     try {
-      FSDataInputStream in  = fs.open(fileName);
+      final FutureDataInputStreamBuilder builder =
+          fileName.getFileSystem(conf).openFile(fileName);
+      FutureIOSupport.propagateOptions(builder, conf,
+          MRJobConfig.INPUT_FILE_OPTION_PREFIX,
+          MRJobConfig.INPUT_FILE_MANDATORY_PREFIX);
+      FSDataInputStream in  = FutureIOSupport.awaitFuture(builder.build());
       lr = new LineReader(in, conf);
       Text line = new Text();
       int numLines = 0;
diff --git a/hadoop-tools/hadoop-aws/dev-support/findbugs-exclude.xml b/hadoop-tools/hadoop-aws/dev-support/findbugs-exclude.xml
index 855aac974c90f..bb6808f0f66de 100644
--- a/hadoop-tools/hadoop-aws/dev-support/findbugs-exclude.xml
+++ b/hadoop-tools/hadoop-aws/dev-support/findbugs-exclude.xml
@@ -63,5 +63,10 @@
     <Method name="reopen"/>
     <Bug pattern="RV_RETURN_VALUE_IGNORED_NO_SIDE_EFFECT"/>
   </Match>
+  <Match>
+    <Class name="org.apache.hadoop.fs.s3a.S3AFileSystem"/>
+    <Method name="openFileWithOptions"/>
+    <Bug pattern="RV_RETURN_VALUE_IGNORED_BAD_PRACTICE"/>
+  </Match>
 
 </FindBugsFilter>
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/InternalConstants.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/InternalConstants.java
new file mode 100644
index 0000000000000..bcf894f96bef3
--- /dev/null
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/InternalConstants.java
@@ -0,0 +1,53 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.s3a;
+
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.classification.InterfaceStability;
+
+/**
+ * Constants for internal use in the org.apache.hadoop.fs.s3a module itself.
+ * Please don't refer to these outside of this module & its tests.
+ * If you find you need to then either the code is doing something it
+ * should not, or these constants need to be uprated to being
+ * public and stable entries.
+ */
+@InterfaceAudience.Private
+public final class InternalConstants {
+
+  private InternalConstants() {
+  }
+
+  /**
+   * The known keys used in a standard openFile call.
+   * if there's a select marker in there then the keyset
+   * used becomes that of the select operation.
+   */
+  @InterfaceStability.Unstable
+  public static final Set<String> STANDARD_OPENFILE_KEYS =
+      Collections.unmodifiableSet(
+          new HashSet<>(
+              Arrays.asList(Constants.INPUT_FADVISE,
+                  Constants.READAHEAD_RANGE)));
+}
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AFileSystem.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AFileSystem.java
index 32718adaf58ad..031a80be1d718 100644
--- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AFileSystem.java
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AFileSystem.java
@@ -34,9 +34,12 @@
 import java.util.EnumSet;
 import java.util.HashSet;
 import java.util.List;
+import java.util.Locale;
 import java.util.Map;
+import java.util.Optional;
 import java.util.Set;
 import java.util.Objects;
+import java.util.concurrent.CompletableFuture;
 import java.util.concurrent.ExecutorService;
 import java.util.concurrent.LinkedBlockingQueue;
 import java.util.concurrent.ThreadPoolExecutor;
@@ -75,20 +78,18 @@
 import com.amazonaws.event.ProgressListener;
 import com.google.common.annotations.VisibleForTesting;
 import com.google.common.base.Preconditions;
+import com.google.common.util.concurrent.ListeningExecutorService;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
 import org.apache.hadoop.classification.InterfaceAudience;
 import org.apache.hadoop.classification.InterfaceStability;
 import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.CommonPathCapabilities;
 import org.apache.hadoop.fs.CreateFlag;
 import org.apache.hadoop.fs.FSDataInputStream;
 import org.apache.hadoop.fs.FSDataOutputStream;
-import org.apache.hadoop.fs.s3a.impl.ChangeDetectionPolicy;
-import org.apache.hadoop.fs.s3a.impl.DirectoryPolicy;
-import org.apache.hadoop.fs.s3a.impl.DirectoryPolicyImpl;
-import org.apache.hadoop.fs.s3a.impl.StatusProbeEnum;
+import org.apache.hadoop.fs.s3a.select.InternalSelectConstants;
+import org.apache.hadoop.util.LambdaUtils;
 import org.apache.hadoop.fs.FileAlreadyExistsException;
 import org.apache.hadoop.fs.FileStatus;
 import org.apache.hadoop.fs.FileSystem;
@@ -104,9 +105,17 @@
 import org.apache.hadoop.fs.RemoteIterator;
 import org.apache.hadoop.fs.StreamCapabilities;
 import org.apache.hadoop.fs.permission.FsPermission;
+import org.apache.hadoop.fs.s3a.auth.RoleModel;
+import org.apache.hadoop.fs.s3a.auth.delegation.AWSPolicyProvider;
+import org.apache.hadoop.fs.s3a.auth.delegation.EncryptionSecretOperations;
+import org.apache.hadoop.fs.s3a.auth.delegation.EncryptionSecrets;
+import org.apache.hadoop.fs.s3a.auth.delegation.S3ADelegationTokens;
+import org.apache.hadoop.fs.s3a.auth.delegation.AbstractS3ATokenIdentifier;
 import org.apache.hadoop.fs.s3a.commit.CommitConstants;
 import org.apache.hadoop.fs.s3a.commit.PutTracker;
 import org.apache.hadoop.fs.s3a.commit.MagicCommitIntegration;
+import org.apache.hadoop.fs.s3a.select.SelectBinding;
+import org.apache.hadoop.fs.s3a.select.SelectConstants;
 import org.apache.hadoop.fs.s3a.s3guard.DirListingMetadata;
 import org.apache.hadoop.fs.s3a.s3guard.MetadataStoreListFilesIterator;
 import org.apache.hadoop.fs.s3a.s3guard.MetadataStore;
@@ -117,18 +126,22 @@
 import org.apache.hadoop.fs.store.EtagChecksum;
 import org.apache.hadoop.security.UserGroupInformation;
 import org.apache.hadoop.util.BlockingThreadPoolExecutorService;
+import org.apache.hadoop.security.token.Token;
 import org.apache.hadoop.util.Progressable;
 import org.apache.hadoop.util.ReflectionUtils;
 import org.apache.hadoop.util.SemaphoredDelegatingExecutor;
 
-import static org.apache.hadoop.fs.impl.PathCapabilitiesSupport.validatePathCapabilityArgs;
+import static org.apache.hadoop.fs.impl.AbstractFSBuilderImpl.rejectUnknownMandatoryKeys;
 import static org.apache.hadoop.fs.s3a.Constants.*;
 import static org.apache.hadoop.fs.s3a.Invoker.*;
 import static org.apache.hadoop.fs.s3a.S3AUtils.*;
-import static org.apache.hadoop.fs.s3a.S3AUtils.getServerSideEncryptionKey;
 import static org.apache.hadoop.fs.s3a.Statistic.*;
-import static org.apache.commons.lang3.StringUtils.isNotBlank;
 import static org.apache.commons.lang3.StringUtils.isNotEmpty;
+import static org.apache.hadoop.fs.s3a.auth.RolePolicies.STATEMENT_ALLOW_SSE_KMS_RW;
+import static org.apache.hadoop.fs.s3a.auth.RolePolicies.allowS3Operations;
+import static org.apache.hadoop.fs.s3a.auth.delegation.S3ADelegationTokens.TokenIssuingPolicy.NoTokensAvailable;
+import static org.apache.hadoop.fs.s3a.auth.delegation.S3ADelegationTokens.hasDelegationTokenBinding;
+import static org.apache.hadoop.io.IOUtils.cleanupWithLogger;
 
 /**
  * The core S3A Filesystem implementation.
@@ -145,7 +158,8 @@
  */
 @InterfaceAudience.Private
 @InterfaceStability.Evolving
-public class S3AFileSystem extends FileSystem implements StreamCapabilities {
+public class S3AFileSystem extends FileSystem implements StreamCapabilities,
+    AWSPolicyProvider {
   /**
    * Default blocksize as used in blocksize and FS status queries.
    */
@@ -160,6 +174,7 @@ public class S3AFileSystem extends FileSystem implements StreamCapabilities {
    * retryable results in files being deleted.
   */
   public static final boolean DELETE_CONSIDERED_IDEMPOTENT = true;
+
   private URI uri;
   private Path workingDir;
   private String username;
@@ -180,7 +195,7 @@ public class S3AFileSystem extends FileSystem implements StreamCapabilities {
   private long partSize;
   private boolean enableMultiObjectsDelete;
   private TransferManager transfers;
-  private ExecutorService boundedThreadPool;
+  private ListeningExecutorService boundedThreadPool;
   private ExecutorService unboundedThreadPool;
   private long multiPartThreshold;
   public static final Logger LOG = LoggerFactory.getLogger(S3AFileSystem.class);
@@ -188,33 +203,41 @@ public class S3AFileSystem extends FileSystem implements StreamCapabilities {
       LoggerFactory.getLogger("org.apache.hadoop.fs.s3a.S3AFileSystem.Progress");
   private LocalDirAllocator directoryAllocator;
   private CannedAccessControlList cannedACL;
-  private S3AEncryptionMethods serverSideEncryptionAlgorithm;
+
+  /**
+   * This must never be null; until initialized it just declares that there
+   * is no encryption.
+   */
+  private EncryptionSecrets encryptionSecrets = new EncryptionSecrets();
   private S3AInstrumentation instrumentation;
   private final S3AStorageStatistics storageStatistics =
       createStorageStatistics();
   private long readAhead;
   private S3AInputPolicy inputPolicy;
-  private ChangeDetectionPolicy changeDetectionPolicy;
   private final AtomicBoolean closed = new AtomicBoolean(false);
   private volatile boolean isClosed = false;
   private MetadataStore metadataStore;
   private boolean allowAuthoritative;
 
+  /** Delegation token integration; non-empty when DT support is enabled. */
+  private Optional<S3ADelegationTokens> delegationTokens = Optional.empty();
+
+  /** Principal who created the FS; recorded during initialization. */
+  private UserGroupInformation owner;
+
   // The maximum number of entries that can be deleted in any call to s3
   private static final int MAX_ENTRIES_TO_DELETE = 1000;
   private String blockOutputBuffer;
   private S3ADataBlocks.BlockFactory blockFactory;
   private int blockOutputActiveBlocks;
   private WriteOperationHelper writeHelper;
+  private SelectBinding selectBinding;
   private boolean useListV1;
   private MagicCommitIntegration committerIntegration;
 
   private AWSCredentialProviderList credentials;
 
-  /**
-   * Directory policy.
-   */
-  private DirectoryPolicy directoryPolicy;
+  private S3Guard.ITtlTimeProvider ttlTimeProvider;
 
   /** Add any deprecated keys. */
   @SuppressWarnings("deprecation")
@@ -243,32 +266,40 @@ private static void addDeprecatedKeys() {
    */
   public void initialize(URI name, Configuration originalConf)
       throws IOException {
-    setUri(name);
     // get the host; this is guaranteed to be non-null, non-empty
     bucket = name.getHost();
     LOG.debug("Initializing S3AFileSystem for {}", bucket);
     // clone the configuration into one with propagated bucket options
     Configuration conf = propagateBucketOptions(originalConf, bucket);
+    // patch the Hadoop security providers
     patchSecurityCredentialProviders(conf);
-    super.initialize(name, conf);
+    // look for delegation token support early.
+    boolean delegationTokensEnabled = hasDelegationTokenBinding(conf);
+    if (delegationTokensEnabled) {
+      LOG.debug("Using delegation tokens");
+    }
+    // set the URI, this will do any fixup of the URI to remove secrets,
+    // canonicalize.
+    setUri(name, delegationTokensEnabled);
+    super.initialize(uri, conf);
     setConf(conf);
     try {
-      instrumentation = new S3AInstrumentation(name);
+
+      // look for encryption data
+      // DT Bindings may override this
+      setEncryptionSecrets(new EncryptionSecrets(
+          getEncryptionAlgorithm(bucket, conf),
+          getServerSideEncryptionKey(bucket, getConf())));
+
+      invoker = new Invoker(new S3ARetryPolicy(getConf()), onRetry);
+      instrumentation = new S3AInstrumentation(uri);
 
       // Username is the current user at the time the FS was instantiated.
-      username = UserGroupInformation.getCurrentUser().getShortUserName();
+      owner = UserGroupInformation.getCurrentUser();
+      username = owner.getShortUserName();
       workingDir = new Path("/user", username)
           .makeQualified(this.uri, this.getWorkingDirectory());
 
-
-      Class<? extends S3ClientFactory> s3ClientFactoryClass = conf.getClass(
-          S3_CLIENT_FACTORY_IMPL, DEFAULT_S3_CLIENT_FACTORY_IMPL,
-          S3ClientFactory.class);
-
-      credentials = createAWSCredentialProviderSet(name, conf);
-      s3 = ReflectionUtils.newInstance(s3ClientFactoryClass, conf)
-          .createS3Client(name, bucket, credentials);
-      invoker = new Invoker(new S3ARetryPolicy(getConf()), onRetry);
       s3guardInvoker = new Invoker(new S3GuardExistsRetryPolicy(getConf()),
           onRetry);
       writeHelper = new WriteOperationHelper(this, getConf());
@@ -315,18 +346,21 @@ public void initialize(URI name, Configuration originalConf)
       }
       useListV1 = (listVersion == 1);
 
+      // creates the AWS client, including overriding auth chain if
+      // the FS came with a DT
+      // this may do some patching of the configuration (e.g. setting
+      // the encryption algorithms)
+      bindAWSClient(name, delegationTokensEnabled);
+
       initTransferManager();
 
       initCannedAcls(conf);
 
       verifyBucketExists();
 
-      serverSideEncryptionAlgorithm = getEncryptionAlgorithm(bucket, conf);
       inputPolicy = S3AInputPolicy.getPolicy(
           conf.getTrimmed(INPUT_FADVISE, INPUT_FADV_NORMAL));
       LOG.debug("Input fadvise policy = {}", inputPolicy);
-      changeDetectionPolicy = ChangeDetectionPolicy.getPolicy(conf);
-      LOG.debug("Change detection policy = {}", changeDetectionPolicy);
       boolean magicCommitterEnabled = conf.getBoolean(
           CommitConstants.MAGIC_COMMITTER_ENABLED,
           CommitConstants.DEFAULT_MAGIC_COMMITTER_ENABLED);
@@ -335,6 +369,9 @@ public void initialize(URI name, Configuration originalConf)
       committerIntegration = new MagicCommitIntegration(
           this, magicCommitterEnabled);
 
+      // instantiate S3 Select support
+      selectBinding = new SelectBinding(writeHelper);
+
       boolean blockUploadEnabled = conf.getBoolean(FAST_UPLOAD, true);
 
       if (!blockUploadEnabled) {
@@ -357,10 +394,10 @@ public void initialize(URI name, Configuration originalConf)
         LOG.debug("Using metadata store {}, authoritative={}",
             getMetadataStore(), allowAuthoritative);
       }
-      // directory policy, which may look at authoritative paths
-      directoryPolicy = DirectoryPolicyImpl.getDirectoryPolicy(conf);
-      LOG.debug("Directory marker retention policy is {}", directoryPolicy);
       initMultipartUploads(conf);
+      long authDirTtl = conf.getLong(METADATASTORE_AUTHORITATIVE_DIR_TTL,
+          DEFAULT_METADATASTORE_AUTHORITATIVE_DIR_TTL);
+      ttlTimeProvider = new S3Guard.TtlTimeProvider(authDirTtl);
     } catch (AmazonClientException e) {
       throw translateException("initializing ", new Path(name), e);
     }
@@ -402,6 +439,80 @@ public S3AInstrumentation getInstrumentation() {
     return instrumentation;
   }
 
+  /**
+   * Set up the client bindings.
+   * If delegation tokens are enabled, the FS first looks for a DT
+   * ahead of any other bindings;.
+   * If there is a DT it uses that to do the auth
+   * and switches to the DT authenticator automatically (and exclusively)
+   * @param name URI of the FS
+   * @param dtEnabled are delegation tokens enabled?
+   * @throws IOException failure.
+   */
+  private void bindAWSClient(URI name, boolean dtEnabled) throws IOException {
+    Configuration conf = getConf();
+    credentials = null;
+    String uaSuffix = "";
+
+    if (dtEnabled) {
+      // Delegation support.
+      // Create and start the DT integration.
+      // Then look for an existing DT for this bucket, switch to authenticating
+      // with it if so.
+
+      LOG.debug("Using delegation tokens");
+      S3ADelegationTokens tokens = new S3ADelegationTokens();
+      this.delegationTokens = Optional.of(tokens);
+      tokens.bindToFileSystem(getCanonicalUri(), this);
+      tokens.init(conf);
+      tokens.start();
+      // switch to the DT provider and bypass all other configured
+      // providers.
+      if (tokens.isBoundToDT()) {
+        // A DT was retrieved.
+        LOG.debug("Using existing delegation token");
+        // and use the encryption settings from that client, whatever they were
+      } else {
+        LOG.debug("No delegation token for this instance");
+      }
+      // Get new credential chain
+      credentials = tokens.getCredentialProviders();
+      // and any encryption secrets which came from a DT
+      tokens.getEncryptionSecrets()
+          .ifPresent(this::setEncryptionSecrets);
+      // and update the UA field with any diagnostics provided by
+      // the DT binding.
+      uaSuffix = tokens.getUserAgentField();
+    } else {
+      // DT support is disabled, so create the normal credential chain
+      credentials = createAWSCredentialProviderSet(name, conf);
+    }
+    LOG.debug("Using credential provider {}", credentials);
+    Class<? extends S3ClientFactory> s3ClientFactoryClass = conf.getClass(
+        S3_CLIENT_FACTORY_IMPL, DEFAULT_S3_CLIENT_FACTORY_IMPL,
+        S3ClientFactory.class);
+
+    s3 = ReflectionUtils.newInstance(s3ClientFactoryClass, conf)
+        .createS3Client(getUri(), bucket, credentials, uaSuffix);
+  }
+
+  /**
+   * Set the encryption secrets for requests.
+   * @param secrets secrets
+   */
+  protected void setEncryptionSecrets(final EncryptionSecrets secrets) {
+    this.encryptionSecrets = secrets;
+  }
+
+  /**
+   * Get the encryption secrets.
+   * This potentially sensitive information and must be treated with care.
+   * @return the current encryption secrets.
+   */
+  public EncryptionSecrets getEncryptionSecrets() {
+    return encryptionSecrets;
+  }
+
   private void initTransferManager() {
     TransferManagerConfiguration transferConfiguration =
         new TransferManagerConfiguration();
@@ -477,18 +588,30 @@ public URI getUri() {
   }
 
   /**
-   * Set the URI field through {@link S3xLoginHelper}.
+   * Set the URI field through {@link S3xLoginHelper} and
+   * optionally {@link #canonicalizeUri(URI)}
    * Exported for testing.
-   * @param uri filesystem URI.
+   * @param fsUri filesystem URI.
+   * @param canonicalize true if the URI should be canonicalized.
    */
   @VisibleForTesting
-  protected void setUri(URI uri) {
-    this.uri = S3xLoginHelper.buildFSURI(uri);
+  protected void setUri(URI fsUri, boolean canonicalize) {
+    URI u = S3xLoginHelper.buildFSURI(fsUri);
+    this.uri = canonicalize ? u : canonicalizeUri(u);
+  }
+
+  /**
+   * Get the canonical URI.
+   * @return the canonical URI of this FS.
+   */
+  public URI getCanonicalUri() {
+    return uri;
   }
 
+  @VisibleForTesting
   @Override
   public int getDefaultPort() {
-    return Constants.S3A_DEFAULT_PORT;
+    return 0;
   }
 
   /**
@@ -564,21 +687,12 @@ public S3AInputPolicy getInputPolicy() {
     return inputPolicy;
   }
 
-  /**
-   * Get the change detection policy for this FS instance.
-   * @return the change detection policy
-   */
-  @VisibleForTesting
-  ChangeDetectionPolicy getChangeDetectionPolicy() {
-    return changeDetectionPolicy;
-  }
-
   /**
    * Get the encryption algorithm of this endpoint.
    * @return the encryption algorithm.
    */
   public S3AEncryptionMethods getServerSideEncryptionAlgorithm() {
-    return serverSideEncryptionAlgorithm;
+    return encryptionSecrets.getEncryptionMethod();
   }
 
   /**
@@ -710,6 +824,13 @@ public void checkPath(Path path) {
     S3xLoginHelper.checkPath(getConf(), getUri(), path, getDefaultPort());
   }
 
+  /**
+   * Override the base canonicalization logic and relay to
+   * {@link S3xLoginHelper#canonicalizeUri(URI, int)}.
+   * This allows for the option of changing this logic for better DT handling.
+   * @param rawUri raw URI.
+   * @return the canonical URI to use in delegation tokens and file context.
+   */
   @Override
   protected URI canonicalizeUri(URI rawUri) {
     return S3xLoginHelper.canonicalizeUri(rawUri, getDefaultPort());
@@ -720,31 +841,50 @@ protected URI canonicalizeUri(URI rawUri) {
    * @param f the file name to open
    * @param bufferSize the size of the buffer to be used.
    */
+  @Retries.RetryTranslated
   public FSDataInputStream open(Path f, int bufferSize)
       throws IOException {
+    return open(f, Optional.empty());
+  }
+
+  /**
+   * Opens an FSDataInputStream at the indicated Path.
+   * @param path the file to open
+   * @param options configuration options if opened with the builder API.
+   * @throws IOException IO failure.
+   */
+  @Retries.RetryTranslated
+  private FSDataInputStream open(
+      final Path path,
+      final Optional<Configuration> options)
+      throws IOException {
+
     entryPoint(INVOCATION_OPEN);
-    LOG.debug("Opening '{}' for reading; input policy = {}", f, inputPolicy);
-    final FileStatus fileStatus = getFileStatus(f);
+    final FileStatus fileStatus = getFileStatus(path);
     if (fileStatus.isDirectory()) {
-      throw new FileNotFoundException("Can't open " + f
+      throw new FileNotFoundException("Can't open " + path
           + " because it is a directory");
     }
 
     S3AReadOpContext readContext;
-    readContext = createReadContext(
-        fileStatus,
-        inputPolicy,
-        changeDetectionPolicy,
-        readAhead);
+    if (options.isPresent()) {
+      Configuration o = options.get();
+      // normal path. Open the file with the chosen seek policy, if different
+      // from the normal one.
+      // and readahead.
+      S3AInputPolicy policy = S3AInputPolicy.getPolicy(
+          o.get(INPUT_FADVISE, inputPolicy.toString()));
+      long readAheadRange2 = o.getLong(READAHEAD_RANGE, readAhead);
+      readContext = createReadContext(fileStatus, policy, readAheadRange2);
+    } else {
+      readContext = createReadContext(fileStatus, inputPolicy, readAhead);
+    }
     LOG.debug("Opening '{}'", readContext);
 
     return new FSDataInputStream(
         new S3AInputStream(
             readContext,
-            new S3ObjectAttributes(bucket,
-                pathToKey(f),
-                serverSideEncryptionAlgorithm,
-                getServerSideEncryptionKey(bucket, getConf())),
+            createObjectAttributes(path),
             fileStatus.getLen(),
             s3));
   }
@@ -760,7 +900,6 @@ public FSDataInputStream open(Path f, int bufferSize)
   private S3AReadOpContext createReadContext(
       final FileStatus fileStatus,
       final S3AInputPolicy seekPolicy,
-      final ChangeDetectionPolicy changePolicy,
       final long readAheadRange) {
     return new S3AReadOpContext(fileStatus.getPath(),
         hasMetadataStore(),
@@ -770,10 +909,21 @@ private S3AReadOpContext createReadContext(
         instrumentation,
         fileStatus,
         seekPolicy,
-        changePolicy,
         readAheadRange);
   }
 
+  /**
+   * Create the attributes of an object for a get/select request.
+   * @param f path path of the request.
+   * @return attributes to use when building the query.
+   */
+  private S3ObjectAttributes createObjectAttributes(final Path f) {
+    return new S3ObjectAttributes(bucket,
+        pathToKey(f),
+        getServerSideEncryptionAlgorithm(),
+        encryptionSecrets.getEncryptionKey());
+  }
+
   /**
    * Create an FSDataOutputStream at the indicated Path with write-progress
    * reporting.
@@ -1030,10 +1180,6 @@ private boolean innerRename(Path source, Path dest)
     }
     // TODO S3Guard HADOOP-13761: retries when source paths are not visible yet
     // TODO S3Guard: performance: mark destination dirs as authoritative
-    // The path to whichever file or directory is created by the
-    // rename. When deleting markers all parents of
-    // this path will need their markers pruned.
-    Path destCreated = dst;
 
     // Ok! Time to start
     if (srcStatus.isFile()) {
@@ -1044,11 +1190,9 @@ private boolean innerRename(Path source, Path dest)
         String filename =
             srcKey.substring(pathToKey(src.getParent()).length()+1);
         newDstKey = newDstKey + filename;
-        destCreated = keyToQualifiedPath(newDstKey);
-
         copyFile(srcKey, newDstKey, length);
         S3Guard.addMoveFile(metadataStore, srcPaths, dstMetas, src,
-            destCreated, length, getDefaultBlockSize(dst),
+            keyToQualifiedPath(newDstKey), length, getDefaultBlockSize(dst),
             username);
       } else {
         copyFile(srcKey, dstKey, srcStatus.getLen());
@@ -1130,9 +1274,9 @@ destCreated, length, getDefaultBlockSize(dst),
 
     metadataStore.move(srcPaths, dstMetas);
 
-    if (!src.getParent().equals(destCreated.getParent())) {
+    if (!src.getParent().equals(dst.getParent())) {
       LOG.debug("source & dest parents are different; fix up dir markers");
-      deleteUnnecessaryFakeDirectories(destCreated.getParent());
+      deleteUnnecessaryFakeDirectories(dst.getParent());
       maybeCreateFakeParentDirectory(src);
     }
     return true;
@@ -1145,9 +1289,26 @@ destCreated, length, getDefaultBlockSize(dst),
    * @throws IOException IO and object access problems.
    */
   @VisibleForTesting
-  @Retries.RetryRaw
+  @Retries.RetryTranslated
   public ObjectMetadata getObjectMetadata(Path path) throws IOException {
-    return getObjectMetadata(pathToKey(path));
+    return once("getObjectMetadata", path.toString(),
+        () ->
+          // this always does a full HEAD to the object
+          getObjectMetadata(pathToKey(path)));
+  }
+
+  /**
+   * Get all the headers of the object of a path, if the object exists.
+   * @param path path to probe
+   * @return an immutable map of object headers.
+   * @throws IOException failure of the query
+   */
+  @Retries.RetryTranslated
+  public Map<String, Object> getObjectHeaders(Path path) throws IOException {
+    LOG.debug("getObjectHeaders({})", path);
+    checkNotClosed();
+    incrementReadOperations();
+    return getObjectMetadata(path).getRawMetadata();
   }
 
   /**
@@ -1297,10 +1458,7 @@ protected ObjectMetadata getObjectMetadata(String key) throws IOException {
     GetObjectMetadataRequest request =
         new GetObjectMetadataRequest(bucket, key);
     //SSE-C requires to be filled in if enabled for object metadata
-    if(S3AEncryptionMethods.SSE_C.equals(serverSideEncryptionAlgorithm) &&
-        isNotBlank(getServerSideEncryptionKey(bucket, getConf()))){
-      request.setSSECustomerKey(generateSSECustomerKey());
-    }
+    generateSSECustomerKey().ifPresent(request::setSSECustomerKey);
     ObjectMetadata meta = invoker.retryUntranslated("GET " + key, true,
         () -> {
           incrementStatistic(OBJECT_METADATA_REQUESTS);
@@ -1879,16 +2037,20 @@ private boolean innerDelete(S3AFileStatus status, boolean recursive)
    */
   private boolean rejectRootDirectoryDelete(S3AFileStatus status,
       boolean recursive) throws IOException {
-    LOG.info("s3a delete the {} root directory of {}", bucket, recursive);
+    LOG.info("s3a delete the {} root directory. Path: {}. Recursive: {}",
+        bucket, status.getPath(), recursive);
     boolean emptyRoot = status.isEmptyDirectory() == Tristate.TRUE;
     if (emptyRoot) {
       return true;
     }
     if (recursive) {
+      LOG.error("Cannot delete root path: {}", status.getPath());
       return false;
     } else {
       // reject
-      throw new PathIOException(bucket, "Cannot delete root path");
+      String msg = "Cannot delete root path: " + status.getPath();
+      LOG.error(msg);
+      throw new PathIOException(bucket, msg);
     }
   }
 
@@ -1965,7 +2127,8 @@ public FileStatus[] innerListStatus(Path f) throws FileNotFoundException,
         key = key + '/';
       }
 
-      DirListingMetadata dirMeta = metadataStore.listChildren(path);
+      DirListingMetadata dirMeta =
+          S3Guard.listChildrenWithTtl(metadataStore, path, ttlTimeProvider);
       if (allowAuthoritative && dirMeta != null && dirMeta.isAuthoritative()) {
         return S3Guard.dirMetaToStatuses(dirMeta);
       }
@@ -1983,7 +2146,7 @@ public FileStatus[] innerListStatus(Path f) throws FileNotFoundException,
         result.add(files.next());
       }
       return S3Guard.dirListingUnion(metadataStore, path, result, dirMeta,
-          allowAuthoritative);
+          allowAuthoritative, ttlTimeProvider);
     } else {
       LOG.debug("Adding: rd (not a dir): {}", path);
       FileStatus[] stats = new FileStatus[1];
@@ -2060,6 +2223,14 @@ public String getUsername() {
     return username;
   }
 
+  /**
+   * Get the owner of this FS: who created it?
+   * @return the owner of the FS.
+   */
+  public UserGroupInformation getOwner() {
+    return owner;
+  }
+
   /**
    *
    * Make the given path and all non-existent parents into
@@ -2189,43 +2360,37 @@ S3AFileStatus innerGetFileStatus(final Path f,
 
       FileStatus msStatus = pm.getFileStatus();
       if (needEmptyDirectoryFlag && msStatus.isDirectory()) {
-        // the caller needs to know if a directory is empty,
-        // and that this is a directory.
         if (pm.isEmptyDirectory() != Tristate.UNKNOWN) {
           // We have a definitive true / false from MetadataStore, we are done.
           return S3AFileStatus.fromFileStatus(msStatus, pm.isEmptyDirectory());
         } else {
-          DirListingMetadata children = metadataStore.listChildren(path);
+          DirListingMetadata children =
+              S3Guard.listChildrenWithTtl(metadataStore, path, ttlTimeProvider);
           if (children != null) {
             tombstones = children.listTombstones();
           }
-          LOG.debug("MetadataStore doesn't know if {} is empty, using S3.",
-              path);
+          LOG.debug("MetadataStore doesn't know if dir is empty, using S3.");
         }
       } else {
         // Either this is not a directory, or we don't care if it is empty
         return S3AFileStatus.fromFileStatus(msStatus, pm.isEmptyDirectory());
       }
 
-      // now issue the S3 getFileStatus call.
+      // If the metadata store has no children for it and it's not listed in
+      // S3 yet, we'll assume the empty directory is true;
+      S3AFileStatus s3FileStatus;
       try {
-        S3AFileStatus s3FileStatus = s3GetFileStatus(path, key,
-            StatusProbeEnum.ALL,
-            tombstones,
-            true);
-        // entry was found, so save in S3Guard and return the final value.
-        return S3Guard.putAndReturn(metadataStore, s3FileStatus,
-            instrumentation);
+        s3FileStatus = s3GetFileStatus(path, key, tombstones);
       } catch (FileNotFoundException e) {
         return S3AFileStatus.fromFileStatus(msStatus, Tristate.TRUE);
       }
+      // entry was found, save in S3Guard
+      return S3Guard.putAndReturn(metadataStore, s3FileStatus, instrumentation);
     } else {
       // there was no entry in S3Guard
       // retrieve the data and update the metadata store in the process.
       return S3Guard.putAndReturn(metadataStore,
-          s3GetFileStatus(path, key, StatusProbeEnum.ALL,
-              tombstones, needEmptyDirectoryFlag),
-          instrumentation);
+          s3GetFileStatus(path, key, tombstones), instrumentation);
     }
   }
 
@@ -2236,96 +2401,87 @@ S3AFileStatus innerGetFileStatus(final Path f,
    * Retry policy: retry translated.
    * @param path Qualified path
    * @param key  Key string for the path
-   * @param probes probes to make
-   * @param tombstones tombstones to filter
-   * @param needEmptyDirectoryFlag if true, implementation will calculate
-   *        a TRUE or FALSE value for {@link S3AFileStatus#isEmptyDirectory()}
    * @return Status
-   * @throws FileNotFoundException the supplied probes failed.
+   * @throws FileNotFoundException when the path does not exist
    * @throws IOException on other problems.
    */
-  @VisibleForTesting
   @Retries.RetryTranslated
-  S3AFileStatus s3GetFileStatus(final Path path,
-      final String key,
-      final Set<StatusProbeEnum> probes,
-      @Nullable final Set<Path> tombstones,
-      final boolean needEmptyDirectoryFlag) throws IOException {
-    LOG.debug("S3GetFileStatus {}", path);
-    Preconditions.checkArgument(!needEmptyDirectoryFlag
-        || probes.contains(StatusProbeEnum.List), String.format(
-            "s3GetFileStatus(%s) wants to know if a directory is empty but"
-              + " does not request a list probe", path));
-
-    if (!key.isEmpty() && !key.endsWith("/")
-        && probes.contains(StatusProbeEnum.Head)) {
+  private S3AFileStatus s3GetFileStatus(final Path path, String key,
+      Set<Path> tombstones) throws IOException {
+    if (!key.isEmpty()) {
       try {
-        // look for the simple file
         ObjectMetadata meta = getObjectMetadata(key);
-        LOG.debug("Found exact file: normal file {}", key);
+
+        if (objectRepresentsDirectory(key, meta.getContentLength())) {
+          LOG.debug("Found exact file: fake directory");
+          return new S3AFileStatus(Tristate.TRUE, path, username);
+        } else {
+          LOG.debug("Found exact file: normal file");
           return new S3AFileStatus(meta.getContentLength(),
               dateToLong(meta.getLastModified()),
               path,
               getDefaultBlockSize(path),
               username);
+        }
       } catch (AmazonServiceException e) {
-      // if the response is a 404 error, it just means that there is
-      // no file at that path...the remaining checks will be needed.
         if (e.getStatusCode() != 404) {
           throw translateException("getFileStatus", path, e);
         }
       } catch (AmazonClientException e) {
         throw translateException("getFileStatus", path, e);
       }
-    }
 
-    // execute the list
-    if (probes.contains(StatusProbeEnum.List)) {
+      // Necessary?
+      if (!key.endsWith("/")) {
+        String newKey = key + "/";
         try {
-        // this will find a marker dir / as well as an entry.
-        // When making a simple "is this a dir check" all is good.
-        // but when looking for an empty dir, we need to verify there are no
-        // children, so ask for two entries, so as to find
-        // a child
-        String dirKey = maybeAddTrailingSlash(key);
-        // list size is dir marker + at least one non-tombstone entry
-        // there's a corner case: more tombstones than you have in a
-        // single page list. We assume that if you have been deleting
-        // that many files, then the AWS listing will have purged some
-        // by the time of listing so that the response includes some
-        // which have not.
-
-        int listSize;
-        if (tombstones == null) {
-          // no tombstones so look for a marker and at least one child.
-          listSize = 2;
+          ObjectMetadata meta = getObjectMetadata(newKey);
+
+          if (objectRepresentsDirectory(newKey, meta.getContentLength())) {
+            LOG.debug("Found file (with /): fake directory");
+            return new S3AFileStatus(Tristate.TRUE, path, username);
           } else {
-          // build a listing > tombstones. If the caller has many thousands
-          // of tombstones this won't work properly, which is why pruning
-          // of expired tombstones matters.
-          listSize = Math.min(2 + tombstones.size(), Math.max(2, maxKeys));
+            LOG.warn("Found file (with /): real file? should not happen: {}",
+                key);
+
+            return new S3AFileStatus(meta.getContentLength(),
+                    dateToLong(meta.getLastModified()),
+                    path,
+                    getDefaultBlockSize(path),
+                    username);
+          }
+        } catch (AmazonServiceException e) {
+          if (e.getStatusCode() != 404) {
+            throw translateException("getFileStatus", newKey, e);
+          }
+        } catch (AmazonClientException e) {
+          throw translateException("getFileStatus", newKey, e);
+        }
       }
-        S3ListRequest request = createListObjectsRequest(dirKey, "/",
-            listSize);
-        // execute the request
-        S3ListResult listResult = listObjects(request);
+    }
 
+    try {
+      key = maybeAddTrailingSlash(key);
+      S3ListRequest request = createListObjectsRequest(key, "/", 1);
+
+      S3ListResult objects = listObjects(request);
 
-        if (listResult.hasPrefixesOrObjects(this::keyToPath, tombstones)) {
+      Collection<String> prefixes = objects.getCommonPrefixes();
+      Collection<S3ObjectSummary> summaries = objects.getObjectSummaries();
+      if (!isEmptyOfKeys(prefixes, tombstones) ||
+          !isEmptyOfObjects(summaries, tombstones)) {
         if (LOG.isDebugEnabled()) {
-            LOG.debug("Found path as directory (with /)");
-            listResult.logAtDebug(LOG);
+          LOG.debug("Found path as directory (with /): {}/{}",
+              prefixes.size(), summaries.size());
+
+          for (S3ObjectSummary summary : summaries) {
+            LOG.debug("Summary: {} {}", summary.getKey(), summary.getSize());
           }
-          // At least one entry has been found.
-          // If looking for an empty directory, the marker must exist but no children.
-          // So the listing must contain the marker entry only.
-          if (needEmptyDirectoryFlag
-              && listResult.representsEmptyDirectory(
-                  this::keyToPath, dirKey, tombstones)) {
-            return new S3AFileStatus(Tristate.TRUE, path, username);
+          for (String prefix : prefixes) {
+            LOG.debug("Prefix: {}", prefix);
           }
-          // either an empty directory is not needed, or the
-          // listing does not meet the requirements.
+        }
+
         return new S3AFileStatus(Tristate.FALSE, path, username);
       } else if (key.isEmpty()) {
         LOG.debug("Found root directory");
@@ -2338,12 +2494,53 @@ S3AFileStatus s3GetFileStatus(final Path path,
     } catch (AmazonClientException e) {
       throw translateException("getFileStatus", path, e);
     }
-    }
 
     LOG.debug("Not Found: {}", path);
     throw new FileNotFoundException("No such file or directory: " + path);
   }
 
+  /**
+   * Helper function to determine if a collection of paths is empty
+   * after accounting for tombstone markers (if provided).
+   * @param keys Collection of path (prefixes / directories or keys).
+   * @param tombstones Set of tombstone markers, or null if not applicable.
+   * @return false if summaries contains objects not accounted for by
+   * tombstones.
+   */
+  private boolean isEmptyOfKeys(Collection<String> keys, Set<Path>
+      tombstones) {
+    if (tombstones == null) {
+      return keys.isEmpty();
+    }
+    for (String key : keys) {
+      Path qualified = keyToQualifiedPath(key);
+      if (!tombstones.contains(qualified)) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  /**
+   * Helper function to determine if a collection of object summaries is empty
+   * after accounting for tombstone markers (if provided).
+   * @param summaries Collection of objects as returned by listObjects.
+   * @param tombstones Set of tombstone markers, or null if not applicable.
+   * @return false if summaries contains objects not accounted for by
+   * tombstones.
+   */
+  private boolean isEmptyOfObjects(Collection<S3ObjectSummary> summaries,
+      Set<Path> tombstones) {
+    if (tombstones == null) {
+      return summaries.isEmpty();
+    }
+    Collection<String> stringCollection = new ArrayList<>(summaries.size());
+    for (S3ObjectSummary summary : summaries) {
+      stringCollection.add(summary.getKey());
+    }
+    return isEmptyOfKeys(stringCollection, tombstones);
+  }
+
   /**
    * Raw version of {@link FileSystem#exists(Path)} which uses S3 only:
    * S3Guard MetadataStore, if any, will be skipped.
@@ -2356,8 +2553,7 @@ private boolean s3Exists(final Path f) throws IOException {
     Path path = qualify(f);
     String key = pathToKey(path);
     try {
-      s3GetFileStatus(path, key, StatusProbeEnum.ALL,
-          null, false);
+      s3GetFileStatus(path, key, null);
       return true;
     } catch (FileNotFoundException e) {
       return false;
@@ -2482,11 +2678,14 @@ UploadResult executePut(PutObjectRequest putObjectRequest,
    * Wait for an upload to complete.
    * If the waiting for completion is interrupted, the upload will be
    * aborted before an {@code InterruptedIOException} is thrown.
-   * @param upload upload to wait for
+   * If the upload (or its result collection) failed, this is where
+   * the failure is raised as an AWS exception
    * @param key destination key
+   * @param uploadInfo upload to wait for
    * @return the upload result
    * @throws InterruptedIOException if the blocking was interrupted.
    */
+  @Retries.OnceRaw
   UploadResult waitForUploadCompletion(String key, UploadInfo uploadInfo)
       throws InterruptedIOException {
     Upload upload = uploadInfo.getUpload();
@@ -2528,6 +2727,7 @@ public void close() throws IOException {
       metadataStore = null;
       instrumentation = null;
       closeAutocloseables(LOG, credentials);
+      cleanupWithLogger(LOG, delegationTokens.orElse(null));
       credentials = null;
     }
   }
@@ -2544,12 +2744,88 @@ private void checkNotClosed() throws IOException {
   }
 
   /**
-   * Override getCanonicalServiceName because we don't support token in S3A.
+   * Get the delegation token support for this filesystem;
+   * not null iff delegation support is enabled.
+   * @return the token support, or an empty option.
+   */
+  @VisibleForTesting
+  public Optional<S3ADelegationTokens> getDelegationTokens() {
+    return delegationTokens;
+  }
+
+  /**
+   * Return a service name iff delegation tokens are enabled and the
+   * token binding is issuing delegation tokens.
+   * @return the canonical service name or null
    */
   @Override
   public String getCanonicalServiceName() {
-    // Does not support Token
-    return null;
+    // this could all be done in map statements, but it'd be harder to
+    // understand and maintain.
+    // Essentially: no DTs, no canonical service name.
+    if (!delegationTokens.isPresent()) {
+      return null;
+    }
+    // DTs present: ask the binding if it is willing to
+    // serve tokens (or fail noisily).
+    S3ADelegationTokens dt = delegationTokens.get();
+    return dt.getTokenIssuingPolicy() != NoTokensAvailable
+        ? dt.getCanonicalServiceName()
+        : null;
+  }
+
+  /**
+   * Get a delegation token if the FS is set up for them.
+   * If the user already has a token, it is returned,
+   * <i>even if it has expired</i>.
+   * @param renewer the account name that is allowed to renew the token.
+   * @return the delegation token or null
+   * @throws IOException IO failure
+   */
+  @Override
+  public Token<AbstractS3ATokenIdentifier> getDelegationToken(String renewer)
+      throws IOException {
+    entryPoint(Statistic.INVOCATION_GET_DELEGATION_TOKEN);
+    LOG.debug("Delegation token requested");
+    if (delegationTokens.isPresent()) {
+      return delegationTokens.get().getBoundOrNewDT(encryptionSecrets);
+    } else {
+      // Delegation token support is not set up
+      LOG.debug("Token support is not enabled");
+      return null;
+    }
+  }
+
+  /**
+   * Build the AWS policy for restricted access to the resources needed
+   * by this bucket.
+   * The policy generated includes S3 access, S3Guard access
+   * if needed, and KMS operations.
+   * @param access access level desired.
+   * @return a policy for use in roles
+   */
+  @Override
+  public List<RoleModel.Statement> listAWSPolicyRules(
+      final Set<AccessLevel> access) {
+    if (access.isEmpty()) {
+      return Collections.emptyList();
+    }
+    List<RoleModel.Statement> statements = new ArrayList<>(
+        allowS3Operations(bucket,
+            access.contains(AccessLevel.WRITE)
+                || access.contains(AccessLevel.ADMIN)));
+
+    // no attempt is made to qualify KMS access; there's no
+    // way to predict read keys, and not worried about granting
+    // too much encryption access.
+    statements.add(STATEMENT_ALLOW_SSE_KMS_RW);
+
+    // add any metastore policies
+    if (metadataStore instanceof AWSPolicyProvider) {
+      statements.addAll(
+          ((AWSPolicyProvider) metadataStore).listAWSPolicyRules(access));
+    }
+    return statements;
   }
 
   /**
@@ -2601,20 +2877,15 @@ private void copyFile(String srcKey, String dstKey, long size)
         });
   }
 
+  /**
+   * Set the optional parameters when initiating the request (encryption,
+   * headers, storage, etc).
+   * @param request request to patch.
+   */
   protected void setOptionalMultipartUploadRequestParameters(
-      InitiateMultipartUploadRequest req) {
-    switch (serverSideEncryptionAlgorithm) {
-    case SSE_KMS:
-      req.setSSEAwsKeyManagementParams(generateSSEAwsKeyParams());
-      break;
-    case SSE_C:
-      if (isNotBlank(getServerSideEncryptionKey(bucket, getConf()))) {
-        //at the moment, only supports copy using the same key
-        req.setSSECustomerKey(generateSSECustomerKey());
-      }
-      break;
-    default:
-    }
+      InitiateMultipartUploadRequest request) {
+    generateSSEAwsKeyParams().ifPresent(request::setSSEAwsKeyManagementParams);
+    generateSSECustomerKey().ifPresent(request::setSSECustomerKey);
   }
 
   /**
@@ -2624,14 +2895,7 @@ protected void setOptionalMultipartUploadRequestParameters(
    */
   protected void setOptionalUploadPartRequestParameters(
       UploadPartRequest request) {
-    switch (serverSideEncryptionAlgorithm) {
-    case SSE_C:
-      if (isNotBlank(getServerSideEncryptionKey(bucket, getConf()))) {
-        request.setSSECustomerKey(generateSSECustomerKey());
-      }
-      break;
-    default:
-    }
+    generateSSECustomerKey().ifPresent(request::setSSECustomerKey);
   }
 
   /**
@@ -2652,71 +2916,53 @@ InitiateMultipartUploadResult initiateMultipartUpload(
 
   protected void setOptionalCopyObjectRequestParameters(
       CopyObjectRequest copyObjectRequest) throws IOException {
-    switch (serverSideEncryptionAlgorithm) {
+    switch (getServerSideEncryptionAlgorithm()) {
     case SSE_KMS:
-      copyObjectRequest.setSSEAwsKeyManagementParams(
-          generateSSEAwsKeyParams()
-      );
+      generateSSEAwsKeyParams().ifPresent(
+          copyObjectRequest::setSSEAwsKeyManagementParams);
       break;
     case SSE_C:
-      if (isNotBlank(getServerSideEncryptionKey(bucket, getConf()))) {
-        //at the moment, only supports copy using the same key
-        SSECustomerKey customerKey = generateSSECustomerKey();
+      generateSSECustomerKey().ifPresent(customerKey -> {
         copyObjectRequest.setSourceSSECustomerKey(customerKey);
         copyObjectRequest.setDestinationSSECustomerKey(customerKey);
-      }
+      });
       break;
     default:
     }
   }
 
   private void setOptionalPutRequestParameters(PutObjectRequest request) {
-    switch (serverSideEncryptionAlgorithm) {
-    case SSE_KMS:
-      request.setSSEAwsKeyManagementParams(generateSSEAwsKeyParams());
-      break;
-    case SSE_C:
-      if (isNotBlank(getServerSideEncryptionKey(bucket, getConf()))) {
-        request.setSSECustomerKey(generateSSECustomerKey());
-      }
-      break;
-    default:
-    }
+    generateSSEAwsKeyParams().ifPresent(request::setSSEAwsKeyManagementParams);
+    generateSSECustomerKey().ifPresent(request::setSSECustomerKey);
   }
 
   private void setOptionalObjectMetadata(ObjectMetadata metadata) {
-    if (S3AEncryptionMethods.SSE_S3.equals(serverSideEncryptionAlgorithm)) {
-      metadata.setSSEAlgorithm(serverSideEncryptionAlgorithm.getMethod());
+    final S3AEncryptionMethods algorithm
+        = getServerSideEncryptionAlgorithm();
+    if (S3AEncryptionMethods.SSE_S3.equals(algorithm)) {
+      metadata.setSSEAlgorithm(algorithm.getMethod());
     }
   }
 
   /**
-   * Create the AWS SDK structure used to configure SSE, based on the
-   * configuration.
-   * @return an instance of the class, which main contain the encryption key
+   * Create the AWS SDK structure used to configure SSE,
+   * if the encryption secrets contain the information/settings for this.
+   * @return an optional set of KMS Key settings
    */
-  @Retries.OnceExceptionsSwallowed
-  private SSEAwsKeyManagementParams generateSSEAwsKeyParams() {
-    //Use specified key, otherwise default to default master aws/s3 key by AWS
-    SSEAwsKeyManagementParams sseAwsKeyManagementParams =
-        new SSEAwsKeyManagementParams();
-    String encryptionKey = getServerSideEncryptionKey(bucket, getConf());
-    if (isNotBlank(encryptionKey)) {
-      sseAwsKeyManagementParams = new SSEAwsKeyManagementParams(encryptionKey);
-    }
-    return sseAwsKeyManagementParams;
+  private Optional<SSEAwsKeyManagementParams> generateSSEAwsKeyParams() {
+    return EncryptionSecretOperations.createSSEAwsKeyManagementParams(
+        encryptionSecrets);
   }
 
   /**
-   * Create the SSE-C structure for the AWS SDK.
+   * Create the SSE-C structure for the AWS SDK, if the encryption secrets
+   * contain the information/settings for this.
    * This will contain a secret extracted from the bucket/configuration.
-   * @return the customer key.
+   * @return an optional customer key.
    */
-  @Retries.OnceExceptionsSwallowed
-  private SSECustomerKey generateSSECustomerKey() {
-    SSECustomerKey customerKey = new SSECustomerKey(
-        getServerSideEncryptionKey(bucket, getConf()));
-    return customerKey;
+  private Optional<SSECustomerKey> generateSSECustomerKey() {
+    return EncryptionSecretOperations.createSSECustomerKey(
+        encryptionSecrets);
   }
 
   /**
@@ -2904,14 +3150,6 @@ public long getDefaultBlockSize() {
     return getConf().getLongBytes(FS_S3A_BLOCK_SIZE, DEFAULT_BLOCKSIZE);
   }
 
-  /**
-   * Get the directory marker policy of this filesystem.
-   * @return the marker policy.
-   */
-  public DirectoryPolicy getDirectoryMarkerPolicy() {
-    return directoryPolicy;
-  }
-
   @Override
   public String toString() {
     final StringBuilder sb = new StringBuilder(
@@ -2930,9 +3168,9 @@ public String toString() {
       sb.append(", blockSize=").append(getDefaultBlockSize());
     }
     sb.append(", multiPartThreshold=").append(multiPartThreshold);
-    if (serverSideEncryptionAlgorithm != null) {
+    if (getServerSideEncryptionAlgorithm() != null) {
       sb.append(", serverSideEncryptionAlgorithm='")
-          .append(serverSideEncryptionAlgorithm)
+          .append(getServerSideEncryptionAlgorithm())
           .append('\'');
     }
     if (blockFactory != null) {
@@ -2947,7 +3185,8 @@ public String toString() {
     sb.append(", boundedExecutor=").append(boundedThreadPool);
     sb.append(", unboundedExecutor=").append(unboundedThreadPool);
     sb.append(", credentials=").append(credentials);
-    sb.append(", ").append(directoryPolicy);
+    sb.append(", delegation tokens=")
+        .append(delegationTokens.map(Objects::toString).orElse("disabled"));
     sb.append(", statistics {")
         .append(statistics)
         .append("}");
@@ -3085,13 +3324,9 @@ public EtagChecksum getFileChecksum(Path f, final long length)
         ETAG_CHECKSUM_ENABLED_DEFAULT)) {
       Path path = qualify(f);
       LOG.debug("getFileChecksum({})", path);
-      return once("getFileChecksum", path.toString(),
-          () -> {
-            // this always does a full HEAD to the object
-            ObjectMetadata headers = getObjectMetadata(path);
-            String eTag = headers.getETag();
-            return eTag != null ? new EtagChecksum(eTag) : null;
-          });
+      ObjectMetadata headers = getObjectMetadata(path);
+      String eTag = headers.getETag();
+      return eTag != null ? new EtagChecksum(eTag) : null;
     } else {
       // disabled
       return null;
@@ -3171,7 +3406,8 @@ private RemoteIterator<LocatedFileStatus> innerListFiles(Path f, boolean
           tombstones = metadataStoreListFilesIterator.listTombstones();
           cachedFilesIterator = metadataStoreListFilesIterator;
         } else {
-          DirListingMetadata meta = metadataStore.listChildren(path);
+          DirListingMetadata meta =
+              S3Guard.listChildrenWithTtl(metadataStore, path, ttlTimeProvider);
           if (meta != null) {
             tombstones = meta.listTombstones();
           } else {
@@ -3244,7 +3480,9 @@ public RemoteIterator<LocatedFileStatus> listLocatedStatus(final Path f,
             final String key = maybeAddTrailingSlash(pathToKey(path));
             final Listing.FileStatusAcceptor acceptor =
                 new Listing.AcceptAllButSelfAndS3nDirs(path);
-            DirListingMetadata meta = metadataStore.listChildren(path);
+            DirListingMetadata meta =
+                S3Guard.listChildrenWithTtl(metadataStore, path,
+                    ttlTimeProvider);
             final RemoteIterator<FileStatus> cachedFileStatusIterator =
                 listing.createProvidedFileStatusIterator(
                     S3Guard.dirMetaToStatuses(meta), filter, acceptor);
@@ -3364,61 +3602,25 @@ public S3AInstrumentation.CommitterStatistics newCommitterStatistics() {
     return instrumentation.newCommitterStatistics();
   }
 
-  @SuppressWarnings("deprecation")
+  /**
+   * Return the capabilities of this filesystem instance.
+   * @param capability string to query the stream support for.
+   * @return whether the FS instance has the capability.
+   */
   @Override
-  public boolean hasPathCapability(final Path path, final String capability)
-      throws IOException {
-    final Path p = makeQualified(path);
-    String cap = validatePathCapabilityArgs(p, capability);
-    switch (cap) {
+  public boolean hasCapability(String capability) {
+
+    switch (capability.toLowerCase(Locale.ENGLISH)) {
 
     case CommitConstants.STORE_CAPABILITY_MAGIC_COMMITTER:
-    case CommitConstants.STORE_CAPABILITY_MAGIC_COMMITTER_OLD:
       // capability depends on FS configuration
       return isMagicCommitEnabled();
 
-    case CommonPathCapabilities.FS_CHECKSUMS:
-      // capability depends on FS configuration
-      return getConf().getBoolean(ETAG_CHECKSUM_ENABLED,
-          ETAG_CHECKSUM_ENABLED_DEFAULT);
-
-
-    // this client is safe to use with buckets
-    // containing directory markers anywhere in
-    // the hierarchy
-    case STORE_CAPABILITY_DIRECTORY_MARKER_AWARE:
-      return true;
-
-    /*
-     * Marker policy capabilities are handed off.
-     */
-    case STORE_CAPABILITY_DIRECTORY_MARKER_POLICY_KEEP:
-    case STORE_CAPABILITY_DIRECTORY_MARKER_POLICY_DELETE:
-    case STORE_CAPABILITY_DIRECTORY_MARKER_POLICY_AUTHORITATIVE:
-    case STORE_CAPABILITY_DIRECTORY_MARKER_ACTION_KEEP:
-    case STORE_CAPABILITY_DIRECTORY_MARKER_ACTION_DELETE:
-      return getDirectoryMarkerPolicy().hasPathCapability(path, cap);
+    case SelectConstants.S3_SELECT_CAPABILITY:
+      // select is only supported if enabled
+      return selectBinding.isEnabled();
 
     default:
-      return super.hasPathCapability(p, cap);
-    }
-  }
-
-  /**
-   * Return the capabilities of this filesystem instance.
-   *
-   * This has been supplanted by {@link #hasPathCapability(Path, String)}.
-   * @param capability string to query the stream support for.
-   * @return whether the FS instance has the capability.
-   */
-  @Deprecated
-  @Override
-  public boolean hasCapability(String capability) {
-    try {
-      return hasPathCapability(workingDir, capability);
-    } catch (IOException ex) {
-      // should never happen, so log and downgrade.
-      LOG.debug("Ignoring exception on hasCapability({}})", capability, ex);
       return false;
     }
   }
@@ -3435,4 +3637,114 @@ public AWSCredentialProviderList shareCredentials(final String purpose) {
     LOG.debug("Sharing credentials for: {}", purpose);
     return credentials.share();
   }
+
+  @VisibleForTesting
+  protected S3Guard.ITtlTimeProvider getTtlTimeProvider() {
+    return ttlTimeProvider;
+  }
+
+  @VisibleForTesting
+  protected void setTtlTimeProvider(S3Guard.ITtlTimeProvider ttlTimeProvider) {
+    this.ttlTimeProvider = ttlTimeProvider;
+  }
+
+  /**
+   * This is a proof of concept of a select API.
+   * Once a proper factory mechanism for opening files is added to the
+   * FileSystem APIs, this will be deleted <i>without any warning</i>.
+   * @param source path to source data
+   * @param expression select expression
+   * @param options request configuration from the builder.
+   * @return the stream of the results
+   * @throws IOException IO failure
+   */
+  @Retries.RetryTranslated
+  private FSDataInputStream select(final Path source,
+      final String expression,
+      final Configuration options)
+      throws IOException {
+    entryPoint(OBJECT_SELECT_REQUESTS);
+    requireSelectSupport(source);
+    final Path path = makeQualified(source);
+    // call getFileStatus(), which will look at S3Guard first,
+    // so the operation will fail if it is not there or S3Guard believes it has
+    // been deleted.
+    // validation of the file status are delegated to the binding.
+    final FileStatus fileStatus = getFileStatus(path);
+
+    // readahead range can be dynamically set
+    long ra = options.getLong(READAHEAD_RANGE, readAhead);
+    // build and execute the request
+    return selectBinding.select(
+        createReadContext(fileStatus, inputPolicy, ra),
+        expression,
+        options,
+        generateSSECustomerKey(),
+        createObjectAttributes(path));
+  }
+
+  /**
+   * Verify the FS supports S3 Select.
+   * @param source source file.
+   * @throws UnsupportedOperationException if not.
+   */
+  private void requireSelectSupport(final Path source) throws
+      UnsupportedOperationException {
+    if (!selectBinding.isEnabled()) {
+      throw new UnsupportedOperationException(
+          SelectConstants.SELECT_UNSUPPORTED);
+    }
+  }
+
+  /**
+   * Initiate the open or select operation.
+   * This is invoked from both the FileSystem and FileContext APIs
+   * @param path path to the file
+   * @param mandatoryKeys set of options declared as mandatory.
+   * @param options options set during the build sequence.
+   * @return a future which will evaluate to the opened/selected file.
+   * @throws IOException failure to resolve the link.
+   * @throws PathIOException operation is a select request but S3 select is
+   * disabled
+   * @throws IllegalArgumentException unknown mandatory key
+   */
+  @Override
+  @Retries.RetryTranslated
+  public CompletableFuture<FSDataInputStream> openFileWithOptions(
+      final Path path,
+      final Set<String> mandatoryKeys,
+      final Configuration options,
+      final int bufferSize) throws IOException {
+    String sql = options.get(SelectConstants.SELECT_SQL, null);
+    boolean isSelect = sql != null;
+    // choice of keys depends on open type
+    if (isSelect) {
+      rejectUnknownMandatoryKeys(
+          mandatoryKeys,
+          InternalSelectConstants.SELECT_OPTIONS,
+          "for " + path + " in S3 Select operation");
+    } else {
+      rejectUnknownMandatoryKeys(
+          mandatoryKeys,
+          InternalConstants.STANDARD_OPENFILE_KEYS,
+          "for " + path + " in non-select file I/O");
+    }
+    CompletableFuture<FSDataInputStream> result = new CompletableFuture<>();
+    if (!isSelect) {
+      // normal path.
+      unboundedThreadPool.submit(() ->
+          LambdaUtils.eval(result,
+              () -> open(path, Optional.of(options))));
+    } else {
+      // it is a select statement.
+      // fail fast if the method is not present
+      requireSelectSupport(path);
+      // submit the query
+      unboundedThreadPool.submit(() ->
+          LambdaUtils.eval(result,
+              () -> select(path, sql, options)));
+    }
+    return result;
+  }
+
 }
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AInputStream.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AInputStream.java
index 031dc717d22d1..ccc86d03a5f68 100644
--- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AInputStream.java
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AInputStream.java
@@ -18,6 +18,8 @@
 
 package org.apache.hadoop.fs.s3a;
 
+import javax.annotation.Nullable;
+
 import com.amazonaws.services.s3.AmazonS3;
 import com.amazonaws.services.s3.model.GetObjectRequest;
 import com.amazonaws.services.s3.model.S3Object;
@@ -30,8 +32,6 @@
 import org.apache.hadoop.fs.CanSetReadahead;
 import org.apache.hadoop.fs.FSExceptionMessages;
 import org.apache.hadoop.fs.FSInputStream;
-import org.apache.hadoop.fs.PathIOException;
-import org.apache.hadoop.fs.s3a.impl.ChangeTracker;
 
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -63,8 +63,8 @@
 @InterfaceStability.Evolving
 public class S3AInputStream extends FSInputStream implements CanSetReadahead {
 
-  public static final String OPERATION_OPEN = "open";
-  public static final String OPERATION_REOPEN = "re-open";
+  public static final String E_NEGATIVE_READAHEAD_VALUE
+      = "Negative readahead value";
 
   /**
    * This is the public position; the one set in {@link #seek(long)}
@@ -110,9 +110,6 @@ public class S3AInputStream extends FSInputStream implements CanSetReadahead {
    */
   private long contentRangeStart;
 
-  /** change tracker. */
-  private final ChangeTracker changeTracker;
-
   /**
    * Create the stream.
    * This does not attempt to open it; that is only done on the first
@@ -141,9 +138,6 @@ public S3AInputStream(S3AReadOpContext ctx,
     this.serverSideEncryptionAlgorithm =
         s3Attributes.getServerSideEncryptionAlgorithm();
     this.serverSideEncryptionKey = s3Attributes.getServerSideEncryptionKey();
-    this.changeTracker = new ChangeTracker(uri,
-        ctx.getChangeDetectionPolicy(),
-        streamStatistics.getVersionMismatchCounter());
     setInputPolicy(ctx.getInputPolicy());
     setReadahead(ctx.getReadahead());
   }
@@ -188,20 +182,15 @@ private synchronized void reopen(String reason, long targetPos, long length,
         StringUtils.isNotBlank(serverSideEncryptionKey)){
       request.setSSECustomerKey(new SSECustomerKey(serverSideEncryptionKey));
     }
-    String operation = opencount == 0 ? OPERATION_OPEN : OPERATION_REOPEN;
-    String text = String.format("%s %s at %d",
-        operation, uri, targetPos);
-    changeTracker.maybeApplyConstraint(request);
+    String text = String.format("Failed to %s %s at %d",
+        (opencount == 0 ? "open" : "re-open"), uri, targetPos);
     S3Object object = Invoker.once(text, uri,
         () -> client.getObject(request));
-
-    changeTracker.processResponse(object, operation,
-        targetPos);
     wrappedStream = object.getObjectContent();
     contentRangeStart = targetPos;
     if (wrappedStream == null) {
-      throw new PathIOException(uri,
-          "Null IO stream from " + operation + " of (" + reason +  ") ");
+      throw new IOException("Null IO stream from reopen of (" + reason +  ") "
+          + uri);
     }
 
     this.pos = targetPos;
@@ -272,7 +261,7 @@ private void seekInStream(long targetPos, long length) throws IOException {
       long forwardSeekLimit = Math.min(remainingInCurrentRequest,
           forwardSeekRange);
       boolean skipForward = remainingInCurrentRequest > 0
-          && diff < forwardSeekLimit;
+          && diff <= forwardSeekLimit;
       if (skipForward) {
         // the forward seek range is within the limits
         LOG.debug("Forward seek on {}, of {} bytes", uri, diff);
@@ -286,8 +275,6 @@ private void seekInStream(long targetPos, long length) throws IOException {
 
         if (pos == targetPos) {
           // all is well
-          LOG.debug("Now at {}: bytes remaining in current request: {}",
-              pos, remainingInCurrentRequest());
           return;
         } else {
           // log a warning; continue to attempt to re-open
@@ -681,7 +668,6 @@ public String toString() {
       sb.append(" contentRangeFinish=").append(contentRangeFinish);
       sb.append(" remainingInCurrentRequest=")
           .append(remainingInCurrentRequest());
-      sb.append(changeTracker);
       sb.append('\n').append(s);
       sb.append('}');
       return sb.toString();
@@ -741,12 +727,7 @@ public S3AInstrumentation.InputStreamStatistics getS3AStreamStatistics() {
 
   @Override
   public synchronized void setReadahead(Long readahead) {
-    if (readahead == null) {
-      this.readahead = Constants.DEFAULT_READAHEAD_RANGE;
-    } else {
-      Preconditions.checkArgument(readahead >= 0, "Negative readahead value");
-      this.readahead = readahead;
-    }
+    this.readahead = validateReadahead(readahead);
   }
 
   /**
@@ -799,4 +780,19 @@ static long calculateRequestLimit(
     return rangeLimit;
   }
 
+  /**
+   * from a possibly null Long value, return a valid
+   * readahead.
+   * @param readahead new readahead
+   * @return a natural number.
+   * @throws IllegalArgumentException if the range is invalid.
+   */
+  public static long validateReadahead(@Nullable Long readahead) {
+    if (readahead == null) {
+      return Constants.DEFAULT_READAHEAD_RANGE;
+    } else {
+      Preconditions.checkArgument(readahead >= 0, E_NEGATIVE_READAHEAD_VALUE);
+      return readahead;
+    }
+  }
 }
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AInstrumentation.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AInstrumentation.java
index 9ab184067176c..17c5aff9af7b0 100644
--- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AInstrumentation.java
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AInstrumentation.java
@@ -120,8 +120,7 @@ public class S3AInstrumentation implements Closeable, MetricsSource {
   private final MutableCounterLong streamBytesReadInClose;
   private final MutableCounterLong streamBytesDiscardedInAbort;
   private final MutableCounterLong ignoredErrors;
-  private final MutableQuantiles putLatencyQuantile;
-  private final MutableQuantiles throttleRateQuantile;
+
   private final MutableCounterLong numberOfFilesCreated;
   private final MutableCounterLong numberOfFilesCopied;
   private final MutableCounterLong bytesOfFilesCopied;
@@ -140,6 +139,7 @@ public class S3AInstrumentation implements Closeable, MetricsSource {
       INVOCATION_CREATE_NON_RECURSIVE,
       INVOCATION_DELETE,
       INVOCATION_EXISTS,
+      INVOCATION_GET_DELEGATION_TOKEN,
       INVOCATION_GET_FILE_CHECKSUM,
       INVOCATION_GET_FILE_STATUS,
       INVOCATION_GLOB_STATUS,
@@ -160,7 +160,7 @@ public class S3AInstrumentation implements Closeable, MetricsSource {
       OBJECT_PUT_BYTES,
       OBJECT_PUT_REQUESTS,
       OBJECT_PUT_REQUESTS_COMPLETED,
-      STREAM_READ_VERSION_MISMATCHES,
+      OBJECT_SELECT_REQUESTS,
       STREAM_WRITE_FAILURES,
       STREAM_WRITE_BLOCK_UPLOADS,
       STREAM_WRITE_BLOCK_UPLOADS_COMMITTED,
@@ -183,7 +183,8 @@ public class S3AInstrumentation implements Closeable, MetricsSource {
       S3GUARD_METADATASTORE_INITIALIZATION,
       S3GUARD_METADATASTORE_RETRY,
       S3GUARD_METADATASTORE_THROTTLED,
-      STORE_IO_THROTTLED
+      STORE_IO_THROTTLED,
+      DELEGATION_TOKENS_ISSUED
   };
 
   private static final Statistic[] GAUGES_TO_CREATE = {
@@ -237,9 +238,9 @@ public S3AInstrumentation(URI name) {
     }
     //todo need a config for the quantiles interval?
     int interval = 1;
-    putLatencyQuantile = quantiles(S3GUARD_METADATASTORE_PUT_PATH_LATENCY,
+    quantiles(S3GUARD_METADATASTORE_PUT_PATH_LATENCY,
         "ops", "latency", interval);
-    throttleRateQuantile = quantiles(S3GUARD_METADATASTORE_THROTTLE_RATE,
+    quantiles(S3GUARD_METADATASTORE_THROTTLE_RATE,
         "events", "frequency (Hz)", interval);
 
     registerAsMetricsSource(name);
@@ -550,7 +551,7 @@ public void decrementGauge(Statistic op, long count) {
    * Create a stream input statistics instance.
    * @return the new instance
    */
-  InputStreamStatistics newInputStreamStatistics() {
+  public InputStreamStatistics newInputStreamStatistics() {
     return new InputStreamStatistics();
   }
 
@@ -593,8 +594,6 @@ private void mergeInputStreamStatistics(InputStreamStatistics statistics) {
     streamReadsIncomplete.incr(statistics.readsIncomplete);
     streamBytesReadInClose.incr(statistics.bytesReadInClose);
     streamBytesDiscardedInAbort.incr(statistics.bytesDiscardedInAbort);
-    incrementCounter(STREAM_READ_VERSION_MISMATCHES,
-        statistics.versionMismatches.get());
   }
 
   @Override
@@ -604,8 +603,6 @@ public void getMetrics(MetricsCollector collector, boolean all) {
 
   public void close() {
     synchronized (metricsSystemLock) {
-      putLatencyQuantile.stop();
-      throttleRateQuantile.stop();
       metricsSystem.unregisterSource(metricsSourceName);
       int activeSources = --metricsSourceActiveCounter;
       if (activeSources == 0) {
@@ -642,8 +639,6 @@ public final class InputStreamStatistics implements AutoCloseable {
     public long bytesDiscardedInAbort;
     public long policySetCount;
     public long inputPolicy;
-    /** This is atomic so that it can be passed as a reference. */
-    private final AtomicLong versionMismatches = new AtomicLong(0);
 
     private InputStreamStatistics() {
     }
@@ -768,14 +763,6 @@ public void inputPolicySet(int updatedPolicy) {
       inputPolicy = updatedPolicy;
     }
 
-    /**
-     * Get a reference to the version mismatch counter.
-     * @return a counter which can be incremented.
-     */
-    public AtomicLong getVersionMismatchCounter() {
-      return versionMismatches;
-    }
-
     /**
      * String operator describes all the current statistics.
      * <b>Important: there are no guarantees as to the stability
@@ -809,7 +796,6 @@ public String toString() {
       sb.append(", BytesDiscardedInAbort=").append(bytesDiscardedInAbort);
       sb.append(", InputPolicy=").append(inputPolicy);
       sb.append(", InputPolicySetCount=").append(policySetCount);
-      sb.append(", versionMismatches=").append(versionMismatches.get());
       sb.append('}');
       return sb.toString();
     }
@@ -1118,6 +1104,30 @@ public void jobCompleted(boolean success) {
   }
 
   /**
+   * Create a delegation token statistics instance.
+   * @return an instance of delegation token statistics
+   */
+  public DelegationTokenStatistics newDelegationTokenStatistics() {
+    return new DelegationTokenStatistics();
+  }
+
+  /**
+   * Instrumentation exported to S3A Delegation Token support.
+   */
+  @InterfaceAudience.Private
+  @InterfaceStability.Unstable
+  public final class DelegationTokenStatistics {
+
+    private DelegationTokenStatistics() {
+    }
+
+    /** A token has been issued. */
+    public void tokenIssued() {
+      incrementCounter(DELEGATION_TOKENS_ISSUED, 1);
+    }
+  }
+
+    /**
    * Copy all the metrics to a map of (name, long-value).
    * @return a map of the metrics
    */
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AOpContext.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AOpContext.java
index fba39b9a5f4b9..553d02fb76113 100644
--- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AOpContext.java
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AOpContext.java
@@ -84,4 +84,29 @@ public S3AOpContext(boolean isS3GuardEnabled, Invoker invoker,
         dstFileStatus);
   }
 
+  public boolean isS3GuardEnabled() {
+    return isS3GuardEnabled;
+  }
+
+  public Invoker getInvoker() {
+    return invoker;
+  }
+
+  @Nullable
+  public FileSystem.Statistics getStats() {
+    return stats;
+  }
+
+  public S3AInstrumentation getInstrumentation() {
+    return instrumentation;
+  }
+
+  @Nullable
+  public Invoker getS3guardInvoker() {
+    return s3guardInvoker;
+  }
+
+  public FileStatus getDstFileStatus() {
+    return dstFileStatus;
+  }
 }
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AReadOpContext.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AReadOpContext.java
index 8c24f2b8617dd..73c219498f0e7 100644
--- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AReadOpContext.java
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AReadOpContext.java
@@ -21,7 +21,6 @@
 import org.apache.hadoop.fs.FileStatus;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.fs.s3a.impl.ChangeDetectionPolicy;
 
 import javax.annotation.Nullable;
 
@@ -44,11 +43,6 @@ public class S3AReadOpContext extends S3AOpContext {
    */
   private final S3AInputPolicy inputPolicy;
 
-  /**
-   * How to detect and deal with the object being updated during read.
-   */
-  private final ChangeDetectionPolicy changeDetectionPolicy;
-
   /**
    * Readahead for GET operations/skip, etc.
    */
@@ -65,7 +59,6 @@ public class S3AReadOpContext extends S3AOpContext {
    * @param dstFileStatus target file status
    * @param inputPolicy the input policy
    * @param readahead readahead for GET operations/skip, etc.
-   * @param changeDetectionPolicy change detection policy.
    */
   public S3AReadOpContext(
       final Path path,
@@ -76,21 +69,20 @@ public S3AReadOpContext(
       S3AInstrumentation instrumentation,
       FileStatus dstFileStatus,
       S3AInputPolicy inputPolicy,
-      ChangeDetectionPolicy changeDetectionPolicy,
       final long readahead) {
     super(isS3GuardEnabled, invoker, s3guardInvoker, stats, instrumentation,
         dstFileStatus);
     this.path = checkNotNull(path);
     Preconditions.checkArgument(readahead >= 0,
-        String.format("invalid readahead %d", readahead));
+        "invalid readahead %d", readahead);
     this.inputPolicy = checkNotNull(inputPolicy);
-    this.changeDetectionPolicy = checkNotNull(changeDetectionPolicy);
     this.readahead = readahead;
   }
 
   /**
-   * Get invoker to use for read operations.  When S3Guard is enabled we use
-   * the S3Guard invoker, which deals with things like FileNotFoundException
+   * Get invoker to use for read operations.
+   * When S3Guard is enabled we use the S3Guard invoker,
+   * which deals with things like FileNotFoundException
    * differently.
    * @return invoker to use for read codepaths
    */
@@ -118,13 +110,9 @@ public S3AInputPolicy getInputPolicy() {
     return inputPolicy;
   }
 
-  public ChangeDetectionPolicy getChangeDetectionPolicy() {
-    return changeDetectionPolicy;
-  }
-
   /**
    * Get the readahead for this operation.
-   * @return a value {@literal >=} 0
+   * @return a value >= 0
    */
   public long getReadahead() {
     return readahead;
@@ -137,7 +125,6 @@ public String toString() {
     sb.append("path=").append(path);
     sb.append(", inputPolicy=").append(inputPolicy);
     sb.append(", readahead=").append(readahead);
-    sb.append(", changeDetectionPolicy=").append(changeDetectionPolicy);
     sb.append('}');
     return sb.toString();
   }
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AUtils.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AUtils.java
index 65d5f1aec82b1..a00fb1a79c4e8 100644
--- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AUtils.java
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AUtils.java
@@ -78,6 +78,7 @@
 import java.util.Optional;
 import java.util.concurrent.ExecutionException;
 
+import static org.apache.commons.lang3.StringUtils.isEmpty;
 import static org.apache.hadoop.fs.s3a.Constants.*;
 
 /**
@@ -238,6 +239,12 @@ public static IOException translateException(@Nullable String operation,
         ioe.initCause(ase);
         break;
 
+      // method not allowed; seen on S3 Select.
+      // treated as a bad request
+      case 405:
+        ioe = new AWSBadRequestException(message, s3Exception);
+        break;
+
       // out of range. This may happen if an object is overwritten with
       // a shorter one while it is being read.
       case 416:
@@ -864,7 +871,7 @@ private static String getPassword(Configuration conf,
       String key,
       String val,
       String defVal) throws IOException {
-    return StringUtils.isEmpty(val)
+    return isEmpty(val)
         ? lookupPassword(conf, key, defVal)
         : val;
   }
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3ObjectAttributes.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3ObjectAttributes.java
index d1bff8a054b6c..d67e3e1e8cbc6 100644
--- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3ObjectAttributes.java
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3ObjectAttributes.java
@@ -18,19 +18,24 @@
 
 package org.apache.hadoop.fs.s3a;
 
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.classification.InterfaceStability;
+
 /**
  * This class is only a holder for bucket, key, SSE Algorithm and SSE key
- * attributes. It is only used in {@link S3AInputStream}
+ * attributes. It is used in {@link S3AInputStream} and the select equivalent.
  * as a way to reduce parameters being passed
  * to the constructor of such class.
  */
-class S3ObjectAttributes {
-  private String bucket;
-  private String key;
-  private S3AEncryptionMethods serverSideEncryptionAlgorithm;
-  private String serverSideEncryptionKey;
+@InterfaceAudience.Private
+@InterfaceStability.Evolving
+public class S3ObjectAttributes {
+  private final String bucket;
+  private final String key;
+  private final S3AEncryptionMethods serverSideEncryptionAlgorithm;
+  private final String serverSideEncryptionKey;
 
-  S3ObjectAttributes(
+  public S3ObjectAttributes(
       String bucket,
       String key,
       S3AEncryptionMethods serverSideEncryptionAlgorithm,
@@ -41,19 +46,19 @@ class S3ObjectAttributes {
     this.serverSideEncryptionKey = serverSideEncryptionKey;
   }
 
-  String getBucket() {
+  public String getBucket() {
     return bucket;
   }
 
-  String getKey() {
+  public String getKey() {
     return key;
   }
 
-  S3AEncryptionMethods getServerSideEncryptionAlgorithm() {
+  public S3AEncryptionMethods getServerSideEncryptionAlgorithm() {
     return serverSideEncryptionAlgorithm;
   }
 
-  String getServerSideEncryptionKey() {
+  public String getServerSideEncryptionKey() {
     return serverSideEncryptionKey;
   }
 }
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/Statistic.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/Statistic.java
index 9bb5583a6338e..919cad4f35d9e 100644
--- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/Statistic.java
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/Statistic.java
@@ -100,6 +100,8 @@ public enum Statistic {
   OBJECT_PUT_BYTES("object_put_bytes", "number of bytes uploaded"),
   OBJECT_PUT_BYTES_PENDING("object_put_bytes_pending",
       "number of bytes queued for upload/being actively uploaded"),
+  OBJECT_SELECT_REQUESTS("object_select_requests",
+      "Count of S3 Select requests issued"),
   STREAM_ABORTED("stream_aborted",
       "Count of times the TCP stream was aborted"),
   STREAM_BACKWARD_SEEK_OPERATIONS("stream_backward_seek_operations",
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/WriteOperationHelper.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/WriteOperationHelper.java
index 93c701f2aad6b..e16f7229ac18c 100644
--- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/WriteOperationHelper.java
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/WriteOperationHelper.java
@@ -26,6 +26,7 @@
 import java.util.List;
 import java.util.concurrent.atomic.AtomicInteger;
 
+import com.amazonaws.services.s3.model.AmazonS3Exception;
 import com.amazonaws.services.s3.model.CompleteMultipartUploadRequest;
 import com.amazonaws.services.s3.model.CompleteMultipartUploadResult;
 import com.amazonaws.services.s3.model.InitiateMultipartUploadRequest;
@@ -34,6 +35,8 @@
 import com.amazonaws.services.s3.model.PartETag;
 import com.amazonaws.services.s3.model.PutObjectRequest;
 import com.amazonaws.services.s3.model.PutObjectResult;
+import com.amazonaws.services.s3.model.SelectObjectContentRequest;
+import com.amazonaws.services.s3.model.SelectObjectContentResult;
 import com.amazonaws.services.s3.model.UploadPartRequest;
 import com.amazonaws.services.s3.model.UploadPartResult;
 import com.amazonaws.services.s3.transfer.model.UploadResult;
@@ -45,17 +48,19 @@
 import org.apache.hadoop.classification.InterfaceAudience;
 import org.apache.hadoop.classification.InterfaceStability;
 import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.s3a.commit.DurationInfo;
+import org.apache.hadoop.fs.s3a.select.SelectBinding;
 
 import static com.google.common.base.Preconditions.checkArgument;
 import static com.google.common.base.Preconditions.checkNotNull;
 import static org.apache.hadoop.fs.s3a.Invoker.*;
 
 /**
- * Helper for low-level operations against an S3 Bucket for writing data
- * and creating and committing pending writes.
+ * Helper for low-level operations against an S3 Bucket for writing data,
+ * creating and committing pending writes, and other S3-layer operations.
  * <p>
  * It hides direct access to the S3 API
- * and is a location where the object upload process can be evolved/enhanced.
+ * and is a location where the object operations can be evolved/enhanced.
  * <p>
  * Features
  * <ul>
@@ -65,8 +70,10 @@
  *   errors.</li>
  *   <li>Callbacks to let the FS know of events in the output stream
  *   upload process.</li>
+ *   <li>Other low-level access to S3 functions, for private use.</li>
  *   <li>Failure handling, including converting exceptions to IOEs.</li>
  *   <li>Integration with instrumentation and S3Guard.</li>
+ *   <li>Evolution to add more low-level operations, such as S3 select.</li>
  * </ul>
  *
  * This API is for internal use only.
@@ -76,9 +83,24 @@
 public class WriteOperationHelper {
   private static final Logger LOG =
       LoggerFactory.getLogger(WriteOperationHelper.class);
+
+  /**
+   * Owning filesystem.
+   */
   private final S3AFileSystem owner;
+
+  /**
+   * Invoker for operations; uses the S3A retry policy and calls int
+   * {@link #operationRetried(String, Exception, int, boolean)} on retries.
+   */
   private final Invoker invoker;
 
+  /** Configuration of the owner. This is a reference, not a copy. */
+  private final Configuration conf;
+
+  /** Bucket of the owner FS. */
+  private final String bucket;
+
   /**
    * Constructor.
    * @param owner owner FS creating the helper
@@ -89,6 +111,8 @@ protected WriteOperationHelper(S3AFileSystem owner, Configuration conf) {
     this.owner = owner;
     this.invoker = new Invoker(new S3ARetryPolicy(conf),
         this::operationRetried);
+    this.conf = conf;
+    bucket = owner.getBucket();
   }
 
   /**
@@ -189,7 +213,7 @@ public ObjectMetadata newObjectMetadata(long length) {
   public String initiateMultiPartUpload(String destKey) throws IOException {
     LOG.debug("Initiating Multipart upload to {}", destKey);
     final InitiateMultipartUploadRequest initiateMPURequest =
-        new InitiateMultipartUploadRequest(owner.getBucket(),
+        new InitiateMultipartUploadRequest(bucket,
             destKey,
             newObjectMetadata(-1));
     initiateMPURequest.setCannedACL(owner.getCannedACL());
@@ -231,7 +255,7 @@ private CompleteMultipartUploadResult finalizeMultipartUpload(
           // attempt to sort an unmodifiable list.
           CompleteMultipartUploadResult result =
               owner.getAmazonS3Client().completeMultipartUpload(
-                  new CompleteMultipartUploadRequest(owner.getBucket(),
+                  new CompleteMultipartUploadRequest(bucket,
                       destKey,
                       uploadId,
                       new ArrayList<>(partETags)));
@@ -381,7 +405,7 @@ public UploadPartRequest newUploadPartRequest(
     LOG.debug("Creating part upload request for {} #{} size {}",
         uploadId, partNumber, size);
     UploadPartRequest request = new UploadPartRequest()
-        .withBucketName(owner.getBucket())
+        .withBucketName(bucket)
         .withKey(destKey)
         .withUploadId(uploadId)
         .withPartNumber(partNumber)
@@ -409,7 +433,7 @@ public UploadPartRequest newUploadPartRequest(
   @Override
   public String toString() {
     final StringBuilder sb = new StringBuilder(
-        "WriteOperationHelper {bucket=").append(owner.getBucket());
+        "WriteOperationHelper {bucket=").append(bucket);
     sb.append('}');
     return sb.toString();
   }
@@ -478,4 +502,71 @@ public UploadPartResult uploadPart(UploadPartRequest request)
         () -> owner.uploadPart(request));
   }
 
+  /**
+   * Get the configuration of this instance; essentially the owning
+   * filesystem configuration.
+   * @return the configuration.
+   */
+  public Configuration getConf() {
+    return conf;
+  }
+
+  /**
+   * Create a S3 Select request for the destination path.
+   * This does not build the query.
+   * @param path pre-qualified path for query
+   * @return the request
+   */
+  public SelectObjectContentRequest newSelectRequest(Path path) {
+    SelectObjectContentRequest request = new SelectObjectContentRequest();
+    request.setBucketName(bucket);
+    request.setKey(owner.pathToKey(path));
+    return request;
+  }
+
+  /**
+   * Execute an S3 Select operation.
+   * On a failure, the request is only logged at debug to avoid the
+   * select exception being printed.
+   * @param source source for selection
+   * @param request Select request to issue.
+   * @param action the action for use in exception creation
+   * @return response
+   * @throws IOException failure
+   */
+  @Retries.RetryTranslated
+  public SelectObjectContentResult select(
+      final Path source,
+      final SelectObjectContentRequest request,
+      final String action)
+      throws IOException {
+    String bucketName = request.getBucketName();
+    Preconditions.checkArgument(bucket.equals(bucketName),
+        "wrong bucket: %s", bucketName);
+    if (LOG.isDebugEnabled()) {
+      LOG.debug("Initiating select call {} {}",
+          source, request.getExpression());
+      LOG.debug(SelectBinding.toString(request));
+    }
+    return invoker.retry(
+        action,
+        source.toString(),
+        true,
+        () -> {
+          try (DurationInfo ignored =
+                   new DurationInfo(LOG, "S3 Select operation")) {
+            try {
+              return owner.getAmazonS3Client().selectObjectContent(request);
+            } catch (AmazonS3Exception e) {
+              LOG.error("Failure of S3 Select request against {}",
+                  source);
+              LOG.debug("S3 Select request against {}:\n{}",
+                  source,
+                  SelectBinding.toString(request),
+                  e);
+              throw e;
+            }
+          }
+        });
+  }
 }
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/S3GuardTool.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/S3GuardTool.java
index 4c88dd5eb6d58..e4fd06436ae9a 100644
--- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/S3GuardTool.java
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/S3GuardTool.java
@@ -18,7 +18,6 @@
 
 package org.apache.hadoop.fs.s3a.s3guard;
 
-
 import java.io.FileNotFoundException;
 import java.io.IOException;
 import java.io.PrintStream;
@@ -33,7 +32,6 @@
 import java.util.Scanner;
 import java.util.Set;
 import java.util.concurrent.TimeUnit;
-import java.util.stream.Collectors;
 
 import com.amazonaws.services.s3.model.MultipartUpload;
 import com.google.common.annotations.VisibleForTesting;
@@ -42,8 +40,6 @@
 import org.slf4j.LoggerFactory;
 
 import org.apache.commons.lang3.StringUtils;
-import org.apache.hadoop.classification.InterfaceAudience;
-import org.apache.hadoop.classification.InterfaceStability;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.conf.Configured;
 import org.apache.hadoop.fs.FileStatus;
@@ -55,10 +51,11 @@
 import org.apache.hadoop.fs.s3a.S3AFileStatus;
 import org.apache.hadoop.fs.s3a.S3AFileSystem;
 import org.apache.hadoop.fs.s3a.S3AUtils;
+import org.apache.hadoop.fs.s3a.auth.delegation.S3ADelegationTokens;
 import org.apache.hadoop.fs.s3a.commit.CommitConstants;
-import org.apache.hadoop.fs.s3a.impl.DirectoryPolicy;
-import org.apache.hadoop.fs.s3a.impl.DirectoryPolicyImpl;
+import org.apache.hadoop.fs.s3a.select.SelectTool;
 import org.apache.hadoop.fs.shell.CommandFormat;
+import org.apache.hadoop.security.UserGroupInformation;
 import org.apache.hadoop.util.ExitUtil;
 import org.apache.hadoop.util.GenericOptionsParser;
 import org.apache.hadoop.util.Tool;
@@ -71,9 +68,8 @@
 /**
  * CLI to manage S3Guard Metadata Store.
  */
-@InterfaceAudience.LimitedPrivate("management tools")
-@InterfaceStability.Evolving
 public abstract class S3GuardTool extends Configured implements Tool {
+
   private static final Logger LOG = LoggerFactory.getLogger(S3GuardTool.class);
 
   private static final String NAME = "s3guard";
@@ -95,11 +91,15 @@ public abstract class S3GuardTool extends Configured implements Tool {
       "\t" + Uploads.NAME + " - " + Uploads.PURPOSE + "\n" +
       "\t" + Diff.NAME + " - " + Diff.PURPOSE + "\n" +
       "\t" + Prune.NAME + " - " + Prune.PURPOSE + "\n" +
-      "\t" + SetCapacity.NAME + " - " +SetCapacity.PURPOSE + "\n";
+      "\t" + SetCapacity.NAME + " - " + SetCapacity.PURPOSE + "\n" +
+      "\t" + SelectTool.NAME + " - " + SelectTool.PURPOSE + "\n";
   private static final String DATA_IN_S3_IS_PRESERVED
       = "(all data in S3 is preserved)";
 
-  abstract public String getUsage();
+  public static final String E_NO_METASTORE_OR_FILESYSTEM
+      = "No metastore or filesystem specified";
+
+  public abstract String getUsage();
 
   // Exit codes
   static final int SUCCESS = EXIT_SUCCESS;
@@ -146,19 +146,19 @@ protected S3GuardTool(Configuration conf, String...opts) {
   /**
    * Return sub-command name.
    */
-  abstract String getName();
+  public abstract String getName();
 
   /**
    * Parse DynamoDB region from either -m option or a S3 path.
    *
-   * This function should only be called from {@link Init} or
-   * {@link Destroy}.
+   * This function should only be called from {@link S3GuardTool.Init} or
+   * {@link S3GuardTool.Destroy}.
    *
    * @param paths remaining parameters from CLI.
    * @throws IOException on I/O errors.
    * @throws ExitUtil.ExitException on validation errors
    */
-  void parseDynamoDBRegion(List<String> paths) throws IOException {
+  protected void parseDynamoDBRegion(List<String> paths) throws IOException {
     Configuration conf = getConf();
     String fromCli = getCommandFormat().getOptValue(REGION_FLAG);
     String fromConf = conf.get(S3GUARD_DDB_REGION_KEY);
@@ -226,22 +226,58 @@ protected void addAgeOptions() {
     format.addOptionWithValue(SECONDS_FLAG);
   }
 
+  protected void checkIfS3BucketIsGuarded(List<String> paths)
+      throws IOException {
+    // be sure that path is provided in params, so there's no IOoBE
+    String s3Path = "";
+    if(!paths.isEmpty()) {
+      s3Path = paths.get(0);
+    }
+
+    // Check if DynamoDB url is set from arguments.
+    String metadataStoreUri = getCommandFormat().getOptValue(META_FLAG);
+    if(metadataStoreUri == null || metadataStoreUri.isEmpty()) {
+      // If not set, check if filesystem is guarded by creating an
+      // S3AFileSystem and check if hasMetadataStore is true
+      try (S3AFileSystem s3AFileSystem = (S3AFileSystem)
+          S3AFileSystem.newInstance(toUri(s3Path), getConf())){
+        Preconditions.checkState(s3AFileSystem.hasMetadataStore(),
+            "The S3 bucket is unguarded. " + getName()
+                + " can not be used on an unguarded bucket.");
+      }
+    }
+  }
+
+  /**
+   * Check if bucket or DDB table name is set.
+   */
+  protected void checkBucketNameOrDDBTableNameProvided(List<String> paths) {
+    String s3Path = null;
+    if(!paths.isEmpty()) {
+      s3Path = paths.get(0);
+    }
+
+    String metadataStoreUri = getCommandFormat().getOptValue(META_FLAG);
+
+    if(metadataStoreUri == null && s3Path == null) {
+      throw invalidArgs("S3 bucket url or DDB table name have to be provided "
+          + "explicitly to use " + getName() + " command.");
+    }
+  }
+
   /**
    * Parse metadata store from command line option or HDFS configuration.
    *
    * @param forceCreate override the auto-creation setting to true.
    * @return a initialized metadata store.
    */
-  MetadataStore initMetadataStore(boolean forceCreate) throws IOException {
+  protected MetadataStore initMetadataStore(boolean forceCreate)
+      throws IOException {
     if (getStore() != null) {
       return getStore();
     }
-    Configuration conf;
-    if (filesystem == null) {
-      conf = getConf();
-    } else {
-      conf = filesystem.getConf();
-    }
+    final boolean hasFileSystem = filesystem != null;
+    final Configuration conf = hasFileSystem ? filesystem.getConf() : getConf();
     String metaURI = getCommandFormat().getOptValue(META_FLAG);
     if (metaURI != null && !metaURI.isEmpty()) {
       URI uri = URI.create(metaURI);
@@ -263,6 +299,13 @@ MetadataStore initMetadataStore(boolean forceCreate) throws IOException {
             String.format("Metadata store %s is not supported", uri));
       }
     } else {
+      if (!hasFileSystem) {
+        // command didn't declare a metadata store URI or a bucket.
+        // to avoid problems related to picking up a shared table for actions
+        // line init and destroy (HADOOP-15843), this is rejected
+        printHelp(this);
+        throw usageError(E_NO_METASTORE_OR_FILESYSTEM);
+      }
       // CLI does not specify metadata store URI, it uses default metadata store
       // DynamoDB instead.
       setStore(new DynamoDBMetadataStore());
@@ -271,10 +314,10 @@ MetadataStore initMetadataStore(boolean forceCreate) throws IOException {
       }
     }
 
-    if (filesystem == null) {
-      getStore().initialize(conf);
-    } else {
+    if (hasFileSystem) {
       getStore().initialize(filesystem);
+    } else {
+      getStore().initialize(conf);
     }
     LOG.info("Metadata store {} is initialized.", getStore());
     return getStore();
@@ -294,7 +337,7 @@ MetadataStore initMetadataStore(boolean forceCreate) throws IOException {
    * @throws IOException failure to init filesystem
    * @throws ExitUtil.ExitException if the FS is not an S3A FS
    */
-  void initS3AFileSystem(String path) throws IOException {
+  protected void initS3AFileSystem(String path) throws IOException {
     URI uri = toUri(path);
     // Make sure that S3AFileSystem does not hold an actual MetadataStore
     // implementation.
@@ -309,8 +352,8 @@ void initS3AFileSystem(String path) throws IOException {
         S3_METADATA_STORE_IMPL);
     LOG.debug("updated bucket store option {}", updatedBucketOption);
     Preconditions.checkState(S3GUARD_METASTORE_NULL.equals(updatedBucketOption),
-        String.format("Expected bucket option to be %s but was %s",
-          S3GUARD_METASTORE_NULL, updatedBucketOption));
+        "Expected bucket option to be %s but was %s",
+        S3GUARD_METASTORE_NULL, updatedBucketOption);
 
     FileSystem fs = FileSystem.newInstance(uri, conf);
     if (!(fs instanceof S3AFileSystem)) {
@@ -327,7 +370,7 @@ void initS3AFileSystem(String path) throws IOException {
    * @param args command line arguments.
    * @return the position arguments from CLI.
    */
-  List<String> parseArgs(String[] args) {
+  protected List<String> parseArgs(String[] args) {
     return getCommandFormat().parse(args, 1);
   }
 
@@ -364,16 +407,16 @@ public final int run(String[] args) throws Exception {
    *
    * As well as returning an exit code, the implementations can choose to
    * throw an instance of {@link ExitUtil.ExitException} with their exit
-   * code set to the desired exit value. The exit code of auch an exception
+   * code set to the desired exit value. The exit code of such an exception
    * is used for the tool's exit code, and the stack trace only logged at
    * debug.
    * @param args argument list
    * @param out output stream
    * @return the exit code to return.
    * @throws Exception on any failure
-   * @throws ExitUtil.ExitException for an alternative clean exit
    */
-  public abstract int run(String[] args, PrintStream out) throws Exception;
+  public abstract int run(String[] args, PrintStream out) throws Exception,
+      ExitUtil.ExitException;
 
   /**
    * Create the metadata store.
@@ -408,7 +451,7 @@ static class Init extends S3GuardTool {
     }
 
     @Override
-    String getName() {
+    public String getName() {
       return NAME;
     }
 
@@ -420,6 +463,12 @@ public String getUsage() {
     @Override
     public int run(String[] args, PrintStream out) throws Exception {
       List<String> paths = parseArgs(args);
+      try {
+        checkBucketNameOrDDBTableNameProvided(paths);
+      } catch (ExitUtil.ExitException e) {
+        errorln(USAGE);
+        throw e;
+      }
 
       String readCap = getCommandFormat().getOptValue(READ_FLAG);
       if (readCap != null && !readCap.isEmpty()) {
@@ -495,7 +544,7 @@ static class SetCapacity extends S3GuardTool {
     }
 
     @Override
-    String getName() {
+    public String getName() {
       return NAME;
     }
 
@@ -508,20 +557,7 @@ public String getUsage() {
     public int run(String[] args, PrintStream out) throws Exception {
       List<String> paths = parseArgs(args);
       Map<String, String> options = new HashMap<>();
-      String s3Path = paths.get(0);
-
-      // Check if DynamoDB url is set from arguments.
-      String metadataStoreUri = getCommandFormat().getOptValue(META_FLAG);
-      if(metadataStoreUri == null || metadataStoreUri.isEmpty()) {
-        // If not set, check if filesystem is guarded by creating an
-        // S3AFileSystem and check if hasMetadataStore is true
-        try (S3AFileSystem s3AFileSystem = (S3AFileSystem)
-            S3AFileSystem.newInstance(toUri(s3Path), getConf())){
-          Preconditions.checkState(s3AFileSystem.hasMetadataStore(),
-              "The S3 bucket is unguarded. " + getName()
-                  + " can not be used on an unguarded bucket.");
-        }
-      }
+      checkIfS3BucketIsGuarded(paths);
 
       String readCap = getCommandFormat().getOptValue(READ_FLAG);
       if (StringUtils.isNotEmpty(readCap)) {
@@ -580,7 +616,7 @@ static class Destroy extends S3GuardTool {
     }
 
     @Override
-    String getName() {
+    public String getName() {
       return NAME;
     }
 
@@ -592,6 +628,8 @@ public String getUsage() {
     public int run(String[] args, PrintStream out) throws Exception {
       List<String> paths = parseArgs(args);
       try {
+        checkBucketNameOrDDBTableNameProvided(paths);
+        checkIfS3BucketIsGuarded(paths);
         parseDynamoDBRegion(paths);
       } catch (ExitUtil.ExitException e) {
         errorln(USAGE);
@@ -643,7 +681,7 @@ static class Import extends S3GuardTool {
     }
 
     @Override
-    String getName() {
+    public String getName() {
       return NAME;
     }
 
@@ -775,7 +813,7 @@ static class Diff extends S3GuardTool {
     }
 
     @Override
-    String getName() {
+    public String getName() {
       return NAME;
     }
 
@@ -983,7 +1021,7 @@ void setMetadataStore(MetadataStore ms) {
     }
 
     @Override
-    String getName() {
+    public String getName() {
       return NAME;
     }
 
@@ -1052,8 +1090,6 @@ static class BucketInfo extends S3GuardTool {
     public static final String NONAUTH_FLAG = "nonauth";
     public static final String ENCRYPTION_FLAG = "encryption";
     public static final String MAGIC_FLAG = "magic";
-    public static final String MARKERS_FLAG = "markers";
-    public static final String MARKERS_AWARE = "aware";
 
     public static final String PURPOSE = "provide/check S3Guard information"
         + " about a specific bucket";
@@ -1061,31 +1097,21 @@ static class BucketInfo extends S3GuardTool {
         + "\t" + PURPOSE + "\n\n"
         + "Common options:\n"
         + "  -" + GUARDED_FLAG + " - Require S3Guard\n"
-        + "  -" + UNGUARDED_FLAG + " - Require S3Guard to be disabled\n"
+        + "  -" + UNGUARDED_FLAG + " - Force S3Guard to be disabled\n"
         + "  -" + AUTH_FLAG + " - Require the S3Guard mode to be \"authoritative\"\n"
         + "  -" + NONAUTH_FLAG + " - Require the S3Guard mode to be \"non-authoritative\"\n"
         + "  -" + MAGIC_FLAG + " - Require the S3 filesystem to be support the \"magic\" committer\n"
         + "  -" + ENCRYPTION_FLAG
-        + " (none, sse-s3, sse-kms) - Require encryption policy\n"
-        + "  -" + MARKERS_FLAG
-        + " (aware, keep, delete, authoritative) - directory markers policy\n";
+        + " -require {none, sse-s3, sse-kms} - Require encryption policy";
 
-    @VisibleForTesting
-    public static final String IS_MARKER_AWARE =
-        "The S3A connector can read data in S3 buckets where"
-            + " directory markers%n"
-            + "are not deleted (optional with later hadoop releases),%n"
-            + "and with buckets where they are.%n";
-
-    public BucketInfo(Configuration conf) {
+    BucketInfo(Configuration conf) {
       super(conf, GUARDED_FLAG, UNGUARDED_FLAG, AUTH_FLAG, NONAUTH_FLAG, MAGIC_FLAG);
       CommandFormat format = getCommandFormat();
       format.addOptionWithValue(ENCRYPTION_FLAG);
-      format.addOptionWithValue(MARKERS_FLAG);
     }
 
     @Override
-    String getName() {
+    public String getName() {
       return NAME;
     }
 
@@ -1102,6 +1128,15 @@ public int run(String[] args, PrintStream out)
         throw invalidArgs("No bucket specified");
       }
       String s3Path = paths.get(0);
+      CommandFormat commands = getCommandFormat();
+
+      // check if UNGUARDED_FLAG is passed and use NullMetadataStore in
+      // config to avoid side effects like creating the table if not exists
+      if (commands.getOpt(UNGUARDED_FLAG)) {
+        LOG.debug("Unguarded flag is passed to command :" + this.getName());
+        getConf().set(S3_METADATA_STORE_IMPL, S3GUARD_METASTORE_NULL);
+      }
+
       S3AFileSystem fs = (S3AFileSystem) FileSystem.newInstance(
           toUri(s3Path), getConf());
       setFilesystem(fs);
@@ -1122,8 +1157,7 @@ public int run(String[] args, PrintStream out)
       } else {
         println(out, "Filesystem %s is not using S3Guard", fsUri);
       }
-      boolean magic = fs.hasPathCapability(
-          new Path(s3Path),
+      boolean magic = fs.hasCapability(
           CommitConstants.STORE_CAPABILITY_MAGIC_COMMITTER);
       println(out, "The \"magic\" committer %s supported",
           magic ? "is" : "is not");
@@ -1139,7 +1173,23 @@ public int run(String[] args, PrintStream out)
               "none");
       printOption(out, "\tInput seek policy", INPUT_FADVISE, INPUT_FADV_NORMAL);
 
-      CommandFormat commands = getCommandFormat();
+      // look at delegation token support
+      if (fs.getDelegationTokens().isPresent()) {
+        // DT is enabled
+        S3ADelegationTokens dtIntegration = fs.getDelegationTokens().get();
+        println(out, "Delegation Support enabled: token kind = %s",
+            dtIntegration.getTokenKind());
+        UserGroupInformation.AuthenticationMethod authenticationMethod
+            = UserGroupInformation.getCurrentUser().getAuthenticationMethod();
+        println(out, "Hadoop security mode: %s", authenticationMethod);
+        if (UserGroupInformation.isSecurityEnabled()) {
+          println(out,
+              "Warning: security is disabled; tokens will not be collected");
+        }
+      } else {
+        println(out, "Delegation token support is disabled");
+      }
+
       if (usingS3Guard) {
         if (commands.getOpt(UNGUARDED_FLAG)) {
           throw badState("S3Guard is enabled for %s", fsUri);
@@ -1170,57 +1220,10 @@ public int run(String[] args, PrintStream out)
                 fsUri, desiredEncryption, encryption);
       }
 
-      // directory markers
-      processMarkerOption(out, fs,
-          getCommandFormat().getOptValue(MARKERS_FLAG));
-
-      // and finally flush the output and report a success.
       out.flush();
       return SUCCESS;
     }
 
-    /**
-     * Validate the marker options.
-     * @param out output stream
-     * @param fs filesystem
-     * @param path test path
-     * @param marker desired marker option -may be null.
-     */
-    private void processMarkerOption(final PrintStream out,
-        final S3AFileSystem fs,
-        final String marker) {
-      DirectoryPolicy markerPolicy = fs.getDirectoryMarkerPolicy();
-      String desc = markerPolicy.describe();
-      println(out, "%nThe directory marker policy is \"%s\"%n", desc);
-
-      DirectoryPolicy.MarkerPolicy mp = markerPolicy.getMarkerPolicy();
-
-      String desiredMarker = marker == null
-          ? ""
-          : marker.trim();
-      final String optionName = mp.getOptionName();
-      if (!desiredMarker.isEmpty()) {
-        if (MARKERS_AWARE.equalsIgnoreCase(desiredMarker)) {
-          // simple awareness test -provides a way to validate compatibility
-          // on the command line
-          println(out, IS_MARKER_AWARE);
-          String pols = DirectoryPolicyImpl.availablePolicies()
-              .stream()
-              .map(DirectoryPolicy.MarkerPolicy::getOptionName)
-              .collect(Collectors.joining(", "));
-          println(out, "Available Policies: %s", pols);
-
-        } else {
-          // compare with current policy
-          if (!optionName.equalsIgnoreCase(desiredMarker)) {
-            throw badState("Bucket %s: required marker policy is \"%s\""
-                    + " but actual policy is \"%s\"",
-                fs.getUri(), desiredMarker, optionName);
-          }
-        }
-      }
-    }
-
     private String printOption(PrintStream out,
         String description, String key, String defVal) {
       String t = getFilesystem().getConf().getTrimmed(key, defVal);
@@ -1290,7 +1293,7 @@ private enum Mode { LIST, EXPECT, ABORT };
     }
 
     @Override
-    String getName() {
+    public String getName() {
       return NAME;
     }
 
@@ -1457,23 +1460,23 @@ protected static URI toUri(String s3Path) {
     return uri;
   }
 
-  private static void printHelp() {
-    if (command == null) {
+  protected static void printHelp(S3GuardTool tool) {
+    if (tool == null) {
       errorln("Usage: hadoop " + USAGE);
       errorln("\tperform S3Guard metadata store " +
           "administrative commands.");
     } else {
-      errorln("Usage: hadoop " + command.getUsage());
+      errorln("Usage: hadoop " + tool.getUsage());
     }
     errorln();
     errorln(COMMON_USAGE);
   }
 
-  private static void errorln() {
+  protected static void errorln() {
     System.err.println();
   }
 
-  private static void errorln(String x) {
+  protected static void errorln(String x) {
     System.err.println(x);
   }
 
@@ -1483,7 +1486,9 @@ private static void errorln(String x) {
    * @param format format string
    * @param args optional arguments
    */
-  private static void println(PrintStream out, String format, Object... args) {
+  protected static void println(PrintStream out,
+      String format,
+      Object... args) {
     out.println(String.format(format, args));
   }
 
@@ -1503,7 +1508,6 @@ protected static void printStoreDiagnostics(PrintStream out,
     }
   }
 
-
   /**
    * Handle store not found by converting to an exit exception
    * with specific error code.
@@ -1524,8 +1528,7 @@ protected static ExitUtil.ExitException storeNotFound(
    */
   protected static ExitUtil.ExitException invalidArgs(
       String format, Object...args) {
-    return new ExitUtil.ExitException(INVALID_ARGUMENT,
-        String.format(format, args));
+    return exitException(INVALID_ARGUMENT, format, args);
   }
 
   /**
@@ -1536,8 +1539,8 @@ protected static ExitUtil.ExitException invalidArgs(
    */
   protected static ExitUtil.ExitException badState(
       String format, Object...args) {
-    return new ExitUtil.ExitException(E_BAD_STATE,
-        String.format(format, args));
+    int exitCode = E_BAD_STATE;
+    return exitException(exitCode, format, args);
   }
 
   /**
@@ -1548,7 +1551,34 @@ protected static ExitUtil.ExitException badState(
    */
   protected static ExitUtil.ExitException userAborted(
       String format, Object...args) {
-    return new ExitUtil.ExitException(ERROR, String.format(format, args));
+    return exitException(ERROR, format, args);
+  }
+
+  /**
+   * Build a exception to throw with a formatted message.
+   * @param exitCode exit code to use
+   * @param format string format
+   * @param args optional arguments for the string
+   * @return a new exception to throw
+   */
+  protected static ExitUtil.ExitException exitException(
+      final int exitCode,
+      final String format,
+      final Object... args) {
+    return new ExitUtil.ExitException(exitCode,
+        String.format(format, args));
+  }
+
+
+  /**
+   * Build the exception to raise on a usage error
+   * @param format string format
+   * @param args optional arguments for the string
+   * @return a new exception to throw
+   */
+  protected static ExitUtil.ExitException usageError(
+      String format, Object...args) {
+    return new ExitUtil.ExitException(E_USAGE, String.format(format, args));
   }
 
   /**
@@ -1566,8 +1596,8 @@ public static int run(Configuration conf, String...args) throws
     String[] otherArgs = new GenericOptionsParser(conf, args)
         .getRemainingArgs();
     if (otherArgs.length == 0) {
-      printHelp();
-      throw new ExitUtil.ExitException(E_USAGE, "No arguments provided");
+      printHelp(null);
+      throw usageError("No arguments provided");
     }
     final String subCommand = otherArgs[0];
     LOG.debug("Executing command {}", subCommand);
@@ -1596,8 +1626,13 @@ public static int run(Configuration conf, String...args) throws
     case Uploads.NAME:
       command = new Uploads(conf);
       break;
+    case SelectTool.NAME:
+      // the select tool is not technically a S3Guard tool, but it's on the CLI
+      // because this is the defacto S3 CLI.
+      command = new SelectTool(conf);
+      break;
     default:
-      printHelp();
+      printHelp(null);
       throw new ExitUtil.ExitException(E_USAGE,
           "Unknown command " + subCommand);
     }
@@ -1614,11 +1649,17 @@ public static void main(String[] args) {
       exit(ret, "");
     } catch (CommandFormat.UnknownOptionException e) {
       errorln(e.getMessage());
-      printHelp();
+      printHelp(command);
       exit(E_USAGE, e.getMessage());
     } catch (ExitUtil.ExitException e) {
       // explicitly raised exit code
       exit(e.getExitCode(), e.toString());
+    } catch (FileNotFoundException e) {
+      // bucket doesn't exist or similar.
+      // skip the stack trace and choose the return code of 44, "404"
+      errorln(e.toString());
+      LOG.debug("Not found:", e);
+      exit(EXIT_NOT_FOUND, e.toString());
     } catch (Throwable e) {
       e.printStackTrace(System.err);
       exit(ERROR, e.toString());
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/select/InternalSelectConstants.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/select/InternalSelectConstants.java
new file mode 100644
index 0000000000000..ae3dc0816d6f4
--- /dev/null
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/select/InternalSelectConstants.java
@@ -0,0 +1,77 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.s3a.select;
+
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.fs.s3a.InternalConstants;
+
+import static org.apache.hadoop.fs.s3a.select.SelectConstants.*;
+
+/**
+ * Constants for internal use in the org.apache.hadoop.fs.s3a module itself.
+ * Please don't refer to these outside of this module & its tests.
+ * If you find you need to then either the code is doing something it
+ * should not, or these constants need to be uprated to being
+ * public and stable entries.
+ */
+@InterfaceAudience.Private
+public final class InternalSelectConstants {
+
+  private InternalSelectConstants() {
+  }
+
+  /**
+   * An unmodifiable set listing the options
+   * supported in {@code openFile()}.
+   */
+  public static final Set<String> SELECT_OPTIONS;
+
+  /*
+   * Build up the options, pulling in the standard set too.
+   */
+  static {
+    // when adding to this, please keep in alphabetical order after the
+    // common options and the SQL.
+    HashSet<String> options = new HashSet<>(Arrays.asList(
+        SELECT_SQL,
+        SELECT_ERRORS_INCLUDE_SQL,
+        SELECT_INPUT_COMPRESSION,
+        SELECT_INPUT_FORMAT,
+        SELECT_OUTPUT_FORMAT,
+        CSV_INPUT_COMMENT_MARKER,
+        CSV_INPUT_HEADER,
+        CSV_INPUT_INPUT_FIELD_DELIMITER,
+        CSV_INPUT_QUOTE_CHARACTER,
+        CSV_INPUT_QUOTE_ESCAPE_CHARACTER,
+        CSV_INPUT_RECORD_DELIMITER,
+        CSV_OUTPUT_FIELD_DELIMITER,
+        CSV_OUTPUT_QUOTE_CHARACTER,
+        CSV_OUTPUT_QUOTE_ESCAPE_CHARACTER,
+        CSV_OUTPUT_QUOTE_FIELDS,
+        CSV_OUTPUT_RECORD_DELIMITER
+    ));
+    options.addAll(InternalConstants.STANDARD_OPENFILE_KEYS);
+    SELECT_OPTIONS = Collections.unmodifiableSet(options);
+  }
+}
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/select/SelectBinding.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/select/SelectBinding.java
new file mode 100644
index 0000000000000..ff39b9ad958e6
--- /dev/null
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/select/SelectBinding.java
@@ -0,0 +1,431 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.s3a.select;
+
+import java.io.IOException;
+import java.util.Locale;
+import java.util.Optional;
+
+import com.amazonaws.services.s3.model.CSVInput;
+import com.amazonaws.services.s3.model.CSVOutput;
+import com.amazonaws.services.s3.model.ExpressionType;
+import com.amazonaws.services.s3.model.InputSerialization;
+import com.amazonaws.services.s3.model.OutputSerialization;
+import com.amazonaws.services.s3.model.QuoteFields;
+import com.amazonaws.services.s3.model.SSECustomerKey;
+import com.amazonaws.services.s3.model.SelectObjectContentRequest;
+import com.google.common.base.Preconditions;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.commons.lang3.StringUtils;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSDataInputStream;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.PathIOException;
+import org.apache.hadoop.fs.s3a.Retries;
+import org.apache.hadoop.fs.s3a.S3AReadOpContext;
+import org.apache.hadoop.fs.s3a.S3ObjectAttributes;
+import org.apache.hadoop.fs.s3a.WriteOperationHelper;
+
+import static com.google.common.base.Preconditions.checkNotNull;
+import static org.apache.commons.lang3.StringUtils.isNotEmpty;
+import static org.apache.hadoop.fs.s3a.select.SelectConstants.*;
+
+/**
+ * Class to do the S3 select binding and build a select request from the
+ * supplied arguments/configuration.
+ *
+ * This class is intended to be instantiated by the owning S3AFileSystem
+ * instance to handle the construction of requests: IO is still done exclusively
+ * in the filesystem.
+ */
+public class SelectBinding {
+
+  static final Logger LOG =
+      LoggerFactory.getLogger(SelectBinding.class);
+
+  /** Operations on the store. */
+  private final WriteOperationHelper operations;
+
+  /** Is S3 Select enabled? */
+  private final boolean enabled;
+  private final boolean errorsIncludeSql;
+
+  /**
+   * Constructor.
+   * @param operations owning FS.
+   */
+  public SelectBinding(final WriteOperationHelper operations) {
+    this.operations = checkNotNull(operations);
+    Configuration conf = getConf();
+    this.enabled = conf.getBoolean(FS_S3A_SELECT_ENABLED, true);
+    this.errorsIncludeSql = conf.getBoolean(SELECT_ERRORS_INCLUDE_SQL, false);
+  }
+
+  Configuration getConf() {
+    return operations.getConf();
+  }
+
+  /**
+   * Is the service supported?
+   * @return true iff select is enabled.
+   */
+  public boolean isEnabled() {
+    return enabled;
+  }
+
+  /**
+   * Build and execute a select request.
+   * @param readContext the read context, which includes the source path.
+   * @param expression the SQL expression.
+   * @param builderOptions query options
+   * @param sseKey optional SSE customer key
+   * @param objectAttributes object attributes from a HEAD request
+   * @return an FSDataInputStream whose wrapped stream is a SelectInputStream
+   * @throws IllegalArgumentException argument failure
+   * @throws IOException failure building, validating or executing the request.
+   * @throws PathIOException source path is a directory.
+   */
+  @Retries.RetryTranslated
+  public FSDataInputStream select(
+      final S3AReadOpContext readContext,
+      final String expression,
+      final Configuration builderOptions,
+      final Optional<SSECustomerKey> sseKey,
+      final S3ObjectAttributes objectAttributes) throws IOException {
+
+    return new FSDataInputStream(
+        executeSelect(readContext,
+            objectAttributes,
+            builderOptions,
+            buildSelectRequest(
+                readContext.getPath(),
+                expression,
+                builderOptions,
+                sseKey)));
+  }
+
+  /**
+   * Build a select request.
+   * @param path source path.
+   * @param expression the SQL expression.
+   * @param builderOptions config to extract other query options from
+   * @param sseKey optional SSE customer key
+   * @return the request to serve
+   * @throws IllegalArgumentException argument failure
+   * @throws IOException problem building/validating the request
+   */
+  public SelectObjectContentRequest buildSelectRequest(
+      final Path path,
+      final String expression,
+      final Configuration builderOptions,
+      final Optional<SSECustomerKey> sseKey)
+      throws IOException {
+    Preconditions.checkState(isEnabled(),
+        "S3 Select is not enabled for %s", path);
+
+    SelectObjectContentRequest request = operations.newSelectRequest(path);
+    buildRequest(request, expression, builderOptions);
+    // optionally set an SSE key in the input
+    sseKey.ifPresent(request::withSSECustomerKey);
+    return request;
+  }
+
+  /**
+   * Execute the select request.
+   * @param readContext read context
+   * @param objectAttributes object attributes from a HEAD request
+   * @param builderOptions the options which came in from the openFile builder.
+   * @param request the built up select request.
+   * @return a SelectInputStream
+   * @throws IOException failure
+   * @throws PathIOException source path is a directory.
+   */
+  @Retries.RetryTranslated
+  private SelectInputStream executeSelect(
+      final S3AReadOpContext readContext,
+      final S3ObjectAttributes objectAttributes,
+      final Configuration builderOptions,
+      final SelectObjectContentRequest request) throws IOException {
+
+    Path path = readContext.getPath();
+    if (readContext.getDstFileStatus().isDirectory()) {
+      throw new PathIOException(path.toString(),
+          "Can't select " + path
+          + " because it is a directory");
+    }
+    boolean sqlInErrors = builderOptions.getBoolean(SELECT_ERRORS_INCLUDE_SQL,
+        errorsIncludeSql);
+    String expression = request.getExpression();
+    final String errorText = sqlInErrors ? expression : "Select";
+    if (sqlInErrors) {
+      LOG.info("Issuing SQL request {}", expression);
+    }
+    return new SelectInputStream(readContext,
+        objectAttributes,
+        operations.select(path, request, errorText));
+  }
+
+  /**
+   * Build the select request from the configuration built up
+   * in {@code S3AFileSystem.openFile(Path)} and the default
+   * options in the cluster configuration.
+   *
+   * Options are picked up in the following order.
+   * <ol>
+   *   <li> Options in {@code openFileOptions}.</li>
+   *   <li> Options in the owning filesystem configuration.</li>
+   *   <li>The default values in {@link SelectConstants}</li>
+   * </ol>
+   *
+   * @param request request to build up
+   * @param expression SQL expression
+   * @param builderOptions the options which came in from the openFile builder.
+   * @throws IllegalArgumentException if an option is somehow invalid.
+   * @throws IOException if an option is somehow invalid.
+   */
+  void buildRequest(
+      final SelectObjectContentRequest request,
+      final String expression,
+      final Configuration builderOptions)
+      throws IllegalArgumentException, IOException {
+    Preconditions.checkArgument(StringUtils.isNotEmpty(expression),
+        "No expression provided in parameter " + SELECT_SQL);
+
+    final Configuration ownerConf = operations.getConf();
+
+
+    String inputFormat = builderOptions.get(SELECT_INPUT_FORMAT,
+        SELECT_FORMAT_CSV).toLowerCase(Locale.ENGLISH);
+    Preconditions.checkArgument(SELECT_FORMAT_CSV.equals(inputFormat),
+        "Unsupported input format %s", inputFormat);
+    String outputFormat = builderOptions.get(SELECT_OUTPUT_FORMAT,
+        SELECT_FORMAT_CSV)
+        .toLowerCase(Locale.ENGLISH);
+    Preconditions.checkArgument(SELECT_FORMAT_CSV.equals(outputFormat),
+        "Unsupported output format %s", outputFormat);
+
+    request.setExpressionType(ExpressionType.SQL);
+    request.setExpression(expandBackslashChars(expression));
+
+    InputSerialization inputSerialization = buildCsvInputRequest(ownerConf,
+        builderOptions);
+    String compression = opt(builderOptions,
+        ownerConf,
+        SELECT_INPUT_COMPRESSION,
+        COMPRESSION_OPT_NONE,
+        true).toUpperCase(Locale.ENGLISH);
+    if (isNotEmpty(compression)) {
+      inputSerialization.setCompressionType(compression);
+    }
+    request.setInputSerialization(inputSerialization);
+
+    request.setOutputSerialization(buildCSVOutput(ownerConf, builderOptions));
+
+  }
+
+  /**
+   * Build the CSV input request.
+   * @param ownerConf FS owner configuration
+   * @param builderOptions options on the specific request
+   * @return the constructed request
+   * @throws IllegalArgumentException argument failure
+   * @throws IOException validation failure
+   */
+  public InputSerialization buildCsvInputRequest(
+      final Configuration ownerConf,
+      final Configuration builderOptions)
+      throws IllegalArgumentException, IOException {
+
+    String headerInfo = opt(builderOptions,
+        ownerConf,
+        CSV_INPUT_HEADER,
+        CSV_INPUT_HEADER_OPT_DEFAULT,
+        true).toUpperCase(Locale.ENGLISH);
+    String commentMarker = xopt(builderOptions,
+        ownerConf,
+        CSV_INPUT_COMMENT_MARKER,
+        CSV_INPUT_COMMENT_MARKER_DEFAULT);
+    String fieldDelimiter = xopt(builderOptions,
+        ownerConf,
+        CSV_INPUT_INPUT_FIELD_DELIMITER,
+        CSV_INPUT_FIELD_DELIMITER_DEFAULT);
+    String recordDelimiter = xopt(builderOptions,
+        ownerConf,
+        CSV_INPUT_RECORD_DELIMITER,
+        CSV_INPUT_RECORD_DELIMITER_DEFAULT);
+    String quoteCharacter = xopt(builderOptions,
+        ownerConf,
+        CSV_INPUT_QUOTE_CHARACTER,
+        CSV_INPUT_QUOTE_CHARACTER_DEFAULT);
+    String quoteEscapeCharacter = xopt(builderOptions,
+        ownerConf,
+        CSV_INPUT_QUOTE_ESCAPE_CHARACTER,
+        CSV_INPUT_QUOTE_ESCAPE_CHARACTER_DEFAULT);
+
+    // CSV input
+    CSVInput csv = new CSVInput();
+    csv.setFieldDelimiter(fieldDelimiter);
+    csv.setRecordDelimiter(recordDelimiter);
+    csv.setComments(commentMarker);
+    csv.setQuoteCharacter(quoteCharacter);
+    if (StringUtils.isNotEmpty(quoteEscapeCharacter)) {
+      csv.setQuoteEscapeCharacter(quoteEscapeCharacter);
+    }
+    csv.setFileHeaderInfo(headerInfo);
+
+    InputSerialization inputSerialization = new InputSerialization();
+    inputSerialization.setCsv(csv);
+
+    return inputSerialization;
+
+  }
+
+  /**
+   * Build CSV output for a request.
+   * @param ownerConf FS owner configuration
+   * @param builderOptions options on the specific request
+   * @return the constructed request
+   * @throws IllegalArgumentException argument failure
+   * @throws IOException validation failure
+   */
+  public OutputSerialization buildCSVOutput(
+      final Configuration ownerConf,
+      final Configuration builderOptions)
+      throws IllegalArgumentException, IOException {
+    String fieldDelimiter = xopt(builderOptions,
+        ownerConf,
+        CSV_OUTPUT_FIELD_DELIMITER,
+        CSV_OUTPUT_FIELD_DELIMITER_DEFAULT);
+    String recordDelimiter = xopt(builderOptions,
+        ownerConf,
+        CSV_OUTPUT_RECORD_DELIMITER,
+        CSV_OUTPUT_RECORD_DELIMITER_DEFAULT);
+    String quoteCharacter = xopt(builderOptions,
+        ownerConf,
+        CSV_OUTPUT_QUOTE_CHARACTER,
+        CSV_OUTPUT_QUOTE_CHARACTER_DEFAULT);
+    String quoteEscapeCharacter = xopt(builderOptions,
+        ownerConf,
+        CSV_OUTPUT_QUOTE_ESCAPE_CHARACTER,
+        CSV_OUTPUT_QUOTE_ESCAPE_CHARACTER_DEFAULT);
+    String quoteFields = xopt(builderOptions,
+        ownerConf,
+        CSV_OUTPUT_QUOTE_FIELDS,
+        CSV_OUTPUT_QUOTE_FIELDS_ALWAYS).toUpperCase(Locale.ENGLISH);
+
+    // output is CSV, always
+    OutputSerialization outputSerialization
+        = new OutputSerialization();
+    CSVOutput csvOut = new CSVOutput();
+    csvOut.setQuoteCharacter(quoteCharacter);
+    csvOut.setQuoteFields(
+        QuoteFields.fromValue(quoteFields));
+    csvOut.setFieldDelimiter(fieldDelimiter);
+    csvOut.setRecordDelimiter(recordDelimiter);
+    if (!quoteEscapeCharacter.isEmpty()) {
+      csvOut.setQuoteEscapeCharacter(quoteEscapeCharacter);
+    }
+
+    outputSerialization.setCsv(csvOut);
+    return outputSerialization;
+  }
+
+  /**
+   * Stringify the given SelectObjectContentRequest, as its
+   * toString() operator doesn't.
+   * @param request request to convert to a string
+   * @return a string to print. Does not contain secrets.
+   */
+  public static String toString(final SelectObjectContentRequest request) {
+    StringBuilder sb = new StringBuilder();
+    sb.append("SelectObjectContentRequest{")
+        .append("bucket name=").append(request.getBucketName())
+        .append("; key=").append(request.getKey())
+        .append("; expressionType=").append(request.getExpressionType())
+        .append("; expression=").append(request.getExpression());
+    InputSerialization input = request.getInputSerialization();
+    if (input != null) {
+      sb.append("; Input")
+          .append(input.toString());
+    } else {
+      sb.append("; Input Serialization: none");
+    }
+    OutputSerialization out = request.getOutputSerialization();
+    if (out != null) {
+      sb.append("; Output")
+          .append(out.toString());
+    } else {
+      sb.append("; Output Serialization: none");
+    }
+    return sb.append("}").toString();
+  }
+
+  /**
+   * Resolve an option.
+   * @param builderOptions the options which came in from the openFile builder.
+   * @param fsConf configuration of the owning FS.
+   * @param base base option (no s3a: prefix)
+   * @param defVal default value. Must not be null.
+   * @param trim should the result be trimmed.
+   * @return the possibly trimmed value.
+   */
+  static String opt(Configuration builderOptions,
+      Configuration fsConf,
+      String base,
+      String defVal,
+      boolean trim) {
+    String r = builderOptions.get(base, fsConf.get(base, defVal));
+    return trim ? r.trim() : r;
+  }
+
+  /**
+   * Get an option with backslash arguments transformed.
+   * These are not trimmed, so whitespace is significant.
+   * @param selectOpts options in the select call
+   * @param fsConf filesystem conf
+   * @param base base option name
+   * @param defVal default value
+   * @return the transformed value
+   */
+  static String xopt(Configuration selectOpts,
+      Configuration fsConf,
+      String base,
+      String defVal) {
+    return expandBackslashChars(
+        opt(selectOpts, fsConf, base, defVal, false));
+  }
+
+  /**
+   * Perform escaping.
+   * @param src source string.
+   * @return the replaced value
+   */
+  static String expandBackslashChars(String src) {
+    return src.replace("\\n", "\n")
+        .replace("\\\"", "\"")
+        .replace("\\t", "\t")
+        .replace("\\r", "\r")
+        .replace("\\\"", "\"")
+        // backslash substitution must come last
+        .replace("\\\\", "\\");
+  }
+
+}
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/select/SelectConstants.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/select/SelectConstants.java
new file mode 100644
index 0000000000000..d74411d2f92ca
--- /dev/null
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/select/SelectConstants.java
@@ -0,0 +1,296 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.s3a.select;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.classification.InterfaceStability;
+
+/**
+ * Options related to S3 Select.
+ *
+ * These options are set for the entire filesystem unless overridden
+ * as an option in the URI
+ */
+@InterfaceAudience.Public
+@InterfaceStability.Unstable
+public final class SelectConstants {
+
+  public static final String SELECT_UNSUPPORTED = "S3 Select is not supported";
+
+  private SelectConstants() {
+  }
+
+  public static final String FS_S3A_SELECT = "fs.s3a.select.";
+
+
+  /**
+   * This is the big SQL expression: {@value}.
+   * When used in an open() call, switch to a select operation.
+   * This is only used in the open call, never in a filesystem configuration.
+   */
+  public static final String SELECT_SQL = FS_S3A_SELECT + "sql";
+
+  /**
+   * Does the FS Support S3 Select?
+   * Value: {@value}.
+   */
+  public static final String S3_SELECT_CAPABILITY = "s3a:fs.s3a.select.sql";
+
+  /**
+   * Flag: is S3 select enabled?
+   * Value: {@value}.
+   */
+  public static final String FS_S3A_SELECT_ENABLED = FS_S3A_SELECT
+      + "enabled";
+
+  /**
+   * Input format for data.
+   * Value: {@value}.
+   */
+  public static final String SELECT_INPUT_FORMAT =
+      "fs.s3a.select.input.format";
+
+  /**
+   * Output format for data -that is, what the results are generated
+   * as.
+   * Value: {@value}.
+   */
+  public static final String SELECT_OUTPUT_FORMAT =
+      "fs.s3a.select.output.format";
+
+  /**
+   * CSV as an input or output format: {@value}.
+   */
+  public static final String SELECT_FORMAT_CSV = "csv";
+
+  /**
+   * JSON as an input or output format: {@value}.
+   */
+  public static final String SELECT_FORMAT_JSON = "json";
+
+  /**
+   * Should Select errors include the SQL statement?
+   * It is easier to debug but a security risk if the exceptions
+   * ever get printed/logged and the query contains secrets.
+   */
+  public static final String SELECT_ERRORS_INCLUDE_SQL =
+      FS_S3A_SELECT + "errors.include.sql";
+
+  /**
+   * How is the input compressed? This applies to all formats.
+   * Value: {@value}.
+   */
+  public static final String SELECT_INPUT_COMPRESSION = FS_S3A_SELECT
+      + "input.compression";
+
+  /**
+   * No compression.
+   * Value: {@value}.
+   */
+  public static final String COMPRESSION_OPT_NONE = "none";
+
+  /**
+   * Gzipped.
+   * Value: {@value}.
+   */
+  public static final String COMPRESSION_OPT_GZIP = "gzip";
+
+  /**
+   * Prefix for all CSV input options.
+   * Value: {@value}.
+   */
+  public static final String FS_S3A_SELECT_INPUT_CSV =
+      "fs.s3a.select.input.csv.";
+
+  /**
+   * Prefix for all CSV output options.
+   * Value: {@value}.
+   */
+  public static final String FS_S3A_SELECT_OUTPUT_CSV =
+      "fs.s3a.select.output.csv.";
+
+  /**
+   * String which indicates the row is actually a comment.
+   * Value: {@value}.
+   */
+  public static final String CSV_INPUT_COMMENT_MARKER =
+      FS_S3A_SELECT_INPUT_CSV + "comment.marker";
+
+  /**
+   * Default marker.
+   * Value: {@value}.
+   */
+  public static final String CSV_INPUT_COMMENT_MARKER_DEFAULT = "#";
+
+  /**
+   * Record delimiter. CR, LF, etc.
+   * Value: {@value}.
+   */
+  public static final String CSV_INPUT_RECORD_DELIMITER =
+      FS_S3A_SELECT_INPUT_CSV + "record.delimiter";
+
+  /**
+   * Default delimiter
+   * Value: {@value}.
+   */
+  public static final String CSV_INPUT_RECORD_DELIMITER_DEFAULT = "\n";
+
+  /**
+   * Field delimiter.
+   * Value: {@value}.
+   */
+  public static final String CSV_INPUT_INPUT_FIELD_DELIMITER =
+      FS_S3A_SELECT_INPUT_CSV + "field.delimiter";
+
+  /**
+   * Default field delimiter.
+   * Value: {@value}.
+   */
+  public static final String CSV_INPUT_FIELD_DELIMITER_DEFAULT = ",";
+
+  /**
+   * Quote Character.
+   * Value: {@value}.
+   */
+  public static final String CSV_INPUT_QUOTE_CHARACTER =
+      FS_S3A_SELECT_INPUT_CSV + "quote.character";
+
+  /**
+   * Default Quote Character.
+   * Value: {@value}.
+   */
+  public static final String CSV_INPUT_QUOTE_CHARACTER_DEFAULT = "\"";
+
+  /**
+   * Character to escape quotes.
+   * If empty: no escaping.
+   * Value: {@value}.
+   */
+  public static final String CSV_INPUT_QUOTE_ESCAPE_CHARACTER =
+      FS_S3A_SELECT_INPUT_CSV + "quote.escape.character";
+
+  /**
+   * Default quote escape character.
+   * Value: {@value}.
+   */
+  public static final String CSV_INPUT_QUOTE_ESCAPE_CHARACTER_DEFAULT = "\\";
+
+  /**
+   * How should headers be used?
+   * Value: {@value}.
+   */
+  public static final String CSV_INPUT_HEADER =
+      FS_S3A_SELECT_INPUT_CSV + "header";
+
+  /**
+   * No header: first row is data.
+   * Value: {@value}.
+   */
+  public static final String CSV_HEADER_OPT_NONE = "none";
+
+  /**
+   * Ignore the header.
+   * Value: {@value}.
+   */
+  public static final String CSV_HEADER_OPT_IGNORE = "ignore";
+
+  /**
+   * Use the header.
+   * Value: {@value}.
+   */
+  public static final String CSV_HEADER_OPT_USE = "use";
+
+  /**
+   * Default header mode: {@value}.
+   */
+  public static final String CSV_INPUT_HEADER_OPT_DEFAULT =
+      CSV_HEADER_OPT_IGNORE;
+
+  /**
+   * Record delimiter. CR, LF, etc.
+   * Value: {@value}.
+   */
+  public static final String CSV_OUTPUT_RECORD_DELIMITER =
+      FS_S3A_SELECT_OUTPUT_CSV + "record.delimiter";
+
+  /**
+   * Default delimiter
+   * Value: {@value}.
+   */
+  public static final String CSV_OUTPUT_RECORD_DELIMITER_DEFAULT = "\n";
+
+  /**
+   * Field delimiter.
+   * Value: {@value}.
+   */
+  public static final String CSV_OUTPUT_FIELD_DELIMITER =
+      FS_S3A_SELECT_OUTPUT_CSV + "field.delimiter";
+
+  /**
+   * Default field delimiter.
+   * Value: {@value}.
+   */
+  public static final String CSV_OUTPUT_FIELD_DELIMITER_DEFAULT = ",";
+
+  /**
+   * Quote Character.
+   * Value: {@value}.
+   */
+  public static final String CSV_OUTPUT_QUOTE_CHARACTER =
+      FS_S3A_SELECT_OUTPUT_CSV + "quote.character";
+
+  /**
+   * Default Quote Character.
+   * Value: {@value}.
+   */
+  public static final String CSV_OUTPUT_QUOTE_CHARACTER_DEFAULT = "\"";
+
+  /**
+   * Should CSV fields be quoted?
+   * One of : ALWAYS, ASNEEDED
+   * Value: {@value}.
+   */
+  public static final String CSV_OUTPUT_QUOTE_FIELDS =
+      FS_S3A_SELECT_OUTPUT_CSV + "quote.fields";
+
+  /**
+   * Output quotation policy (default): {@value}.
+   */
+  public static final String CSV_OUTPUT_QUOTE_FIELDS_ALWAYS = "always";
+
+  /**
+   * Output quotation policy: {@value}.
+   */
+  public static final String CSV_OUTPUT_QUOTE_FIELDS_AS_NEEEDED = "asneeded";
+
+  /**
+   * Character to escape quotes.
+   * If empty: no escaping.
+   * Value: {@value}.
+   */
+  public static final String CSV_OUTPUT_QUOTE_ESCAPE_CHARACTER =
+      FS_S3A_SELECT_OUTPUT_CSV + "quote.escape.character";
+
+  /**
+   * Default quote escape character.
+   * Value: {@value}.
+   */
+  public static final String CSV_OUTPUT_QUOTE_ESCAPE_CHARACTER_DEFAULT = "";
+
+}
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/select/SelectInputStream.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/select/SelectInputStream.java
new file mode 100644
index 0000000000000..f4bd8d11708ef
--- /dev/null
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/select/SelectInputStream.java
@@ -0,0 +1,457 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.s3a.select;
+
+import java.io.EOFException;
+import java.io.IOException;
+import java.util.concurrent.atomic.AtomicBoolean;
+import java.util.concurrent.atomic.AtomicLong;
+
+import com.amazonaws.AbortedException;
+import com.amazonaws.services.s3.model.SelectObjectContentEvent;
+import com.amazonaws.services.s3.model.SelectObjectContentEventVisitor;
+import com.amazonaws.services.s3.model.SelectObjectContentResult;
+import com.amazonaws.services.s3.model.SelectRecordsInputStream;
+import com.google.common.base.Preconditions;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.classification.InterfaceStability;
+import org.apache.hadoop.fs.CanSetReadahead;
+import org.apache.hadoop.fs.FSExceptionMessages;
+import org.apache.hadoop.fs.FSInputStream;
+import org.apache.hadoop.fs.PathIOException;
+import org.apache.hadoop.fs.s3a.Retries;
+import org.apache.hadoop.fs.s3a.S3AInstrumentation;
+import org.apache.hadoop.fs.s3a.S3AReadOpContext;
+import org.apache.hadoop.fs.s3a.S3ObjectAttributes;
+import org.apache.hadoop.io.IOUtils;
+
+import static com.google.common.base.Preconditions.checkNotNull;
+import static org.apache.commons.lang3.StringUtils.isNotEmpty;
+import static org.apache.hadoop.fs.s3a.Invoker.once;
+import static org.apache.hadoop.fs.s3a.S3AInputStream.validateReadahead;
+
+/**
+ * An input stream for S3 Select return values.
+ * This is simply an end-to-end GET request, without any
+ * form of seek or recovery from connectivity failures.
+ *
+ * Currently only seek and positioned read operations on the current
+ * location are supported.
+ *
+ * The normal S3 input counters are updated by this stream.
+ */
+@InterfaceAudience.Private
+@InterfaceStability.Unstable
+public class SelectInputStream extends FSInputStream implements
+    CanSetReadahead {
+
+  private static final Logger LOG =
+      LoggerFactory.getLogger(SelectInputStream.class);
+
+  public static final String SEEK_UNSUPPORTED = "seek()";
+
+  /**
+   * Same set of arguments as for an S3AInputStream.
+   */
+  private final S3ObjectAttributes objectAttributes;
+
+  /**
+   * Tracks the current position.
+   */
+  private AtomicLong pos = new AtomicLong(0);
+
+  /**
+   * Closed flag.
+   */
+  private final AtomicBoolean closed = new AtomicBoolean(false);
+
+  /**
+   * Did the read complete successfully?
+   */
+  private final AtomicBoolean completedSuccessfully = new AtomicBoolean(false);
+
+  /**
+   * Abortable response stream.
+   * This is guaranteed to never be null.
+   */
+  private final SelectRecordsInputStream wrappedStream;
+
+  private final String bucket;
+
+  private final String key;
+
+  private final String uri;
+
+  private final S3AReadOpContext readContext;
+
+  private final S3AInstrumentation.InputStreamStatistics streamStatistics;
+
+  private long readahead;
+
+  /**
+   * Create the stream.
+   * The read attempt is initiated immediately.
+   * @param readContext read context
+   * @param objectAttributes object attributes from a HEAD request
+   * @param selectResponse response from the already executed call
+   * @throws IOException failure
+   */
+  @Retries.OnceTranslated
+  public SelectInputStream(
+      final S3AReadOpContext readContext,
+      final S3ObjectAttributes objectAttributes,
+      final SelectObjectContentResult selectResponse) throws IOException {
+    Preconditions.checkArgument(isNotEmpty(objectAttributes.getBucket()),
+        "No Bucket");
+    Preconditions.checkArgument(isNotEmpty(objectAttributes.getKey()),
+        "No Key");
+    this.objectAttributes = objectAttributes;
+    this.bucket = objectAttributes.getBucket();
+    this.key = objectAttributes.getKey();
+    this.uri = "s3a://" + this.bucket + "/" + this.key;
+    this.readContext = readContext;
+    this.readahead = readContext.getReadahead();
+    this.streamStatistics = readContext.getInstrumentation()
+        .newInputStreamStatistics();
+    SelectRecordsInputStream stream = once(
+        "S3 Select",
+        uri,
+        () -> selectResponse.getPayload()
+            .getRecordsInputStream(new SelectObjectContentEventVisitor() {
+              @Override
+              public void visit(final SelectObjectContentEvent.EndEvent event) {
+                LOG.debug("Completed successful S3 select read from {}", uri);
+                completedSuccessfully.set(true);
+              }
+            }));
+    this.wrappedStream = checkNotNull(stream);
+    // this stream is already opened, so mark as such in the statistics.
+    streamStatistics.streamOpened();
+  }
+
+  @Override
+  public void close() throws IOException {
+    long skipped = 0;
+    boolean aborted = false;
+    if (!closed.getAndSet(true)) {
+      try {
+        // set up for aborts.
+        // if we know the available amount > readahead. Abort.
+        //
+        boolean shouldAbort = wrappedStream.available() > readahead;
+        if (!shouldAbort) {
+          // read our readahead range worth of data
+          skipped = wrappedStream.skip(readahead);
+          shouldAbort = wrappedStream.read() >= 0;
+        }
+        // now, either there is data left or not.
+        if (shouldAbort) {
+          // yes, more data. Abort and add this fact to the stream stats
+          aborted = true;
+          wrappedStream.abort();
+        }
+      } catch (IOException | AbortedException e) {
+        LOG.debug("While closing stream", e);
+      } finally {
+        IOUtils.cleanupWithLogger(LOG, wrappedStream);
+        streamStatistics.streamClose(aborted, skipped);
+        streamStatistics.close();
+        super.close();
+      }
+    }
+  }
+
+  /**
+   * Verify that the input stream is open. Non blocking; this gives
+   * the last state of the atomic {@link #closed} field.
+   * @throws PathIOException if the connection is closed.
+   */
+  private void checkNotClosed() throws IOException {
+    if (closed.get()) {
+      throw new PathIOException(uri, FSExceptionMessages.STREAM_IS_CLOSED);
+    }
+  }
+
+  @Override
+  public int available() throws IOException {
+    checkNotClosed();
+    return wrappedStream.available();
+  }
+
+  @Override
+  @Retries.OnceTranslated
+  public synchronized long skip(final long n) throws IOException {
+    checkNotClosed();
+    long skipped = once("skip", uri, () -> wrappedStream.skip(n));
+    pos.addAndGet(skipped);
+    // treat as a forward skip for stats
+    streamStatistics.seekForwards(skipped);
+    return skipped;
+  }
+
+  @Override
+  public long getPos() {
+    return pos.get();
+  }
+
+  /**
+   * Set the readahead.
+   * @param readahead The readahead to use.  null means to use the default.
+   */
+  @Override
+  public void setReadahead(Long readahead) {
+    this.readahead = validateReadahead(readahead);
+  }
+
+  /**
+   * Get the current readahead value.
+   * @return the readahead
+   */
+  public long getReadahead() {
+    return readahead;
+  }
+
+  /**
+   * Read a byte. There's no attempt to recover, but AWS-SDK exceptions
+   * such as {@code SelectObjectContentEventException} are translated into
+   * IOExceptions.
+   * @return a byte read or -1 for an end of file.
+   * @throws IOException failure.
+   */
+  @Override
+  @Retries.OnceTranslated
+  public synchronized int read() throws IOException {
+    checkNotClosed();
+    int byteRead;
+    try {
+      byteRead = once("read()", uri, () -> wrappedStream.read());
+    } catch (EOFException e) {
+      // this could be one of: end of file, some IO failure
+      if (completedSuccessfully.get()) {
+        // read was successful
+        return -1;
+      } else {
+        // the stream closed prematurely
+        LOG.info("Reading of S3 Select data from {} failed before all results "
+            + " were generated.", uri);
+        streamStatistics.readException();
+        throw new PathIOException(uri,
+            "Read of S3 Select data did not complete");
+      }
+    }
+
+    if (byteRead >= 0) {
+      incrementBytesRead(1);
+    }
+    return byteRead;
+  }
+
+  @SuppressWarnings("NullableProblems")
+  @Override
+  @Retries.OnceTranslated
+  public synchronized int read(final byte[] buf, final int off, final int len)
+      throws IOException {
+    checkNotClosed();
+    validatePositionedReadArgs(pos.get(), buf, off, len);
+    if (len == 0) {
+      return 0;
+    }
+
+    int bytesRead;
+    try {
+      streamStatistics.readOperationStarted(pos.get(), len);
+      bytesRead = wrappedStream.read(buf, off, len);
+    } catch (EOFException e) {
+      streamStatistics.readException();
+      // the base implementation swallows EOFs.
+      return -1;
+    }
+
+    incrementBytesRead(bytesRead);
+    streamStatistics.readOperationCompleted(len, bytesRead);
+    return bytesRead;
+  }
+
+  /**
+   * Forward seeks are supported, but not backwards ones.
+   * Forward seeks are implemented using read, so
+   * means that long-distance seeks will be (literally) expensive.
+   *
+   * @param newPos new seek position.
+   * @throws PathIOException Backwards seek attempted.
+   * @throws EOFException attempt to seek past the end of the stream.
+   * @throws IOException IO failure while skipping bytes
+   */
+  @Override
+  @Retries.OnceTranslated
+  public synchronized void seek(long newPos) throws IOException {
+    long current = getPos();
+    long distance = newPos - current;
+    if (distance < 0) {
+      throw unsupported(SEEK_UNSUPPORTED
+          + " backwards from " + current + " to " + newPos);
+    }
+    if (distance == 0) {
+      LOG.debug("ignoring seek to current position.");
+    } else {
+      // the complicated one: Forward seeking. Useful for split files.
+      LOG.debug("Forward seek by reading {} bytes", distance);
+      long bytesSkipped = 0;
+      // read byte-by-byte, hoping that buffering will compensate for this.
+      // doing it this way ensures that the seek stops at exactly the right
+      // place. skip(len) can return a smaller value, at which point
+      // it's not clear what to do.
+      while(distance > 0) {
+        int r = read();
+        if (r == -1) {
+          // reached an EOF too early
+          throw new EOFException("Seek to " + newPos
+              + " reached End of File at offset " + getPos());
+        }
+        distance--;
+        bytesSkipped++;
+      }
+      // read has finished.
+      streamStatistics.seekForwards(bytesSkipped);
+    }
+  }
+
+  /**
+   * Build an exception to raise when an operation is not supported here.
+   * @param action action which is unsupported.
+   * @return an exception to throw.
+   */
+  protected PathIOException unsupported(final String action) {
+    return new PathIOException(
+        String.format("s3a://%s/%s", bucket, key),
+        action + " not supported");
+  }
+
+  @Override
+  public boolean seekToNewSource(long targetPos) throws IOException {
+    return false;
+  }
+
+  // Not supported.
+  @Override
+  public boolean markSupported() {
+    return false;
+  }
+
+  @SuppressWarnings("NonSynchronizedMethodOverridesSynchronizedMethod")
+  @Override
+  public void mark(int readLimit) {
+    // Do nothing
+  }
+
+  @SuppressWarnings("NonSynchronizedMethodOverridesSynchronizedMethod")
+  @Override
+  public void reset() throws IOException {
+    throw unsupported("Mark");
+  }
+
+  /**
+   * Aborts the IO.
+   */
+  public void abort() {
+    if (!closed.get()) {
+      LOG.debug("Aborting");
+      wrappedStream.abort();
+    }
+  }
+
+  /**
+   * Read at a specific position.
+   * Reads at a position earlier than the current {@link #getPos()} position
+   * will fail with a {@link PathIOException}. See {@link #seek(long)}.
+   * Unlike the base implementation <i>And the requirements of the filesystem
+   * specification, this updates the stream position as returned in
+   * {@link #getPos()}.</i>
+   * @param position offset in the stream.
+   * @param buffer buffer to read in to.
+   * @param offset offset within the buffer
+   * @param length amount of data to read.
+   * @return the result.
+   * @throws PathIOException Backwards seek attempted.
+   * @throws EOFException attempt to seek past the end of the stream.
+   * @throws IOException IO failure while seeking in the stream or reading data.
+   */
+  @Override
+  public int read(final long position,
+      final byte[] buffer,
+      final int offset,
+      final int length)
+      throws IOException {
+    // maybe seek forwards to the position.
+    seek(position);
+    return read(buffer, offset, length);
+  }
+
+  /**
+   * Increment the bytes read counter if there is a stats instance
+   * and the number of bytes read is more than zero.
+   * This also updates the {@link #pos} marker by the same value.
+   * @param bytesRead number of bytes read
+   */
+  private void incrementBytesRead(long bytesRead) {
+    if (bytesRead > 0) {
+      pos.addAndGet(bytesRead);
+    }
+    streamStatistics.bytesRead(bytesRead);
+    if (readContext.getStats() != null && bytesRead > 0) {
+      readContext.getStats().incrementBytesRead(bytesRead);
+    }
+  }
+
+  /**
+   * Get the Stream statistics.
+   * @return the statistics for this stream.
+   */
+  @InterfaceAudience.Private
+  @InterfaceStability.Unstable
+  public S3AInstrumentation.InputStreamStatistics getS3AStreamStatistics() {
+    return streamStatistics;
+  }
+
+  /**
+   * String value includes statistics as well as stream state.
+   * <b>Important: there are no guarantees as to the stability
+   * of this value.</b>
+   * @return a string value for printing in logs/diagnostics
+   */
+  @Override
+  @InterfaceStability.Unstable
+  public String toString() {
+    String s = streamStatistics.toString();
+    synchronized (this) {
+      final StringBuilder sb = new StringBuilder(
+          "SelectInputStream{");
+      sb.append(uri);
+      sb.append("; state ").append(!closed.get() ? "open" : "closed");
+      sb.append("; pos=").append(getPos());
+      sb.append("; readahead=").append(readahead);
+      sb.append('\n').append(s);
+      sb.append('}');
+      return sb.toString();
+    }
+  }
+}
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/select/SelectTool.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/select/SelectTool.java
new file mode 100644
index 0000000000000..8c87694570334
--- /dev/null
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/select/SelectTool.java
@@ -0,0 +1,355 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.s3a.select;
+
+import java.io.BufferedReader;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.io.OutputStream;
+import java.io.PrintStream;
+import java.nio.charset.StandardCharsets;
+import java.util.List;
+import java.util.Locale;
+import java.util.Optional;
+import java.util.Scanner;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSDataInputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.FutureDataInputStreamBuilder;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.impl.FutureIOSupport;
+import org.apache.hadoop.fs.s3a.S3AFileSystem;
+import org.apache.hadoop.fs.s3a.commit.Duration;
+import org.apache.hadoop.fs.s3a.commit.DurationInfo;
+import org.apache.hadoop.fs.s3a.s3guard.S3GuardTool;
+import org.apache.hadoop.fs.shell.CommandFormat;
+import org.apache.hadoop.util.ExitUtil;
+
+import static org.apache.commons.lang3.StringUtils.isNotEmpty;
+import static org.apache.hadoop.io.IOUtils.cleanupWithLogger;
+import static org.apache.hadoop.service.launcher.LauncherExitCodes.*;
+import static org.apache.hadoop.fs.s3a.select.SelectConstants.*;
+
+/**
+ * This is a CLI tool for the select operation, which is available
+ * through the S3Guard command.
+ *
+ * Usage:
+ * <pre>
+ *   hadoop s3guard select [options] Path Statement
+ * </pre>
+ */
+public class SelectTool extends S3GuardTool {
+
+  private static final Logger LOG =
+      LoggerFactory.getLogger(SelectTool.class);
+
+  public static final String NAME = "select";
+
+  public static final String PURPOSE = "make an S3 Select call";
+
+  private static final String USAGE = NAME
+      + " [OPTIONS]"
+      + " [-limit rows]"
+      + " [-header (use|none|ignore)]"
+      + " [-out path]"
+      + " [-expected rows]"
+      + " [-compression (gzip|bzip2|none)]"
+      + " [-inputformat csv]"
+      + " [-outputformat csv]"
+      + " <PATH> <SELECT QUERY>\n"
+      + "\t" + PURPOSE + "\n\n";
+
+  public static final String OPT_COMPRESSION = "compression";
+
+  public static final String OPT_EXPECTED = "expected";
+
+  public static final String OPT_HEADER = "header";
+
+  public static final String OPT_INPUTFORMAT = "inputformat";
+
+  public static final String OPT_LIMIT = "limit";
+
+  public static final String OPT_OUTPUT = "out";
+
+  public static final String OPT_OUTPUTFORMAT = "inputformat";
+
+  static final String TOO_FEW_ARGUMENTS = "Too few arguments";
+
+  static final String WRONG_FILESYSTEM = "Wrong filesystem for ";
+
+  static final String SELECT_IS_DISABLED = "S3 Select is disabled";
+
+  private Duration selectDuration;
+
+  private long bytesRead;
+
+  private long linesRead;
+
+  public SelectTool(Configuration conf) {
+    super(conf);
+    // read capacity.
+    getCommandFormat().addOptionWithValue(OPT_COMPRESSION);
+    getCommandFormat().addOptionWithValue(OPT_EXPECTED);
+    getCommandFormat().addOptionWithValue(OPT_HEADER);
+    getCommandFormat().addOptionWithValue(OPT_INPUTFORMAT);
+    getCommandFormat().addOptionWithValue(OPT_LIMIT);
+    getCommandFormat().addOptionWithValue(OPT_OUTPUT);
+    getCommandFormat().addOptionWithValue(OPT_OUTPUTFORMAT);
+  }
+
+  @Override
+  public String getName() {
+    return NAME;
+  }
+
+  @Override
+  public String getUsage() {
+    return USAGE;
+  }
+
+  public Duration getSelectDuration() {
+    return selectDuration;
+  }
+
+  public long getBytesRead() {
+    return bytesRead;
+  }
+
+  /**
+   * Number of lines read, when printing to the console.
+   * @return line count. 0 if writing direct to a file.
+   */
+  public long getLinesRead() {
+    return linesRead;
+  }
+
+  private int parseNaturalInt(String option, String value) {
+    try {
+      int r = Integer.parseInt(value);
+      if (r < 0) {
+        throw invalidArgs("Negative value for option %s : %s", option, value);
+      }
+      return r;
+    } catch (NumberFormatException e) {
+      throw invalidArgs("Invalid number for option %s : %s", option, value);
+    }
+  }
+
+  private Optional<String> getOptValue(String key) {
+    String value = getCommandFormat().getOptValue(key);
+    return isNotEmpty(value) ? Optional.of(value): Optional.empty();
+  }
+
+  private Optional<Integer> getIntValue(String key) {
+    Optional<String> v = getOptValue(key);
+    return v.map(i -> parseNaturalInt(key, i));
+  }
+
+  /**
+   * Execute the select operation.
+   * @param args argument list
+   * @param out output stream
+   * @return an exit code
+   * @throws IOException IO failure
+   * @throws ExitUtil.ExitException managed failure
+   */
+  public int run(String[] args, PrintStream out)
+      throws IOException, ExitUtil.ExitException {
+    final List<String> parsedArgs;
+    try {
+      parsedArgs = parseArgs(args);
+    } catch (CommandFormat.UnknownOptionException e) {
+      errorln(getUsage());
+      throw new ExitUtil.ExitException(EXIT_USAGE, e.getMessage(), e);
+    }
+    if (parsedArgs.size() < 2) {
+      errorln(getUsage());
+      throw new ExitUtil.ExitException(EXIT_USAGE, TOO_FEW_ARGUMENTS);
+    }
+
+    // read mandatory arguments
+    final String file = parsedArgs.get(0);
+    final Path path = new Path(file);
+
+    String expression = parsedArgs.get(1);
+
+    println(out, "selecting file %s with query %s",
+        path, expression);
+
+    // and the optional arguments to adjust the configuration.
+    final Optional<String> header = getOptValue(OPT_HEADER);
+    header.ifPresent(h -> println(out, "Using header option %s", h));
+
+    Path destPath = getOptValue(OPT_OUTPUT).map(
+        output -> {
+          println(out, "Saving output to %s", output);
+          return new Path(output);
+        }).orElse(null);
+    final boolean toConsole = destPath == null;
+
+    // expected lines are only checked if empty
+    final Optional<Integer> expectedLines = toConsole
+        ? getIntValue(OPT_EXPECTED)
+        : Optional.empty();
+
+    final Optional<Integer> limit = getIntValue(OPT_LIMIT);
+    if (limit.isPresent()) {
+      final int l = limit.get();
+      println(out, "Using line limit %s", l);
+      if (expression.toLowerCase(Locale.ENGLISH).contains(" limit ")) {
+        println(out, "line limit already specified in SELECT expression");
+      } else {
+        expression = expression + " LIMIT " + l;
+      }
+    }
+
+    // now bind to the filesystem.
+    FileSystem fs = path.getFileSystem(getConf());
+    if (!(fs instanceof S3AFileSystem)) {
+      throw new ExitUtil.ExitException(EXIT_SERVICE_UNAVAILABLE,
+          WRONG_FILESYSTEM + file + ": got " + fs);
+    }
+    setFilesystem((S3AFileSystem) fs);
+
+    if (!getFilesystem().hasCapability(S3_SELECT_CAPABILITY)) {
+      // capability disabled
+      throw new ExitUtil.ExitException(EXIT_SERVICE_UNAVAILABLE,
+          SELECT_IS_DISABLED + " for " + file);
+    }
+    linesRead = 0;
+
+    selectDuration = new Duration();
+
+    // open and scan the stream.
+    final FutureDataInputStreamBuilder builder = fs.openFile(path)
+        .must(SELECT_SQL, expression);
+
+    header.ifPresent(h -> builder.must(CSV_INPUT_HEADER, h));
+
+    getOptValue(OPT_COMPRESSION).ifPresent(compression ->
+        builder.must(SELECT_INPUT_COMPRESSION,
+          compression.toUpperCase(Locale.ENGLISH)));
+
+    getOptValue(OPT_INPUTFORMAT).ifPresent(opt -> {
+      if (!"csv".equalsIgnoreCase(opt)) {
+        throw invalidArgs("Unsupported input format %s", opt);
+      }
+    });
+    getOptValue(OPT_OUTPUTFORMAT).ifPresent(opt -> {
+      if (!"csv".equalsIgnoreCase(opt)) {
+        throw invalidArgs("Unsupported output format %s", opt);
+      }
+    });
+    // turn on SQL error reporting.
+    builder.opt(SELECT_ERRORS_INCLUDE_SQL, true);
+
+    FSDataInputStream stream;
+    try(DurationInfo ignored =
+            new DurationInfo(LOG, "Selecting stream")) {
+      stream = FutureIOSupport.awaitFuture(builder.build());
+    } catch (FileNotFoundException e) {
+      // the source file is missing.
+      throw storeNotFound(e);
+    }
+    try {
+      if (toConsole) {
+        // logging to console
+        bytesRead = 0;
+        @SuppressWarnings("IOResourceOpenedButNotSafelyClosed")
+        Scanner scanner =
+            new Scanner(
+                new BufferedReader(
+                    new InputStreamReader(stream, StandardCharsets.UTF_8)));
+        scanner.useDelimiter("\n");
+        while (scanner.hasNextLine()) {
+          linesRead++;
+          String l = scanner.nextLine();
+          bytesRead += l.length() + 1;
+          println(out, "%s", l);
+        }
+      } else {
+        // straight dump of whole file; no line counting
+        FileSystem destFS = destPath.getFileSystem(getConf());
+        try(DurationInfo ignored =
+                new DurationInfo(LOG, "Copying File");
+            OutputStream destStream = destFS.createFile(destPath)
+                .overwrite(true)
+                .build()) {
+          bytesRead = IOUtils.copy(stream, destStream);
+        }
+      }
+
+      // close the stream.
+      // this will take time if there's a lot of data remaining
+      try (DurationInfo ignored =
+               new DurationInfo(LOG, "Closing stream")) {
+        stream.close();
+      }
+
+      // generate a meaningful result depending on the operation
+      String result = toConsole
+          ? String.format("%s lines", linesRead)
+          : String.format("%s bytes", bytesRead);
+
+      // print some statistics
+      selectDuration.finished();
+      println(out, "Read %s in time %s",
+          result, selectDuration.getDurationString());
+
+      println(out, "Bytes Read: %,d bytes", bytesRead);
+
+      println(out, "Bandwidth: %,.1f MiB/s",
+          bandwidthMBs(bytesRead, selectDuration.value()));
+
+    } finally {
+      cleanupWithLogger(LOG, stream);
+    }
+
+    LOG.debug("Statistics {}", stream);
+
+    expectedLines.ifPresent(l -> {
+      if (l != linesRead) {
+        throw exitException(EXIT_FAIL,
+            "Expected %d rows but the operation returned %d",
+            l, linesRead);
+      }
+    });
+    out.flush();
+    return EXIT_SUCCESS;
+  }
+
+  /**
+   * Work out the bandwidth in MB/s.
+   * @param bytes bytes
+   * @param durationMillisNS duration in nanos
+   * @return the number of megabytes/second of the recorded operation
+   */
+  public static double bandwidthMBs(long bytes, long durationMillisNS) {
+    return durationMillisNS > 0
+        ? (bytes / 1048576.0 * 1000 / durationMillisNS)
+        : 0;
+  }
+}
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/select/package-info.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/select/package-info.java
new file mode 100644
index 0000000000000..6cca6f420a61b
--- /dev/null
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/select/package-info.java
@@ -0,0 +1,27 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Support for S3 Select.
+ */
+@InterfaceAudience.Private
+@InterfaceStability.Unstable
+package org.apache.hadoop.fs.s3a.select;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.classification.InterfaceStability;
diff --git a/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/s3_select.md b/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/s3_select.md
new file mode 100644
index 0000000000000..a684c3aa31e53
--- /dev/null
+++ b/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/s3_select.md
@@ -0,0 +1,1100 @@
+<!---
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License. See accompanying LICENSE file.
+-->
+
+# S3 Select
+
+**Experimental Feature**
+
+<!-- MACRO{toc|fromDepth=0|toDepth=5} -->
+
+S3 Select is a feature for Amazon S3 introduced in April 2018. It allows for
+SQL-like SELECT expressions to be applied to files in some structured
+formats, including CSV and JSON.
+
+By performing the SELECT operation in the S3 storage infrastructure, the
+bandwidth requirements between S3 and the hosts making the request can be reduced.
+Along with latency, this bandwidth is often the limiting factor in processing
+data from S3, especially with larger CSV and JSON datasets.
+
+Apache Hadoop's S3A Client has experimental support for this feature, with the
+following warnings:
+
+* The filtering is being done in S3 itself. If the source files cannot be parsed,
+that's not something which can be fixed in Hadoop or layers above.
+* It is not currently supported by third party S3 implementations, and unlikely
+to be supported in future (the bandwidth constraints are less, so the value
+less compelling).
+* Performance *appears* best when the selection restricts the number of fields,
+and projected columns: the less data returned, the faster the response.
+* High-level support in tools such as Apache Hive and Spark will also be
+evolving. Nobody has ever written CSV connectors with predicate pushdown before.
+* The standard `FileInputFormat` readers of text (`LineRecordReader` etc) fail when the
+amount of data returned is less than they expect. For this reason, S3 Select
+*MUST NOT BE USED IN PRODUCTION MAPREDUCE JOBS*.
+
+## Currently Implemented Features
+
+* Ability to issue select queries on the command line.
+* Proof of concept support in MapReduce queries.
+* CSV input with/without compression.
+* CSV output.
+
+## Currently Unsupported
+
+* Production-ready integration with the standard FileInputFormat and
+Record Readers.
+* Non-CSV output.
+* JSON source files.
+* Structured source file formats like Apache Parquet.
+It's better here to directly use the Apache Spark, Hive, Impala, Flink or
+similar, which all use the latest ASF-supported libraries.
+
+## Enabling/Disabling S3 Select
+
+S3 Select is enabled by default:
+
+```xml
+<property>
+  <name>fs.s3a.select.enabled</name>
+  <value>true</value>
+  <description>Is S3 Select enabled?</description>
+</property>
+```
+
+To disable it, set the option `fs.s3a.select.enabled` to `false`.
+
+To probe to see if a FileSystem instance implements it,
+`StreamCapabilities.hasCapability("s3a:fs.s3a.select.sql")` will be true
+on an instance of the S3AFileSystem class if
+the version of Hadoop supports S3 Select, *and* it is enabled for that
+specific instance.
+
+If this call returns false, then S3 Select calls will fail.
+
+Rather than cast a filesystem to the `S3AFileSystem` class, cast it to
+`org.apache.hadoop.fs.StreamCapabilities`; a class which was added in Hadoop 2.9.
+This should result in less brittle code -and there is no need to have the
+`hadoop-aws` JAR on the classpath at compile time.
+
+```java
+/**
+ * Probe for a filesystem instance supporting S3 Select.
+ * @param fs filesystem
+ * @return true if the filesystem supports S3 Select.
+ */
+public static boolean hasS3SelectCapability(final FileSystem fs) {
+  return (fs instanceof StreamCapabilities)
+    && ((StreamCapabilities)fs).hasCapability("s3a:fs.s3a.select.sql");
+}
+```
+
+## Making S3 Select calls through the Hadoop APIs
+
+Applications can issue select queries through the Hadoop FileSystem/FileContext
+ APIs via the asynchronous `openFile()` call added in Hadoop 3.3.
+
+Use the `FileSystem.openFile(path)` or `FileContext.openFile(path)` methods
+command to get a builder class for the open operations, then
+set the mandatory s3 select options though multiple `must()` parameters.
+
+```java
+FileSystem.FSDataInputStreamBuilder builder =
+    filesystem.openFile("s3a://bucket/path-to-file.csv")
+        .must("fs.s3a.select.sql",
+            "SELECT * FROM S3OBJECT s WHERE s.\"odd\" = `TRUE`")
+        .must("fs.s3a.select.input.format", "CSV")
+        .must("fs.s3a.select.input.compression", "NONE")
+        .must("fs.s3a.select.input.csv.header", "use")
+        .must("fs.s3a.select.output.format", "CSV")
+        .must("fs.s3a.select.output.csv.field.delimiter", "\t")
+        .must("fs.s3a.select.output.csv.quote.character", "\"")
+        .must("fs.s3a.select.output.csv.quote.fields", "asneeded") ;
+CompletableFuture<FSDataInputStream> future = builder.build();
+try (FSDataInputStream select = future.get()) {
+    // process the output
+    byte[] bytes = new byte[8192];
+    int actual = select.read(bytes);
+}
+```
+
+When the Builder's `build()` call is made, if the FileSystem/FileContext
+instance does not recognize any of the mandatory options it will fail.
+The S3A connector does recognize them, and, if S3 Select has not been
+disabled, will issue the Select query against the object store.
+
+If the S3A connector has S3 Select disabled, it will fail with
+an `UnsupportedOperationException`.
+
+The `build()` call returns a `CompletableFuture<FSDataInputStream>`.
+This future retrieves the result of the select call, which is executed
+asynchronously in the S3A FileSystem instance's executor pool.
+
+Errors in the SQL, missing file, permission failures and suchlike
+will surface when the future is evaluated, *not the build call*.
+
+In the returned stream, seeking and positioned reads do not work as usual,
+because there are no absolute positions in the file to seek to.
+
+1. backwards `seek()` calls will raise a `PathIOException`.
+1. Forwards `seek()` calls will succeed, but only by reading and discarding
+bytes. This will be slow.
+1. All positioned read operations when the offset of the read is not the current position
+will raise a `PathIOException`.
+1. Positioned read operations when the offset of the read *is* current position
+   will succeed, but the position of the stream (as returned by `getPos()`)
+   will be updated. This is not compliant with the filesystem specification.
+
+This is all done as a best-effort attempt to support existing code which
+often uses `seek()` to move forward in a split file after opening,
+or does a series of positioned read calls.
+
+
+### seek() behavior on `SelectInputStream`
+
+The returned stream, of type `org.apache.hadoop.fs.s3a.select.SelectInputStream`,
+only supports forward `seek()` operations.
+
+A zero-byte seek operation is always valid:
+
+```java
+stream.seek(stream.getPos());
+```
+
+A negative seek operation will always fail:
+
+```java
+stream.seek(stream.getPos() - offset);
+```
+
+A forward seek operation will work, provided the final position is less
+than the total length of the stream:
+
+```java
+stream.seek(stream.getPos() + offset);
+```
+
+If it is past the end of the file, an `EOFException` is raised.
+
+*Important* Forward seeks are implemented by reading and discarding the
+contents of the stream. The bigger the forward seek, the more data is thrown
+away, the longer the operation takes. And, if the data is being paid for over
+a long-haul S3 connection. the more the seek costs.
+
+Calling `seek()` on a `SelectInputStream` should only be used with care.
+
+The feature has been implemented for splittable queries across Selected data,
+where the initial read starts with a `seek()` to the offset. However, for
+reasons covered below, a codec should be used to declare the input unsplittable.
+
+## Use with third-party S3-compatible object stores.
+
+Third party object stores do not, at the time of writing, support S3 Select.
+S3 Select operations against such stores will fail, presumably with a store-specific
+error code.
+
+To avoid problems, disable S3 Select entirely:
+
+```xml
+<property>
+  <name>fs.s3a.select.enabled</name>
+  <value>false</value>
+</property>
+```
+
+This guarantees that the `hasCapability()` check will fail immediately,
+rather than delaying the failure until an SQL query is attempted.
+
+## Selecting data from the command line: `hadoop s3guard select`
+
+The `s3guard select` command allows direct select statements to be made
+of a path.
+
+Usage:
+
+```bash
+hadoop s3guard select [OPTIONS] \
+ [-limit rows] \
+ [-header (use|none|ignore)] \
+ [-out file] \
+ [-compression (gzip|none)] \
+ [-expected rows]
+ [-inputformat csv]
+ [-outputformat csv]
+  <PATH> <SELECT QUERY>
+```
+
+The output is printed, followed by some summary statistics, unless the `-out`
+option is used to declare a destination file. In this mode
+status will be logged to the console, but the output of the query will be
+saved directly to the output file.
+
+### Example 1
+
+Read the first 100 rows of the landsat dataset where cloud cover is zero:
+
+```bash
+hadoop s3guard select -header use -compression gzip -limit 100  \
+  s3a://landsat-pds/scene_list.gz \
+  "SELECT * FROM S3OBJECT s WHERE s.cloudCover = '0.0'"
+```
+
+### Example 2
+
+Return the `entityId` column for all rows in the dataset where the cloud
+cover was "0.0", and save it to the file `output.csv`:
+
+```bash
+hadoop s3guard select -header use -out s3a://mybucket/output.csv \
+  -compression gzip \
+  s3a://landsat-pds/scene_list.gz \
+  "SELECT s.entityId from S3OBJECT s WHERE s.cloudCover = '0.0'"
+```
+
+This file will:
+
+1. Be UTF-8 encoded.
+1. Have quotes on all columns returned.
+1. Use commas as a separator.
+1. Not have any header.
+
+The output can be saved to a file with the `-out` option. Note also that
+`-D key=value` settings can be used to control the operation, if placed after
+the `s3guard` command and before `select`
+
+
+```bash
+hadoop s3guard \
+  -D  s.s3a.select.output.csv.quote.fields=asneeded \
+  select \
+  -header use \
+  -compression gzip \
+  -limit 500 \
+  -inputformat csv \
+  -outputformat csv \
+  -out s3a://hwdev-steve-new/output.csv \
+  s3a://landsat-pds/scene_list.gz \
+  "SELECT s.entityId from S3OBJECT s WHERE s.cloudCover = '0.0'"
+```
+
+
+## Use in MR/Analytics queries: Work in Progress
+
+S3 Select support in analytics queries is a work in progress. It does
+not work reliably with large source files where the work is split up.
+
+As a proof of concept *only*, S3 Select queries can be made through
+MapReduce jobs which use any Hadoop `RecordReader`
+class which uses the new `openFile()` API.
+
+Currently this consists of the following MRv2 readers.
+
+```
+org.apache.hadoop.mapreduce.lib.input.LineRecordReader
+org.apache.hadoop.mapreduce.lib.input.FixedLengthRecordReader
+```
+
+And a limited number of the MRv1 record readers:
+
+```
+org.apache.hadoop.mapred.LineRecordReader
+```
+
+All of these readers use the new API and can be have its optional/mandatory
+options set via the `JobConf` used when creating/configuring the reader.
+
+These readers are instantiated within input formats; the following
+formats therefore support S3 Select.
+
+```
+org.apache.hadoop.mapreduce.lib.input.FixedLengthInputFormat
+org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat
+org.apache.hadoop.mapreduce.lib.input.NLineInputFormat
+org.apache.hadoop.mapreduce.lib.input.TextInputFormat
+org.apache.hadoop.mapred.KeyValueTextInputFormat
+org.apache.hadoop.mapred.TextInputFormat
+org.apache.hadoop.mapred.lib.NLineInputFormat
+```
+
+All `JobConf` options which begin with the prefix `mapreduce.job.input.file.option.`
+will have that prefix stripped and the remainder used as the name for an option
+when opening the file.
+
+All `JobConf` options which being with the prefix `mapreduce.job.input.file.must.`
+will be converted into mandatory options.
+
+To use an S3 Select call, set the following options
+
+```
+mapreduce.job.input.file.must.fs.s3a.select.sql = <SQL STATEMENT>
+mapreduce.job.input.file.must.fs.s3a.select.input.format = CSV
+mapreduce.job.input.file.must.fs.s3a.select.output.format = CSV
+```
+
+Further options may be set to tune the behaviour, for example:
+
+```java
+jobConf.set("mapreduce.job.input.file.must.fs.s3a.select.input.csv.header", "use");
+```
+
+*Note* How to tell if a reader has migrated to the new `openFile()` builder
+API:
+
+Set a mandatory option which is not known; if the job does not fail then
+an old reader is being used.
+
+```java
+jobConf.set("mapreduce.job.input.file.must.unknown.option", "anything");
+```
+
+
+### Querying Compressed objects
+
+S3 Select queries can be made against gzipped source files; the S3A input
+stream receives the output in text format, rather than as a (re)compressed
+stream.
+
+To read a gzip file, set `fs.s3a.select.input.compression` to `gzip`.
+
+```java
+jobConf.set("mapreduce.job.input.file.must.fs.s3a.select.input.compression",
+  "gzip");
+```
+
+
+Most of the Hadoop RecordReader classes automatically choose a decompressor
+based on the extension of the source file. This causes problems when
+reading `.gz` files, because S3 Select is automatically decompressing and
+returning csv-formatted text.
+
+By default, a query across gzipped files will fail with the error
+"IOException: not a gzip file"
+
+To avoid this problem, declare that the job should switch to the
+"Passthrough Codec" for all files with a ".gz" extension:
+
+```java
+jobConf.set("io.compression.codecs",
+    "org.apache.hadoop.io.compress.PassthroughCodec");
+jobConf.set("io.compress.passthrough.extension", ".gz");
+```
+
+Obviously, this breaks normal `.gz` decompression: only set it on S3 Select
+jobs.
+
+## S3 Select configuration options.
+
+Consult the javadocs for `org.apache.hadoop.fs.s3a.select.SelectConstants`.
+
+The listed options can be set in `core-site.xml`, supported by S3A per-bucket
+configuration, and can be set programmatically on the `Configuration` object
+use to configure a new filesystem instance.
+
+Any of these options can be set in the builder returned by the `openFile()` call
+—simply set them through a chain of `builder.must()` operations.
+
+```xml
+<property>
+  <name>fs.s3a.select.input.format</name>
+  <value>csv</value>
+  <description>Input format</description>
+</property>
+
+<property>
+  <name>fs.s3a.select.output.format</name>
+  <value>csv</value>
+  <description>Output format</description>
+</property>
+
+<property>
+  <name>fs.s3a.select.input.csv.comment.marker</name>
+  <value>#</value>
+  <description>In S3 Select queries: the marker for comment lines in CSV files</description>
+</property>
+
+<property>
+  <name>fs.s3a.select.input.csv.record.delimiter</name>
+  <value>\n</value>
+  <description>In S3 Select queries over CSV files: the record delimiter.
+    \t is remapped to the TAB character, \r to CR \n to newline. \\ to \
+    and \" to "
+  </description>
+</property>
+
+<property>
+  <name>fs.s3a.select.input.csv.field.delimiter</name>
+  <value>,</value>
+  <description>In S3 Select queries over CSV files: the field delimiter.
+    \t is remapped to the TAB character, \r to CR \n to newline. \\ to \
+    and \" to "
+  </description>
+</property>
+
+<property>
+  <name>fs.s3a.select.input.csv.quote.character</name>
+  <value>"</value>
+  <description>In S3 Select queries over CSV files: quote character.
+    \t is remapped to the TAB character, \r to CR \n to newline. \\ to \
+    and \" to "
+  </description>
+</property>
+
+<property>
+  <name>fs.s3a.select.input.csv.quote.escape.character</name>
+  <value>\\</value>
+  <description>In S3 Select queries over CSV files: quote escape character.
+    \t is remapped to the TAB character, \r to CR \n to newline. \\ to \
+    and \" to "
+  </description>
+</property>
+
+<property>
+  <name>fs.s3a.select.input.csv.header</name>
+  <value>none</value>
+  <description>In S3 Select queries over CSV files: what is the role of the header? One of "none", "ignore" and "use"</description>
+</property>
+
+<property>
+  <name>fs.s3a.select.input.compression</name>
+  <value>none</value>
+  <description>In S3 Select queries, the source compression
+    algorithm. One of: "none" and "gzip"</description>
+</property>
+
+<property>
+  <name>fs.s3a.select.output.csv.quote.fields</name>
+  <value>always</value>
+  <description>
+    In S3 Select queries: should fields in generated CSV Files be quoted?
+    One of: "always", "asneeded".
+  </description>
+</property>
+
+<property>
+  <name>fs.s3a.select.output.csv.quote.character</name>
+  <value>"</value>
+  <description>
+    In S3 Select queries: the quote character for generated CSV Files.
+  </description>
+</property>
+
+<property>
+  <name>fs.s3a.select.output.csv.quote.escape.character</name>
+  <value>\\</value>
+  <description>
+    In S3 Select queries: the quote escape character for generated CSV Files.
+  </description>
+</property>
+
+<property>
+  <name>fs.s3a.select.output.csv.record.delimiter</name>
+  <value>\n</value>
+  <description>
+    In S3 Select queries: the record delimiter for generated CSV Files.
+  </description>
+</property>
+
+<property>
+  <name>fs.s3a.select.output.csv.field.delimiter</name>
+  <value>,</value>
+  <description>
+    In S3 Select queries: the field delimiter for generated CSV Files.
+  </description>
+</property>
+
+<property>
+  <name>fs.s3a.select.errors.include.sql</name>
+  <value>false</value>
+  <description>
+    Include the SQL statement in errors: this is useful for development but
+    may leak security and Personally Identifying Information in production,
+    so must be disabled there.
+  </description>
+</property>
+```
+
+## Security and Privacy
+
+SQL Injection attacks are the classic attack on data.
+Because S3 Select is a read-only API, the classic ["Bobby Tables"](https://xkcd.com/327/)
+attack to gain write access isn't going to work. Even so: sanitize your inputs.
+
+CSV does have security issues of its own, specifically:
+
+*Excel and other spreadsheets may interpret some fields beginning with special
+characters as formula, and execute them*
+
+S3 Select does not appear vulnerable to this, but in workflows where untrusted
+data eventually ends up in a spreadsheet (including Google Document spreadsheets),
+the data should be sanitized/audited first. There is no support for
+such sanitization in S3 Select or in the S3A connector.
+
+Logging Select statements may expose secrets if they are in the statement.
+Even if they are just logged, this may potentially leak Personally Identifying
+Information as covered in the EU GDPR legislation and equivalents.
+
+For both privacy and security reasons, SQL statements are not included
+in exception strings by default, nor logged at INFO level.
+
+To enable them, set `fs.s3a.select.errors.include.sql` to `true`, either in the
+site/application configuration, or as an option in the builder for a
+single request. When set, the request will also be logged at
+the INFO level of the log `org.apache.hadoop.fs.s3a.select.SelectBinding`.
+
+Personal Identifiable Information is not printed in the AWS S3 logs.
+Those logs contain only the SQL keywords from the query planner.
+All column names and literals are masked. Following is a sample log example:
+
+*Query:*
+
+```sql
+SELECT * FROM S3OBJECT s;
+```
+
+*Log:*
+
+```sql
+select (project (list (project_all))) (from (as str0 (id str1 case_insensitive)))
+```
+
+Note also that:
+
+1. Debug-level Hadoop logs for the module `org.apache.hadoop.fs.s3a` and other
+components's debug logs may also log the SQL statements (e.g. aws-sdk HTTP logs).
+
+The best practise here is: only enable SQL in exceptions while developing
+SQL queries, especially in an application/notebook where the exception
+text is a lot easier to see than the application logs.
+
+In production: don't log or report. If you do, all logs and output must be
+considered sensitive from security and privacy perspectives.
+
+The `hadoop s3guard select` command does enable the logging, so
+can be used as an initial place to experiment with the SQL syntax.
+Rationale: if you are constructing SQL queries on the command line,
+your shell history is already tainted with the query.
+
+### Links
+
+* [CVE-2014-3524](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2014-3524).
+* [The Absurdly Underestimated Dangers of CSV Injection](http://georgemauer.net/2017/10/07/csv-injection.html).
+* [Comma Separated Vulnerabilities](https://www.contextis.com/blog/comma-separated-vulnerabilities).
+
+### SQL Syntax
+
+The SQL Syntax directly supported by the AWS S3 Select API is [documented by
+Amazon](https://docs.aws.amazon.com/AmazonS3/latest/dev/s3-glacier-select-sql-reference.html).
+
+* Use single quotes for all constants, not double quotes.
+* All CSV column values are strings unless cast to a type
+* Simple `SELECT` calls, no `JOIN`.
+
+### CSV formats
+
+"CSV" is less a format, more "a term meaning the data is in some nonstandard
+line-by-line" text file, and there are even "multiline CSV files".
+
+S3 Select only supports a subset of the loose "CSV" concept, as covered in
+the AWS documentation. There are also limits on how many columns and how
+large a single line may be.
+
+The specific quotation character, field and record delimiters, comments and escape
+characters can be configured in the Hadoop configuration.
+
+### Consistency, Concurrency and Error handling
+
+**Consistency**
+
+* Assume the usual S3 consistency model applies.
+
+* When enabled, S3Guard's DynamoDB table will declare whether or not
+a newly deleted file is visible: if it is marked as deleted, the
+select request will be rejected with a `FileNotFoundException`.
+
+* When an existing S3-hosted object is changed, the S3 select operation
+may return the results of a SELECT call as applied to either the old
+or new version.
+
+* We don't know whether you can get partially consistent reads, or whether
+an extended read ever picks up a later value.
+
+* The AWS S3 load balancers can briefly cache 404/Not-Found entries
+from a failed HEAD/GET request against a nonexistent file; this cached
+entry can briefly create create inconsistency, despite the
+AWS "Create is consistent" model. There is no attempt to detect or recover from
+this.
+
+**Concurrency**
+
+The outcome of what happens when source file is overwritten while the result of
+a select call is overwritten is undefined.
+
+The input stream returned by the operation is *NOT THREAD SAFE*.
+
+**Error Handling**
+
+If an attempt to issue an S3 select call fails, the S3A connector will
+reissue the request if-and-only-if it believes a retry may succeed.
+That is: it considers the operation to be idempotent and if the failure is
+considered to be a recoverable connectivity problem or a server-side rejection
+which can be retried (500, 503).
+
+If an attempt to read data from an S3 select stream (`org.apache.hadoop.fs.s3a.select.SelectInputStream)` fails partway through the read, *no attempt is made to retry the operation*
+
+In contrast, the normal S3A input stream tries to recover from (possibly transient)
+failures by attempting to reopen the file.
+
+
+## Performance
+
+The select operation is best when the least amount of data is returned by
+the query, as this reduces the amount of data downloaded.
+
+* Limit the number of columns projected to only those needed.
+* Use `LIMIT` to set an upper limit on the rows read, rather than implementing
+a row counter in application code and closing the stream when reached.
+This avoids having to abort the HTTPS connection and negotiate a new one
+on the next S3 request.
+
+The select call itself can be slow, especially when the source is a multi-MB
+compressed file with aggressive filtering in the `WHERE` clause.
+Assumption: the select query starts at row 1 and scans through each row,
+and does not return data until it has matched one or more rows.
+
+If the asynchronous nature of the `openFile().build().get()` sequence
+can be taken advantage of, by performing other work before or in parallel
+to the `get()` call: do it.
+
+## Troubleshooting
+
+Getting S3 Select code to work is hard, though those knowledgeable in SQL
+will find it easier.
+
+Problems can be split into:
+
+1. Basic configuration of the client to issue the query.
+1. Bad SQL select syntax and grammar.
+1. Datatype casting issues
+1. Bad records/data in source files.
+1. Failure to configure MR jobs to work correctly.
+1. Failure of MR jobs due to
+
+The exceptions here are all based on the experience during writing tests;
+more may surface with broader use.
+
+All failures other than network errors on request initialization are considered
+unrecoverable and will not be reattempted.
+
+As parse-time errors always state the line and column of an error, you can
+simplify debugging by breaking a SQL statement across lines, e.g.
+
+```java
+String sql = "SELECT\n"
+    + "s.entityId \n"
+    + "FROM " + "S3OBJECT s WHERE\n"
+    + "s.\"cloudCover\" = '100.0'\n"
+    + " LIMIT 100";
+```
+Now if the error is declared as "line 4", it will be on the select conditions;
+the column offset will begin from the first character on that row.
+
+The SQL Statements issued are only included in exceptions if `fs.s3a.select.errors.include.sql`
+is explicitly set to true. This can be done in an application during development,
+or in a `openFile()` option parameter. This should only be done during development,
+to reduce the risk of logging security or privacy information.
+
+
+### "mid-query" failures on large datasets
+
+S3 Select returns paged results; the source file is _not_ filtered in
+one go in the initial request.
+
+This means that errors related to the content of the data (type casting, etc)
+may only surface partway through the read. The errors reported in such a
+case may be different than those raised on reading the first page of data,
+where it will happen earlier on in the read process.
+
+### External Resources on for troubleshooting
+
+See:
+
+* [SELECT Command Reference](https://docs.aws.amazon.com/AmazonS3/latest/dev/s3-glacier-select-sql-reference-select.html)
+* [SELECT Object Content](https://docs.aws.amazon.com/AmazonS3/latest/API/RESTObjectSELECTContent.html)
+
+### IOException: "not a gzip file"
+
+This surfaces when trying to read in data from a `.gz` source file through an MR
+or other analytics query, and the gzip codec has tried to parse it.
+
+```
+java.io.IOException: not a gzip file
+at org.apache.hadoop.io.compress.zlib.BuiltInGzipDecompressor.processBasicHeader(BuiltInGzipDecompressor.java:496)
+at org.apache.hadoop.io.compress.zlib.BuiltInGzipDecompressor.executeHeaderState(BuiltInGzipDecompressor.java:257)
+at org.apache.hadoop.io.compress.zlib.BuiltInGzipDecompressor.decompress(BuiltInGzipDecompressor.java:186)
+at org.apache.hadoop.io.compress.DecompressorStream.decompress(DecompressorStream.java:111)
+at org.apache.hadoop.io.compress.DecompressorStream.read(DecompressorStream.java:105)
+at java.io.InputStream.read(InputStream.java:101)
+at org.apache.hadoop.util.LineReader.fillBuffer(LineReader.java:182)
+at org.apache.hadoop.util.LineReader.readCustomLine(LineReader.java:306)
+at org.apache.hadoop.util.LineReader.readLine(LineReader.java:174)
+at org.apache.hadoop.mapreduce.lib.input.LineRecordReader.skipUtfByteOrderMark(LineRecordReader.java:158)
+at org.apache.hadoop.mapreduce.lib.input.LineRecordReader.nextKeyValue(LineRecordReader.java:198)
+```
+
+The underlying problem is that the gzip decompressor is automatically enabled
+when the the source file ends with the ".gz" extension. Because S3 Select
+returns decompressed data, the codec fails.
+
+The workaround here is to declare that the job should add the "Passthrough Codec"
+to its list of known decompressors, and that this codec should declare the
+file format it supports to be ".gz".
+
+```
+io.compression.codecs = org.apache.hadoop.io.compress.PassthroughCodec
+io.compress.passthrough.extension = .gz
+```
+
+### AWSBadRequestException `InvalidColumnIndex`
+
+
+Your SQL is wrong and the element at fault is considered an unknown column
+name.
+
+```
+org.apache.hadoop.fs.s3a.AWSBadRequestException:
+  Select: SELECT * FROM S3OBJECT WHERE odd = true on test/testSelectOddLines.csv:
+  com.amazonaws.services.s3.model.AmazonS3Exception:
+  The column index at line 1, column 30 is invalid.
+  Please check the service documentation and try again.
+  (Service: Amazon S3; Status Code: 400; Error Code: InvalidColumnIndex;
+```
+
+Here it's the first line of the query, column 30. Paste the query
+into an editor and position yourself on the line and column at fault.
+
+```sql
+SELECT * FROM S3OBJECT WHERE odd = true
+                             ^ HERE
+```
+
+Another example:
+
+```
+org.apache.hadoop.fs.s3a.AWSBadRequestException: Select:
+SELECT * FROM S3OBJECT s WHERE s._1 = "true" on test/testSelectOddLines.csv:
+  com.amazonaws.services.s3.model.AmazonS3Exception:
+  The column index at line 1, column 39 is invalid.
+  Please check the service documentation and try again.
+  (Service: Amazon S3; Status Code: 400;
+  Error Code: InvalidColumnIndex;
+```
+
+Here it is because strings must be single quoted, not double quoted.
+
+```sql
+SELECT * FROM S3OBJECT s WHERE s._1 = "true"
+                                      ^ HERE
+```
+
+S3 select uses double quotes to wrap column names, interprets the string
+as column "true", and fails with a non-intuitive message.
+
+*Tip*: look for the element at fault and treat the `InvalidColumnIndex`
+message as a parse-time message, rather than the definitive root
+cause of the problem.
+
+### AWSBadRequestException `ParseInvalidPathComponent`
+
+Your SQL is wrong.
+
+```
+org.apache.hadoop.fs.s3a.AWSBadRequestException:
+Select: SELECT * FROM S3OBJECT s WHERE s.'odd' is "true" on test/testSelectOddLines.csv
+: com.amazonaws.services.s3.model.AmazonS3Exception: Invalid Path component,
+  expecting either an IDENTIFIER or STAR, got: LITERAL,at line 1, column 34.
+  (Service: Amazon S3; Status Code: 400; Error Code: ParseInvalidPathComponent;
+
+```
+
+```
+SELECT * FROM S3OBJECT s WHERE s.'odd' is "true" on test/testSelectOddLines.csv
+                                 ^ HERE
+```
+
+
+### AWSBadRequestException  `ParseExpectedTypeName`
+
+Your SQL is still wrong.
+
+```
+
+org.apache.hadoop.fs.s3a.AWSBadRequestException:
+ Select: SELECT * FROM S3OBJECT s WHERE s.odd = "true"
+on test/testSelectOddLines.csv:
+com.amazonaws.services.s3.model.AmazonS3Exception
+: Expected type name, found QUOTED_IDENTIFIER:'true' at line 1, column 41.
+(Service: Amazon S3; Status Code: 400; Error Code: ParseExpectedTypeName;
+```
+
+### `ParseUnexpectedToken`
+
+Your SQL is broken.
+
+```
+org.apache.hadoop.fs.s3a.AWSBadRequestException:
+Select: SELECT * FROM S3OBJECT s WHERE s.5 = `true` on test/testSelectOddLines.csv:
+com.amazonaws.services.s3.model.AmazonS3Exception:
+Unexpected token found LITERAL:5d-1 at line 1, column 33.
+(Service: Amazon S3; Status Code: 400; Error Code: ParseUnexpectedToken;
+```
+### `ParseUnexpectedOperator`
+
+Your SQL is broken.
+
+```
+com.amazonaws.services.s3.model.AmazonS3Exception: Unexpected operator OPERATOR:'%' at line 1, column 45.
+(Service: Amazon S3; Status Code: 400;
+Error Code: ParseUnexpectedOperator; Request ID: E87F30C57436B459;
+S3 Extended Request ID: UBFOIgkQxBBL+bcBFPaZaPBsjdnd8NRz3NFWAgcctqm3n6f7ib9FMOpR+Eu1Cy6cNMYHCpJbYEY
+ =:ParseUnexpectedOperator: Unexpected operator OPERATOR:'%' at line 1, column 45.
+at java.util.concurrent.CompletableFuture.reportGet(CompletableFuture.java:357)
+at java.util.concurrent.CompletableFuture.get(CompletableFuture.java:1895)
+```
+
+### `MissingHeaders`
+
+```
+org.apache.hadoop.fs.s3a.AWSBadRequestException:
+Select: SELECT * FROM S3OBJECT s WHERE s."odd" = `true` on test/testSelectOddLines.csv:
+com.amazonaws.services.s3.model.AmazonS3Exception:
+Some headers in the query are missing from the file.
+Please check the file and try again.
+(Service: Amazon S3; Status Code: 400; Error Code: MissingHeaders;
+```
+
+1. There's a header used in the query which doesn't match any in the document
+itself.
+1. The header option for the select query is set to "none" or "ignore", and
+you are trying to use a header named there.
+
+This can happen if you are trying to use double quotes for constants in the
+SQL expression.
+
+```
+SELECT * FROM S3OBJECT s WHERE s."odd" = "true" on test/testSelectOddLines.csv:
+                                         ^ HERE
+```
+
+Double quotes (") may only be used when naming columns; for constants
+single quotes are required.
+
+### Method not allowed
+
+```
+org.apache.hadoop.fs.s3a.AWSS3IOException: Select on test/testSelectWholeFile:
+com.amazonaws.services.s3.model.AmazonS3Exception: The specified method is not
+allowed against this resource. (Service: Amazon S3; Status Code: 405;
+Error Code: MethodNotAllowed;
+```
+
+You are trying to use S3 Select to read data which for some reason
+you are not allowed to.
+
+### AWSBadRequestException `InvalidTextEncoding`
+
+The file couldn't be parsed. This can happen if you try to read a `.gz` file
+and forget to set the compression in the select request.
+
+That can be done through the `fs.s3a.select.compression` option.
+
+```
+org.apache.hadoop.fs.s3a.AWSBadRequestException:
+  Select: '" SELECT * FROM S3OBJECT s WHERE endstation_name = 'Bayswater Road: Hyde Park' "
+  on s3a://example/dataset.csv.gz:
+  com.amazonaws.services.s3.model.AmazonS3Exception:
+   UTF-8 encoding is required. The text encoding error was found near byte 8,192.
+    (Service: Amazon S3; Status Code: 400; Error Code: InvalidTextEncoding
+```
+
+### AWSBadRequestException  `InvalidCompressionFormat` "GZIP is not applicable to the queried object"
+
+A SELECT call has been made using a compression which doesn't match that of the
+source object, such as it being a plain text file.
+
+```
+org.apache.hadoop.fs.s3a.AWSBadRequestException: Select:
+ '" SELECT * FROM S3OBJECT s WHERE endstation_name = 'Bayswater Road: Hyde Park' "
+  on s3a://example/dataset.csv:
+   com.amazonaws.services.s3.model.AmazonS3Exception:
+    GZIP is not applicable to the queried object. Please correct the request and try again.
+     (Service: Amazon S3; Status Code: 400; Error Code: InvalidCompressionFormat;
+  at org.apache.hadoop.fs.s3a.S3AUtils.translateException(S3AUtils.java:212)
+  at org.apache.hadoop.fs.s3a.Invoker.once(Invoker.java:111)
+...
+Caused by: com.amazonaws.services.s3.model.AmazonS3Exception: GZIP is not applicable to the queried object.
+ Please correct the request and try again.
+  Service: Amazon S3; Status Code: 400; Error Code: InvalidCompressionFormat;
+  at com.amazonaws.http.AmazonHttpClient$RequestExecutor.handleErrorResponse
+  ...
+```
+
+### `PathIOException`: "seek() not supported"
+
+The input stream returned by the select call does not support seeking
+backwards in the stream.
+
+Similarly, `PositionedReadable` operations will fail when used to read
+data any offset other than that of `getPos()`.
+
+```
+org.apache.hadoop.fs.PathIOException: `s3a://landsat-pds/landsat.csv.gz': seek() not supported
+
+  at org.apache.hadoop.fs.s3a.select.SelectInputStream.unsupported(SelectInputStream.java:254)
+  at org.apache.hadoop.fs.s3a.select.SelectInputStream.seek(SelectInputStream.java:243)
+  at org.apache.hadoop.fs.FSDataInputStream.seek(FSDataInputStream.java:66)
+```
+
+There is no fix for this. You can move forward in a file using `skip(offset)`;
+bear in mind that the return value indicates what offset was skipped -it
+may be less than expected.
+
+### `IllegalArgumentException`: "Unknown mandatory key "fs.s3a.select.sql"
+
+The filesystem is not an S3A filesystem, and the s3a select option is not recognized.
+
+```
+java.lang.IllegalArgumentException: Unknown mandatory key "fs.s3a.select.sql"
+at com.google.common.base.Preconditions.checkArgument(Preconditions.java:88)
+at org.apache.hadoop.fs.AbstractFSBuilder.lambda$rejectUnknownMandatoryKeys$0(AbstractFSBuilder.java:331)
+at java.lang.Iterable.forEach(Iterable.java:75)
+at java.util.Collections$UnmodifiableCollection.forEach(Collections.java:1080)
+at org.apache.hadoop.fs.AbstractFSBuilder.rejectUnknownMandatoryKeys(AbstractFSBuilder.java:330)
+at org.apache.hadoop.fs.filesystem.openFileWithOptions(FileSystem.java:3541)
+at org.apache.hadoop.fs.FileSystem$FSDataInputStreamBuilder.build(FileSystem.java:4442)
+```
+
+* Verify that the URL has an "s3a:" prefix.
+* If it does, there may be a non-standard S3A implementation, or some
+a filtering/relaying class has been placed in front of the S3AFilesystem.
+
+### `IllegalArgumentException`: "Unknown mandatory key in non-select file I/O"
+
+The file options to tune an S3 select call are only valid when a SQL expression
+is set in the `fs.s3a.select.sql` option. If not, any such option added as a `must()` value
+will fail.
+
+```
+java.lang.IllegalArgumentException: Unknown mandatory key for s3a://example/test/testSelectOptionsOnlyOnSelectCalls.csv in non-select file I/O "fs.s3a.select.input.csv.header"
+
+  at com.google.common.base.Preconditions.checkArgument(Preconditions.java:115)
+  at org.apache.hadoop.fs.impl.AbstractFSBuilderImpl.lambda$rejectUnknownMandatoryKeys$0(AbstractFSBuilderImpl.java:352)
+  at java.lang.Iterable.forEach(Iterable.java:75)
+  at java.util.Collections$UnmodifiableCollection.forEach(Collections.java:1080)
+  at org.apache.hadoop.fs.impl.AbstractFSBuilderImpl.rejectUnknownMandatoryKeys(AbstractFSBuilderImpl.java:351)
+  at org.apache.hadoop.fs.s3a.S3AFileSystem.openFileWithOptions(S3AFileSystem.java:3736)
+  at org.apache.hadoop.fs.FileSystem$FSDataInputStreamBuilder.build(FileSystem.java:4471)
+```
+
+Requiring these options without providing a SQL query is invariably an error.
+Fix: add the SQL statement, or use `opt()` calls to set the option.
+
+If the `fs.s3a.select.sql` option is set, and still a key is rejected, then
+either the spelling of the key is wrong, it has leading or trailing spaces,
+or it is an option not supported in that specific release of Hadoop.
+
+
+### PathIOException : "seek() backwards from  not supported"
+
+Backwards seeks in an S3 Select `SelectInputStream` are not supported.
+
+```
+org.apache.hadoop.fs.PathIOException: `s3a://landsat-pds/scene_list.gz':
+  seek() backwards from 16387 to 0 not supported
+
+  at org.apache.hadoop.fs.s3a.select.SelectInputStream.unsupported(SelectInputStream.java:288)
+  at org.apache.hadoop.fs.s3a.select.SelectInputStream.seek(SelectInputStream.java:253)
+  at org.apache.hadoop.fs.FSDataInputStream.seek(FSDataInputStream.java:66)
+```
+
+### InvalidTableAlias
+
+The SELECT refers to the name of a column which is not recognized
+
+* the name of a column is wrong, here `s.oddf`.
+* headers are not enabled for the CSV source file. Fix: enable.
+* a generated alias is used e.g `s._1`, but headers have been enabled.
+Fix. disable, or use the header name.
+
+```
+org.apache.hadoop.fs.s3a.AWSBadRequestException:
+ SELECT * FROM S3OBJECT WHERE s."oddf" = 'true'
+ on s3a://example/test/testParseBrokenCSVFile:
+ com.amazonaws.services.s3.model.AmazonS3Exception:
+ Invalid table alias is specified at line 1, column 30.
+  Please check the file and try again. (Service: Amazon S3; Status Code: 400; Error Code: InvalidTableAlias;
+   Invalid table alias is specified at line 1, column 30. Please check the file and try again.
+    (Service: Amazon S3; Status Code: 400;
+    Error Code: InvalidTableAlias;
+    Request ID: 8693B86A52CFB91C;
+  at org.apache.hadoop.fs.s3a.S3AUtils.translateException(S3AUtils.java:225)
+  at org.apache.hadoop.fs.s3a.Invoker.once(Invoker.java:111)
+  at org.apache.hadoop.fs.s3a.Invoker.lambda$retry$3(Invoker.java:265)
+  ...
+Caused by: com.amazonaws.services.s3.model.AmazonS3Exception:
+ Invalid table alias is specified at line 1, column 30.
+  Please check the file and try again.
+   (Service: Amazon S3; Status Code: 400; Error Code: InvalidTableAlias; Request ID: 8693B86A52CFB91C;
+  at com.amazonaws.http.AmazonHttpClient$RequestExecutor.handleErrorResponse(AmazonHttpClient.java:1640)
+  at com.amazonaws.http.AmazonHttpClient$RequestExecutor.executeOneRequest(AmazonHttpClient.java:1304)
+  at com.amazonaws.http.AmazonHttpClient$RequestExecutor.executeHelper(AmazonHttpClient.java:1058)
+  at com.amazonaws.http.AmazonHttpClient$RequestExecutor.doExecute(AmazonHttpClient.java:743)
+  at com.amazonaws.http.AmazonHttpClient$RequestExecutor.executeWithTimer(AmazonHttpClient.java:717)
+  at com.amazonaws.http.AmazonHttpClient$RequestExecutor.execute(AmazonHttpClient.java:699)
+  at com.amazonaws.http.AmazonHttpClient$RequestExecutor.access$500(AmazonHttpClient.java:667)
+  at com.amazonaws.http.AmazonHttpClient$RequestExecutionBuilderImpl.execute(AmazonHttpClient.java:649)
+```
+
+###  `AWSBadRequestException` "Attempt to convert from one data type to another failed: cast from STRING to TIMESTAMP."
+
+A string field could not be converted to a timestamp because one or more of its entries were not parseable
+with the given timestamp.
+
+Example, from a spreadsheet where "timestamp" is normally a well-formatted timestamp field,
+but in one column it is just "Tuesday"
+
+```sql
+SELECT CAST(s.date AS TIMESTAMP) FROM S3OBJECT s
+```
+
+```
+org.apache.hadoop.fs.s3a.AWSBadRequestException: Select on s3a://example/test/testParseBrokenCSVFile:
+com.amazonaws.services.s3.model.AmazonS3Exception:
+Attempt to convert from one data type to another failed: cast from STRING to TIMESTAMP.
+(Service: Amazon S3; Status Code: 400; Error Code: CastFailed;
+Request ID: E2158FE45AF2049A; S3 Extended Request ID: iM40fzGuaPt6mQo0QxDDX+AY1bAgSVD1sKErFq6Y4GDJYHIAnmc00i0EvGGnH+0MFCFhKIivIrQ=),
+S3 Extended Request ID: iM40fzGuaPt6mQo0QxDDX+AY1bAgSVD1sKErFq6Y4GDJYHIAnmc00i0EvGGnH+0MFCFhKIivIrQ=:CastFailed:
+Attempt to convert from one data type to another failed: cast from STRING to TIMESTAMP.
+(Service: Amazon S3; Status Code: 400; Error Code: CastFailed; Request ID: E2158FE45AF2049A; S3 Extended Request ID: iM40fzGuaPt6mQo0QxDDX+AY1bAgSVD1sKErFq6Y4GDJYHIAnmc00i0EvGGnH+0MFCFhKIivIrQ=)
+  at org.apache.hadoop.fs.s3a.S3AUtils.translateException(S3AUtils.java:225)
+  at org.apache.hadoop.fs.s3a.Invoker.once(Invoker.java:111)
+  at org.apache.hadoop.fs.s3a.Invoker.lambda$retry$3(Invoker.java:265)
+Caused by: com.amazonaws.services.s3.model.AmazonS3Exception:
+ Attempt to convert from one data type to another failed: cast from STRING to TIMESTAMP.
+  (Service: Amazon S3; Status Code: 400; Error Code: CastFailed;)
+
+```
+
+There's no way to recover from a bad record here; no option to skip invalid
+rows.
+
+*Note:* This is an example stack trace *without* the SQL being printed.
diff --git a/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/testing.md b/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/testing.md
index 09b6d4b32b584..058fb35e259c5 100644
--- a/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/testing.md
+++ b/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/testing.md
@@ -489,6 +489,22 @@ the `fs.s3a.scale.test.csvfile` option set to its path.
 (yes, the space is necessary. The Hadoop `Configuration` class treats an empty
 value as "do not override the default").
 
+### Turning off S3 Select
+
+The S3 select tests are skipped when the S3 endpoint doesn't support S3 Select.
+
+```xml
+<property>
+  <name>fs.s3a.select.enabled</name>
+  <value>false</value>
+</property>
+```
+
+If your endpoint doesn't support that feature, this option should be in
+your `core-site.xml` file, so that trying to use S3 select fails fast with
+a meaningful error ("S3 Select not supported") rather than a generic Bad Request
+exception.
+
 
 ### Testing Session Credentials
 
diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AAWSCredentialsProvider.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AAWSCredentialsProvider.java
index 267646ca258e4..9e8a871ef7aad 100644
--- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AAWSCredentialsProvider.java
+++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AAWSCredentialsProvider.java
@@ -39,6 +39,7 @@
 
 import static org.apache.hadoop.fs.s3a.Constants.*;
 import static org.apache.hadoop.fs.s3a.S3ATestConstants.*;
+import static org.apache.hadoop.fs.s3a.S3ATestUtils.getCSVTestPath;
 import static org.apache.hadoop.fs.s3a.S3AUtils.*;
 import static org.junit.Assert.*;
 
@@ -150,8 +151,7 @@ public void testAnonymousProvider() throws Exception {
     Configuration conf = new Configuration();
     conf.set(AWS_CREDENTIALS_PROVIDER,
         AnonymousAWSCredentialsProvider.class.getName());
-    Path testFile = new Path(
-        conf.getTrimmed(KEY_CSVTEST_FILE, DEFAULT_CSVTEST_FILE));
+    Path testFile = getCSVTestPath(conf);
     FileSystem fs = FileSystem.newInstance(testFile.toUri(), conf);
     assertNotNull(fs);
     assertTrue(fs instanceof S3AFileSystem);
diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AFailureHandling.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AFailureHandling.java
index da9ecc0d90c5f..8f8d8605653b1 100644
--- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AFailureHandling.java
+++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AFailureHandling.java
@@ -24,7 +24,6 @@
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.Path;
 
-import org.junit.Assume;
 import org.junit.Test;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -34,6 +33,7 @@
 import java.util.List;
 
 import static org.apache.hadoop.fs.contract.ContractTestUtils.*;
+import static org.apache.hadoop.fs.s3a.S3ATestUtils.getLandsatCSVPath;
 import static org.apache.hadoop.test.LambdaTestUtils.*;
 
 /**
@@ -89,12 +89,9 @@ public void testMultiObjectDeleteSomeFiles() throws Throwable {
 
   @Test
   public void testMultiObjectDeleteNoPermissions() throws Throwable {
-    Configuration conf = getConfiguration();
-    String csvFile = conf.getTrimmed(KEY_CSVTEST_FILE, DEFAULT_CSVTEST_FILE);
-    Assume.assumeTrue("CSV test file is not the default",
-        DEFAULT_CSVTEST_FILE.equals(csvFile));
-    Path testFile = new Path(csvFile);
-    S3AFileSystem fs = (S3AFileSystem)testFile.getFileSystem(conf);
+    Path testFile = getLandsatCSVPath(getConfiguration());
+    S3AFileSystem fs = (S3AFileSystem)testFile.getFileSystem(
+        getConfiguration());
     intercept(MultiObjectDeleteException.class,
         () -> removeKeys(fs, fs.pathToKey(testFile)));
   }
diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/S3ATestUtils.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/S3ATestUtils.java
index 11cbd4ff8a86b..e15c24aced88f 100644
--- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/S3ATestUtils.java
+++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/S3ATestUtils.java
@@ -29,15 +29,24 @@
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.fs.permission.FsPermission;
+import org.apache.hadoop.fs.s3a.auth.MarshalledCredentialBinding;
+import org.apache.hadoop.fs.s3a.auth.MarshalledCredentials;
 import org.apache.hadoop.fs.s3a.commit.CommitConstants;
 
+import org.apache.hadoop.fs.s3a.s3guard.MetadataStore;
+import org.apache.hadoop.fs.s3a.s3guard.MetadataStoreCapabilities;
+import org.apache.hadoop.fs.s3native.S3xLoginHelper;
+import org.apache.hadoop.io.DataInputBuffer;
+import org.apache.hadoop.io.DataOutputBuffer;
+import org.apache.hadoop.io.Writable;
 import org.apache.hadoop.service.Service;
 import org.apache.hadoop.service.ServiceOperations;
+import org.apache.hadoop.util.ReflectionUtils;
 
+import com.amazonaws.auth.AWSCredentialsProvider;
 import org.hamcrest.core.Is;
 import org.junit.Assert;
 import org.junit.Assume;
-import org.junit.internal.AssumptionViolatedException;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -49,9 +58,11 @@
 import java.text.DateFormat;
 import java.text.SimpleDateFormat;
 import java.util.List;
+import java.util.Map;
 import java.util.concurrent.Callable;
 
-import static com.google.common.base.Preconditions.checkNotNull;
+import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.HADOOP_SECURITY_CREDENTIAL_PROVIDER_PATH;
+import static org.apache.commons.lang3.StringUtils.isNotEmpty;
 import static org.apache.hadoop.fs.contract.ContractTestUtils.skip;
 import static org.apache.hadoop.fs.s3a.FailureInjectionPolicy.*;
 import static org.apache.hadoop.fs.s3a.S3ATestConstants.*;
@@ -77,6 +88,27 @@ public final class S3ATestUtils {
   public static final String UNSET_PROPERTY = "unset";
   public static final int PURGE_DELAY_SECONDS = 60 * 60;
 
+  /** Add any deprecated keys. */
+  @SuppressWarnings("deprecation")
+  private static void addDeprecatedKeys() {
+    // STS endpoint configuration option
+    Configuration.DeprecationDelta[] deltas = {
+        // STS endpoint configuration option
+        new Configuration.DeprecationDelta(
+            S3ATestConstants.TEST_STS_ENDPOINT,
+            ASSUMED_ROLE_STS_ENDPOINT)
+    };
+
+    if (deltas.length > 0) {
+      Configuration.addDeprecations(deltas);
+      Configuration.reloadExistingConfigurations();
+    }
+  }
+
+  static {
+    addDeprecatedKeys();
+  }
+
   /**
    * Get S3A FS name.
    * @param conf configuration.
@@ -112,7 +144,6 @@ public static S3AFileSystem createTestFileSystem(Configuration conf)
    * @param purge flag to enable Multipart purging
    * @return the FS
    * @throws IOException IO Problems
-   * @throws AssumptionViolatedException if the FS is not named
    */
   public static S3AFileSystem createTestFileSystem(Configuration conf,
       boolean purge)
@@ -126,12 +157,10 @@ public static S3AFileSystem createTestFileSystem(Configuration conf,
       testURI = URI.create(fsname);
       liveTest = testURI.getScheme().equals(Constants.FS_S3A);
     }
-    if (!liveTest) {
-      // This doesn't work with our JUnit 3 style test cases, so instead we'll
-      // make this whole class not run by default
-      throw new AssumptionViolatedException(
-          "No test filesystem in " + TEST_FS_S3A_NAME);
-    }
+    // This doesn't work with our JUnit 3 style test cases, so instead we'll
+    // make this whole class not run by default
+    Assume.assumeTrue("No test filesystem in " + TEST_FS_S3A_NAME,
+        liveTest);
     // patch in S3Guard options
     maybeEnableS3Guard(conf);
     S3AFileSystem fs1 = new S3AFileSystem();
@@ -160,7 +189,6 @@ public static void enableMultipartPurge(Configuration conf, int seconds) {
    * @param conf configuration
    * @return the FS
    * @throws IOException IO Problems
-   * @throws AssumptionViolatedException if the FS is not named
    */
   public static FileContext createTestFileContext(Configuration conf)
       throws IOException {
@@ -172,12 +200,10 @@ public static FileContext createTestFileContext(Configuration conf)
       testURI = URI.create(fsname);
       liveTest = testURI.getScheme().equals(Constants.FS_S3A);
     }
-    if (!liveTest) {
-      // This doesn't work with our JUnit 3 style test cases, so instead we'll
-      // make this whole class not run by default
-      throw new AssumptionViolatedException("No test filesystem in "
-          + TEST_FS_S3A_NAME);
-    }
+    // This doesn't work with our JUnit 3 style test cases, so instead we'll
+    // make this whole class not run by default
+    Assume.assumeTrue("No test filesystem in " + TEST_FS_S3A_NAME,
+        liveTest);
     // patch in S3Guard options
     maybeEnableS3Guard(conf);
     FileContext fc = FileContext.getFileContext(testURI, conf);
@@ -295,10 +321,56 @@ public static String getTestProperty(Configuration conf,
       String defVal) {
     String confVal = conf != null ? conf.getTrimmed(key, defVal) : defVal;
     String propval = System.getProperty(key);
-    return StringUtils.isNotEmpty(propval) && !UNSET_PROPERTY.equals(propval)
+    return isNotEmpty(propval) && !UNSET_PROPERTY.equals(propval)
         ? propval : confVal;
   }
 
+  /**
+   * Get the test CSV file; assume() that it is not empty.
+   * @param conf test configuration
+   * @return test file.
+   */
+  public static String getCSVTestFile(Configuration conf) {
+    String csvFile = conf
+        .getTrimmed(KEY_CSVTEST_FILE, DEFAULT_CSVTEST_FILE);
+    Assume.assumeTrue("CSV test file is not the default",
+        isNotEmpty(csvFile));
+    return csvFile;
+  }
+
+  /**
+   * Get the test CSV path; assume() that it is not empty.
+   * @param conf test configuration
+   * @return test file as a path.
+   */
+  public static Path getCSVTestPath(Configuration conf) {
+    return new Path(getCSVTestFile(conf));
+  }
+
+  /**
+   * Get the test CSV file; assume() that it is not modified (i.e. we haven't
+   * switched to a new storage infrastructure where the bucket is no longer
+   * read only).
+   * @return test file.
+   * @param conf test configuration
+   */
+  public static String getLandsatCSVFile(Configuration conf) {
+    String csvFile = getCSVTestFile(conf);
+    Assume.assumeTrue("CSV test file is not the default",
+        DEFAULT_CSVTEST_FILE.equals(csvFile));
+    return csvFile;
+  }
+  /**
+   * Get the test CSV file; assume() that it is not modified (i.e. we haven't
+   * switched to a new storage infrastructure where the bucket is no longer
+   * read only).
+   * @param conf test configuration
+   * @return test file as a path.
+   */
+  public static Path getLandsatCSVPath(Configuration conf) {
+    return new Path(getLandsatCSVFile(conf));
+  }
+
   /**
    * Verify the class of an exception. If it is not as expected, rethrow it.
    * Comparison is on the exact class, not subclass-of inference as
@@ -516,15 +588,111 @@ public static Configuration prepareTestConfiguration(final Configuration conf) {
   }
 
   /**
-   * Get the name of the test bucket.
-   * @param conf configuration to scan.
-   * @return the bucket name from the config.
-   * @throws NullPointerException: no test bucket
+   * Clear any Hadoop credential provider path.
+   * This is needed if people's test setups switch to credential providers,
+   * and the test case is altering FS login details: changes made in the
+   * config will not be picked up.
+   * @param conf configuration to update
+   */
+  public static void unsetHadoopCredentialProviders(final Configuration conf) {
+    conf.unset(HADOOP_SECURITY_CREDENTIAL_PROVIDER_PATH);
+  }
+
+  /**
+   * Build AWS credentials to talk to the STS. Also where checks for the
+   * session tests being disabled are implemented.
+   * @return a set of credentials
+   * @throws IOException on a failure
+   */
+  public static AWSCredentialsProvider buildAwsCredentialsProvider(
+      final Configuration conf)
+      throws IOException {
+    assumeSessionTestsEnabled(conf);
+
+    S3xLoginHelper.Login login = S3AUtils.getAWSAccessKeys(
+        URI.create("s3a://foobar"), conf);
+    if (!login.hasLogin()) {
+      skip("testSTS disabled because AWS credentials not configured");
+    }
+    return new SimpleAWSCredentialsProvider(login);
+  }
+
+  /**
+   * Skip the current test if STS tess are not enabled.
+   * @param conf configuration to examine
+   */
+  public static void assumeSessionTestsEnabled(final Configuration conf) {
+    if (!conf.getBoolean(TEST_STS_ENABLED, true)) {
+      skip("STS functional tests disabled");
+    }
+  }
+
+  /**
+   * Request session credentials for the default time (900s).
+   * @param conf configuration to use for login
+   * @param bucket Optional bucket to use to look up per-bucket proxy secrets
+   * @return the credentials
+   * @throws IOException on a failure
+   */
+  public static MarshalledCredentials requestSessionCredentials(
+      final Configuration conf,
+      final String bucket)
+      throws IOException {
+    return requestSessionCredentials(conf, bucket,
+        TEST_SESSION_TOKEN_DURATION_SECONDS);
+  }
+
+  /**
+   * Request session credentials.
+   * @param conf The Hadoop configuration
+   * @param bucket Optional bucket to use to look up per-bucket proxy secrets
+   * @param duration duration in seconds.
+   * @return the credentials
+   * @throws IOException on a failure
    */
-  public static String getTestBucketName(final Configuration conf) {
-    String bucket = checkNotNull(conf.get(TEST_FS_S3A_NAME),
-        "No test bucket");
-    return URI.create(bucket).getHost();
+  public static MarshalledCredentials requestSessionCredentials(
+      final Configuration conf,
+      final String bucket,
+      final int duration)
+      throws IOException {
+    assumeSessionTestsEnabled(conf);
+    MarshalledCredentials sc = MarshalledCredentialBinding
+        .requestSessionCredentials(
+          buildAwsCredentialsProvider(conf),
+          S3AUtils.createAwsConf(conf, bucket),
+          conf.getTrimmed(ASSUMED_ROLE_STS_ENDPOINT,
+              DEFAULT_ASSUMED_ROLE_STS_ENDPOINT),
+          conf.getTrimmed(ASSUMED_ROLE_STS_ENDPOINT_REGION,
+              ASSUMED_ROLE_STS_ENDPOINT_REGION_DEFAULT),
+          duration,
+          new Invoker(new S3ARetryPolicy(conf), Invoker.LOG_EVENT));
+    sc.validate("requested session credentials: ",
+        MarshalledCredentials.CredentialTypeRequired.SessionOnly);
+    return sc;
+  }
+
+  /**
+   * Round trip a writable to a new instance.
+   * @param source source object
+   * @param conf configuration
+   * @param <T> type
+   * @return an unmarshalled instance of the type
+   * @throws Exception on any failure.
+   */
+  @SuppressWarnings("unchecked")
+  public static <T extends Writable> T roundTrip(
+      final T source,
+      final Configuration conf)
+      throws Exception {
+    DataOutputBuffer dob = new DataOutputBuffer();
+    source.write(dob);
+
+    DataInputBuffer dib = new DataInputBuffer();
+    dib.reset(dob.getData(), dob.getLength());
+
+    T after = ReflectionUtils.newInstance((Class<T>) source.getClass(), conf);
+    after.readFields(dib);
+    return after;
   }
 
   /**
@@ -1000,12 +1168,9 @@ public static void skipDuringFaultInjection(S3AFileSystem fs) {
    * Skip a test if the FS isn't marked as supporting magic commits.
    * @param fs filesystem
    */
-  public static void assumeMagicCommitEnabled(S3AFileSystem fs)
-      throws IOException {
+  public static void assumeMagicCommitEnabled(S3AFileSystem fs) {
     assume("Magic commit option disabled on " + fs,
-        fs.hasPathCapability(
-            fs.getWorkingDirectory(),
-            CommitConstants.STORE_CAPABILITY_MAGIC_COMMITTER));
+        fs.hasCapability(CommitConstants.STORE_CAPABILITY_MAGIC_COMMITTER));
   }
 
   /**
@@ -1023,4 +1188,14 @@ public static boolean authenticationContains(Configuration conf,
         .contains(providerClassname);
   }
 
+  public static boolean metadataStorePersistsAuthoritativeBit(MetadataStore ms)
+      throws IOException {
+    Map<String, String> diags = ms.getDiagnostics();
+    String persists =
+        diags.get(MetadataStoreCapabilities.PERSISTS_AUTHORITATIVE_BIT);
+    if(persists == null){
+      return false;
+    }
+    return Boolean.valueOf(persists);
+  }
 }
diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/TestS3AAWSCredentialsProvider.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/TestS3AAWSCredentialsProvider.java
index 66f7e0a3d3c70..e7f836be728a6 100644
--- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/TestS3AAWSCredentialsProvider.java
+++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/TestS3AAWSCredentialsProvider.java
@@ -93,8 +93,7 @@ public void testInstantiationChain() throws Throwable {
         TemporaryAWSCredentialsProvider.NAME
             + ", \t" + SimpleAWSCredentialsProvider.NAME
             + " ,\n " + AnonymousAWSCredentialsProvider.NAME);
-    Path testFile = new Path(
-        conf.getTrimmed(KEY_CSVTEST_FILE, DEFAULT_CSVTEST_FILE));
+    Path testFile = getCSVTestPath(conf);
 
     URI uri = testFile.toUri();
     AWSCredentialProviderList list = S3AUtils.createAWSCredentialProviderSet(
diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/commit/AbstractCommitITest.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/commit/AbstractCommitITest.java
index 0a3d07a195b45..ef594e62a7f02 100644
--- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/commit/AbstractCommitITest.java
+++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/commit/AbstractCommitITest.java
@@ -183,7 +183,7 @@ public void setup() throws Exception {
    * @return fork ID string in a format parseable by Jobs
    * @throws Exception failure
    */
-  protected String randomJobId() throws Exception {
+  public static String randomJobId() throws Exception {
     String testUniqueForkId = System.getProperty(TEST_UNIQUE_FORK_ID, "0001");
     int l = testUniqueForkId.length();
     String trailingDigits = testUniqueForkId.substring(l - 4, l);
diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/AbstractS3GuardToolTestBase.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/AbstractS3GuardToolTestBase.java
index 9c814f4a7f46d..71e9975c7326c 100644
--- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/AbstractS3GuardToolTestBase.java
+++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/AbstractS3GuardToolTestBase.java
@@ -24,7 +24,6 @@
 import java.io.FileNotFoundException;
 import java.io.IOException;
 import java.io.InputStreamReader;
-import java.io.PrintStream;
 import java.net.URI;
 import java.util.Collection;
 import java.util.HashSet;
@@ -37,7 +36,6 @@
 import org.apache.hadoop.util.StopWatch;
 import com.google.common.base.Preconditions;
 import org.apache.hadoop.fs.FileSystem;
-import org.junit.Assume;
 import org.junit.Test;
 
 import org.apache.hadoop.conf.Configuration;
@@ -53,13 +51,18 @@
 import org.apache.hadoop.util.ExitUtil;
 import org.apache.hadoop.util.StringUtils;
 
+import static org.apache.hadoop.fs.s3a.Constants.METADATASTORE_AUTHORITATIVE;
+import static org.apache.hadoop.fs.s3a.Constants.S3GUARD_DDB_REGION_KEY;
 import static org.apache.hadoop.fs.s3a.Constants.S3GUARD_DDB_TABLE_CREATE_KEY;
 import static org.apache.hadoop.fs.s3a.Constants.S3GUARD_DDB_TABLE_NAME_KEY;
 import static org.apache.hadoop.fs.s3a.Constants.S3GUARD_METASTORE_NULL;
 import static org.apache.hadoop.fs.s3a.Constants.S3_METADATA_STORE_IMPL;
 import static org.apache.hadoop.fs.s3a.S3AUtils.clearBucketOption;
 import static org.apache.hadoop.fs.s3a.s3guard.S3GuardTool.E_BAD_STATE;
+import static org.apache.hadoop.fs.s3a.s3guard.S3GuardTool.E_NO_METASTORE_OR_FILESYSTEM;
+import static org.apache.hadoop.fs.s3a.s3guard.S3GuardTool.E_USAGE;
 import static org.apache.hadoop.fs.s3a.s3guard.S3GuardTool.SUCCESS;
+import static org.apache.hadoop.fs.s3a.s3guard.S3GuardToolTestHelper.exec;
 import static org.apache.hadoop.test.LambdaTestUtils.intercept;
 
 /**
@@ -84,11 +87,21 @@ protected static void expectResult(int expected,
     assertEquals(message, expected, tool.run(args));
   }
 
-  protected static void expectSuccess(
+  /**
+   * Expect a command to succeed.
+   * @param message any extra text to include in the assertion error message
+   * @param tool tool to run
+   * @param args arguments to the command
+   * @return the output of any successful run
+   * @throws Exception failure
+   */
+  protected static String expectSuccess(
       String message,
       S3GuardTool tool,
       String... args) throws Exception {
-    assertEquals(message, SUCCESS, tool.run(args));
+    ByteArrayOutputStream buf = new ByteArrayOutputStream();
+    exec(SUCCESS, message, tool, buf, args);
+    return buf.toString();
   }
 
   /**
@@ -153,7 +166,7 @@ public void setup() throws Exception {
     conf.set(S3_METADATA_STORE_IMPL, S3GUARD_METASTORE_NULL);
     URI fsUri = fs.getUri();
     S3AUtils.setBucketOption(conf,fsUri.getHost(),
-        S3_METADATA_STORE_IMPL,
+        METADATASTORE_AUTHORITATIVE,
         S3GUARD_METASTORE_NULL);
     rawFs = (S3AFileSystem) FileSystem.newInstance(fsUri, conf);
   }
@@ -315,31 +328,89 @@ public void testSetCapacityFailFastOnReadWriteOfZero() throws Exception{
         S3GuardTool.SetCapacity.WRITE_CAP_INVALID, () -> cmdW.run(argsW));
   }
 
+  @Test
+  public void testBucketInfoUnguarded() throws Exception {
+    final Configuration conf = getConfiguration();
+    conf.set(S3GUARD_DDB_TABLE_CREATE_KEY, Boolean.FALSE.toString());
+    conf.set(S3GUARD_DDB_TABLE_NAME_KEY,
+        "testBucketInfoUnguarded-" + UUID.randomUUID());
+
+    // run a bucket info command and look for
+    // confirmation that it got the output from DDB diags
+    S3GuardTool.BucketInfo infocmd = new S3GuardTool.BucketInfo(conf);
+    String info = exec(infocmd, S3GuardTool.BucketInfo.NAME,
+        "-" + S3GuardTool.BucketInfo.UNGUARDED_FLAG,
+        getFileSystem().getUri().toString());
+
+    assertTrue("Output should contain information about S3A client " + info,
+        info.contains("S3A Client"));
+  }
+
   @Test
   public void testSetCapacityFailFastIfNotGuarded() throws Exception{
     Configuration conf = getConfiguration();
-    conf.set(S3GUARD_DDB_TABLE_NAME_KEY, UUID.randomUUID().toString());
-    conf.set(S3GUARD_DDB_TABLE_CREATE_KEY, Boolean.FALSE.toString());
+    bindToNonexistentTable(conf);
+    String bucket = rawFs.getBucket();
+    clearBucketOption(conf, bucket, S3_METADATA_STORE_IMPL);
+    clearBucketOption(conf, bucket, S3GUARD_DDB_TABLE_NAME_KEY);
+    clearBucketOption(conf, bucket, S3GUARD_DDB_TABLE_CREATE_KEY);
     conf.set(S3_METADATA_STORE_IMPL, S3GUARD_METASTORE_NULL);
 
     S3GuardTool.SetCapacity cmdR = new S3GuardTool.SetCapacity(conf);
-    String[] argsR = new String[]{cmdR.getName(),
-        "s3a://" + getFileSystem().getBucket()};
+    String[] argsR = new String[]{
+        cmdR.getName(),
+        "s3a://" + getFileSystem().getBucket()
+    };
 
     intercept(IllegalStateException.class, "unguarded",
-        () -> run(argsR));
+        () -> cmdR.run(argsR));
+  }
+
+  /**
+   * Binds the configuration to a nonexistent table.
+   * @param conf
+   */
+  protected void bindToNonexistentTable(final Configuration conf) {
+    conf.set(S3GUARD_DDB_TABLE_NAME_KEY, UUID.randomUUID().toString());
+    conf.setBoolean(S3GUARD_DDB_TABLE_CREATE_KEY, false);
   }
 
   @Test
   public void testDestroyNoBucket() throws Throwable {
+    describe("Destroy a bucket which doesn't exist");
+
+    Configuration conf = getConfiguration();
+    // set a table as a safety check in case the test goes wrong
+    // and deletes it.
+    bindToNonexistentTable(conf);
+
+    S3GuardTool.Destroy cmdR = new S3GuardTool.Destroy(conf);
+    String[] argsR = new String[]{
+        S3GuardTool.Destroy.NAME,
+        S3A_THIS_BUCKET_DOES_NOT_EXIST
+    };
     intercept(FileNotFoundException.class,
-        new Callable<Integer>() {
-          @Override
-          public Integer call() throws Exception {
-            return run(S3GuardTool.Destroy.NAME,
-                S3A_THIS_BUCKET_DOES_NOT_EXIST);
-          }
-        });
+        () -> cmdR.run(argsR));
+  }
+
+  @Test
+  public void testDestroyNoArgs() throws Throwable {
+    describe("Destroy a bucket which doesn't exist");
+
+    Configuration conf = getConfiguration();
+    // set a table as a safety check in case the test goes wrong
+    // and deletes it.
+    conf.set(S3GUARD_DDB_TABLE_NAME_KEY, UUID.randomUUID().toString());
+    conf.set(S3GUARD_DDB_REGION_KEY, "us-gov-west-1");
+    conf.setBoolean(S3GUARD_DDB_TABLE_CREATE_KEY, false);
+
+    S3GuardTool.Destroy cmdR = new S3GuardTool.Destroy(conf);
+
+    assertExitCode(E_USAGE,
+        intercept(ExitUtil.ExitException.class,
+            E_NO_METASTORE_OR_FILESYSTEM,
+            () -> cmdR.run(new String[]{})));
+
   }
 
   @Test
@@ -348,70 +419,44 @@ public void testProbeForMagic() throws Throwable {
     String name = fs.getUri().toString();
     S3GuardTool.BucketInfo cmd = new S3GuardTool.BucketInfo(
         getConfiguration());
-    if (fs.hasPathCapability(fs.getWorkingDirectory(),
+    if (fs.hasCapability(
         CommitConstants.STORE_CAPABILITY_MAGIC_COMMITTER)) {
       // if the FS is magic, expect this to work
       exec(cmd, S3GuardTool.BucketInfo.MAGIC_FLAG, name);
     } else {
       // if the FS isn't magic, expect the probe to fail
-      ExitUtil.ExitException e = intercept(ExitUtil.ExitException.class,
-          () -> exec(cmd, S3GuardTool.BucketInfo.MAGIC_FLAG, name));
-      if (e.getExitCode() != E_BAD_STATE) {
-        throw e;
-      }
+      assertExitCode(E_BAD_STATE,
+          intercept(ExitUtil.ExitException.class,
+              () -> exec(cmd, S3GuardTool.BucketInfo.MAGIC_FLAG, name)));
     }
   }
 
   /**
-   * Get the test CSV file; assume() that it is not modified (i.e. we haven't
-   * switched to a new storage infrastructure where the bucket is no longer
-   * read only).
-   * @return test file.
+   * Assert that an exit exception had a specific error code.
+   * @param expectedErrorCode expected code.
+   * @param e exit exception
+   * @throws AssertionError with the exit exception nested inside
    */
-  protected String getLandsatCSVFile() {
-    String csvFile = getConfiguration()
-        .getTrimmed(KEY_CSVTEST_FILE, DEFAULT_CSVTEST_FILE);
-    Assume.assumeTrue("CSV test file is not the default",
-        DEFAULT_CSVTEST_FILE.equals(csvFile));
-    return csvFile;
-  }
-
-  /**
-   * Execute a command, returning the buffer if the command actually completes.
-   * If an exception is raised the output is logged instead.
-   * @param cmd command
-   * @param args argument list
-   * @throws Exception on any failure
-   */
-  public String exec(S3GuardTool cmd, String...args) throws Exception {
-    ByteArrayOutputStream buf = new ByteArrayOutputStream();
-    try {
-      exec(cmd, buf, args);
-      return buf.toString();
-    } catch (AssertionError e) {
-      throw e;
-    } catch (Exception e) {
-      LOG.error("Command {} failed: \n{}", cmd, buf);
-      throw e;
+  protected void assertExitCode(final int expectedErrorCode,
+      final ExitUtil.ExitException e) {
+    if (e.getExitCode() != expectedErrorCode) {
+      throw new AssertionError("Expected error code " + expectedErrorCode
+          + " in " + e,
+          e);
     }
   }
 
-  /**
-   * Execute a command, saving the output into the buffer.
-   * @param cmd command
-   * @param buf buffer to use for tool output (not SLF4J output)
-   * @param args argument list
-   * @throws Exception on any failure
-   */
-  protected void exec(S3GuardTool cmd, ByteArrayOutputStream buf, String...args)
+  @Test
+  public void testDestroyFailsIfNoBucketNameOrDDBTableSet()
       throws Exception {
-    LOG.info("exec {}", (Object) args);
-    int r = 0;
-    try(PrintStream out =new PrintStream(buf)) {
-      r = cmd.run(args, out);
-      out.flush();
-    }
-    assertEquals("Command " + cmd + " failed\n"+ buf, 0, r);
+    intercept(ExitUtil.ExitException.class,
+        () -> run(S3GuardTool.Destroy.NAME));
+  }
+
+  @Test
+  public void testInitFailsIfNoBucketNameOrDDBTableSet() throws Exception {
+    intercept(ExitUtil.ExitException.class,
+        () -> run(S3GuardTool.Init.NAME));
   }
 
   @Test
@@ -449,7 +494,7 @@ protected void exec(S3GuardTool cmd, ByteArrayOutputStream buf, String...args)
     ByteArrayOutputStream buf = new ByteArrayOutputStream();
     S3GuardTool.Diff cmd = new S3GuardTool.Diff(fs.getConf());
     cmd.setStore(ms);
-    exec(cmd, buf, "diff", "-meta", DYNAMODB_TABLE, testPath.toString());
+    exec(0, "", cmd, buf, "diff", "-meta", DYNAMODB_TABLE, testPath.toString());
 
     Set<Path> actualOnS3 = new HashSet<>();
     Set<Path> actualOnMS = new HashSet<>();
@@ -481,40 +526,4 @@ protected void exec(S3GuardTool cmd, ByteArrayOutputStream buf, String...args)
     assertEquals("Mismatched s3 outputs: " + actualOut, filesOnS3, actualOnS3);
     assertFalse("Diff contained duplicates", duplicates);
   }
-
-  @Test
-  public void testLandsatBucketMarkerAware() throws Throwable {
-    describe("verify that -markers aware succeeds");
-    run(S3GuardTool.BucketInfo.NAME,
-        "-" + S3GuardTool.BucketInfo.MARKERS_FLAG,
-        S3GuardTool.BucketInfo.MARKERS_AWARE,
-        getLandsatCSVFile());
-  }
-
-  @Test
-  public void testLandsatBucketMarkerDelete() throws Throwable {
-    describe("verify that -markers delete succeeds");
-    run(S3GuardTool.BucketInfo.NAME,
-        "-" + S3GuardTool.BucketInfo.MARKERS_FLAG, "delete",
-        getLandsatCSVFile());
-  }
-
-  @Test
-  public void testLandsatBucketMarkerKeepFails() throws Throwable {
-    describe("verify that -markers keep fails");
-    runToFailure(E_BAD_STATE,
-        S3GuardTool.BucketInfo.NAME,
-        "-" + S3GuardTool.BucketInfo.MARKERS_FLAG, "keep",
-        getLandsatCSVFile());
-  }
-
-  @Test
-  public void testLandsatBucketMarkerAuthFails() throws Throwable {
-    describe("verify that -markers authoritative fails");
-    runToFailure(E_BAD_STATE,
-        S3GuardTool.BucketInfo.NAME,
-        "-" + S3GuardTool.BucketInfo.MARKERS_FLAG, "authoritative",
-        getLandsatCSVFile());
-  }
-
 }
diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/ITestS3GuardToolDynamoDB.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/ITestS3GuardToolDynamoDB.java
index 65e2619fe7524..13df6e3da8ece 100644
--- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/ITestS3GuardToolDynamoDB.java
+++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/ITestS3GuardToolDynamoDB.java
@@ -49,7 +49,7 @@
 import static org.apache.hadoop.fs.s3a.Constants.S3GUARD_DDB_TABLE_TAG;
 import static org.apache.hadoop.fs.s3a.s3guard.DynamoDBMetadataStore.*;
 import static org.apache.hadoop.fs.s3a.s3guard.S3GuardTool.*;
-import static org.apache.hadoop.test.LambdaTestUtils.intercept;
+import static org.apache.hadoop.fs.s3a.s3guard.S3GuardToolTestHelper.exec;
 
 /**
  * Test S3Guard related CLI commands against DynamoDB.
diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/ITestS3GuardToolLocal.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/ITestS3GuardToolLocal.java
index 1ee3cde80d996..6a4d45e9ea170 100644
--- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/ITestS3GuardToolLocal.java
+++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/ITestS3GuardToolLocal.java
@@ -40,7 +40,9 @@
 import org.apache.hadoop.fs.s3a.S3AFileSystem;
 
 import static org.apache.hadoop.fs.s3a.MultipartTestUtils.*;
+import static org.apache.hadoop.fs.s3a.S3ATestUtils.getLandsatCSVFile;
 import static org.apache.hadoop.fs.s3a.s3guard.S3GuardTool.*;
+import static org.apache.hadoop.fs.s3a.s3guard.S3GuardToolTestHelper.exec;
 import static org.apache.hadoop.test.LambdaTestUtils.intercept;
 
 /**
@@ -97,7 +99,7 @@ public void testImportCommand() throws Exception {
   public void testDestroyBucketExistsButNoTable() throws Throwable {
     run(Destroy.NAME,
         "-meta", LOCAL_METADATA,
-        getLandsatCSVFile());
+        getLandsatCSVFile(getConfiguration()));
   }
 
   @Test
@@ -161,7 +163,7 @@ public void testInitTwice() throws Throwable {
   public void testLandsatBucketUnguarded() throws Throwable {
     run(BucketInfo.NAME,
         "-" + BucketInfo.UNGUARDED_FLAG,
-        getLandsatCSVFile());
+        getLandsatCSVFile(getConfiguration()));
   }
 
   @Test
@@ -169,14 +171,15 @@ public void testLandsatBucketRequireGuarded() throws Throwable {
     runToFailure(E_BAD_STATE,
         BucketInfo.NAME,
         "-" + BucketInfo.GUARDED_FLAG,
-        ITestS3GuardToolLocal.this.getLandsatCSVFile());
+        getLandsatCSVFile(
+            ITestS3GuardToolLocal.this.getConfiguration()));
   }
 
   @Test
   public void testLandsatBucketRequireUnencrypted() throws Throwable {
     run(BucketInfo.NAME,
         "-" + BucketInfo.ENCRYPTION_FLAG, "none",
-        getLandsatCSVFile());
+        getLandsatCSVFile(getConfiguration()));
   }
 
   @Test
@@ -184,7 +187,8 @@ public void testLandsatBucketRequireEncrypted() throws Throwable {
     runToFailure(E_BAD_STATE,
         BucketInfo.NAME,
         "-" + BucketInfo.ENCRYPTION_FLAG,
-        "AES256", ITestS3GuardToolLocal.this.getLandsatCSVFile());
+        "AES256", getLandsatCSVFile(
+            ITestS3GuardToolLocal.this.getConfiguration()));
   }
 
   @Test
@@ -367,7 +371,7 @@ private void uploadCommandAssertCount(S3AFileSystem fs, String options[],
       allOptions.add(String.valueOf(ageSeconds));
     }
     allOptions.add(path.toString());
-    exec(cmd, buf, allOptions.toArray(new String[0]));
+    exec(0, "", cmd, buf, allOptions.toArray(new String[0]));
 
     try (BufferedReader reader = new BufferedReader(
         new InputStreamReader(new ByteArrayInputStream(buf.toByteArray())))) {
diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/S3GuardToolTestHelper.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/S3GuardToolTestHelper.java
new file mode 100644
index 0000000000000..f22aa3606baa4
--- /dev/null
+++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/S3GuardToolTestHelper.java
@@ -0,0 +1,89 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.s3a.s3guard;
+
+import java.io.ByteArrayOutputStream;
+import java.io.PrintStream;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import static org.junit.Assert.assertEquals;
+
+/**
+ * Helper class for tests which make CLI invocations of the S3Guard tools.
+ * That's {@link AbstractS3GuardToolTestBase} and others.
+ */
+public final class S3GuardToolTestHelper {
+
+  private static final Logger LOG = LoggerFactory.getLogger(
+      S3GuardToolTestHelper.class);
+
+  private S3GuardToolTestHelper() {
+  }
+
+  /**
+   * Execute a command, returning the buffer if the command actually completes.
+   * If an exception is raised the output is logged instead.
+   * @param cmd command
+   * @param args argument list
+   * @throws Exception on any failure
+   */
+  public static String exec(S3GuardTool cmd, String... args) throws Exception {
+    ByteArrayOutputStream buf = new ByteArrayOutputStream();
+    try {
+      exec(0, "", cmd, buf, args);
+      return buf.toString();
+    } catch (AssertionError e) {
+      throw e;
+    } catch (Exception e) {
+      LOG.error("Command {} failed: \n{}", cmd, buf);
+      throw e;
+    }
+  }
+
+  /**
+   * Execute a command, saving the output into the buffer.
+   * @param expectedResult expected result of the command.
+   * @param errorText error text to include in the assertion.
+   * @param cmd command
+   * @param buf buffer to use for tool output (not SLF4J output)
+   * @param args argument list
+   * @throws Exception on any failure
+   */
+  public static void exec(final int expectedResult,
+      final String errorText,
+      final S3GuardTool cmd,
+      final ByteArrayOutputStream buf,
+      final String... args)
+      throws Exception {
+    LOG.info("exec {}", (Object) args);
+    int r;
+    try (PrintStream out = new PrintStream(buf)) {
+      r = cmd.run(args, out);
+      out.flush();
+    }
+    if (expectedResult != r) {
+      String message = errorText.isEmpty() ? "" : (errorText + ": ")
+          + "Command " + cmd + " failed\n" + buf;
+      assertEquals(message, expectedResult, r);
+    }
+  }
+
+}
diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/select/AbstractS3SelectTest.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/select/AbstractS3SelectTest.java
new file mode 100644
index 0000000000000..18138a616bbe4
--- /dev/null
+++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/select/AbstractS3SelectTest.java
@@ -0,0 +1,746 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.s3a.select;
+
+import java.io.BufferedReader;
+import java.io.EOFException;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.time.Duration;
+import java.time.ZonedDateTime;
+import java.time.format.DateTimeFormatter;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.NoSuchElementException;
+import java.util.Optional;
+import java.util.Scanner;
+import java.util.function.Consumer;
+
+import org.junit.Assume;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.CommonConfigurationKeys;
+import org.apache.hadoop.fs.FSDataInputStream;
+import org.apache.hadoop.fs.FileContext;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.FutureDataInputStreamBuilder;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.StreamCapabilities;
+import org.apache.hadoop.fs.s3a.AWSServiceIOException;
+import org.apache.hadoop.fs.s3a.AbstractS3ATestBase;
+import org.apache.hadoop.fs.s3a.S3AFileSystem;
+import org.apache.hadoop.fs.s3a.commit.AbstractCommitITest;
+import org.apache.hadoop.fs.s3a.commit.DurationInfo;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.compress.PassthroughCodec;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapreduce.MRJobConfig;
+import org.apache.hadoop.mapreduce.RecordReader;
+import org.apache.hadoop.mapreduce.TaskAttemptContext;
+import org.apache.hadoop.mapreduce.TaskAttemptID;
+import org.apache.hadoop.mapreduce.lib.input.FileSplit;
+import org.apache.hadoop.mapreduce.lib.input.LineRecordReader;
+import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl;
+
+import static org.apache.hadoop.fs.impl.FutureIOSupport.awaitFuture;
+import static org.apache.hadoop.fs.s3a.S3ATestUtils.getLandsatCSVPath;
+import static org.apache.hadoop.fs.s3a.select.CsvFile.ALL_QUOTES;
+import static org.apache.hadoop.fs.s3a.select.SelectConstants.*;
+import static org.apache.hadoop.test.LambdaTestUtils.intercept;
+
+/**
+ * Superclass for S3 Select tests.
+ * A lot of the work here goes into creating and querying a simple CSV test
+ * format, with various datatypes which can be used in type-casting queries.
+ * <pre>
+ * 1  "ID": index of the row
+ * 2  "date": date as ISO 8601
+ * 3  "timestamp": timestamp in seconds of epoch
+ * 4  "name", entry-$row
+ * 5  "odd", odd/even as boolean. True means odd,
+ * 6  "oddint", odd/even as int : 1 for odd, 0 for even
+ * 7  "oddrange": odd/even as 1 for odd, -1 for even
+ * </pre>
+ */
+public abstract class AbstractS3SelectTest extends AbstractS3ATestBase {
+
+  /**
+   * Number of columns in the CSV file: {@value}.
+   */
+  public static final int CSV_COLUMN_COUNT = 7;
+
+  protected static final String TRUE = q("TRUE");
+
+  protected static final String FALSE = q("FALSE");
+
+  public static final String SELECT_EVERYTHING = "SELECT * FROM S3OBJECT s";
+
+  public static final String SELECT_EVEN_ROWS_NO_HEADER =
+      "SELECT * FROM S3OBJECT s WHERE s._5 = " + TRUE;
+  public static final String SELECT_ODD_ROWS
+      = "SELECT s.name FROM S3OBJECT s WHERE s.odd = " + TRUE;
+
+  public static final String SELECT_ODD_ENTRIES
+      = "SELECT * FROM S3OBJECT s WHERE s.odd = `TRUE`";
+
+  public static final String SELECT_ODD_ENTRIES_BOOL
+      = "SELECT * FROM S3OBJECT s WHERE CAST(s.odd AS BOOL) = TRUE";
+
+  public static final String SELECT_ODD_ENTRIES_INT
+      = "SELECT * FROM S3OBJECT s WHERE CAST(s.\"oddint\" AS INT) = 1";
+
+  public static final String SELECT_ODD_ENTRIES_DECIMAL
+      = "SELECT * FROM S3OBJECT s WHERE CAST(s.\"oddint\" AS DECIMAL) = 1";
+
+  /**
+   * Playing with timestamps: {@value}.
+   */
+  public static final String SELECT_TO_DATE
+      = "SELECT\n"
+      + "CAST(s.\"date\" AS TIMESTAMP)\n"
+      + "FROM S3OBJECT s";
+
+
+  /**
+   * How many rows are being generated.
+   */
+  protected static final int ALL_ROWS_COUNT = 10;
+
+  /**
+   * Row count of all rows + header.
+   */
+  protected static final int ALL_ROWS_COUNT_WITH_HEADER = ALL_ROWS_COUNT + 1;
+
+  /**
+   * Number of odd rows expected: {@value}.
+   */
+  protected static final int ODD_ROWS_COUNT = ALL_ROWS_COUNT / 2;
+
+  /**
+   * Number of even rows expected: {@value}.
+   * This is the same as the odd row count; it's separate just to
+   * be consistent on tests which select even results.
+   */
+  protected static final int EVEN_ROWS_COUNT = ODD_ROWS_COUNT;
+
+  protected static final String ENTRY_0001 = "\"entry-0001\"";
+
+  protected static final String ENTRY_0002 = "\"entry-0002\"";
+
+  /**
+   * Path to the landsat csv.gz file.
+   */
+  private Path landsatGZ;
+
+  /**
+   * The filesystem with the landsat data.
+   */
+  private S3AFileSystem landsatFS;
+
+
+  // A random task attempt id for testing.
+  private String attempt0;
+
+  private TaskAttemptID taskAttempt0;
+
+  private String jobId;
+
+  /**
+   * Base CSV file is headers.
+   * <pre>
+   * 1  "ID": index of the row
+   * 2  "date": date as Date.toString
+   * 3  "timestamp": timestamp in seconds of epoch
+   * 4  "name", entry-$row
+   * 5  "odd", odd/even as boolean
+   * 6  "oddint", odd/even as int : 1 for odd, 0 for even
+   * 7  "oddrange": odd/even as 1 for odd, -1 for even
+   * </pre>
+   * @param fs filesystem
+   * @param path path to write
+   * @param header should the standard header be printed?
+   * @param quoteHeaderPolicy what the header quote policy is.
+   * @param quoteRowPolicy what the row quote policy is.
+   * @param rows number of rows
+   * @param separator column separator
+   * @param eol end of line characters
+   * @param quote quote char
+   * @param footer callback to run after the main CSV file is written
+   * @throws IOException IO failure.
+   */
+  public static void createStandardCsvFile(
+      final FileSystem fs,
+      final Path path,
+      final boolean header,
+      final long quoteHeaderPolicy,
+      final long quoteRowPolicy,
+      final int rows,
+      final String separator,
+      final String eol,
+      final String quote,
+      final Consumer<CsvFile> footer) throws IOException {
+    try (CsvFile csv = new CsvFile(fs,
+        path,
+        true,
+        separator,
+        eol,
+        quote)) {
+
+      if (header) {
+        writeStandardHeader(csv, quoteHeaderPolicy);
+      }
+      DateTimeFormatter formatter
+          = DateTimeFormatter.ISO_OFFSET_DATE_TIME;
+      ZonedDateTime timestamp = ZonedDateTime.now();
+      Duration duration = Duration.ofHours(20);
+      // loop is at 1 for use in counters and flags
+      for (int i = 1; i <= rows; i++) {
+        // flip the odd flags
+        boolean odd = (i & 1) == 1;
+        // and move the timestamp back
+        timestamp = timestamp.minus(duration);
+        csv.row(quoteRowPolicy,
+            i,
+            timestamp.format(formatter),
+            timestamp.toEpochSecond(),
+            String.format("entry-%04d", i),
+            odd ? "TRUE" : "FALSE",
+            odd ? 1 : 0,
+            odd ? 1 : -1
+        );
+      }
+      // write the footer
+      footer.accept(csv);
+    }
+  }
+
+  /**
+   * Write out the standard header to a CSV file.
+   * @param csv CSV file to use.
+   * @param quoteHeaderPolicy quote policy.
+   * @return the input file.
+   * @throws IOException failure to write.
+   */
+  private static CsvFile writeStandardHeader(final CsvFile csv,
+      final long quoteHeaderPolicy) throws IOException {
+    return csv.row(quoteHeaderPolicy,
+        "id",
+        "date",
+        "timestamp",
+        "name",
+        "odd",
+        "oddint",
+        "oddrange");
+  }
+
+  /**
+   * Verify that an exception has a specific error code.
+   * if not: an assertion is raised containing the original value.
+   * @param code expected code.
+   * @param ex exception caught
+   * @throws AssertionError on a mismatch
+   */
+  protected static AWSServiceIOException verifyErrorCode(final String code,
+      final AWSServiceIOException ex) {
+    logIntercepted(ex);
+    if (!code.equals(ex.getErrorCode())) {
+      throw new AssertionError("Expected Error code" + code
+          + " actual " + ex.getErrorCode(),
+          ex);
+    }
+    return ex;
+  }
+
+  /**
+   * Probe for a filesystem instance supporting S3 Select.
+   * @param filesystem filesystem
+   * @return true iff the filesystem supports S3 Select.
+   */
+  boolean isSelectAvailable(final FileSystem filesystem) {
+    return filesystem instanceof StreamCapabilities
+        && ((StreamCapabilities) filesystem)
+        .hasCapability(S3_SELECT_CAPABILITY);
+  }
+
+  /**
+   * Setup: requires select to be available.
+   */
+  @Override
+  public void setup() throws Exception {
+    super.setup();
+    Assume.assumeTrue("S3 Select is not enabled on "
+            + getFileSystem().getUri(),
+        isSelectAvailable(getFileSystem()));
+    Configuration conf = getConfiguration();
+    landsatGZ = getLandsatCSVPath(conf);
+    landsatFS = (S3AFileSystem) landsatGZ.getFileSystem(conf);
+    Assume.assumeTrue("S3 Select is not enabled on " + landsatFS.getUri(),
+        isSelectAvailable(landsatFS));
+    // create some job info
+    jobId = AbstractCommitITest.randomJobId();
+    attempt0 = "attempt_" + jobId + "_m_000000_0";
+    taskAttempt0 = TaskAttemptID.forName(attempt0);
+  }
+
+  /**
+   * Build the SQL statement, using String.Format rules.
+   * @param template template
+   * @param args arguments for the template
+   * @return the template to use
+   */
+  protected static String sql(
+      final String template,
+      final Object... args) {
+    return args.length > 0 ? String.format(template, args) : template;
+  }
+
+  /**
+   * Quote a constant with the SQL quote logic.
+   * @param c constant
+   * @return quoted constant
+   */
+  protected static String q(String c) {
+    return '\'' + c + '\'';
+  }
+
+  /**
+   * Select from a source file.
+   * @param fileSystem FS.
+   * @param source source file.
+   * @param conf config for the select call.
+   * @param sql template for a formatted SQL request.
+   * @param args arguments for the formatted request.
+   * @return the input stream.
+   * @throws IOException failure
+   */
+  protected FSDataInputStream select(
+      final FileSystem fileSystem,
+      final Path source,
+      final Configuration conf,
+      final String sql,
+      final Object... args)
+      throws IOException {
+    String expression = sql(sql, args);
+    describe("Execution Select call: %s", expression);
+    FutureDataInputStreamBuilder builder =
+        fileSystem.openFile(source)
+            .must(SELECT_SQL, expression);
+    // propagate all known options
+    for (String key : InternalSelectConstants.SELECT_OPTIONS) {
+      String value = conf.get(key);
+      if (value != null) {
+        builder.must(key, value);
+      }
+    }
+    return awaitFuture(builder.build());
+  }
+
+  /**
+   * Select from a source file via the file context API.
+   * @param fc file context
+   * @param source source file.
+   * @param conf config for the select call.
+   * @param sql template for a formatted SQL request.
+   * @param args arguments for the formatted request.
+   * @return the input stream.
+   * @throws IOException failure
+   */
+  protected FSDataInputStream select(
+      final FileContext fc,
+      final Path source,
+      final Configuration conf,
+      final String sql,
+      final Object... args)
+      throws IOException {
+    String expression = sql(sql, args);
+    describe("Execution Select call: %s", expression);
+    FutureDataInputStreamBuilder builder = fc.openFile(source)
+        .must(SELECT_SQL, expression);
+    // propagate all known options
+    InternalSelectConstants.SELECT_OPTIONS.forEach((key) ->
+        Optional.ofNullable(conf.get(key))
+            .map((v) -> builder.must(key, v)));
+    return awaitFuture(builder.build());
+  }
+
+  /**
+   * Parse a selection to lines; log at info.
+   * @param selection selection input
+   * @return a list of lines.
+   * @throws IOException if raised during the read.
+   */
+  protected List<String> parseToLines(final FSDataInputStream selection)
+      throws IOException {
+    return parseToLines(selection, getMaxLines());
+  }
+
+  /**
+   * Enable the passthrough codec for a job, with the given extension.
+   * @param conf configuration to update
+   * @param extension extension to use
+   */
+  protected void enablePassthroughCodec(final Configuration conf,
+      final String extension) {
+    conf.set(CommonConfigurationKeys.IO_COMPRESSION_CODECS_KEY,
+        PassthroughCodec.CLASSNAME);
+    conf.set(PassthroughCodec.OPT_EXTENSION, extension);
+  }
+
+  /**
+   * Override if a test suite is likely to ever return more lines.
+   * @return the max number for parseToLines/1
+   */
+  protected int getMaxLines() {
+    return 100;
+  }
+
+  /**
+   * Parse a selection to lines; log at info.
+   * @param selection selection input
+   * @param maxLines maximum number of lines.
+   * @return a list of lines.
+   * @throws IOException if raised during the read.
+   */
+  protected List<String> parseToLines(final FSDataInputStream selection,
+      int maxLines)
+      throws IOException {
+    List<String> result = new ArrayList<>();
+    String stats;
+    // the scanner assumes that any IOE => EOF; we don't want
+    // that and so will check afterwards.
+    try (Scanner scanner = new Scanner(
+        new BufferedReader(new InputStreamReader(selection)))) {
+      scanner.useDelimiter(CSV_INPUT_RECORD_DELIMITER_DEFAULT);
+      while (maxLines > 0) {
+        try {
+          String l = scanner.nextLine();
+          LOG.info("{}", l);
+          result.add(l);
+          maxLines--;
+        } catch (NoSuchElementException e) {
+          // EOL or an error
+          break;
+        }
+      }
+      stats = selection.toString();
+      describe("Result line count: %s\nStatistics\n%s",
+          result.size(), stats);
+      // look for any raised error.
+      IOException ioe = scanner.ioException();
+      if (ioe != null && !(ioe instanceof EOFException)) {
+        throw ioe;
+      }
+    }
+    return result;
+  }
+
+  /**
+   * Verify the selection count; return the original list.
+   * If there's a mismatch, the whole list is logged at error, then
+   * an assertion raised.
+   * @param expected expected value.
+   * @param expression expression -for error messages.
+   * @param selection selected result.
+   * @return the input list.
+   */
+  protected List<String> verifySelectionCount(
+      final int expected,
+      final String expression,
+      final List<String> selection) {
+    return verifySelectionCount(expected, expected, expression, selection);
+  }
+
+  /**
+   * Verify the selection count is within a given range;
+   * return the original list.
+   * If there's a mismatch, the whole list is logged at error, then
+   * an assertion raised.
+   * @param min min value (exclusive).
+   * @param max max value (exclusive). If -1: no maximum.
+   * @param expression expression -for error messages.
+   * @param selection selected result.
+   * @return the input list.
+   */
+  protected List<String> verifySelectionCount(
+      final int min,
+      final int max,
+      final String expression,
+      final List<String> selection) {
+    int size = selection.size();
+    if (size < min || (max > -1 && size > max)) {
+      // mismatch: log and then fail
+      String listing = prepareToPrint(selection);
+      LOG.error("\n{} => \n{}", expression, listing);
+      fail("row count from select call " + expression
+          + " is out of range " + min + " to " + max
+          + ": " + size
+          + " \n" + listing);
+    }
+    return selection;
+  }
+
+  /**
+   * Do whatever is needed to prepare a string for logging.
+   * @param selection selection
+   * @return something printable.
+   */
+  protected String prepareToPrint(final List<String> selection) {
+    return String.join("\n", selection);
+  }
+
+  /**
+   * Create "the standard" CSV file with the default row count.
+   * @param fs filesystem
+   * @param path path to write
+   * @param quoteRowPolicy what the row quote policy is.
+   * @throws IOException IO failure.
+   */
+  protected void createStandardCsvFile(
+      final FileSystem fs,
+      final Path path,
+      final long quoteRowPolicy)
+      throws IOException {
+    createStandardCsvFile(
+        fs, path,
+        true,
+        ALL_QUOTES,
+        quoteRowPolicy,
+        ALL_ROWS_COUNT,
+        ",",
+        "\n",
+        "\"",
+        c -> {});
+  }
+
+  /**
+   * Set an MR Job input option.
+   * @param conf configuration
+   * @param key key to set
+   * @param val value
+   */
+  void inputOpt(Configuration conf, String key, String val) {
+    conf.set(MRJobConfig.INPUT_FILE_OPTION_PREFIX + key, val);
+  }
+
+  /**
+   * Set a mandatory MR Job input option.
+   * @param conf configuration
+   * @param key key to set
+   * @param val value
+   */
+  void inputMust(Configuration conf, String key, String val) {
+    conf.set(MRJobConfig.INPUT_FILE_MANDATORY_PREFIX + key,
+        val);
+  }
+
+  /**
+   * Reads lines through a v2 RecordReader, as if it were part of a
+   * MRv2 job.
+   * @param conf job conf
+   * @param path path to query
+   * @param sql sql to add to the configuration.
+   * @param initialCapacity capacity of the read
+   * @param reader reader: this is closed after the read
+   * @return the selected lines.
+   * @throws Exception failure
+   */
+  protected List<String> readRecords(JobConf conf,
+      Path path,
+      String sql,
+      RecordReader<?, ?> reader,
+      int initialCapacity) throws Exception {
+
+    inputMust(conf, SELECT_SQL, sql);
+    List<String> lines = new ArrayList<>(initialCapacity);
+    try {
+      reader.initialize(
+          createSplit(conf, path),
+          createTaskAttemptContext(conf));
+      while (reader.nextKeyValue()) {
+        lines.add(reader.getCurrentValue().toString());
+      }
+    } finally {
+      reader.close();
+    }
+    return lines;
+  }
+  /**
+   * Reads lines through a v1 RecordReader, as if it were part of a
+   * MRv1 job.
+   * @param conf job conf
+   * @param reader reader: this is closed after the read
+   * @param initialCapacity capacity of the read
+   * @return the selected lines.
+   * @throws Exception failure
+   */
+  protected <K, V> List<String> readRecordsV1(JobConf conf,
+      org.apache.hadoop.mapred.RecordReader<K, V> reader,
+      K key,
+      V value,
+      int initialCapacity) throws Exception {
+    List<String> lines = new ArrayList<>(initialCapacity);
+    try {
+      while (reader.next(key, value)) {
+        lines.add(value.toString());
+      }
+    } finally {
+      reader.close();
+    }
+    return lines;
+  }
+
+  /**
+   * Create a task attempt context for a job, creating a random JobID to
+   * do this.
+   * @param conf job configuration.
+   * @return a new task attempt context containing the job conf
+   * @throws Exception failure.
+   */
+  protected TaskAttemptContext createTaskAttemptContext(final JobConf conf)
+      throws Exception {
+    String id = AbstractCommitITest.randomJobId();
+    return new TaskAttemptContextImpl(conf,
+        TaskAttemptID.forName("attempt_" + id + "_m_000000_0"));
+  }
+
+  /**
+   * Create an MRv2 file input split.
+   * @param conf job configuration
+   * @param path path to file
+   * @return the split
+   * @throws IOException problems reading the file.
+   */
+  protected FileSplit createSplit(final JobConf conf, final Path path)
+      throws IOException {
+    FileSystem fs = path.getFileSystem(conf);
+    FileStatus status = fs.getFileStatus(path);
+    return new FileSplit(path, 0, status.getLen(),
+        new String[]{"localhost"});
+  }
+
+  /**
+   * Create an MRv1 file input split.
+   * @param conf job configuration
+   * @param path path to file
+   * @return the split
+   * @throws IOException problems reading the file.
+   */
+  protected org.apache.hadoop.mapred.FileSplit
+      createSplitV1(final JobConf conf, final Path path)
+      throws IOException {
+    FileSystem fs = path.getFileSystem(conf);
+    FileStatus status = fs.getFileStatus(path);
+    return new org.apache.hadoop.mapred.FileSplit(path, 0, status.getLen(),
+        new String[]{"localhost"});
+  }
+
+  /**
+   * Create a v2 line record reader expecting newlines as the EOL marker.
+   * @return a reader
+   */
+  protected RecordReader<LongWritable, Text> createLineRecordReader() {
+    return new LineRecordReader(new byte[]{'\n'});
+  }
+
+  /**
+   * Create a v1 line record reader.
+   * @return a reader
+   */
+  protected org.apache.hadoop.mapred.RecordReader<LongWritable, Text>
+      createLineRecordReaderV1(
+        final JobConf conf,
+        final Path path) throws IOException {
+    return new org.apache.hadoop.mapred.LineRecordReader(
+        conf, createSplitV1(conf, path));
+  }
+
+  /**
+   * Get the path to the landsat file.
+   * @return the landsat CSV.GZ path.
+   */
+  protected Path getLandsatGZ() {
+    return landsatGZ;
+  }
+
+  /**
+   * Get the filesystem for the landsat file.
+   * @return the landsat FS.
+   */
+  protected S3AFileSystem getLandsatFS() {
+    return landsatFS;
+  }
+
+  /**
+   * Perform a seek: log duration of the operation.
+   * @param stream stream to seek.
+   * @param target target position.
+   * @throws IOException on an error
+   */
+  protected void seek(final FSDataInputStream stream, final long target)
+      throws IOException {
+    try(DurationInfo ignored =
+            new DurationInfo(LOG, "Seek to %d", target)) {
+      stream.seek(target);
+    }
+  }
+
+  /**
+   * Execute a seek so far past the EOF that it will be rejected.
+   * If the seek did not fail, the exception raised includes the toString()
+   * value of the stream.
+   * @param seekStream stream to seek in.
+   * @param newpos new position
+   * @return the EOF Exception raised.
+   * @throws Exception any other exception.
+   */
+  protected EOFException expectSeekEOF(final FSDataInputStream seekStream,
+      final int newpos) throws Exception {
+    return intercept(EOFException.class,
+        () -> {
+          seek(seekStream, newpos);
+          // return this for the test failure reports.
+          return "Stream after seek to " + newpos + ": " + seekStream;
+        });
+  }
+
+  public String getAttempt0() {
+    return attempt0;
+  }
+
+  public TaskAttemptID getTaskAttempt0() {
+    return taskAttempt0;
+  }
+
+  public String getJobId() {
+    return jobId;
+  }
+
+  /**
+   * Logs intercepted exceptions.
+   * This generates the stack traces for the documentation.
+   * @param ex exception
+   * @return the exception passed in (for chaining)
+   */
+  protected static <T extends Exception> T logIntercepted(T ex) {
+    LOG.info("Intercepted Exception is ", ex);
+    return ex;
+  }
+}
diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/select/CsvFile.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/select/CsvFile.java
new file mode 100644
index 0000000000000..06e6d2a78aef7
--- /dev/null
+++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/select/CsvFile.java
@@ -0,0 +1,138 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.s3a.select;
+
+import java.io.Closeable;
+import java.io.IOException;
+import java.io.PrintWriter;
+
+import com.google.common.base.Preconditions;
+
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+
+/**
+ * Writer for generating test CSV files.
+ *
+ * Quotes are manged by passing in a long whose specific bits control
+ * whether or not a row is quoted, bit 0 for column 0, etc.
+ */
+class CsvFile implements Closeable {
+
+
+  /** constant to quote all columns. */
+  public static final long ALL_QUOTES = 0x7fffffff;
+
+  /** quote nothing: {@value}. */
+  public static final long NO_QUOTES = 0;
+
+  private final Path path;
+
+  private final PrintWriter out;
+
+  private final String separator;
+
+  private final String eol;
+
+  private final String quote;
+
+  CsvFile(final FileSystem fs,
+      final Path path,
+      boolean overwrite,
+      final String separator,
+      final String eol,
+      final String quote) throws IOException {
+    this.path = path;
+    this.separator = Preconditions.checkNotNull(separator);
+    this.eol = Preconditions.checkNotNull(eol);
+    this.quote = Preconditions.checkNotNull(quote);
+    out = new PrintWriter(fs.create(path, overwrite));
+  }
+
+
+  /**
+   * Close the file, if not already done.
+   * @throws IOException on a failure.
+   */
+  @Override
+  public synchronized void close() throws IOException {
+    if (out != null) {
+      out.close();
+    }
+  }
+
+  public Path getPath() {
+    return path;
+  }
+
+  public String getSeparator() {
+    return separator;
+  }
+
+  public String getEol() {
+    return eol;
+  }
+
+  /**
+   * Write a row.
+   * Entries are quoted if the bit for that column is true.
+   * @param quotes quote policy: every bit defines the rule for that element
+   * @param columns columns to write
+   * @return self for ease of chaining.
+   */
+  public CsvFile row(long quotes, Object... columns) {
+    for (int i = 0; i < columns.length; i++) {
+      if (i != 0) {
+        out.write(separator);
+      }
+      boolean toQuote = (quotes & 1) == 1;
+      // unsigned right shift to make next column flag @ position 0
+      quotes = quotes >>> 1;
+      if (toQuote) {
+        out.write(quote);
+      }
+      out.write(columns[i].toString());
+      if (toQuote) {
+        out.write(quote);
+      }
+    }
+    out.write(eol);
+    return this;
+  }
+
+  /**
+   * Write a line.
+   * @param line line to print
+   * @return self for ease of chaining.
+   * @throws IOException IO failure
+   */
+  public CsvFile line(String line) {
+    out.write(line);
+    out.write(eol);
+    return this;
+  }
+
+  /**
+   * Get the output stream.
+   * @return the stream.
+   */
+  public PrintWriter getOut() {
+    return out;
+  }
+}
diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/select/ITestS3Select.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/select/ITestS3Select.java
new file mode 100644
index 0000000000000..5fe4e2bb6709c
--- /dev/null
+++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/select/ITestS3Select.java
@@ -0,0 +1,967 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.s3a.select;
+
+import java.io.BufferedReader;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.util.List;
+import java.util.concurrent.CompletableFuture;
+
+import org.junit.Assume;
+import org.junit.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSDataInputStream;
+import org.apache.hadoop.fs.FSExceptionMessages;
+import org.apache.hadoop.fs.FileContext;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.FutureDataInputStreamBuilder;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.PathIOException;
+import org.apache.hadoop.fs.Seekable;
+import org.apache.hadoop.fs.contract.ContractTestUtils;
+import org.apache.hadoop.fs.impl.AbstractFSBuilderImpl;
+import org.apache.hadoop.fs.s3a.AWSBadRequestException;
+import org.apache.hadoop.fs.s3a.AWSServiceIOException;
+import org.apache.hadoop.fs.s3a.Constants;
+import org.apache.hadoop.fs.s3a.S3AFileSystem;
+import org.apache.hadoop.fs.s3a.S3AInputStream;
+import org.apache.hadoop.fs.s3a.S3AInstrumentation;
+import org.apache.hadoop.fs.s3a.S3ATestUtils;
+import org.apache.hadoop.fs.s3a.Statistic;
+import org.apache.hadoop.fs.s3a.commit.DurationInfo;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapreduce.InputSplit;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.JobContext;
+import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
+import org.apache.hadoop.mapreduce.lib.input.LineRecordReader;
+import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
+import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
+import org.apache.hadoop.mapreduce.task.JobContextImpl;
+
+import static org.apache.hadoop.fs.s3a.Constants.INPUT_FADVISE;
+import static org.apache.hadoop.fs.s3a.Constants.INPUT_FADV_NORMAL;
+import static org.apache.hadoop.fs.s3a.Constants.READAHEAD_RANGE;
+import static org.apache.hadoop.fs.s3a.select.CsvFile.ALL_QUOTES;
+import static org.apache.hadoop.fs.s3a.select.SelectBinding.expandBackslashChars;
+import static org.apache.hadoop.fs.s3a.select.SelectConstants.*;
+import static org.apache.hadoop.test.LambdaTestUtils.intercept;
+import static org.apache.hadoop.test.LambdaTestUtils.interceptFuture;
+import static org.hamcrest.CoreMatchers.hasItem;
+import static org.hamcrest.CoreMatchers.not;
+import static org.hamcrest.collection.IsCollectionWithSize.hasSize;
+
+/**
+ * Test the S3 Select feature with some basic SQL Commands.
+ * Executed if the destination store declares its support for the feature.
+ */
+public class ITestS3Select extends AbstractS3SelectTest {
+
+  private static final Logger LOG =
+      LoggerFactory.getLogger(ITestS3Select.class);
+
+  public static final String E_CAST_FAILED = "CastFailed";
+
+  public static final String E_PARSE_INVALID_PATH_COMPONENT
+      = "ParseInvalidPathComponent";
+
+  public static final String E_INVALID_TABLE_ALIAS = "InvalidTableAlias";
+
+  private Configuration selectConf;
+
+  /** well formed CSV. */
+  private Path csvPath;
+
+  /** CSV file with fewer columns than expected, all fields parse badly. */
+  private Path brokenCSV;
+
+  @Override
+  public void setup() throws Exception {
+    super.setup();
+    Assume.assumeTrue("S3 Select is not enabled",
+        getFileSystem().hasCapability(S3_SELECT_CAPABILITY));
+    csvPath = path(getMethodName() + ".csv");
+    selectConf = new Configuration(false);
+    selectConf.setBoolean(SELECT_ERRORS_INCLUDE_SQL, true);
+    createStandardCsvFile(getFileSystem(), csvPath, ALL_QUOTES);
+    // create the broken CSV file.
+    brokenCSV = path("testParseBrokenCSVFile");
+    createStandardCsvFile(
+        getFileSystem(), brokenCSV,
+        true,
+        ALL_QUOTES,
+        ALL_ROWS_COUNT,
+        ALL_ROWS_COUNT,
+        ",",
+        "\n",
+        "\"",
+        csv -> csv
+            .line("# comment")
+            .row(ALL_QUOTES, "bad", "Tuesday", 0, "entry-bad", "yes", false));
+  }
+
+  @Override
+  public void teardown() throws Exception {
+    describe("teardown");
+    try {
+      if (csvPath != null) {
+        getFileSystem().delete(csvPath, false);
+      }
+      if (brokenCSV != null) {
+        getFileSystem().delete(brokenCSV, false);
+      }
+    } finally {
+      super.teardown();
+    }
+  }
+
+  @Test
+  public void testCapabilityProbe() throws Throwable {
+
+    // this should always hold true if we get past test setup
+    assertTrue("Select is not available on " + getFileSystem(),
+        isSelectAvailable(getFileSystem()));
+  }
+
+  @SuppressWarnings("NestedAssignment")
+  @Test
+  public void testReadWholeFileClassicAPI() throws Throwable {
+    describe("create and read the whole file. Verifies setup working");
+    int lines;
+    try (BufferedReader reader = new BufferedReader(
+        new InputStreamReader(
+            getFileSystem().open(csvPath)))) {
+      lines = 0;
+      // seek to 0, which is what some input formats do
+      String line;
+      while ((line = reader.readLine()) != null) {
+        lines++;
+        LOG.info("{}", line);
+      }
+    }
+    assertEquals("line count", ALL_ROWS_COUNT_WITH_HEADER, lines);
+  }
+
+  @Test
+  public void testSelectWholeFileNoHeader() throws Throwable {
+    describe("Select the entire file, expect all rows but the header");
+    expectSelected(
+        ALL_ROWS_COUNT,
+        selectConf,
+        CSV_HEADER_OPT_USE,
+        "SELECT * FROM S3OBJECT");
+  }
+
+  @Test
+  public void testSelectFirstColumnNoHeader() throws Throwable {
+    describe("Select the entire file, expect all rows but the header");
+    expectSelected(
+        ALL_ROWS_COUNT_WITH_HEADER,
+        selectConf,
+        CSV_HEADER_OPT_NONE,
+        "SELECT s._1 FROM S3OBJECT s");
+  }
+
+  @Test
+  public void testSelectSelfNoHeader() throws Throwable {
+    describe("Select the entire file, expect all rows but the header");
+    expectSelected(
+        ALL_ROWS_COUNT_WITH_HEADER,
+        selectConf,
+        CSV_HEADER_OPT_NONE,
+        "SELECT s._1 FROM S3OBJECT s WHERE s._1 = s._1");
+  }
+
+  @Test
+  public void testSelectSelfUseHeader() throws Throwable {
+    describe("Select the entire file, expect all rows including the header");
+    expectSelected(
+        ALL_ROWS_COUNT,
+        selectConf,
+        CSV_HEADER_OPT_USE,
+        "SELECT s.id FROM S3OBJECT s WHERE s.id = s.id");
+  }
+
+  @Test
+  public void testSelectID2UseHeader() throws Throwable {
+    describe("Select where ID=2; use the header");
+    expectSelected(
+        1,
+        selectConf,
+        CSV_HEADER_OPT_USE,
+        "SELECT s.id FROM S3OBJECT s WHERE s.id = '2'");
+  }
+
+  @Test
+  public void testSelectNoMatchingID() throws Throwable {
+    describe("Select where there is no match; expect nothing back");
+    expectSelected(
+        0,
+        selectConf,
+        CSV_HEADER_OPT_USE,
+        "SELECT s.id FROM S3OBJECT s WHERE s.id = '0x8000'");
+  }
+
+  @Test
+  public void testSelectId1() throws Throwable {
+    describe("Select the first element in the file");
+    expectSelected(
+        1,
+        selectConf,
+        CSV_HEADER_OPT_NONE,
+        "SELECT * FROM S3OBJECT s WHERE s._1 = '1'",
+        TRUE);
+  }
+
+  @Test
+  public void testSelectEmptySQL() throws Throwable {
+    describe("An empty SQL statement fails fast");
+    FutureDataInputStreamBuilder builder = getFileSystem().openFile(
+        csvPath)
+        .must(SELECT_SQL, "");
+    interceptFuture(IllegalArgumentException.class,
+        SELECT_SQL,
+        builder.build());
+  }
+
+  @Test
+  public void testSelectEmptyFile() throws Throwable {
+    describe("Select everything from an empty file");
+    Path path = path("testSelectEmptyFile");
+    S3AFileSystem fs = getFileSystem();
+    ContractTestUtils.touch(fs, path);
+    parseToLines(fs.openFile(path)
+            .must(SELECT_SQL, SELECT_EVERYTHING)
+            .build()
+            .get(),
+        0);
+  }
+
+  @Test
+  public void testSelectEmptyFileWithConditions() throws Throwable {
+    describe("Select everything from an empty file with a more complex SQL");
+    Path path = path("testSelectEmptyFileWithConditions");
+    S3AFileSystem fs = getFileSystem();
+    ContractTestUtils.touch(fs, path);
+    String sql = "SELECT * FROM S3OBJECT s WHERE s._1 = `TRUE`";
+    CompletableFuture<FSDataInputStream> future = fs.openFile(path)
+        .must(SELECT_SQL, sql).build();
+    assertEquals("Not at the end of the file", -1, future.get().read());
+  }
+
+  @Test
+  public void testSelectSeek() throws Throwable {
+    describe("Verify forward seeks work, not others");
+
+    // start: read in the full data through the initial select
+    // this makes asserting that contents match possible
+    Path path = csvPath;
+    S3AFileSystem fs = getFileSystem();
+    int len = (int) fs.getFileStatus(path).getLen();
+    byte[] fullData = new byte[len];
+    int actualLen;
+    try (DurationInfo ignored =
+             new DurationInfo(LOG, "Initial read of %s", path);
+        FSDataInputStream sourceStream =
+             select(fs, path,
+                 selectConf,
+                 SELECT_EVERYTHING)) {
+      // read it in
+      actualLen = IOUtils.read(sourceStream, fullData);
+    }
+    int seekRange = 20;
+
+    try (FSDataInputStream seekStream =
+             select(fs, path,
+                 selectConf,
+                 SELECT_EVERYTHING)) {
+      SelectInputStream sis
+          = (SelectInputStream) seekStream.getWrappedStream();
+      S3AInstrumentation.InputStreamStatistics streamStats
+          = sis.getS3AStreamStatistics();
+      // lazy seek doesn't raise a problem here
+      seekStream.seek(0);
+      assertEquals("first byte read", fullData[0], seekStream.read());
+
+      // and now the pos has moved, again, seek will be OK
+      seekStream.seek(1);
+      seekStream.seek(1);
+      // but trying to seek elsewhere now fails
+      PathIOException ex = intercept(PathIOException.class,
+          SelectInputStream.SEEK_UNSUPPORTED,
+          () -> seekStream.seek(0));
+      LOG.info("Seek error is as expected", ex);
+      // positioned reads from the current location work.
+      byte[] buffer = new byte[1];
+      long pos = seekStream.getPos();
+      seekStream.readFully(pos, buffer);
+      // but positioned backwards fail.
+      intercept(PathIOException.class,
+          SelectInputStream.SEEK_UNSUPPORTED,
+          () -> seekStream.readFully(0, buffer));
+      // the position has now moved on.
+      assertPosition(seekStream, pos + 1);
+      // so a seek to the old pos will fail
+      intercept(PathIOException.class,
+          SelectInputStream.SEEK_UNSUPPORTED,
+          () -> seekStream.readFully(pos, buffer));
+
+      // set the readahead to the default.
+      // This verifies it reverts to the default.
+      seekStream.setReadahead(null);
+      assertEquals("Readahead in ",
+          Constants.DEFAULT_READAHEAD_RANGE, sis.getReadahead());
+      // forward seeks are implemented as 1+ skip
+      long target = seekStream.getPos() + seekRange;
+      seek(seekStream, target);
+      assertPosition(seekStream, target);
+      // now do a read and compare values
+      assertEquals("byte at seek position",
+          fullData[(int)seekStream.getPos()], seekStream.read());
+      assertEquals("Seek bytes skipped in " + streamStats,
+          seekRange, streamStats.bytesSkippedOnSeek);
+
+      // try an invalid readahead range
+      intercept(IllegalArgumentException.class,
+          S3AInputStream.E_NEGATIVE_READAHEAD_VALUE,
+          () -> seekStream.setReadahead(-1L));
+
+      // do a slightly forward offset read
+      int read = seekStream.read(seekStream.getPos() + 2, buffer, 0, 1);
+      assertEquals(1, read);
+
+      // final fun: seek way past the EOF
+      logIntercepted(expectSeekEOF(seekStream, actualLen * 2));
+      assertPosition(seekStream, actualLen);
+      assertEquals(-1, seekStream.read());
+      LOG.info("Seek statistics {}", streamStats);
+      // this will return no, but not fail
+      assertFalse("Failed to seek to new source in " + seekStream,
+          seekStream.seekToNewSource(0));
+      // and set the readahead to 0 to see that close path works
+      seekStream.setReadahead(0L);
+      // then do a manual close even though there's one in the try resource.
+      // which will verify that a double close is harmless
+      seekStream.close();
+      LOG.info("Final stream state {}", sis);
+    }
+  }
+
+  /**
+   * Assert that a stream is in a specific position.
+   * @param stream stream or other seekable.
+   * @param pos expected position.
+   * @throws IOException failure of the getPos() call.
+   * @throws AssertionError mismatch between expected and actual.
+   */
+  private void assertPosition(Seekable stream, long pos)
+      throws IOException {
+    assertEquals("Wrong stream position in " + stream,
+        pos, stream.getPos());
+  }
+
+  @Test
+  public void testSelectOddLinesNoHeader() throws Throwable {
+    describe("Select odd lines, ignoring the header");
+    expectSelected(
+        ODD_ROWS_COUNT,
+        selectConf,
+        CSV_HEADER_OPT_IGNORE,
+        "SELECT * FROM S3OBJECT s WHERE s._5 = `TRUE`");
+    // and do a quick check on the instrumentation
+    long bytesRead = getFileSystem().getInstrumentation()
+        .getCounterValue(Statistic.STREAM_SEEK_BYTES_READ);
+    assertNotEquals("No bytes read count", 0, bytesRead);
+  }
+
+  @Test
+  public void testSelectOddLinesHeader() throws Throwable {
+    describe("Select the odd values");
+    List<String> selected = expectSelected(
+        ODD_ROWS_COUNT,
+        selectConf,
+        CSV_HEADER_OPT_USE,
+        SELECT_ODD_ROWS);
+    // the list includes odd values
+    assertThat(selected, hasItem(ENTRY_0001));
+    // but not the evens
+    assertThat(selected, not(hasItem(ENTRY_0002)));
+  }
+
+  @Test
+  public void testSelectOddLinesHeaderTSVOutput() throws Throwable {
+    describe("Select the odd values with tab spaced output");
+    selectConf.set(CSV_OUTPUT_FIELD_DELIMITER, "\t");
+    selectConf.set(CSV_OUTPUT_QUOTE_CHARACTER, "'");
+    selectConf.set(CSV_OUTPUT_QUOTE_FIELDS,
+        CSV_OUTPUT_QUOTE_FIELDS_AS_NEEEDED);
+    selectConf.set(CSV_OUTPUT_RECORD_DELIMITER, "\r");
+    List<String> selected = expectSelected(
+        ODD_ROWS_COUNT,
+        selectConf,
+        CSV_HEADER_OPT_USE,
+        SELECT_ODD_ENTRIES_BOOL);
+    // the list includes odd values
+    String row1 = selected.get(0);
+
+    // split that first line into columns: This is why TSV is better for code
+    // to work with than CSV
+    String[] columns = row1.split("\t", -1);
+    assertEquals("Wrong column count from tab split line <" + row1 + ">",
+        CSV_COLUMN_COUNT, columns.length);
+    assertEquals("Wrong column value from tab split line <" + row1 + ">",
+        "entry-0001", columns[3]);
+  }
+
+  @Test
+  public void testSelectNotOperationHeader() throws Throwable {
+    describe("Select the even values with a NOT call; quote the header name");
+    List<String> selected = expectSelected(
+        EVEN_ROWS_COUNT,
+        selectConf,
+        CSV_HEADER_OPT_USE,
+        "SELECT s.name FROM S3OBJECT s WHERE NOT s.\"odd\" = %s",
+        TRUE);
+    // the list includes no odd values
+    assertThat(selected, not(hasItem(ENTRY_0001)));
+    // but has the evens
+    assertThat(selected, hasItem(ENTRY_0002));
+  }
+
+  @Test
+  public void testBackslashExpansion() throws Throwable {
+    assertEquals("\t\r\n", expandBackslashChars("\t\r\n"));
+    assertEquals("\t", expandBackslashChars("\\t"));
+    assertEquals("\r", expandBackslashChars("\\r"));
+    assertEquals("\r \n", expandBackslashChars("\\r \\n"));
+    assertEquals("\\", expandBackslashChars("\\\\"));
+  }
+
+  /**
+   * This is an expanded example for the documentation.
+   * Also helps catch out unplanned changes to the configuration strings.
+   */
+  @Test
+  public void testSelectFileExample() throws Throwable {
+    describe("Select the entire file, expect all rows but the header");
+    int len = (int) getFileSystem().getFileStatus(csvPath).getLen();
+    FutureDataInputStreamBuilder builder =
+        getFileSystem().openFile(csvPath)
+            .must("fs.s3a.select.sql",
+                SELECT_ODD_ENTRIES)
+            .must("fs.s3a.select.input.format", "CSV")
+            .must("fs.s3a.select.input.compression", "NONE")
+            .must("fs.s3a.select.input.csv.header", "use")
+            .must("fs.s3a.select.output.format", "CSV");
+
+    CompletableFuture<FSDataInputStream> future = builder.build();
+    try (FSDataInputStream select = future.get()) {
+      // process the output
+      byte[] bytes = new byte[len];
+      int actual = select.read(bytes);
+      LOG.info("file length is {}; length of selected data is {}",
+          len, actual);
+    }
+  }
+
+  /**
+   * This is an expanded example for the documentation.
+   * Also helps catch out unplanned changes to the configuration strings.
+   */
+  @Test
+  public void testSelectUnsupportedInputFormat() throws Throwable {
+    describe("Request an unsupported input format");
+    FutureDataInputStreamBuilder builder = getFileSystem().openFile(csvPath)
+        .must(SELECT_SQL, SELECT_ODD_ENTRIES)
+        .must(SELECT_INPUT_FORMAT, "pptx");
+    interceptFuture(IllegalArgumentException.class,
+        "pptx",
+        builder.build());
+  }
+
+  /**
+   * Ask for an invalid output format.
+   */
+  @Test
+  public void testSelectUnsupportedOutputFormat() throws Throwable {
+    describe("Request a (currently) unsupported output format");
+    FutureDataInputStreamBuilder builder = getFileSystem().openFile(csvPath)
+        .must(SELECT_SQL, SELECT_ODD_ENTRIES)
+        .must(SELECT_INPUT_FORMAT, "csv")
+        .must(SELECT_OUTPUT_FORMAT, "json");
+    interceptFuture(IllegalArgumentException.class,
+        "json",
+        builder.build());
+  }
+
+  /**
+   *  Missing files fail lazy.
+   */
+  @Test
+  public void testSelectMissingFile() throws Throwable {
+
+    describe("Select a missing file, expect it to surface in the future");
+
+    Path missing = path("missing");
+
+    FutureDataInputStreamBuilder builder =
+        getFileSystem().openFile(missing)
+            .must(SELECT_SQL, SELECT_ODD_ENTRIES);
+
+    interceptFuture(FileNotFoundException.class,
+        "", builder.build());
+  }
+
+  @Test
+  public void testSelectDirectoryFails() throws Throwable {
+    describe("Verify that secondary select options are only valid on select"
+        + " queries");
+    S3AFileSystem fs = getFileSystem();
+    Path dir = path("dir");
+    // this will be an empty dir marker
+    fs.mkdirs(dir);
+
+    FutureDataInputStreamBuilder builder =
+        getFileSystem().openFile(dir)
+            .must(SELECT_SQL, SELECT_ODD_ENTRIES);
+    interceptFuture(PathIOException.class,
+        "", builder.build());
+
+    // try the parent
+    builder = getFileSystem().openFile(dir.getParent())
+            .must(SELECT_SQL,
+                SELECT_ODD_ENTRIES);
+    interceptFuture(PathIOException.class,
+        "", builder.build());
+  }
+
+  @Test
+  public void testSelectRootFails() throws Throwable {
+    describe("verify root dir selection is rejected");
+    FutureDataInputStreamBuilder builder =
+        getFileSystem().openFile(path("/"))
+            .must(SELECT_SQL, SELECT_ODD_ENTRIES);
+    interceptFuture(PathIOException.class,
+        "", builder.build());
+  }
+
+  /**
+   * Validate the abort logic.
+   */
+  @Test
+  public void testCloseWithAbort() throws Throwable {
+    describe("Close the stream with the readahead outstanding");
+    S3ATestUtils.MetricDiff readOps = new S3ATestUtils.MetricDiff(
+        getFileSystem(),
+        Statistic.STREAM_READ_OPERATIONS_INCOMPLETE);
+    selectConf.setInt(READAHEAD_RANGE, 2);
+
+    FSDataInputStream stream = select(getFileSystem(), csvPath, selectConf,
+        "SELECT * FROM S3OBJECT s");
+    SelectInputStream sis = (SelectInputStream) stream.getWrappedStream();
+    assertEquals("Readahead on " + sis, 2, sis.getReadahead());
+    stream.setReadahead(1L);
+    assertEquals("Readahead on " + sis, 1, sis.getReadahead());
+    stream.read();
+    S3AInstrumentation.InputStreamStatistics stats
+        = sis.getS3AStreamStatistics();
+    assertEquals("Read count in " + sis,
+        1, stats.bytesRead);
+    stream.close();
+    assertEquals("Abort count in " + sis,
+        1, stats.aborted);
+    readOps.assertDiffEquals("Read operations are still considered active",
+        0);
+    intercept(PathIOException.class, FSExceptionMessages.STREAM_IS_CLOSED,
+        () -> stream.read());
+  }
+
+  @Test
+  public void testCloseWithNoAbort() throws Throwable {
+    describe("Close the stream with the readahead outstandingV");
+    FSDataInputStream stream = select(getFileSystem(), csvPath, selectConf,
+        "SELECT * FROM S3OBJECT s");
+    stream.setReadahead(0x1000L);
+    SelectInputStream sis = (SelectInputStream) stream.getWrappedStream();
+    S3AInstrumentation.InputStreamStatistics stats
+        = sis.getS3AStreamStatistics();
+    stream.close();
+    assertEquals("Close count in " + sis, 1, stats.closed);
+    assertEquals("Abort count in " + sis, 0, stats.aborted);
+    assertTrue("No bytes read in close of " + sis, stats.bytesReadInClose > 0);
+  }
+
+  @Test
+  public void testFileContextIntegration() throws Throwable {
+    describe("Test that select works through FileContext");
+    FileContext fc = S3ATestUtils.createTestFileContext(getConfiguration());
+    selectConf.set(CSV_INPUT_HEADER, CSV_HEADER_OPT_USE);
+
+    List<String> selected =
+        verifySelectionCount(ODD_ROWS_COUNT, SELECT_ODD_ENTRIES_INT,
+            parseToLines(
+                select(fc, csvPath, selectConf, SELECT_ODD_ROWS)));
+    // the list includes odd values
+    assertThat(selected, hasItem(ENTRY_0001));
+    // but not the evens
+    assertThat(selected, not(hasItem(ENTRY_0002)));
+  }
+
+  @Test
+  public void testSelectOptionsOnlyOnSelectCalls() throws Throwable {
+    describe("Secondary select options are only valid on select"
+        + " queries");
+    String key = CSV_INPUT_HEADER;
+    intercept(IllegalArgumentException.class, key,
+        () -> getFileSystem().openFile(csvPath)
+            .must(key, CSV_HEADER_OPT_USE).build());
+  }
+
+  @Test
+  public void testSelectMustBeEnabled() throws Throwable {
+    describe("Verify that the FS must have S3 select enabled.");
+    Configuration conf = new Configuration(getFileSystem().getConf());
+    conf.setBoolean(FS_S3A_SELECT_ENABLED, false);
+    try (FileSystem fs2 = FileSystem.newInstance(csvPath.toUri(), conf)) {
+      intercept(UnsupportedOperationException.class,
+          SELECT_UNSUPPORTED,
+          () -> {
+            assertFalse("S3 Select Capability must be disabled on " + fs2,
+                isSelectAvailable(fs2));
+            return fs2.openFile(csvPath)
+              .must(SELECT_SQL, SELECT_ODD_ROWS)
+              .build();
+          });
+    }
+  }
+
+  @Test
+  public void testSelectOptionsRejectedOnNormalOpen() throws Throwable {
+    describe("Verify that a normal open fails on select must() options");
+    intercept(IllegalArgumentException.class,
+        AbstractFSBuilderImpl.UNKNOWN_MANDATORY_KEY,
+        () -> getFileSystem().openFile(csvPath)
+            .must(CSV_INPUT_HEADER, CSV_HEADER_OPT_USE)
+            .build());
+  }
+
+  @Test
+  public void testSelectOddRecordsWithHeader()
+      throws Throwable {
+    describe("work through a record reader");
+    JobConf conf = createJobConf();
+    inputMust(conf, CSV_INPUT_HEADER, CSV_HEADER_OPT_USE);
+    expectRecordsRead(ODD_ROWS_COUNT, conf, SELECT_ODD_ENTRIES_DECIMAL);
+  }
+
+  @Test
+  public void testSelectDatestampsConverted()
+      throws Throwable {
+    describe("timestamp conversion in record IIO");
+    JobConf conf = createJobConf();
+    inputMust(conf, CSV_INPUT_HEADER, CSV_HEADER_OPT_USE);
+    inputMust(conf, CSV_OUTPUT_QUOTE_FIELDS,
+        CSV_OUTPUT_QUOTE_FIELDS_AS_NEEEDED);
+    String sql = SELECT_TO_DATE;
+    List<String> records = expectRecordsRead(ALL_ROWS_COUNT, conf, sql);
+    LOG.info("Result of {}\n{}", sql, prepareToPrint(records));
+  }
+
+  @Test
+  public void testSelectNoMatch()
+      throws Throwable {
+    describe("when there's no match to a query, 0 records are returned,");
+    JobConf conf = createJobConf();
+    inputMust(conf, CSV_INPUT_HEADER, CSV_HEADER_OPT_USE);
+    expectRecordsRead(0, conf,
+        "SELECT * FROM S3OBJECT s WHERE s.odd = " + q("maybe"));
+  }
+
+  @Test
+  public void testSelectOddRecordsIgnoreHeader()
+      throws Throwable {
+    describe("work through a record reader");
+    JobConf conf = createJobConf();
+    inputOpt(conf, CSV_INPUT_HEADER, CSV_HEADER_OPT_NONE);
+    inputMust(conf, CSV_INPUT_HEADER, CSV_HEADER_OPT_IGNORE);
+    expectRecordsRead(EVEN_ROWS_COUNT, conf,
+        SELECT_EVEN_ROWS_NO_HEADER);
+  }
+
+  @Test
+  public void testSelectRecordsUnknownMustOpt()
+      throws Throwable {
+    describe("verify reader key validation is remapped");
+    JobConf conf = createJobConf();
+    inputOpt(conf, CSV_INPUT_HEADER, CSV_HEADER_OPT_NONE);
+    inputMust(conf, CSV_INPUT_HEADER + ".something", CSV_HEADER_OPT_IGNORE);
+    intercept(IllegalArgumentException.class,
+        AbstractFSBuilderImpl.UNKNOWN_MANDATORY_KEY,
+        () -> readRecords(conf, SELECT_EVEN_ROWS_NO_HEADER));
+  }
+
+  @Test
+  public void testSelectOddRecordsWithHeaderV1()
+      throws Throwable {
+    describe("work through a V1 record reader");
+    JobConf conf = createJobConf();
+    inputMust(conf, CSV_INPUT_HEADER, CSV_HEADER_OPT_USE);
+    // using a double backslash here makes the string "\t" which will then
+    // be parsed in the SelectBinding code as it if had come in on from an XML
+    // entry
+    inputMust(conf, CSV_OUTPUT_FIELD_DELIMITER, "\\t");
+    inputMust(conf, CSV_OUTPUT_QUOTE_CHARACTER, "'");
+    inputMust(conf, CSV_OUTPUT_QUOTE_FIELDS,
+        CSV_OUTPUT_QUOTE_FIELDS_AS_NEEEDED);
+    inputMust(conf, CSV_OUTPUT_RECORD_DELIMITER, "\n");
+    verifySelectionCount(ODD_ROWS_COUNT,
+        SELECT_ODD_ROWS,
+        readRecordsV1(conf, SELECT_ODD_ROWS));
+  }
+
+  /**
+   * Create a job conf for line reader tests.
+   * This patches the job with the passthrough codec for
+   * CSV files.
+   * @return a job configuration
+   */
+  private JobConf createJobConf() {
+    JobConf conf = new JobConf(getConfiguration());
+    enablePassthroughCodec(conf, ".csv");
+    return conf;
+  }
+
+  @Test
+  public void testSelectOddRecordsIgnoreHeaderV1()
+      throws Throwable {
+    describe("work through a V1 record reader");
+    JobConf conf = createJobConf();
+    inputOpt(conf, CSV_INPUT_HEADER, CSV_HEADER_OPT_NONE);
+    inputMust(conf, CSV_INPUT_HEADER, CSV_HEADER_OPT_IGNORE);
+    inputMust(conf, INPUT_FADVISE, INPUT_FADV_NORMAL);
+    inputMust(conf, SELECT_ERRORS_INCLUDE_SQL, "true");
+    verifySelectionCount(EVEN_ROWS_COUNT,
+        SELECT_EVEN_ROWS_NO_HEADER,
+        readRecordsV1(conf, SELECT_EVEN_ROWS_NO_HEADER));
+  }
+
+  protected List<String> expectRecordsRead(final int expected,
+      final JobConf conf,
+      final String sql) throws Exception {
+    return verifySelectionCount(expected, sql, readRecords(conf, sql));
+  }
+
+  /**
+   * Reads lines through {@link LineRecordReader}, as if it were an MR
+   * job.
+   * @param conf jpb conf
+   * @param sql sql to add to the configuration.
+   * @return the selected lines.
+   * @throws Exception failure
+   */
+  private List<String> readRecords(JobConf conf, String sql) throws Exception {
+    return readRecords(conf,
+        csvPath,
+        sql,
+        createLineRecordReader(),
+        ALL_ROWS_COUNT_WITH_HEADER);
+  }
+
+  /**
+   * Reads lines through a v1 LineRecordReader}.
+   * @param conf jpb conf
+   * @param sql sql to add to the configuration.
+   * @return the selected lines.
+   * @throws Exception failure
+   */
+  private List<String> readRecordsV1(JobConf conf, String sql)
+      throws Exception {
+    inputMust(conf, SELECT_SQL, sql);
+    return super.readRecordsV1(conf,
+        createLineRecordReaderV1(conf, csvPath),
+        new LongWritable(),
+        new Text(),
+        ALL_ROWS_COUNT_WITH_HEADER);
+  }
+
+  /**
+   * Issue a select call, expect the specific number of rows back.
+   * Error text will include the SQL.
+   * @param expected expected row count.
+   * @param conf config for the select call.
+   * @param header header option
+   * @param sql template for a formatted SQL request.
+   * @param args arguments for the formatted request.
+   * @return the lines selected
+   * @throws IOException failure
+   */
+  private List<String> expectSelected(
+      final int expected,
+      final Configuration conf,
+      final String header,
+      final String sql,
+      final Object...args) throws Exception {
+    conf.set(CSV_INPUT_HEADER, header);
+    return verifySelectionCount(expected, sql(sql, args),
+        selectCsvFile(conf, sql, args));
+  }
+
+  /**
+   * Select from the CSV file.
+   * @param conf config for the select call.
+   * @param sql template for a formatted SQL request.
+   * @param args arguments for the formatted request.
+   * @return the lines selected
+   * @throws IOException failure
+   */
+  private List<String> selectCsvFile(
+      final Configuration conf,
+      final String sql,
+      final Object...args)
+      throws Exception {
+
+    return parseToLines(
+        select(getFileSystem(), csvPath, conf, sql, args));
+  }
+
+  @Test
+  public void testCommentsSkipped() throws Throwable {
+    describe("Verify that comments are skipped");
+    selectConf.set(CSV_INPUT_HEADER, CSV_HEADER_OPT_USE);
+
+    List<String> lines = verifySelectionCount(
+        ALL_ROWS_COUNT_WITH_HEADER,
+        "select s.id",
+        parseToLines(
+            select(getFileSystem(), brokenCSV, selectConf,
+                "SELECT * FROM S3OBJECT s")));
+    LOG.info("\n{}", prepareToPrint(lines));
+  }
+
+  @Test
+  public void testEmptyColumnsRegenerated() throws Throwable {
+    describe("if you ask for a column but your row doesn't have it,"
+        + " an empty column is inserted");
+    selectConf.set(CSV_INPUT_HEADER, CSV_HEADER_OPT_USE);
+
+    List<String> lines = verifySelectionCount(
+        ALL_ROWS_COUNT_WITH_HEADER, "select s.oddrange",
+        parseToLines(
+            select(getFileSystem(), brokenCSV, selectConf,
+                "SELECT s.oddrange FROM S3OBJECT s")));
+    LOG.info("\n{}", prepareToPrint(lines));
+    assertEquals("Final oddrange column is not regenerated empty",
+        "\"\"", lines.get(lines.size() - 1));
+  }
+
+  @Test
+  public void testIntCastFailure() throws Throwable {
+    describe("Verify that int casts fail");
+    expectSelectFailure(E_CAST_FAILED, SELECT_ODD_ENTRIES_INT);
+
+  }
+
+  @Test
+  public void testSelectToDateParseFailure() throws Throwable {
+    describe("Verify date parsing failure");
+    expectSelectFailure(E_CAST_FAILED, SELECT_TO_DATE);
+  }
+
+  @Test
+  public void testParseInvalidPathComponent() throws Throwable {
+    describe("Verify bad SQL parseing");
+    expectSelectFailure(E_PARSE_INVALID_PATH_COMPONENT,
+        "SELECT * FROM S3OBJECT WHERE s.'oddf' = true");
+  }
+
+  @Test
+  public void testSelectInvalidTableAlias() throws Throwable {
+    describe("select with unknown column name");
+    expectSelectFailure(E_INVALID_TABLE_ALIAS,
+        "SELECT * FROM S3OBJECT WHERE s.\"oddf\" = 'true'");
+  }
+
+  @Test
+  public void testSelectGeneratedAliases() throws Throwable {
+    describe("select with a ._2 column when headers are enabled");
+    expectSelectFailure(E_INVALID_TABLE_ALIAS,
+        "SELECT * FROM S3OBJECT WHERE s._2 = 'true'");
+  }
+
+  /**
+   * Expect select against the broken CSV file to fail with a specific
+   * AWS exception error code.
+   * If the is no failure, the results are included in the assertion raised.
+   * @param expectedErrorCode error code in getErrorCode()
+   * @param sql SQL to invoke
+   * @return the exception, if it is as expected.
+   * @throws Exception any other failure
+   * @throws AssertionError when an exception is raised, but its error code
+   * is different, or when no exception was raised.
+   */
+  protected AWSServiceIOException expectSelectFailure(
+      String expectedErrorCode,
+      String sql)
+      throws Exception {
+    selectConf.set(CSV_INPUT_HEADER, CSV_HEADER_OPT_USE);
+    return verifyErrorCode(expectedErrorCode,
+        intercept(AWSBadRequestException.class,
+            () ->
+                prepareToPrint(
+                    parseToLines(
+                        select(getFileSystem(), brokenCSV, selectConf, sql)
+                    ))));
+
+  }
+
+
+  @Test
+  public void testInputSplit()
+      throws Throwable {
+    describe("Verify that only a single file is used for splits");
+    JobConf conf = new JobConf(getConfiguration());
+
+
+    inputMust(conf, CSV_INPUT_HEADER, CSV_HEADER_OPT_USE);
+    final Path input = csvPath;
+    S3AFileSystem fs = getFileSystem();
+    final Path output = path("testLandsatSelect")
+        .makeQualified(fs.getUri(), fs.getWorkingDirectory());
+    conf.set(FileInputFormat.INPUT_DIR, input.toString());
+    conf.set(FileOutputFormat.OUTDIR, output.toString());
+
+    final Job job = Job.getInstance(conf, "testInputSplit");
+    JobContext jobCtx = new JobContextImpl(job.getConfiguration(),
+        getTaskAttempt0().getJobID());
+
+    TextInputFormat tif = new TextInputFormat();
+    List<InputSplit> splits = tif.getSplits(jobCtx);
+    assertThat("split count wrong", splits, hasSize(1));
+
+  }
+
+}
diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/select/ITestS3SelectCLI.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/select/ITestS3SelectCLI.java
new file mode 100644
index 0000000000000..c04cf8bff76c1
--- /dev/null
+++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/select/ITestS3SelectCLI.java
@@ -0,0 +1,347 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.s3a.select;
+
+import java.io.ByteArrayOutputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.nio.charset.Charset;
+import java.util.List;
+
+import org.junit.Test;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.s3a.S3AFileSystem;
+import org.apache.hadoop.fs.s3a.S3ATestUtils;
+import org.apache.hadoop.fs.s3a.Statistic;
+import org.apache.hadoop.fs.s3a.commit.Duration;
+import org.apache.hadoop.fs.s3a.s3guard.S3GuardTool;
+import org.apache.hadoop.util.ExitUtil;
+import org.apache.hadoop.util.ToolRunner;
+
+import static com.google.common.base.Preconditions.checkNotNull;
+import static org.apache.hadoop.fs.s3a.S3ATestUtils.disableFilesystemCaching;
+import static org.apache.hadoop.fs.s3a.s3guard.S3GuardToolTestHelper.exec;
+import static org.apache.hadoop.fs.s3a.select.ITestS3SelectLandsat.SELECT_NOTHING;
+import static org.apache.hadoop.fs.s3a.select.ITestS3SelectLandsat.SELECT_SUNNY_ROWS_NO_LIMIT;
+import static org.apache.hadoop.fs.s3a.select.SelectConstants.*;
+import static org.apache.hadoop.fs.s3a.select.SelectTool.*;
+import static org.apache.hadoop.service.launcher.LauncherExitCodes.EXIT_COMMAND_ARGUMENT_ERROR;
+import static org.apache.hadoop.service.launcher.LauncherExitCodes.EXIT_NOT_FOUND;
+import static org.apache.hadoop.service.launcher.LauncherExitCodes.EXIT_SERVICE_UNAVAILABLE;
+import static org.apache.hadoop.service.launcher.LauncherExitCodes.EXIT_SUCCESS;
+import static org.apache.hadoop.service.launcher.LauncherExitCodes.EXIT_USAGE;
+import static org.apache.hadoop.test.LambdaTestUtils.intercept;
+
+/**
+ * Test the S3 Select CLI through some operations against landsat
+ * and files generated from it.
+ */
+public class ITestS3SelectCLI extends AbstractS3SelectTest {
+
+  public static final int LINE_COUNT = 100;
+
+  public static final String SELECT_EVERYTHING = "SELECT * FROM S3OBJECT s";
+
+  private SelectTool selectTool;
+
+  private Configuration selectConf;
+
+  public static final String D = "-D";
+
+  private File localFile;
+
+  private String landsatSrc;
+
+  @Override
+  public void setup() throws Exception {
+    super.setup();
+    selectTool = new SelectTool(getConfiguration());
+    selectConf = new Configuration(getConfiguration());
+    localFile = getTempFilename();
+    landsatSrc = getLandsatGZ().toString();
+  }
+
+  @Override
+  public void teardown() throws Exception {
+    super.teardown();
+    if (localFile != null) {
+      localFile.delete();
+    }
+  }
+
+  /**
+   * Expect a command to succeed.
+   * @param message any extra text to include in the assertion error message
+   * @param tool tool to run
+   * @param args arguments to the command
+   * @return the output of any successful run
+   * @throws Exception failure
+   */
+  protected static String expectSuccess(
+      String message,
+      S3GuardTool tool,
+      String... args) throws Exception {
+    ByteArrayOutputStream buf = new ByteArrayOutputStream();
+    exec(EXIT_SUCCESS, message, tool, buf, args);
+    return buf.toString();
+  }
+
+  /**
+   * Run a S3GuardTool command from a varags list and the
+   * configuration returned by {@code getConfiguration()}.
+   * @param conf config to use
+   * @param args argument list
+   * @return the return code
+   * @throws Exception any exception
+   */
+  protected int run(Configuration conf, S3GuardTool tool,
+      String... args) throws Exception {
+    return ToolRunner.run(conf, tool, args);
+  }
+
+  /**
+   * Run a S3GuardTool command from a varags list, catch any raised
+   * ExitException and verify the status code matches that expected.
+   * @param status expected status code of the exception
+   * @param conf config to use
+   * @param args argument list
+   * @throws Exception any exception
+   */
+  protected void runToFailure(int status, Configuration conf,
+      String message,
+      S3GuardTool tool, String... args)
+      throws Exception {
+    final ExitUtil.ExitException ex =
+        intercept(ExitUtil.ExitException.class, message,
+            () -> ToolRunner.run(conf, tool, args));
+    if (ex.status != status) {
+      throw ex;
+    }
+
+  }
+
+  @Test
+  public void testLandsatToFile() throws Throwable {
+    describe("select part of the landsat to a file");
+    int lineCount = LINE_COUNT;
+    S3AFileSystem landsatFS =
+        (S3AFileSystem) getLandsatGZ().getFileSystem(getConfiguration());
+    S3ATestUtils.MetricDiff selectCount = new S3ATestUtils.MetricDiff(landsatFS,
+        Statistic.OBJECT_SELECT_REQUESTS);
+
+    run(selectConf, selectTool,
+        D, v(CSV_OUTPUT_QUOTE_CHARACTER, "'"),
+        D, v(CSV_OUTPUT_QUOTE_FIELDS, CSV_OUTPUT_QUOTE_FIELDS_AS_NEEEDED),
+        "select",
+        o(OPT_HEADER), CSV_HEADER_OPT_USE,
+        o(OPT_COMPRESSION), COMPRESSION_OPT_GZIP,
+        o(OPT_LIMIT), Integer.toString(lineCount),
+        o(OPT_OUTPUT), localFile.toString(),
+        landsatSrc,
+        SELECT_SUNNY_ROWS_NO_LIMIT);
+    List<String> lines = IOUtils.readLines(new FileInputStream(localFile),
+        Charset.defaultCharset());
+    LOG.info("Result from select:\n{}", lines.get(0));
+    assertEquals(lineCount, lines.size());
+    selectCount.assertDiffEquals("select count", 1);
+    Duration duration = selectTool.getSelectDuration();
+    assertTrue("Select duration was not measured",
+        duration.value() > 0);
+  }
+
+  private File getTempFilename() throws IOException {
+    File dest = File.createTempFile("landat", ".csv");
+    dest.delete();
+    return dest;
+  }
+
+  @Test
+  public void testLandsatToConsole() throws Throwable {
+    describe("select part of the landsat to the console");
+    // this verifies the input stream was actually closed
+    S3ATestUtils.MetricDiff readOps = new S3ATestUtils.MetricDiff(
+        getFileSystem(),
+        Statistic.STREAM_READ_OPERATIONS_INCOMPLETE);
+    run(selectConf, selectTool,
+        D, v(CSV_OUTPUT_QUOTE_CHARACTER, "'"),
+        D, v(CSV_OUTPUT_QUOTE_FIELDS, CSV_OUTPUT_QUOTE_FIELDS_ALWAYS),
+        "select",
+        o(OPT_HEADER), CSV_HEADER_OPT_USE,
+        o(OPT_COMPRESSION), COMPRESSION_OPT_GZIP,
+        o(OPT_LIMIT), Integer.toString(LINE_COUNT),
+        landsatSrc,
+        SELECT_SUNNY_ROWS_NO_LIMIT);
+    assertEquals("Lines read and printed to console",
+        LINE_COUNT, selectTool.getLinesRead());
+    readOps.assertDiffEquals("Read operations are still considered active",
+        0);  }
+
+  @Test
+  public void testSelectNothing() throws Throwable {
+    describe("an empty select is not an error");
+    run(selectConf, selectTool,
+        "select",
+        o(OPT_HEADER), CSV_HEADER_OPT_USE,
+        o(OPT_COMPRESSION), COMPRESSION_OPT_GZIP,
+        o(OPT_INPUTFORMAT), "csv",
+        o(OPT_OUTPUTFORMAT), "csv",
+        o(OPT_EXPECTED), "0",
+        o(OPT_LIMIT), Integer.toString(LINE_COUNT),
+        landsatSrc,
+        SELECT_NOTHING);
+    assertEquals("Lines read and printed to console",
+        0, selectTool.getLinesRead());
+  }
+
+  @Test
+  public void testLandsatToRemoteFile() throws Throwable {
+    describe("select part of the landsat to a file");
+    Path dest = path("testLandsatToRemoteFile.csv");
+    run(selectConf, selectTool,
+        D, v(CSV_OUTPUT_QUOTE_CHARACTER, "'"),
+        D, v(CSV_OUTPUT_QUOTE_FIELDS, CSV_OUTPUT_QUOTE_FIELDS_ALWAYS),
+        "select",
+        o(OPT_HEADER), CSV_HEADER_OPT_USE,
+        o(OPT_COMPRESSION), COMPRESSION_OPT_GZIP,
+        o(OPT_LIMIT), Integer.toString(LINE_COUNT),
+        o(OPT_OUTPUT), dest.toString(),
+        landsatSrc,
+        SELECT_SUNNY_ROWS_NO_LIMIT);
+    FileStatus status = getFileSystem().getFileStatus(dest);
+    assertEquals(
+        "Mismatch between bytes selected and file len in " + status,
+        selectTool.getBytesRead(), status.getLen());
+    assertIsFile(dest);
+
+    // now select on that
+    Configuration conf = getConfiguration();
+    SelectTool tool2 = new SelectTool(conf);
+    run(conf, tool2,
+        "select",
+        o(OPT_HEADER), CSV_HEADER_OPT_NONE,
+        dest.toString(),
+        SELECT_EVERYTHING);
+  }
+
+  @Test
+  public void testUsage() throws Throwable {
+    runToFailure(EXIT_USAGE, getConfiguration(), TOO_FEW_ARGUMENTS,
+        selectTool, "select");
+  }
+
+  @Test
+  public void testRejectionOfNonS3FS() throws Throwable {
+    File dest = getTempFilename();
+    runToFailure(EXIT_SERVICE_UNAVAILABLE,
+        getConfiguration(),
+        WRONG_FILESYSTEM,
+        selectTool, "select", dest.toString(),
+        SELECT_EVERYTHING);
+  }
+
+  @Test
+  public void testFailMissingFile() throws Throwable {
+    Path dest = path("testFailMissingFile.csv");
+    runToFailure(EXIT_NOT_FOUND,
+        getConfiguration(),
+        "",
+        selectTool, "select", dest.toString(),
+        SELECT_EVERYTHING);
+  }
+
+  @Test
+  public void testS3SelectDisabled() throws Throwable {
+    Configuration conf = getConfiguration();
+    conf.setBoolean(FS_S3A_SELECT_ENABLED, false);
+    disableFilesystemCaching(conf);
+    runToFailure(EXIT_SERVICE_UNAVAILABLE,
+        conf,
+        SELECT_IS_DISABLED,
+        selectTool, "select",
+        o(OPT_HEADER), CSV_HEADER_OPT_USE,
+        o(OPT_COMPRESSION), COMPRESSION_OPT_GZIP,
+        o(OPT_LIMIT), Integer.toString(LINE_COUNT),
+        landsatSrc,
+        SELECT_SUNNY_ROWS_NO_LIMIT);
+  }
+
+  @Test
+  public void testSelectBadLimit() throws Throwable {
+    runToFailure(EXIT_USAGE,
+        getConfiguration(),
+        "",
+        selectTool, "select",
+        o(OPT_HEADER), CSV_HEADER_OPT_USE,
+        o(OPT_COMPRESSION), COMPRESSION_OPT_GZIP,
+        o(OPT_LIMIT), "-1",
+        landsatSrc,
+        SELECT_NOTHING);
+  }
+
+  @Test
+  public void testSelectBadInputFormat() throws Throwable {
+    runToFailure(EXIT_COMMAND_ARGUMENT_ERROR,
+        getConfiguration(),
+        "",
+        selectTool, "select",
+        o(OPT_HEADER), CSV_HEADER_OPT_USE,
+        o(OPT_INPUTFORMAT), "pptx",
+        o(OPT_COMPRESSION), COMPRESSION_OPT_GZIP,
+        landsatSrc,
+        SELECT_NOTHING);
+  }
+
+  @Test
+  public void testSelectBadOutputFormat() throws Throwable {
+    runToFailure(EXIT_COMMAND_ARGUMENT_ERROR,
+        getConfiguration(),
+        "",
+        selectTool, "select",
+        o(OPT_HEADER), CSV_HEADER_OPT_USE,
+        o(OPT_OUTPUTFORMAT), "pptx",
+        o(OPT_COMPRESSION), COMPRESSION_OPT_GZIP,
+        landsatSrc,
+        SELECT_NOTHING);
+  }
+
+  /**
+   * Take an option and add the "-" prefix.
+   * @param in input option
+   * @return value for the tool args list.
+   */
+  private static String o(String in) {
+    return "-" + in;
+  }
+
+  /**
+   * Create the key=value bit of the -D key=value pair.
+   * @param key key to set
+   * @param value value to use
+   * @return a string for the tool args list.
+   */
+  private static String v(String key, String value) {
+    return checkNotNull(key) + "=" + checkNotNull(value);
+  }
+
+}
diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/select/ITestS3SelectLandsat.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/select/ITestS3SelectLandsat.java
new file mode 100644
index 0000000000000..780040e6a48a3
--- /dev/null
+++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/select/ITestS3SelectLandsat.java
@@ -0,0 +1,432 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.s3a.select;
+
+import java.io.IOException;
+import java.util.List;
+
+import org.junit.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.CommonConfigurationKeys;
+import org.apache.hadoop.fs.FSDataInputStream;
+import org.apache.hadoop.fs.FileContext;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.PathIOException;
+import org.apache.hadoop.fs.s3a.S3AFileSystem;
+import org.apache.hadoop.fs.s3a.S3AInstrumentation;
+import org.apache.hadoop.fs.s3a.S3ATestUtils;
+import org.apache.hadoop.fs.s3a.Statistic;
+import org.apache.hadoop.fs.s3a.commit.DurationInfo;
+import org.apache.hadoop.mapred.JobConf;
+
+import static org.apache.hadoop.fs.s3a.S3ATestUtils.assume;
+import static org.apache.hadoop.fs.s3a.S3ATestUtils.getTestPropertyBool;
+import static org.apache.hadoop.fs.s3a.scale.S3AScaleTestBase._1KB;
+import static org.apache.hadoop.fs.s3a.scale.S3AScaleTestBase._1MB;
+import static org.apache.hadoop.fs.s3a.select.SelectConstants.*;
+import static org.apache.hadoop.test.LambdaTestUtils.intercept;
+import static org.hamcrest.CoreMatchers.containsString;
+import static org.hamcrest.CoreMatchers.not;
+
+/**
+ * Test the S3 Select feature with the Landsat dataset.
+ *
+ * This helps explore larger datasets, compression and the like.
+ *
+ * This suite is only executed if the destination store declares its support for
+ * the feature and the test CSV file configuration option points to the
+ * standard landsat GZip file. That's because these tests require the specific
+ * format of the landsat file.
+ *
+ * Normally working with the landsat file is a scale test.
+ * Here, because of the select operations, there's a lot less data
+ * to download.
+ * For this to work: write aggressive select calls: filtering, using LIMIT
+ * and projecting down to a few columns.
+ *
+ * For the structure, see
+ * <a href="https://docs.opendata.aws/landsat-pds/readme.html">Landsat on AWS</a>
+ *
+ * <code>
+ *   entityId: String         LC80101172015002LGN00
+ *   acquisitionDate: String  2015-01-02 15:49:05.571384
+ *   cloudCover: Float (possibly -ve) 80.81
+ *   processingLevel: String  L1GT
+ *   path: Int                10
+ *   row:  Int                117
+ *   min_lat: Float           -79.09923
+ *   min_lon: Float           -139.66082
+ *   max_lat: Float           -77.7544
+ *   max_lon: Float           125.09297
+ *   download_url: HTTPS URL https://s3-us-west-2.amazonaws.com/landsat-pds/L8/010/117/LC80101172015002LGN00/index.html
+ * </code>
+ * Ranges
+ * <ol>
+ *   <li>Latitude should range in -180 <= lat <= 180</li>
+ *   <li>Longitude in 0 <= lon <= 360</li>
+ *   <li>Standard Greenwich Meridian (not the french one which still surfaces)</li>
+ *   <li>Cloud cover <i>Should</i> be 0-100, but there are some negative ones.</li>
+ * </ol>
+ *
+ * Head of the file:
+ * <code>
+ entityId,acquisitionDate,cloudCover,processingLevel,path,row,min_lat,min_lon,max_lat,max_lon,download_url
+ * LC80101172015002LGN00,2015-01-02 15:49:05.571384,80.81,L1GT,10,117,-79.09923,-139.66082,-77.7544,-125.09297,https://s3-us-west-2.amazonaws.com/landsat-pds/L8/010/117/LC80101172015002LGN00/index.html
+ * LC80260392015002LGN00,2015-01-02 16:56:51.399666,90.84,L1GT,26,39,29.23106,-97.48576,31.36421,-95.16029,https://s3-us-west-2.amazonaws.com/landsat-pds/L8/026/039/LC80260392015002LGN00/index.html
+ * LC82270742015002LGN00,2015-01-02 13:53:02.047000,83.44,L1GT,227,74,-21.28598,-59.27736,-19.17398,-57.07423,https://s3-us-west-2.amazonaws.com/landsat-pds/L8/227/074/LC82270742015002LGN00/index.html
+ * LC82270732015002LGN00,2015-01-02 13:52:38.110317,52.29,L1T,227,73,-19.84365,-58.93258,-17.73324,-56.74692,https://s3-us-west-2.amazonaws.com/landsat-pds/L8/227/073/LC82270732015002LGN00/index.html
+ * </code>
+ *
+ * For the Curious this is the Scala/Spark declaration of the schema.
+ * <code>
+ *   def addLandsatColumns(csv: DataFrame): DataFrame = {
+ *     csv
+ *       .withColumnRenamed("entityId", "id")
+ *       .withColumn("acquisitionDate",
+ *         csv.col("acquisitionDate").cast(TimestampType))
+ *       .withColumn("cloudCover", csv.col("cloudCover").cast(DoubleType))
+ *       .withColumn("path", csv.col("path").cast(IntegerType))
+ *       .withColumn("row", csv.col("row").cast(IntegerType))
+ *       .withColumn("min_lat", csv.col("min_lat").cast(DoubleType))
+ *       .withColumn("min_lon", csv.col("min_lon").cast(DoubleType))
+ *       .withColumn("max_lat", csv.col("max_lat").cast(DoubleType))
+ *       .withColumn("max_lon", csv.col("max_lon").cast(DoubleType))
+ *       .withColumn("year",
+ *         year(col("acquisitionDate")))
+ *       .withColumn("month",
+ *         month(col("acquisitionDate")))
+ *       .withColumn("day",
+ *         month(col("acquisitionDate")))
+ *   }
+ * </code>
+ */
+public class ITestS3SelectLandsat extends AbstractS3SelectTest {
+
+  private static final Logger LOG =
+      LoggerFactory.getLogger(ITestS3SelectLandsat.class);
+
+  private JobConf selectConf;
+
+  /**
+   * Normal limit for select operations.
+   * Value: {@value}.
+   */
+  public static final int SELECT_LIMIT = 250;
+
+  /**
+   * And that select limit as a limit string.
+   */
+  public static final String LIMITED = " LIMIT " + SELECT_LIMIT;
+
+  /**
+   * Select days with 100% cloud cover, limited to {@link #SELECT_LIMIT}.
+   * Value: {@value}.
+   */
+  public static final String SELECT_ENTITY_ID_ALL_CLOUDS =
+      "SELECT\n"
+          + "s.entityId from\n"
+          + "S3OBJECT s WHERE\n"
+          + "s.\"cloudCover\" = '100.0'\n"
+          + LIMITED;
+
+  /**
+   * Select sunny days. There's no limit on the returned values, so
+   * set one except for a scale test.
+   * Value: {@value}.
+   */
+  public static final String SELECT_SUNNY_ROWS_NO_LIMIT
+      = "SELECT * FROM S3OBJECT s WHERE s.cloudCover = '0.0'";
+
+  /**
+   * A Select call which returns nothing, always.
+   * Value: {@value}.
+   */
+  public static final String SELECT_NOTHING
+      = "SELECT * FROM S3OBJECT s WHERE s.cloudCover = 'sunny'";
+
+  /**
+   * Select the processing level; no limit.
+   * Value: {@value}.
+   */
+  public static final String SELECT_PROCESSING_LEVEL_NO_LIMIT =
+      "SELECT\n"
+          + "s.processingLevel from\n"
+          + "S3OBJECT s";
+
+  @Override
+  public void setup() throws Exception {
+    super.setup();
+
+    selectConf = new JobConf(false);
+    // file is compressed.
+    selectConf.set(SELECT_INPUT_COMPRESSION, COMPRESSION_OPT_GZIP);
+    // and has a header
+    selectConf.set(CSV_INPUT_HEADER, CSV_HEADER_OPT_USE);
+    selectConf.setBoolean(SELECT_ERRORS_INCLUDE_SQL, true);
+    inputMust(selectConf, CSV_INPUT_HEADER, CSV_HEADER_OPT_USE);
+    inputMust(selectConf, SELECT_INPUT_FORMAT, SELECT_FORMAT_CSV);
+    inputMust(selectConf, SELECT_OUTPUT_FORMAT, SELECT_FORMAT_CSV);
+    inputMust(selectConf, SELECT_INPUT_COMPRESSION, COMPRESSION_OPT_GZIP);
+    // disable the gzip codec, so that the record readers do not
+    // get confused
+    enablePassthroughCodec(selectConf, ".gz");
+  }
+
+  protected int getMaxLines() {
+    return SELECT_LIMIT * 2;
+  }
+
+  @Test
+  public void testSelectCloudcoverIgnoreHeader() throws Throwable {
+    describe("select ignoring the header");
+    selectConf.set(CSV_INPUT_HEADER, CSV_HEADER_OPT_IGNORE);
+    String sql = "SELECT\n"
+        + "* from\n"
+        + "S3OBJECT s WHERE\n"
+        + "s._3 = '0.0'\n"
+        + LIMITED;
+    List<String> list = selectLandsatFile(selectConf, sql);
+    LOG.info("Line count: {}", list.size());
+    verifySelectionCount(1, SELECT_LIMIT, sql, list);
+  }
+
+  @Test
+  public void testSelectCloudcoverUseHeader() throws Throwable {
+    describe("select 100% cover using the header, "
+        + "+ verify projection and incrementing select statistics");
+    S3ATestUtils.MetricDiff selectCount = new S3ATestUtils.MetricDiff(
+        getLandsatFS(),
+        Statistic.OBJECT_SELECT_REQUESTS);
+
+    List<String> list = selectLandsatFile(selectConf,
+        SELECT_ENTITY_ID_ALL_CLOUDS);
+    LOG.info("Line count: {}", list.size());
+    verifySelectionCount(1, SELECT_LIMIT, SELECT_ENTITY_ID_ALL_CLOUDS, list);
+    String line1 = list.get(0);
+    assertThat("no column filtering from " + SELECT_ENTITY_ID_ALL_CLOUDS,
+        line1, not(containsString("100.0")));
+    selectCount.assertDiffEquals("select count", 1);
+  }
+
+  @Test
+  public void testFileContextIntegration() throws Throwable {
+    describe("Test that select works through FileContext");
+    FileContext fc = S3ATestUtils.createTestFileContext(getConfiguration());
+
+    // there's a limit on the number of rows to read; this is larger
+    // than the SELECT_LIMIT call to catch any failure where more than
+    // that is returned, newline parsing fails, etc etc.
+    List<String> list = parseToLines(
+        select(fc, getLandsatGZ(), selectConf, SELECT_ENTITY_ID_ALL_CLOUDS),
+        SELECT_LIMIT * 2);
+    LOG.info("Line count: {}", list.size());
+    verifySelectionCount(1, SELECT_LIMIT, SELECT_ENTITY_ID_ALL_CLOUDS, list);
+  }
+
+  @Test
+  public void testReadLandsatRecords() throws Throwable {
+    describe("Use a record reader to read the records");
+    inputMust(selectConf, CSV_OUTPUT_FIELD_DELIMITER, "\\t");
+    inputMust(selectConf, CSV_OUTPUT_QUOTE_CHARACTER, "'");
+    inputMust(selectConf, CSV_OUTPUT_QUOTE_FIELDS,
+        CSV_OUTPUT_QUOTE_FIELDS_AS_NEEEDED);
+    inputMust(selectConf, CSV_OUTPUT_RECORD_DELIMITER, "\n");
+    List<String> records = readRecords(
+        selectConf,
+        getLandsatGZ(),
+        SELECT_ENTITY_ID_ALL_CLOUDS,
+        createLineRecordReader(),
+        SELECT_LIMIT);
+    verifySelectionCount(1, SELECT_LIMIT, SELECT_ENTITY_ID_ALL_CLOUDS, records);
+  }
+
+  @Test
+  public void testReadLandsatRecordsNoMatch() throws Throwable {
+    describe("Verify the v2 record reader does not fail"
+        + " when there are no results");
+    verifySelectionCount(0, 0, SELECT_NOTHING,
+        readRecords(
+        selectConf,
+            getLandsatGZ(),
+        SELECT_NOTHING,
+        createLineRecordReader(),
+        SELECT_LIMIT));
+  }
+
+  @Test
+  public void testReadLandsatRecordsGZipEnabled() throws Throwable {
+    describe("Verify that by default, the gzip codec is connected to .gz"
+        + " files, and so fails");
+    // implicitly re-enable the gzip codec.
+    selectConf.unset(CommonConfigurationKeys.IO_COMPRESSION_CODECS_KEY);
+    intercept(IOException.class, "gzip",
+        () -> readRecords(
+            selectConf,
+            getLandsatGZ(),
+            SELECT_ENTITY_ID_ALL_CLOUDS,
+            createLineRecordReader(),
+            SELECT_LIMIT));
+  }
+
+  @Test
+  public void testReadLandsatRecordsV1() throws Throwable {
+    describe("Use a record reader to read the records");
+
+    verifySelectionCount(1, SELECT_LIMIT, SELECT_ENTITY_ID_ALL_CLOUDS,
+        readRecords(
+          selectConf,
+            getLandsatGZ(),
+            SELECT_ENTITY_ID_ALL_CLOUDS,
+          createLineRecordReader(),
+          SELECT_LIMIT));
+  }
+
+  @Test
+  public void testReadLandsatRecordsV1NoResults() throws Throwable {
+    describe("verify that a select with no results is not an error");
+
+    verifySelectionCount(0, 0, SELECT_NOTHING,
+        readRecords(
+          selectConf,
+            getLandsatGZ(),
+          SELECT_NOTHING,
+          createLineRecordReader(),
+          SELECT_LIMIT));
+  }
+
+  /**
+   * Select from the landsat file.
+   * @param conf config for the select call.
+   * @param sql template for a formatted SQL request.
+   * @param args arguments for the formatted request.
+   * @return the lines selected
+   * @throws IOException failure
+   */
+  private List<String> selectLandsatFile(
+      final Configuration conf,
+      final String sql,
+      final Object... args)
+      throws Exception {
+
+    // there's a limit on the number of rows to read; this is larger
+    // than the SELECT_LIMIT call to catch any failure where more than
+    // that is returned, newline parsing fails, etc etc.
+    return parseToLines(
+        select(getLandsatFS(), getLandsatGZ(), conf, sql, args));
+  }
+
+  /**
+   * This is a larger-scale version of {@link ITestS3Select#testSelectSeek()}.
+   */
+  @Test
+  public void testSelectSeekFullLandsat() throws Throwable {
+    describe("Verify forward seeks work, not others");
+
+    boolean enabled = getTestPropertyBool(
+        getConfiguration(),
+        KEY_SCALE_TESTS_ENABLED,
+        DEFAULT_SCALE_TESTS_ENABLED);
+    assume("Scale test disabled", enabled);
+
+    // start: read in the full data through the initial select
+    // this makes asserting that contents match possible
+    final Path path = getLandsatGZ();
+    S3AFileSystem fs = getLandsatFS();
+
+    int len = (int) fs.getFileStatus(path).getLen();
+    byte[] dataset = new byte[4 * _1MB];
+    int actualLen;
+    try (DurationInfo ignored =
+             new DurationInfo(LOG, "Initial read of %s", path);
+         FSDataInputStream sourceStream =
+             select(fs, path,
+                 selectConf,
+                 SELECT_EVERYTHING)) {
+      // read it in
+      actualLen = IOUtils.read(sourceStream, dataset);
+    }
+    int seekRange = 16 * _1KB;
+
+    try (FSDataInputStream seekStream =
+             select(fs, path,
+                 selectConf,
+                 SELECT_EVERYTHING)) {
+      SelectInputStream sis
+          = (SelectInputStream) seekStream.getWrappedStream();
+      S3AInstrumentation.InputStreamStatistics streamStats
+          = sis.getS3AStreamStatistics();
+      // lazy seek doesn't raise a problem here
+      seekStream.seek(0);
+      assertEquals("first byte read", dataset[0], seekStream.read());
+
+      // and now the pos has moved, again, seek will be OK
+      seekStream.seek(1);
+      seekStream.seek(1);
+      // but trying to seek elsewhere now fails
+      intercept(PathIOException.class,
+          SelectInputStream.SEEK_UNSUPPORTED,
+          () -> seekStream.seek(0));
+      // positioned reads from the current location work.
+      byte[] buffer = new byte[1];
+      seekStream.readFully(seekStream.getPos(), buffer);
+      // but positioned backwards fail.
+      intercept(PathIOException.class,
+          SelectInputStream.SEEK_UNSUPPORTED,
+          () -> seekStream.readFully(0, buffer));
+      // forward seeks are implemented as 1+ skip
+      long target = seekStream.getPos() + seekRange;
+      seek(seekStream, target);
+      assertEquals("Seek position in " + seekStream,
+          target, seekStream.getPos());
+      // now do a read and compare values
+      assertEquals("byte at seek position",
+          dataset[(int) seekStream.getPos()], seekStream.read());
+      assertEquals("Seek bytes skipped in " + streamStats,
+          seekRange, streamStats.bytesSkippedOnSeek);
+      long offset;
+      long increment = 64 * _1KB;
+
+      // seek forward, comparing bytes
+      for(offset = 32 * _1KB; offset < actualLen; offset += increment) {
+        seek(seekStream, offset);
+        assertEquals("Seek position in " + seekStream,
+            offset, seekStream.getPos());
+        // now do a read and compare values
+        assertEquals("byte at seek position",
+            dataset[(int) seekStream.getPos()], seekStream.read());
+      }
+      for(; offset < len; offset += _1MB) {
+        seek(seekStream, offset);
+        assertEquals("Seek position in " + seekStream,
+            offset, seekStream.getPos());
+      }
+      // there's no knowledge of how much data is left, but with Gzip
+      // involved there can be a lot. To keep the test duration down,
+      // this test, unlike the simpler one, doesn't try to read past the
+      // EOF. Know this: it will be slow.
+
+      LOG.info("Seek statistics {}", streamStats);
+    }
+  }
+
+}
diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/select/ITestS3SelectMRJob.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/select/ITestS3SelectMRJob.java
new file mode 100644
index 0000000000000..86d1590fce6d0
--- /dev/null
+++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/select/ITestS3SelectMRJob.java
@@ -0,0 +1,206 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.s3a.select;
+
+import java.io.IOException;
+import java.util.concurrent.atomic.AtomicLong;
+
+import org.junit.Test;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.examples.WordCount;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.impl.FutureIOSupport;
+import org.apache.hadoop.fs.impl.WrappedIOException;
+import org.apache.hadoop.fs.s3a.S3AFileSystem;
+import org.apache.hadoop.fs.s3a.S3ATestUtils;
+import org.apache.hadoop.fs.s3a.S3AUtils;
+import org.apache.hadoop.fs.s3a.commit.DurationInfo;
+import org.apache.hadoop.fs.s3a.commit.files.SuccessData;
+import org.apache.hadoop.fs.s3a.commit.staging.StagingCommitter;
+import org.apache.hadoop.io.IOUtils;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
+import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
+import org.apache.hadoop.yarn.conf.YarnConfiguration;
+import org.apache.hadoop.yarn.server.MiniYARNCluster;
+
+import static org.apache.hadoop.fs.s3a.commit.CommitConstants.FS_S3A_COMMITTER_NAME;
+import static org.apache.hadoop.fs.s3a.commit.CommitConstants.FS_S3A_COMMITTER_STAGING_UNIQUE_FILENAMES;
+import static org.apache.hadoop.fs.s3a.select.SelectConstants.*;
+
+/**
+ * Run an MR job with a select query.
+ * This is the effective end-to-end test which verifies:
+ * <ol>
+ *   <li>Passing of select parameters through an MR job conf.</li>
+ *   <li>Automatic pick-up of these parameter through TextInputFormat's use
+ *   of the mapreduce.lib.input.LineRecordReaderLineRecordReader.</li>
+ *   <li>Issuing of S3 Select queries in mapper processes.</li>
+ *   <li>Projection of columns in a select.</li>
+ *   <li>Ability to switch to the Passthrough decompressor in an MR job.</li>
+ *   <li>Saving of results through the S3A Staging committer.</li>
+ *   <li>Basic validation of results.</li>
+ * </ol>
+ * This makes it the most complex of the MR jobs in the hadoop-aws test suite.
+ *
+ * The query used is
+ * {@link ITestS3SelectLandsat#SELECT_PROCESSING_LEVEL_NO_LIMIT},
+ * which lists the processing level of all records in the source file,
+ * and counts the number in each one by way of the normal word-count
+ * routines.
+ * This works because the SQL is projecting only the processing level.
+ *
+ * The result becomes something like (with tabs between fields):
+ * <pre>
+ * L1GT   370231
+ * L1T    689526
+ * </pre>
+ */
+public class ITestS3SelectMRJob extends AbstractS3SelectTest {
+
+  private final Configuration conf = new YarnConfiguration();
+
+  private S3AFileSystem fs;
+
+  private MiniYARNCluster yarnCluster;
+
+  private Path rootPath;
+
+  @Override
+  public void setup() throws Exception {
+    super.setup();
+    fs = S3ATestUtils.createTestFileSystem(conf);
+    rootPath = path("ITestS3SelectMRJob");
+    Path workingDir = path("working");
+    fs.setWorkingDirectory(workingDir);
+    fs.mkdirs(new Path(rootPath, "input/"));
+
+    yarnCluster = new MiniYARNCluster("ITestS3SelectMRJob", // testName
+        1, // number of node managers
+        1, // number of local log dirs per node manager
+        1); // number of hdfs dirs per node manager
+    yarnCluster.init(conf);
+    yarnCluster.start();
+  }
+
+  @Override
+  public void teardown() throws Exception {
+    if (yarnCluster != null) {
+      yarnCluster.stop();
+    }
+    super.teardown();
+  }
+
+  @Test
+  public void testLandsatSelect() throws Exception {
+    final Path input = getLandsatGZ();
+    final Path output = path("testLandsatSelect")
+        .makeQualified(fs.getUri(), fs.getWorkingDirectory());
+
+    final Job job = Job.getInstance(conf, "process level count");
+    job.setJarByClass(WordCount.class);
+    job.setMapperClass(WordCount.TokenizerMapper.class);
+    job.setCombinerClass(WordCount.IntSumReducer.class);
+    job.setReducerClass(WordCount.IntSumReducer.class);
+    job.setOutputKeyClass(Text.class);
+    job.setOutputValueClass(IntWritable.class);
+    FileInputFormat.addInputPath(job, input);
+    FileOutputFormat.setOutputPath(job, output);
+
+    // job with use the staging committer
+    final JobConf jobConf = (JobConf) job.getConfiguration();
+    jobConf.set(FS_S3A_COMMITTER_NAME, StagingCommitter.NAME);
+    jobConf.setBoolean(FS_S3A_COMMITTER_STAGING_UNIQUE_FILENAMES,
+        false);
+
+    final String query
+        = ITestS3SelectLandsat.SELECT_PROCESSING_LEVEL_NO_LIMIT;
+    inputMust(jobConf, SELECT_SQL,
+        query);
+    inputMust(jobConf, SELECT_INPUT_COMPRESSION, COMPRESSION_OPT_GZIP);
+
+    // input settings
+    inputMust(jobConf, SELECT_INPUT_FORMAT, SELECT_FORMAT_CSV);
+    inputMust(jobConf, CSV_INPUT_HEADER, CSV_HEADER_OPT_USE);
+
+    // output
+    inputMust(jobConf, SELECT_OUTPUT_FORMAT, SELECT_FORMAT_CSV);
+    inputMust(jobConf, CSV_OUTPUT_QUOTE_FIELDS,
+        CSV_OUTPUT_QUOTE_FIELDS_AS_NEEEDED);
+
+    // disable the gzip codec, so that the record readers do not
+    // get confused
+    enablePassthroughCodec(jobConf, ".gz");
+
+    try (DurationInfo ignored = new DurationInfo(LOG, "SQL " + query)) {
+      int exitCode = job.waitForCompletion(true) ? 0 : 1;
+      assertEquals("Returned error code.", 0, exitCode);
+    }
+
+    // log the success info
+    Path successPath = new Path(output, "_SUCCESS");
+    SuccessData success = SuccessData.load(fs, successPath);
+    LOG.info("Job _SUCCESS\n{}", success);
+
+    // process the results by ver
+    //
+    LOG.info("Results for query \n{}", query);
+    final AtomicLong parts = new AtomicLong(0);
+    S3AUtils.applyLocatedFiles(fs.listFiles(output, false),
+        (status) -> {
+          Path path = status.getPath();
+          // ignore _SUCCESS, any temp files in subdirectories...
+          if (path.getName().startsWith("part-")) {
+            parts.incrementAndGet();
+            String result = readStringFromFile(path);
+            LOG.info("{}\n{}", path, result);
+            String[] lines = result.split("\n", -1);
+            int l = lines.length;
+            // add a bit of slack here in case some new processing
+            // option was added.
+            assertTrue("Wrong number of lines (" + l + ") in " + result,
+                l > 0 && l < 15);
+          }
+        });
+    assertEquals("More part files created than expected", 1, parts.get());
+  }
+
+  /**
+   * Read a file; using Async IO for completeness and to see how
+   * well the async IO works in practice.
+   * Summary: checked exceptions cripple Async operations.
+   */
+  private String readStringFromFile(Path path) throws IOException {
+    int bytesLen = (int)fs.getFileStatus(path).getLen();
+    byte[] buffer = new byte[bytesLen];
+    return FutureIOSupport.awaitFuture(
+        fs.openFile(path).build().thenApply(in -> {
+          try {
+            IOUtils.readFully(in, buffer, 0, bytesLen);
+            return new String(buffer);
+          } catch (IOException ex) {
+            throw new WrappedIOException(ex);
+          }
+        }));
+  }
+}
diff --git a/hadoop-tools/hadoop-streaming/src/main/java/org/apache/hadoop/streaming/mapreduce/StreamInputFormat.java b/hadoop-tools/hadoop-streaming/src/main/java/org/apache/hadoop/streaming/mapreduce/StreamInputFormat.java
index a77c13762ca3b..77f4e041d5f09 100644
--- a/hadoop-tools/hadoop-streaming/src/main/java/org/apache/hadoop/streaming/mapreduce/StreamInputFormat.java
+++ b/hadoop-tools/hadoop-streaming/src/main/java/org/apache/hadoop/streaming/mapreduce/StreamInputFormat.java
@@ -24,8 +24,12 @@
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FSDataInputStream;
 import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.FutureDataInputStreamBuilder;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.impl.FutureIOSupport;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.mapreduce.InputSplit;
+import org.apache.hadoop.mapreduce.MRJobConfig;
 import org.apache.hadoop.mapreduce.RecordReader;
 import org.apache.hadoop.mapreduce.TaskAttemptContext;
 import org.apache.hadoop.mapreduce.lib.input.FileSplit;
@@ -58,8 +62,14 @@ public RecordReader<Text, Text> createRecordReader(InputSplit genericSplit,
     context.progress();
 
     // Open the file and seek to the start of the split
-    FileSystem fs = split.getPath().getFileSystem(conf);
-    FSDataInputStream in = fs.open(split.getPath());
+    Path path = split.getPath();
+    FileSystem fs = path.getFileSystem(conf);
+    // open the file
+    final FutureDataInputStreamBuilder builder = fs.openFile(path);
+    FutureIOSupport.propagateOptions(builder, conf,
+        MRJobConfig.INPUT_FILE_OPTION_PREFIX,
+        MRJobConfig.INPUT_FILE_MANDATORY_PREFIX);
+    FSDataInputStream in = FutureIOSupport.awaitFuture(builder.build());
 
     // Factory dispatch based on available params..
     Class readerClass;

From e158c7836c5329fb2d30368cccffcbf73343e7b8 Mon Sep 17 00:00:00 2001
From: Anu Engineer <aengineer@apache.org>
Date: Mon, 22 Apr 2019 12:01:15 -0700
Subject: [PATCH 27/40] HADOOP-16026:Replace incorrect use of system property
 user.name. Contributed by Dinesh Chitlangia.

---
 .../org/apache/hadoop/fs/AbstractFileSystem.java    | 13 +++++++++++--
 .../main/java/org/apache/hadoop/fs/FileSystem.java  | 10 +++++++++-
 2 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/AbstractFileSystem.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/AbstractFileSystem.java
index dc6cd2bc2b07f..6e82543ca850a 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/AbstractFileSystem.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/AbstractFileSystem.java
@@ -51,6 +51,7 @@
 import org.apache.hadoop.fs.permission.FsPermission;
 import org.apache.hadoop.security.AccessControlException;
 import org.apache.hadoop.security.SecurityUtil;
+import org.apache.hadoop.security.UserGroupInformation;
 import org.apache.hadoop.security.token.Token;
 import org.apache.hadoop.util.LambdaUtils;
 import org.apache.hadoop.util.Progressable;
@@ -456,8 +457,16 @@ public Path getInitialWorkingDirectory() {
    * @return current user's home directory.
    */
   public Path getHomeDirectory() {
-    return new Path("/user/"+System.getProperty("user.name")).makeQualified(
-                                                                getUri(), null);
+    String username;
+    try {
+      username = UserGroupInformation.getCurrentUser().getShortUserName();
+    } catch(IOException ex) {
+      LOG.warn("Unable to get user name. Fall back to system property " +
+          "user.name", ex);
+      username = System.getProperty("user.name");
+    }
+    return new Path("/user/" + username)
+        .makeQualified(getUri(), null);
   }
   
   /**
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FileSystem.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FileSystem.java
index 61fa43bdf6fa2..2c8b6954a791b 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FileSystem.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FileSystem.java
@@ -2270,8 +2270,16 @@ public LocatedFileStatus next() throws IOException {
    * The default implementation returns {@code "/user/$USER/"}.
    */
   public Path getHomeDirectory() {
+    String username;
+    try {
+      username = UserGroupInformation.getCurrentUser().getShortUserName();
+    } catch(IOException ex) {
+      LOGGER.warn("Unable to get user name. Fall back to system property " +
+          "user.name", ex);
+      username = System.getProperty("user.name");
+    }
     return this.makeQualified(
-        new Path(USER_HOME_PREFIX + "/" + System.getProperty("user.name")));
+        new Path(USER_HOME_PREFIX + "/" + username));
   }
 
 

From 9726b46245ddae9a5c378b03271b1b4b174fcf13 Mon Sep 17 00:00:00 2001
From: Steve Loughran <stevel@cloudera.com>
Date: Wed, 25 Sep 2019 16:46:41 +0530
Subject: [PATCH 28/40] HADOOP-15691 Add PathCapabilities to FileSystem and
 FileContext.

Contributed by Steve Loughran.

This complements the StreamCapabilities Interface by allowing applications to probe for a specific path on a specific instance of a FileSystem client
to offer a specific capability.

This is intended to allow applications to determine

* Whether a method is implemented before calling it and dealing with UnsupportedOperationException.
* Whether a specific feature is believed to be available in the remote store.

As well as a common set of capabilities defined in CommonPathCapabilities,
file systems are free to add their own capabilities, prefixed with
 fs. + schema + .

The plan is to identify and document more capabilities -and for file systems which add new features, for a declaration of the availability of the feature to always be available.

Note

* The remote store is not expected to be checked for the feature;
  It is more a check of client API and the client's configuration/knowledge
  of the state of the remote system.
* Permissions are not checked.

Change-Id: I80bfebe94f4a8bdad8f3ac055495735b824968f5
---
 .../apache/hadoop/fs/AbstractFileSystem.java  | 16 +++++-
 .../hadoop/fs/DelegateToFileSystem.java       |  7 +++
 .../org/apache/hadoop/fs/FileContext.java     | 23 ++++++++-
 .../apache/hadoop/fs/FilterFileSystem.java    |  7 +++
 .../java/org/apache/hadoop/fs/FilterFs.java   |  5 ++
 .../org/apache/hadoop/fs/HarFileSystem.java   |  2 +
 .../hadoop/fs/contract/ContractTestUtils.java | 50 +++++++++++++++++--
 .../hadoop/hdfs/DistributedFileSystem.java    |  3 ++
 .../hadoop/hdfs/web/WebHdfsFileSystem.java    |  4 ++
 .../fs/http/client/HttpFSFileSystem.java      |  1 +
 .../apache/hadoop/fs/s3a/S3AFileSystem.java   | 41 +++++++++++----
 .../hadoop/fs/s3a/s3guard/S3GuardTool.java    |  3 +-
 .../hadoop/fs/s3a/select/SelectConstants.java |  2 +-
 .../hadoop/fs/s3a/select/SelectTool.java      |  2 +-
 .../apache/hadoop/fs/s3a/S3ATestUtils.java    |  7 ++-
 .../s3guard/AbstractS3GuardToolTestBase.java  |  2 +-
 .../hadoop/fs/s3a/select/ITestS3Select.java   |  4 +-
 17 files changed, 155 insertions(+), 24 deletions(-)

diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/AbstractFileSystem.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/AbstractFileSystem.java
index 6e82543ca850a..0453ca14537c3 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/AbstractFileSystem.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/AbstractFileSystem.java
@@ -60,6 +60,8 @@
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+import static org.apache.hadoop.fs.impl.PathCapabilitiesSupport.validatePathCapabilityArgs;
+
 /**
  * This class provides an interface for implementors of a Hadoop file system
  * (analogous to the VFS of Unix). Applications do not access this class;
@@ -72,7 +74,7 @@
  */
 @InterfaceAudience.Public
 @InterfaceStability.Stable
-public abstract class AbstractFileSystem {
+public abstract class AbstractFileSystem implements PathCapabilities {
   static final Logger LOG = LoggerFactory.getLogger(AbstractFileSystem.class);
 
   /** Recording statistics per a file system class. */
@@ -1371,4 +1373,16 @@ public CompletableFuture<FSDataInputStream> openFileWithOptions(Path path,
         new CompletableFuture<>(), () -> open(path, bufferSize));
   }
 
+  public boolean hasPathCapability(final Path path,
+      final String capability)
+      throws IOException {
+    switch (validatePathCapabilityArgs(makeQualified(path), capability)) {
+    case CommonPathCapabilities.FS_SYMLINKS:
+      // delegate to the existing supportsSymlinks() call.
+      return supportsSymlinks();
+    default:
+      // the feature is not implemented.
+      return false;
+    }
+  }
 }
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/DelegateToFileSystem.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/DelegateToFileSystem.java
index 165c56c3d5c37..a8f294f379158 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/DelegateToFileSystem.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/DelegateToFileSystem.java
@@ -281,4 +281,11 @@ public CompletableFuture<FSDataInputStream> openFileWithOptions(Path path,
       int bufferSize) throws IOException {
     return fsImpl.openFileWithOptions(path, mandatoryKeys, options, bufferSize);
   }
+
+  @Override
+  public boolean hasPathCapability(final Path path,
+      final String capability)
+      throws IOException {
+    return fsImpl.hasPathCapability(path, capability);
+  }
 }
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FileContext.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FileContext.java
index f65074856bf3e..b2c1369a9c1fe 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FileContext.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FileContext.java
@@ -46,6 +46,8 @@
 import org.apache.hadoop.fs.FileSystem.Statistics;
 import org.apache.hadoop.fs.Options.CreateOpts;
 import org.apache.hadoop.fs.impl.FutureDataInputStreamBuilderImpl;
+import org.apache.hadoop.fs.impl.FsLinkResolution;
+import org.apache.hadoop.fs.impl.PathCapabilitiesSupport;
 import org.apache.hadoop.fs.permission.AclEntry;
 import org.apache.hadoop.fs.permission.AclStatus;
 import org.apache.hadoop.fs.permission.FsAction;
@@ -68,6 +70,8 @@
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+import static org.apache.hadoop.fs.impl.PathCapabilitiesSupport.validatePathCapabilityArgs;
+
 /**
  * The FileContext class provides an interface for users of the Hadoop
  * file system. It exposes a number of file system operations, e.g. create,
@@ -171,7 +175,7 @@
 
 @InterfaceAudience.Public
 @InterfaceStability.Stable
-public class FileContext {
+public class FileContext implements PathCapabilities {
   
   public static final Logger LOG = LoggerFactory.getLogger(FileContext.class);
   /**
@@ -2934,4 +2938,21 @@ public CompletableFuture<FSDataInputStream> next(
       }.resolve(FileContext.this, absF);
     }
   }
+
+  /**
+   * Return the path capabilities of the bonded {@code AbstractFileSystem}.
+   * @param path path to query the capability of.
+   * @param capability string to query the stream support for.
+   * @return true iff the capability is supported under that FS.
+   * @throws IOException path resolution or other IO failure
+   * @throws IllegalArgumentException invalid arguments
+   */
+  public boolean hasPathCapability(Path path, String capability)
+      throws IOException {
+    validatePathCapabilityArgs(path, capability);
+    return FsLinkResolution.resolve(this,
+        fixRelativePart(path),
+        (fs, p) -> fs.hasPathCapability(p, capability));
+  }
+
 }
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FilterFileSystem.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FilterFileSystem.java
index 99c18b6646cd6..fc72c9bccd4d4 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FilterFileSystem.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FilterFileSystem.java
@@ -725,4 +725,11 @@ protected CompletableFuture<FSDataInputStream> openFileWithOptions(
     return fs.openFileWithOptions(pathHandle, mandatoryKeys, options,
         bufferSize);
   }
+
+  @Override
+  public boolean hasPathCapability(final Path path, final String capability)
+      throws IOException {
+    return fs.hasPathCapability(path, capability);
+  }
+
 }
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FilterFs.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FilterFs.java
index f5430d6026160..731a52a7b4137 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FilterFs.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FilterFs.java
@@ -446,4 +446,9 @@ public CompletableFuture<FSDataInputStream> openFileWithOptions(
     return myFs.openFileWithOptions(path, mandatoryKeys, options, bufferSize);
   }
 
+  public boolean hasPathCapability(final Path path,
+      final String capability)
+      throws IOException {
+    return myFs.hasPathCapability(path, capability);
+  }
 }
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/HarFileSystem.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/HarFileSystem.java
index 7e12d0a11e953..009b55e77b8d5 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/HarFileSystem.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/HarFileSystem.java
@@ -39,6 +39,8 @@
 
 import static org.apache.hadoop.fs.impl.PathCapabilitiesSupport.validatePathCapabilityArgs;
 
+import static org.apache.hadoop.fs.impl.PathCapabilitiesSupport.validatePathCapabilityArgs;
+
 /**
  * This is an implementation of the Hadoop Archive 
  * Filesystem. This archive Filesystem has index files
diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/ContractTestUtils.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/ContractTestUtils.java
index b4db3a5803ad8..c5ce46f292712 100644
--- a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/ContractTestUtils.java
+++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/ContractTestUtils.java
@@ -25,6 +25,7 @@
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.LocatedFileStatus;
 import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.PathCapabilities;
 import org.apache.hadoop.fs.RemoteIterator;
 import org.apache.hadoop.fs.StreamCapabilities;
 import org.apache.hadoop.io.IOUtils;
@@ -1466,22 +1467,61 @@ public static void assertCapabilities(
     assertTrue("Stream should be instanceof StreamCapabilities",
         stream instanceof StreamCapabilities);
 
-    if (shouldHaveCapabilities!=null) {
+    StreamCapabilities source = (StreamCapabilities) stream;
+    if (shouldHaveCapabilities != null) {
       for (String shouldHaveCapability : shouldHaveCapabilities) {
         assertTrue("Should have capability: " + shouldHaveCapability,
-            ((StreamCapabilities) stream).hasCapability(shouldHaveCapability));
+            source.hasCapability(shouldHaveCapability));
       }
     }
 
-    if (shouldNotHaveCapabilities!=null) {
+    if (shouldNotHaveCapabilities != null) {
       for (String shouldNotHaveCapability : shouldNotHaveCapabilities) {
         assertFalse("Should not have capability: " + shouldNotHaveCapability,
-            ((StreamCapabilities) stream)
-                .hasCapability(shouldNotHaveCapability));
+            source.hasCapability(shouldNotHaveCapability));
       }
     }
   }
 
+  /**
+   * Custom assert to test {@link PathCapabilities}.
+   *
+   * @param source source (FS, FC, etc)
+   * @param path path to check
+   * @param capabilities The array of unexpected capabilities
+   */
+  public static void assertHasPathCapabilities(
+      final PathCapabilities source,
+      final Path path,
+      final String...capabilities) throws IOException {
+
+    for (String shouldHaveCapability: capabilities) {
+      assertTrue("Should have capability: " + shouldHaveCapability
+              + " under " + path,
+          source.hasPathCapability(path, shouldHaveCapability));
+    }
+  }
+
+  /**
+   * Custom assert to test that the named {@link PathCapabilities}
+   * are not supported.
+   *
+   * @param source source (FS, FC, etc)
+   * @param path path to check
+   * @param capabilities The array of unexpected capabilities
+   */
+  public static void assertLacksPathCapabilities(
+      final PathCapabilities source,
+      final Path path,
+      final String...capabilities) throws IOException {
+
+    for (String shouldHaveCapability: capabilities) {
+      assertFalse("Path  must not support capability: " + shouldHaveCapability
+              + " under " + path,
+          source.hasPathCapability(path, shouldHaveCapability));
+    }
+  }
+
   /**
    * Function which calls {@code InputStream.read()} and
    * downgrades an IOE to a runtime exception.
diff --git a/hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/DistributedFileSystem.java b/hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/DistributedFileSystem.java
index 5c1002e7a0350..923006c9001ee 100644
--- a/hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/DistributedFileSystem.java
+++ b/hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/DistributedFileSystem.java
@@ -31,6 +31,7 @@
 import org.apache.hadoop.fs.BlockLocation;
 import org.apache.hadoop.fs.BlockStoragePolicySpi;
 import org.apache.hadoop.fs.CacheFlag;
+import org.apache.hadoop.fs.CommonPathCapabilities;
 import org.apache.hadoop.fs.ContentSummary;
 import org.apache.hadoop.fs.CreateFlag;
 import org.apache.hadoop.fs.FSDataInputStream;
@@ -122,6 +123,8 @@
 import java.util.Map;
 import java.util.Optional;
 
+import static org.apache.hadoop.fs.impl.PathCapabilitiesSupport.validatePathCapabilityArgs;
+
 /****************************************************************
  * Implementation of the abstract FileSystem for the DFS system.
  * This object is the way end-user code interacts with a Hadoop
diff --git a/hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/web/WebHdfsFileSystem.java b/hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/web/WebHdfsFileSystem.java
index 1e58995c3a398..74c14486a93e3 100644
--- a/hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/web/WebHdfsFileSystem.java
+++ b/hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/web/WebHdfsFileSystem.java
@@ -63,6 +63,7 @@
 import org.apache.hadoop.crypto.key.KeyProviderTokenIssuer;
 import org.apache.hadoop.fs.BlockLocation;
 import org.apache.hadoop.fs.CommonConfigurationKeys;
+import org.apache.hadoop.fs.CommonPathCapabilities;
 import org.apache.hadoop.fs.ContentSummary;
 import org.apache.hadoop.fs.CreateFlag;
 import org.apache.hadoop.fs.DelegationTokenRenewer;
@@ -76,6 +77,7 @@
 import org.apache.hadoop.fs.GlobalStorageStatistics;
 import org.apache.hadoop.fs.GlobalStorageStatistics.StorageStatisticsProvider;
 import org.apache.hadoop.fs.QuotaUsage;
+import org.apache.hadoop.fs.PathCapabilities;
 import org.apache.hadoop.fs.StorageStatistics;
 import org.apache.hadoop.fs.StorageType;
 import org.apache.hadoop.fs.permission.FsCreateModes;
@@ -135,6 +137,8 @@
 import com.google.common.base.Preconditions;
 import com.google.common.collect.Lists;
 
+import static org.apache.hadoop.fs.impl.PathCapabilitiesSupport.validatePathCapabilityArgs;
+
 /** A FileSystem for HDFS over the web. */
 public class WebHdfsFileSystem extends FileSystem
     implements DelegationTokenRenewer.Renewable,
diff --git a/hadoop-hdfs-project/hadoop-hdfs-httpfs/src/main/java/org/apache/hadoop/fs/http/client/HttpFSFileSystem.java b/hadoop-hdfs-project/hadoop-hdfs-httpfs/src/main/java/org/apache/hadoop/fs/http/client/HttpFSFileSystem.java
index d1fdaf11e0e38..d6534fcbd7bbd 100644
--- a/hadoop-hdfs-project/hadoop-hdfs-httpfs/src/main/java/org/apache/hadoop/fs/http/client/HttpFSFileSystem.java
+++ b/hadoop-hdfs-project/hadoop-hdfs-httpfs/src/main/java/org/apache/hadoop/fs/http/client/HttpFSFileSystem.java
@@ -86,6 +86,7 @@
 import java.security.PrivilegedExceptionAction;
 import java.text.MessageFormat;
 import java.util.HashMap;
+import java.util.Locale;
 import java.util.Map;
 
 import static org.apache.hadoop.fs.impl.PathCapabilitiesSupport.validatePathCapabilityArgs;
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AFileSystem.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AFileSystem.java
index 031a80be1d718..7b046bef5162d 100644
--- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AFileSystem.java
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AFileSystem.java
@@ -34,7 +34,6 @@
 import java.util.EnumSet;
 import java.util.HashSet;
 import java.util.List;
-import java.util.Locale;
 import java.util.Map;
 import java.util.Optional;
 import java.util.Set;
@@ -85,6 +84,7 @@
 import org.apache.hadoop.classification.InterfaceAudience;
 import org.apache.hadoop.classification.InterfaceStability;
 import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.CommonPathCapabilities;
 import org.apache.hadoop.fs.CreateFlag;
 import org.apache.hadoop.fs.FSDataInputStream;
 import org.apache.hadoop.fs.FSDataOutputStream;
@@ -132,6 +132,7 @@
 import org.apache.hadoop.util.SemaphoredDelegatingExecutor;
 
 import static org.apache.hadoop.fs.impl.AbstractFSBuilderImpl.rejectUnknownMandatoryKeys;
+import static org.apache.hadoop.fs.impl.PathCapabilitiesSupport.validatePathCapabilityArgs;
 import static org.apache.hadoop.fs.s3a.Constants.*;
 import static org.apache.hadoop.fs.s3a.Invoker.*;
 import static org.apache.hadoop.fs.s3a.S3AUtils.*;
@@ -3602,17 +3603,15 @@ public S3AInstrumentation.CommitterStatistics newCommitterStatistics() {
     return instrumentation.newCommitterStatistics();
   }
 
-  /**
-   * Return the capabilities of this filesystem instance.
-   * @param capability string to query the stream support for.
-   * @return whether the FS instance has the capability.
-   */
+  @SuppressWarnings("deprecation")
   @Override
-  public boolean hasCapability(String capability) {
-
-    switch (capability.toLowerCase(Locale.ENGLISH)) {
+  public boolean hasPathCapability(final Path path, final String capability)
+      throws IOException {
+    final Path p = makeQualified(path);
+    switch (validatePathCapabilityArgs(p, capability)) {
 
     case CommitConstants.STORE_CAPABILITY_MAGIC_COMMITTER:
+    case CommitConstants.STORE_CAPABILITY_MAGIC_COMMITTER_OLD:
       // capability depends on FS configuration
       return isMagicCommitEnabled();
 
@@ -3620,7 +3619,31 @@ public boolean hasCapability(String capability) {
       // select is only supported if enabled
       return selectBinding.isEnabled();
 
+    case CommonPathCapabilities.FS_CHECKSUMS:
+      // capability depends on FS configuration
+      return getConf().getBoolean(ETAG_CHECKSUM_ENABLED,
+          ETAG_CHECKSUM_ENABLED_DEFAULT);
+
     default:
+      return super.hasPathCapability(p, capability);
+    }
+  }
+
+  /**
+   * Return the capabilities of this filesystem instance.
+   *
+   * This has been supplanted by {@link #hasPathCapability(Path, String)}.
+   * @param capability string to query the stream support for.
+   * @return whether the FS instance has the capability.
+   */
+  @Deprecated
+  @Override
+  public boolean hasCapability(String capability) {
+    try {
+      return hasPathCapability(workingDir, capability);
+    } catch (IOException ex) {
+      // should never happen, so log and downgrade.
+      LOG.debug("Ignoring exception on hasCapability({}})", capability, ex);
       return false;
     }
   }
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/S3GuardTool.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/S3GuardTool.java
index e4fd06436ae9a..9df912424c886 100644
--- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/S3GuardTool.java
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/S3GuardTool.java
@@ -1157,7 +1157,8 @@ public int run(String[] args, PrintStream out)
       } else {
         println(out, "Filesystem %s is not using S3Guard", fsUri);
       }
-      boolean magic = fs.hasCapability(
+      boolean magic = fs.hasPathCapability(
+          new Path(s3Path),
           CommitConstants.STORE_CAPABILITY_MAGIC_COMMITTER);
       println(out, "The \"magic\" committer %s supported",
           magic ? "is" : "is not");
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/select/SelectConstants.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/select/SelectConstants.java
index d74411d2f92ca..0e2bf914f83c5 100644
--- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/select/SelectConstants.java
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/select/SelectConstants.java
@@ -50,7 +50,7 @@ private SelectConstants() {
    * Does the FS Support S3 Select?
    * Value: {@value}.
    */
-  public static final String S3_SELECT_CAPABILITY = "s3a:fs.s3a.select.sql";
+  public static final String S3_SELECT_CAPABILITY = "fs.s3a.capability.select.sql";
 
   /**
    * Flag: is S3 select enabled?
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/select/SelectTool.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/select/SelectTool.java
index 8c87694570334..c89cc287f2eaf 100644
--- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/select/SelectTool.java
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/select/SelectTool.java
@@ -234,7 +234,7 @@ public int run(String[] args, PrintStream out)
     }
     setFilesystem((S3AFileSystem) fs);
 
-    if (!getFilesystem().hasCapability(S3_SELECT_CAPABILITY)) {
+    if (!getFilesystem().hasPathCapability(path, S3_SELECT_CAPABILITY)) {
       // capability disabled
       throw new ExitUtil.ExitException(EXIT_SERVICE_UNAVAILABLE,
           SELECT_IS_DISABLED + " for " + file);
diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/S3ATestUtils.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/S3ATestUtils.java
index e15c24aced88f..097b482d334a0 100644
--- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/S3ATestUtils.java
+++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/S3ATestUtils.java
@@ -1168,9 +1168,12 @@ public static void skipDuringFaultInjection(S3AFileSystem fs) {
    * Skip a test if the FS isn't marked as supporting magic commits.
    * @param fs filesystem
    */
-  public static void assumeMagicCommitEnabled(S3AFileSystem fs) {
+  public static void assumeMagicCommitEnabled(S3AFileSystem fs)
+      throws IOException {
     assume("Magic commit option disabled on " + fs,
-        fs.hasCapability(CommitConstants.STORE_CAPABILITY_MAGIC_COMMITTER));
+        fs.hasPathCapability(
+            fs.getWorkingDirectory(),
+            CommitConstants.STORE_CAPABILITY_MAGIC_COMMITTER));
   }
 
   /**
diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/AbstractS3GuardToolTestBase.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/AbstractS3GuardToolTestBase.java
index 71e9975c7326c..9c88a62f29ca0 100644
--- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/AbstractS3GuardToolTestBase.java
+++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/AbstractS3GuardToolTestBase.java
@@ -419,7 +419,7 @@ public void testProbeForMagic() throws Throwable {
     String name = fs.getUri().toString();
     S3GuardTool.BucketInfo cmd = new S3GuardTool.BucketInfo(
         getConfiguration());
-    if (fs.hasCapability(
+    if (fs.hasPathCapability(fs.getWorkingDirectory(),
         CommitConstants.STORE_CAPABILITY_MAGIC_COMMITTER)) {
       // if the FS is magic, expect this to work
       exec(cmd, S3GuardTool.BucketInfo.MAGIC_FLAG, name);
diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/select/ITestS3Select.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/select/ITestS3Select.java
index 5fe4e2bb6709c..f5b81a2c46964 100644
--- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/select/ITestS3Select.java
+++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/select/ITestS3Select.java
@@ -102,9 +102,9 @@ public class ITestS3Select extends AbstractS3SelectTest {
   @Override
   public void setup() throws Exception {
     super.setup();
-    Assume.assumeTrue("S3 Select is not enabled",
-        getFileSystem().hasCapability(S3_SELECT_CAPABILITY));
     csvPath = path(getMethodName() + ".csv");
+    Assume.assumeTrue("S3 Select is not enabled",
+        getFileSystem().hasPathCapability(csvPath, S3_SELECT_CAPABILITY));
     selectConf = new Configuration(false);
     selectConf.setBoolean(SELECT_ERRORS_INCLUDE_SQL, true);
     createStandardCsvFile(getFileSystem(), csvPath, ALL_QUOTES);

From b2f0c3ffc81c4e4410d0dcb7d27d8dabd64536eb Mon Sep 17 00:00:00 2001
From: Steve Loughran <stevel@cloudera.com>
Date: Wed, 22 Jan 2020 04:01:51 +0530
Subject: [PATCH 29/40] HADOOP-16759. Filesystem openFile() builder to take a
 FileStatus param (#1761). Contributed by Steve Loughran

* Enhanced builder + FS spec
* s3a FS to use this to skip HEAD on open
* and to use version/etag when opening the file

works with S3AFileStatus FS and S3ALocatedFileStatus
---
 .../apache/hadoop/fs/AbstractFileSystem.java  | 16 ++--
 .../hadoop/fs/DelegateToFileSystem.java       | 17 ++--
 .../org/apache/hadoop/fs/FileContext.java     | 12 ++-
 .../java/org/apache/hadoop/fs/FileSystem.java | 41 ++++----
 .../apache/hadoop/fs/FilterFileSystem.java    | 15 +--
 .../java/org/apache/hadoop/fs/FilterFs.java   |  9 +-
 .../fs/FutureDataInputStreamBuilder.java      | 11 +++
 .../FutureDataInputStreamBuilderImpl.java     | 33 +++++--
 .../hadoop/fs/impl/OpenFileParameters.java    | 94 +++++++++++++++++++
 .../site/markdown/filesystem/filesystem.md    | 23 +++--
 .../filesystem/fsdatainputstreambuilder.md    | 41 ++++++++
 .../fs/contract/AbstractContractOpenTest.java |  9 ++
 .../hadoop/fs/s3a/select/ITestS3Select.java   |  7 +-
 13 files changed, 252 insertions(+), 76 deletions(-)
 create mode 100644 hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/impl/OpenFileParameters.java

diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/AbstractFileSystem.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/AbstractFileSystem.java
index 0453ca14537c3..1df68b647c99a 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/AbstractFileSystem.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/AbstractFileSystem.java
@@ -31,7 +31,6 @@
 import java.util.List;
 import java.util.Map;
 import java.util.NoSuchElementException;
-import java.util.Set;
 import java.util.StringTokenizer;
 import java.util.concurrent.CompletableFuture;
 import java.util.concurrent.ConcurrentHashMap;
@@ -45,6 +44,7 @@
 import org.apache.hadoop.fs.Options.CreateOpts;
 import org.apache.hadoop.fs.Options.Rename;
 import org.apache.hadoop.fs.impl.AbstractFSBuilderImpl;
+import org.apache.hadoop.fs.impl.OpenFileParameters;
 import org.apache.hadoop.fs.permission.AclEntry;
 import org.apache.hadoop.fs.permission.AclStatus;
 import org.apache.hadoop.fs.permission.FsAction;
@@ -1355,22 +1355,20 @@ public boolean equals(Object other) {
    * setting up the expectation that the {@code get()} call
    * is needed to evaluate the result.
    * @param path path to the file
-   * @param mandatoryKeys set of options declared as mandatory.
-   * @param options options set during the build sequence.
-   * @param bufferSize buffer size
+   * @param parameters open file parameters from the builder.
    * @return a future which will evaluate to the opened file.
    * @throws IOException failure to resolve the link.
    * @throws IllegalArgumentException unknown mandatory key
    */
   public CompletableFuture<FSDataInputStream> openFileWithOptions(Path path,
-      Set<String> mandatoryKeys,
-      Configuration options,
-      int bufferSize) throws IOException {
-    AbstractFSBuilderImpl.rejectUnknownMandatoryKeys(mandatoryKeys,
+      final OpenFileParameters parameters) throws IOException {
+    AbstractFSBuilderImpl.rejectUnknownMandatoryKeys(
+        parameters.getMandatoryKeys(),
         Collections.emptySet(),
         "for " + path);
     return LambdaUtils.eval(
-        new CompletableFuture<>(), () -> open(path, bufferSize));
+        new CompletableFuture<>(), () ->
+            open(path, parameters.getBufferSize()));
   }
 
   public boolean hasPathCapability(final Path path,
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/DelegateToFileSystem.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/DelegateToFileSystem.java
index a8f294f379158..3a139781e0372 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/DelegateToFileSystem.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/DelegateToFileSystem.java
@@ -24,13 +24,13 @@
 import java.util.Arrays;
 import java.util.EnumSet;
 import java.util.List;
-import java.util.Set;
 import java.util.concurrent.CompletableFuture;
 
 import org.apache.hadoop.classification.InterfaceAudience;
 import org.apache.hadoop.classification.InterfaceStability;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.Options.ChecksumOpt;
+import org.apache.hadoop.fs.impl.OpenFileParameters;
 import org.apache.hadoop.fs.permission.FsPermission;
 import org.apache.hadoop.security.token.Token;
 import org.apache.hadoop.util.Progressable;
@@ -266,20 +266,17 @@ public List<Token<?>> getDelegationTokens(String renewer) throws IOException {
 
   /**
    * Open a file by delegating to
-   * {@link FileSystem#openFileWithOptions(Path, Set, Configuration, int)}.
+   * {@link FileSystem#openFileWithOptions(Path, org.apache.hadoop.fs.impl.OpenFileParameters)}.
    * @param path path to the file
-   * @param mandatoryKeys set of options declared as mandatory.
-   * @param options options set during the build sequence.
-   * @param bufferSize buffer size
-   * @return a future which will evaluate to the opened file.
+   * @param parameters open file parameters from the builder.
+   *
+   * @return a future which will evaluate to the opened file.ControlAlpha
    * @throws IOException failure to resolve the link.
    * @throws IllegalArgumentException unknown mandatory key
    */
   public CompletableFuture<FSDataInputStream> openFileWithOptions(Path path,
-      Set<String> mandatoryKeys,
-      Configuration options,
-      int bufferSize) throws IOException {
-    return fsImpl.openFileWithOptions(path, mandatoryKeys, options, bufferSize);
+      final OpenFileParameters parameters) throws IOException {
+    return fsImpl.openFileWithOptions(path, parameters);
   }
 
   @Override
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FileContext.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FileContext.java
index b2c1369a9c1fe..df93e89750ee0 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FileContext.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FileContext.java
@@ -47,7 +47,7 @@
 import org.apache.hadoop.fs.Options.CreateOpts;
 import org.apache.hadoop.fs.impl.FutureDataInputStreamBuilderImpl;
 import org.apache.hadoop.fs.impl.FsLinkResolution;
-import org.apache.hadoop.fs.impl.PathCapabilitiesSupport;
+import org.apache.hadoop.fs.impl.OpenFileParameters;
 import org.apache.hadoop.fs.permission.AclEntry;
 import org.apache.hadoop.fs.permission.AclStatus;
 import org.apache.hadoop.fs.permission.FsAction;
@@ -2924,16 +2924,18 @@ protected FSDataInputStreamBuilder(
     @Override
     public CompletableFuture<FSDataInputStream> build() throws IOException {
       final Path absF = fixRelativePart(getPath());
+      OpenFileParameters parameters = new OpenFileParameters()
+          .withMandatoryKeys(getMandatoryKeys())
+          .withOptions(getOptions())
+          .withBufferSize(getBufferSize())
+          .withStatus(getStatus());
       return new FSLinkResolver<CompletableFuture<FSDataInputStream>>() {
         @Override
         public CompletableFuture<FSDataInputStream> next(
             final AbstractFileSystem fs,
             final Path p)
             throws IOException {
-          return fs.openFileWithOptions(p,
-              getMandatoryKeys(),
-              getOptions(),
-              getBufferSize());
+          return fs.openFileWithOptions(p, parameters);
         }
       }.resolve(FileContext.this, absF);
     }
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FileSystem.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FileSystem.java
index 2c8b6954a791b..95850eb0760d5 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FileSystem.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FileSystem.java
@@ -58,6 +58,7 @@
 import org.apache.hadoop.fs.Options.Rename;
 import org.apache.hadoop.fs.impl.AbstractFSBuilderImpl;
 import org.apache.hadoop.fs.impl.FutureDataInputStreamBuilderImpl;
+import org.apache.hadoop.fs.impl.OpenFileParameters;
 import org.apache.hadoop.fs.permission.AclEntry;
 import org.apache.hadoop.fs.permission.AclStatus;
 import org.apache.hadoop.fs.permission.FsAction;
@@ -4443,43 +4444,39 @@ public FutureDataInputStreamBuilder openFile(PathHandle pathHandle)
    * the action of opening the file should begin.
    *
    * The base implementation performs a blocking
-   * call to {@link #open(Path, int)}in this call;
+   * call to {@link #open(Path, int)} in this call;
    * the actual outcome is in the returned {@code CompletableFuture}.
    * This avoids having to create some thread pool, while still
    * setting up the expectation that the {@code get()} call
    * is needed to evaluate the result.
    * @param path path to the file
-   * @param mandatoryKeys set of options declared as mandatory.
-   * @param options options set during the build sequence.
-   * @param bufferSize buffer size
+   * @param parameters open file parameters from the builder.
    * @return a future which will evaluate to the opened file.
    * @throws IOException failure to resolve the link.
    * @throws IllegalArgumentException unknown mandatory key
    */
   protected CompletableFuture<FSDataInputStream> openFileWithOptions(
       final Path path,
-      final Set<String> mandatoryKeys,
-      final Configuration options,
-      final int bufferSize) throws IOException {
-    AbstractFSBuilderImpl.rejectUnknownMandatoryKeys(mandatoryKeys,
+      final OpenFileParameters parameters) throws IOException {
+    AbstractFSBuilderImpl.rejectUnknownMandatoryKeys(
+        parameters.getMandatoryKeys(),
         Collections.emptySet(),
         "for " + path);
     return LambdaUtils.eval(
-        new CompletableFuture<>(), () -> open(path, bufferSize));
+        new CompletableFuture<>(), () ->
+            open(path, parameters.getBufferSize()));
   }
 
   /**
    * Execute the actual open file operation.
    * The base implementation performs a blocking
-   * call to {@link #open(Path, int)}in this call;
+   * call to {@link #open(Path, int)} in this call;
    * the actual outcome is in the returned {@code CompletableFuture}.
    * This avoids having to create some thread pool, while still
    * setting up the expectation that the {@code get()} call
    * is needed to evaluate the result.
    * @param pathHandle path to the file
-   * @param mandatoryKeys set of options declared as mandatory.
-   * @param options options set during the build sequence.
-   * @param bufferSize buffer size
+   * @param parameters open file parameters from the builder.
    * @return a future which will evaluate to the opened file.
    * @throws IOException failure to resolve the link.
    * @throws IllegalArgumentException unknown mandatory key
@@ -4488,14 +4485,13 @@ protected CompletableFuture<FSDataInputStream> openFileWithOptions(
    */
   protected CompletableFuture<FSDataInputStream> openFileWithOptions(
       final PathHandle pathHandle,
-      final Set<String> mandatoryKeys,
-      final Configuration options,
-      final int bufferSize) throws IOException {
-    AbstractFSBuilderImpl.rejectUnknownMandatoryKeys(mandatoryKeys,
+      final OpenFileParameters parameters) throws IOException {
+    AbstractFSBuilderImpl.rejectUnknownMandatoryKeys(
+        parameters.getMandatoryKeys(),
         Collections.emptySet(), "");
     CompletableFuture<FSDataInputStream> result = new CompletableFuture<>();
     try {
-      result.complete(open(pathHandle, bufferSize));
+      result.complete(open(pathHandle, parameters.getBufferSize()));
     } catch (UnsupportedOperationException tx) {
       // fail fast here
       throw tx;
@@ -4551,12 +4547,17 @@ protected FSDataInputStreamBuilder(
     @Override
     public CompletableFuture<FSDataInputStream> build() throws IOException {
       Optional<Path> optionalPath = getOptionalPath();
+      OpenFileParameters parameters = new OpenFileParameters()
+          .withMandatoryKeys(getMandatoryKeys())
+          .withOptions(getOptions())
+          .withBufferSize(getBufferSize())
+          .withStatus(super.getStatus());  // explicit to avoid IDE warnings
       if(optionalPath.isPresent()) {
         return getFS().openFileWithOptions(optionalPath.get(),
-            getMandatoryKeys(), getOptions(), getBufferSize());
+            parameters);
       } else {
         return getFS().openFileWithOptions(getPathHandle(),
-            getMandatoryKeys(), getOptions(), getBufferSize());
+            parameters);
       }
     }
 
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FilterFileSystem.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FilterFileSystem.java
index fc72c9bccd4d4..ac0ca91f8cfc6 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FilterFileSystem.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FilterFileSystem.java
@@ -25,12 +25,12 @@
 import java.util.EnumSet;
 import java.util.List;
 import java.util.Map;
-import java.util.Set;
 import java.util.concurrent.CompletableFuture;
 
 import org.apache.hadoop.classification.InterfaceAudience;
 import org.apache.hadoop.classification.InterfaceStability;
 import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.impl.OpenFileParameters;
 import org.apache.hadoop.fs.permission.AclEntry;
 import org.apache.hadoop.fs.permission.AclStatus;
 import org.apache.hadoop.fs.permission.FsAction;
@@ -710,20 +710,15 @@ public FutureDataInputStreamBuilder openFile(final PathHandle pathHandle)
   @Override
   protected CompletableFuture<FSDataInputStream> openFileWithOptions(
       final Path path,
-      final Set<String> mandatoryKeys,
-      final Configuration options,
-      final int bufferSize) throws IOException {
-    return fs.openFileWithOptions(path, mandatoryKeys, options, bufferSize);
+      final OpenFileParameters parameters) throws IOException {
+    return fs.openFileWithOptions(path, parameters);
   }
 
   @Override
   protected CompletableFuture<FSDataInputStream> openFileWithOptions(
       final PathHandle pathHandle,
-      final Set<String> mandatoryKeys,
-      final Configuration options,
-      final int bufferSize) throws IOException {
-    return fs.openFileWithOptions(pathHandle, mandatoryKeys, options,
-        bufferSize);
+      final OpenFileParameters parameters) throws IOException {
+    return fs.openFileWithOptions(pathHandle, parameters);
   }
 
   @Override
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FilterFs.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FilterFs.java
index 731a52a7b4137..e197506edc88b 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FilterFs.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FilterFs.java
@@ -26,13 +26,12 @@
 import java.util.EnumSet;
 import java.util.List;
 import java.util.Map;
-import java.util.Set;
 import java.util.concurrent.CompletableFuture;
 
 import org.apache.hadoop.classification.InterfaceAudience;
 import org.apache.hadoop.classification.InterfaceStability;
-import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileSystem.Statistics;
+import org.apache.hadoop.fs.impl.OpenFileParameters;
 import org.apache.hadoop.fs.permission.AclEntry;
 import org.apache.hadoop.fs.permission.AclStatus;
 import org.apache.hadoop.fs.permission.FsAction;
@@ -440,10 +439,8 @@ public Collection<? extends BlockStoragePolicySpi> getAllStoragePolicies()
   @Override
   public CompletableFuture<FSDataInputStream> openFileWithOptions(
       final Path path,
-      final Set<String> mandatoryKeys,
-      final Configuration options,
-      final int bufferSize) throws IOException {
-    return myFs.openFileWithOptions(path, mandatoryKeys, options, bufferSize);
+      final OpenFileParameters parameters) throws IOException {
+    return myFs.openFileWithOptions(path, parameters);
   }
 
   public boolean hasPathCapability(final Path path,
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FutureDataInputStreamBuilder.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FutureDataInputStreamBuilder.java
index 774d30927df2c..27a522e593001 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FutureDataInputStreamBuilder.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FutureDataInputStreamBuilder.java
@@ -47,4 +47,15 @@ public interface FutureDataInputStreamBuilder
   CompletableFuture<FSDataInputStream> build()
       throws IllegalArgumentException, UnsupportedOperationException,
       IOException;
+
+  /**
+   * A FileStatus may be provided to the open request.
+   * It is up to the implementation whether to use this or not.
+   * @param status status.
+   * @return the builder.
+   */
+  default FutureDataInputStreamBuilder withFileStatus(FileStatus status) {
+    return this;
+  }
+
 }
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/impl/FutureDataInputStreamBuilderImpl.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/impl/FutureDataInputStreamBuilderImpl.java
index 2aa4a5d95fcc7..24a8d49747fe6 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/impl/FutureDataInputStreamBuilderImpl.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/impl/FutureDataInputStreamBuilderImpl.java
@@ -26,12 +26,13 @@
 import org.apache.hadoop.classification.InterfaceStability;
 import org.apache.hadoop.fs.FSDataInputStream;
 import org.apache.hadoop.fs.FileContext;
+import org.apache.hadoop.fs.FileStatus;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.FutureDataInputStreamBuilder;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.fs.PathHandle;
 
-import static com.google.common.base.Preconditions.checkNotNull;
+import static java.util.Objects.requireNonNull;
 import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.IO_FILE_BUFFER_SIZE_DEFAULT;
 import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.IO_FILE_BUFFER_SIZE_KEY;
 
@@ -60,6 +61,12 @@ public abstract class FutureDataInputStreamBuilderImpl
 
   private int bufferSize;
 
+  /**
+   * File status passed in through a {@link #withFileStatus(FileStatus)}
+   * call; null otherwise.
+   */
+  private FileStatus status;
+
   /**
    * Construct from a {@link FileContext}.
    *
@@ -69,8 +76,8 @@ public abstract class FutureDataInputStreamBuilderImpl
    */
   protected FutureDataInputStreamBuilderImpl(@Nonnull FileContext fc,
       @Nonnull Path path) throws IOException {
-    super(checkNotNull(path));
-    checkNotNull(fc);
+    super(requireNonNull(path, "path"));
+    requireNonNull(fc, "file context");
     this.fileSystem = null;
     bufferSize = IO_FILE_BUFFER_SIZE_DEFAULT;
   }
@@ -82,8 +89,8 @@ protected FutureDataInputStreamBuilderImpl(@Nonnull FileContext fc,
    */
   protected FutureDataInputStreamBuilderImpl(@Nonnull FileSystem fileSystem,
       @Nonnull Path path) {
-    super(checkNotNull(path));
-    this.fileSystem = checkNotNull(fileSystem);
+    super(requireNonNull(path, "path"));
+    this.fileSystem = requireNonNull(fileSystem, "fileSystem");
     initFromFS();
   }
 
@@ -108,7 +115,7 @@ private void initFromFS() {
   }
 
   protected FileSystem getFS() {
-    checkNotNull(fileSystem);
+    requireNonNull(fileSystem, "fileSystem");
     return fileSystem;
   }
 
@@ -138,4 +145,18 @@ public FutureDataInputStreamBuilder builder() {
   public FutureDataInputStreamBuilder getThisBuilder() {
     return this;
   }
+
+  @Override
+  public FutureDataInputStreamBuilder withFileStatus(FileStatus st) {
+    this.status = requireNonNull(st, "status");
+    return this;
+  }
+
+  /**
+   * Get any status set in {@link #withFileStatus(FileStatus)}.
+   * @return a status value or null.
+   */
+  protected FileStatus getStatus() {
+    return status;
+  }
 }
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/impl/OpenFileParameters.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/impl/OpenFileParameters.java
new file mode 100644
index 0000000000000..77b4ff52696a3
--- /dev/null
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/impl/OpenFileParameters.java
@@ -0,0 +1,94 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.impl;
+
+import java.util.Set;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
+
+import static java.util.Objects.requireNonNull;
+
+/**
+ * All the parameters from the openFile builder for the
+ * {@code openFileWithOptions} commands.
+ *
+ * If/when new attributes added to the builder, this class will be extended.
+ */
+public class OpenFileParameters {
+
+  /**
+   * Set of options declared as mandatory.
+   */
+  private Set<String> mandatoryKeys;
+
+  /**
+   * Options set during the build sequence.
+   */
+  private Configuration options;
+
+  /**
+   * Buffer size.
+   */
+  private int bufferSize;
+
+  /**
+   * Optional file status.
+   */
+  private FileStatus status;
+
+  public OpenFileParameters() {
+  }
+
+  public OpenFileParameters withMandatoryKeys(final Set<String> keys) {
+    this.mandatoryKeys = requireNonNull(keys);
+    return this;
+  }
+
+  public OpenFileParameters withOptions(final Configuration opts) {
+    this.options = requireNonNull(opts);
+    return this;
+  }
+
+  public OpenFileParameters withBufferSize(final int size) {
+    this.bufferSize = size;
+    return this;
+  }
+
+  public OpenFileParameters withStatus(final FileStatus st) {
+    this.status = st;
+    return this;
+  }
+
+  public Set<String> getMandatoryKeys() {
+    return mandatoryKeys;
+  }
+
+  public Configuration getOptions() {
+    return options;
+  }
+
+  public int getBufferSize() {
+    return bufferSize;
+  }
+
+  public FileStatus getStatus() {
+    return status;
+  }
+}
diff --git a/hadoop-common-project/hadoop-common/src/site/markdown/filesystem/filesystem.md b/hadoop-common-project/hadoop-common/src/site/markdown/filesystem/filesystem.md
index 7b356121e1fb1..6ea9414f0634e 100644
--- a/hadoop-common-project/hadoop-common/src/site/markdown/filesystem/filesystem.md
+++ b/hadoop-common-project/hadoop-common/src/site/markdown/filesystem/filesystem.md
@@ -739,24 +739,29 @@ exists in the metadata, but no copies of any its blocks can be located;
 Creates a [`FSDataInputStreamBuilder`](fsdatainputstreambuilder.html)
 to construct a operation to open the file at `path` for reading.
 
-
 When `build()` is invoked on the returned `FSDataInputStreamBuilder` instance,
 the builder parameters are verified and
-`openFileWithOptions(Path, Set<String>, Configuration, int)` invoked.
+`openFileWithOptions(Path, OpenFileParameters)` invoked.
 
 This (protected) operation returns a `CompletableFuture<FSDataInputStream>`
 which, when its `get()` method is called, either returns an input
 stream of the contents of opened file, or raises an exception.
 
-The base implementation of the `openFileWithOptions(PathHandle, Set<String>, Configuration, int)`
+The base implementation of the `openFileWithOptions(PathHandle, OpenFileParameters)`
 ultimately invokes `open(Path, int)`.
 
 Thus the chain `openFile(path).build().get()` has the same preconditions
 and postconditions as `open(Path p, int bufferSize)`
 
+However, there is one difference which implementations are free to
+take advantage of: 
+
+The returned stream MAY implement a lazy open where file non-existence or
+access permission failures may not surface until the first `read()` of the
+actual data.
 
-The `openFile()` operation may check the state of the filesystem during this
-call, but as the state of the filesystem may change betwen this call and
+The `openFile()` operation may check the state of the filesystem during its
+invocation, but as the state of the filesystem may change betwen this call and
 the actual `build()` and `get()` operations, this file-specific
 preconditions (file exists, file is readable, etc) MUST NOT be checked here.
 
@@ -787,6 +792,10 @@ It SHOULD be possible to always open a file without specifying any options,
 so as to present a consistent model to users. However, an implementation MAY
 opt to require one or more mandatory options to be set.
 
+The returned stream may perform "lazy" evaluation of file access. This is
+relevant for object stores where the probes for existence are expensive, and,
+even with an asynchronous open, may be considered needless.
+ 
 ### `FSDataInputStreamBuilder openFile(PathHandle)`
 
 Creates a `FSDataInputStreamBuilder` to build an operation to open a file.
@@ -795,13 +804,13 @@ to construct a operation to open the file identified by the given `PathHandle` f
 
 When `build()` is invoked on the returned `FSDataInputStreamBuilder` instance,
 the builder parameters are verified and
-`openFileWithOptions(PathHandle, Set<String>, Configuration, int)` invoked.
+`openFileWithOptions(PathHandle, OpenFileParameters)` invoked.
 
 This (protected) operation returns a `CompletableFuture<FSDataInputStream>`
 which, when its `get()` method is called, either returns an input
 stream of the contents of opened file, or raises an exception.
 
-The base implementation of the `openFileWithOptions(Path,PathHandle, Set<String>, Configuration, int)` method
+The base implementation of the `openFileWithOptions(PathHandle, OpenFileParameters)` method
 returns a future which invokes `open(Path, int)`.
 
 Thus the chain `openFile(pathhandle).build().get()` has the same preconditions
diff --git a/hadoop-common-project/hadoop-common/src/site/markdown/filesystem/fsdatainputstreambuilder.md b/hadoop-common-project/hadoop-common/src/site/markdown/filesystem/fsdatainputstreambuilder.md
index f1beed862cdbf..9546012b6d6e9 100644
--- a/hadoop-common-project/hadoop-common/src/site/markdown/filesystem/fsdatainputstreambuilder.md
+++ b/hadoop-common-project/hadoop-common/src/site/markdown/filesystem/fsdatainputstreambuilder.md
@@ -43,6 +43,31 @@ path validation.
 
 Set the size of the buffer to be used.
 
+### <a name="Builder.withFileStatus"></a> `FSDataInputStreamBuilder withFileStatus(FileStatus status)`
+
+A `FileStatus` instance which refers to the file being opened.
+
+This MAY be used by implementations to short-circuit checks for the file,
+So potentially saving on remote calls especially to object stores.
+
+Requirements:
+
+* `status != null`
+* `status.getPath()` == the resolved path of the file being opened.
+
+The path validation MUST take place if the store uses the `FileStatus` when
+it opens files, and MAY be performed otherwise. The validation
+SHOULD be postponed until the `build()` operation.
+
+This operation should be considered a hint to the filesystem.
+
+If a filesystem implementation extends the `FileStatus` returned in its
+implementation MAY use this information when opening the file.
+
+This is relevant with those stores which return version/etag information,
+including the S3A and ABFS connectors -they MAY use this to guarantee that
+the file they opened is exactly the one returned in the listing.
+
 ### Set optional or mandatory parameters
 
     FSDataInputStreamBuilder opt(String key, ...)
@@ -56,6 +81,7 @@ of `FileSystem`.
 out = fs.openFile(path)
     .opt("fs.s3a.experimental.fadvise", "random")
     .must("fs.s3a.readahead.range", 256 * 1024)
+    .withFileStatus(statusFromListing)
     .build()
     .get();
 ```
@@ -76,6 +102,21 @@ builder methods (i.e., `bufferSize()`) and `opt()`/`must()` is as follows:
 
 > The last option specified defines the value and its optional/mandatory state.
 
+If the `FileStatus` option passed in `withFileStatus()` is used, implementations
+MUST accept all subclasses of `FileStatus`, including `LocatedFileStatus`,
+rather than just any FS-specific subclass implemented by the implementation
+(e.g `S3AFileStatus`). They MAY simply ignore those which are not the 
+custom subclasses.
+
+This is critical to ensure safe use of the feature: directory listing/
+status serialization/deserialization can result result in the `withFileStatus()`
+argumennt not being the custom subclass returned by the Filesystem instance's
+own `getFileStatus()`, `listFiles()`, `listLocatedStatus()` calls, etc.
+
+In such a situation the implementations must:
+
+1. Validate the path (always).
+1. Use the status/convert to the custom type, *or* simply discard it.
 
 ## Builder interface
 
diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/AbstractContractOpenTest.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/AbstractContractOpenTest.java
index b6e94a664165e..a43053180fbf8 100644
--- a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/AbstractContractOpenTest.java
+++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/AbstractContractOpenTest.java
@@ -281,6 +281,7 @@ public void testOpenFileApplyRead() throws Throwable {
     createFile(fs, path, true,
         dataset(len, 0x40, 0x80));
     CompletableFuture<Long> readAllBytes = fs.openFile(path)
+        .withFileStatus(fs.getFileStatus(path))
         .build()
         .thenApply(ContractTestUtils::readStream);
     assertEquals("Wrong number of bytes read value",
@@ -302,4 +303,12 @@ public void testOpenFileApplyAsyncRead() throws Throwable {
         accepted.get());
   }
 
+  @Test
+  public void testOpenFileNullStatus() throws Throwable {
+    describe("use openFile() with a null status");
+    Path path = path("testOpenFileNullStatus");
+    intercept(NullPointerException.class,
+        () -> getFileSystem().openFile(path).withFileStatus(null));
+  }
+
 }
diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/select/ITestS3Select.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/select/ITestS3Select.java
index f5b81a2c46964..3dc2c6125762d 100644
--- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/select/ITestS3Select.java
+++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/select/ITestS3Select.java
@@ -256,6 +256,7 @@ public void testSelectEmptyFile() throws Throwable {
     ContractTestUtils.touch(fs, path);
     parseToLines(fs.openFile(path)
             .must(SELECT_SQL, SELECT_EVERYTHING)
+            .withFileStatus(fs.getFileStatus(path))
             .build()
             .get(),
         0);
@@ -548,14 +549,14 @@ public void testSelectDirectoryFails() throws Throwable {
     FutureDataInputStreamBuilder builder =
         getFileSystem().openFile(dir)
             .must(SELECT_SQL, SELECT_ODD_ENTRIES);
-    interceptFuture(PathIOException.class,
+    interceptFuture(FileNotFoundException.class,
         "", builder.build());
 
     // try the parent
     builder = getFileSystem().openFile(dir.getParent())
             .must(SELECT_SQL,
                 SELECT_ODD_ENTRIES);
-    interceptFuture(PathIOException.class,
+    interceptFuture(FileNotFoundException.class,
         "", builder.build());
   }
 
@@ -565,7 +566,7 @@ public void testSelectRootFails() throws Throwable {
     FutureDataInputStreamBuilder builder =
         getFileSystem().openFile(path("/"))
             .must(SELECT_SQL, SELECT_ODD_ENTRIES);
-    interceptFuture(PathIOException.class,
+    interceptFuture(FileNotFoundException.class,
         "", builder.build());
   }
 

From 3a39f91a5d42538251077eac1491f177f3886bab Mon Sep 17 00:00:00 2001
From: Steve Loughran <stevel@cloudera.com>
Date: Sun, 24 Apr 2022 21:33:59 +0530
Subject: [PATCH 30/40] HADOOP-16202. Enhanced openFile(): hadoop-common
 changes. (#2584/1)

This defines standard option and values for the
openFile() builder API for opening a file:

fs.option.openfile.read.policy
 A list of the desired read policy, in preferred order.
 standard values are
 adaptive, default, random, sequential, vector, whole-file

fs.option.openfile.length
 How long the file is.

fs.option.openfile.split.start
 start of a task's split

fs.option.openfile.split.end
 end of a task's split

These can be used by filesystem connectors to optimize their
reading of the source file, including but not limited to
* skipping existence/length probes when opening a file
* choosing a policy for prefetching/caching data

The hadoop shell commands which read files all declare "whole-file"
and "sequential", as appropriate.

Contributed by Steve Loughran.

Change-Id: Ia290f79ea7973ce8713d4f90f1315b24d7a23da1
---
 .../org/apache/hadoop/fs/AvroFSInput.java     |  11 +-
 .../java/org/apache/hadoop/fs/FSBuilder.java  |  14 +
 .../org/apache/hadoop/fs/FileContext.java     |  18 +-
 .../fs/FutureDataInputStreamBuilder.java      |   8 +-
 .../java/org/apache/hadoop/fs/Options.java    | 119 ++++
 .../hadoop/fs/impl/AbstractFSBuilderImpl.java |  38 +-
 .../FutureDataInputStreamBuilderImpl.java     |   8 +-
 .../hadoop/fs/impl/OpenFileParameters.java    |  13 +
 .../hadoop/fs/impl/WrappedIOException.java    |  14 +-
 .../apache/hadoop/fs/shell/CopyCommands.java  |   3 +-
 .../org/apache/hadoop/fs/shell/Display.java   |   3 +-
 .../java/org/apache/hadoop/fs/shell/Head.java |   8 +-
 .../org/apache/hadoop/fs/shell/PathData.java  |  37 +
 .../java/org/apache/hadoop/fs/shell/Tail.java |  15 +-
 .../fs/statistics/StoreStatisticNames.java    | 346 ++++++++++
 .../fs/statistics/StreamStatisticNames.java   | 309 +++++++++
 .../statistics/impl/IOStatisticsBinding.java  | 632 ++++++++++++++++++
 .../org/apache/hadoop/io/SequenceFile.java    |  14 +-
 .../hadoop/util/functional/FutureIO.java      | 278 ++++++++
 .../site/markdown/filesystem/filesystem.md    |  92 +--
 .../src/site/markdown/filesystem/index.md     |   2 +
 .../src/site/markdown/filesystem/openfile.md  | 122 ++++
 .../fs/contract/AbstractContractOpenTest.java |  80 ++-
 .../hadoop/fs/contract/ContractTestUtils.java |  13 +-
 .../fs/statistics/IOStatisticAssertions.java  | 548 +++++++++++++++
 .../fs/statistics/TestDurationTracking.java   | 360 ++++++++++
 26 files changed, 2968 insertions(+), 137 deletions(-)
 create mode 100644 hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/StoreStatisticNames.java
 create mode 100644 hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/StreamStatisticNames.java
 create mode 100644 hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/impl/IOStatisticsBinding.java
 create mode 100644 hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/functional/FutureIO.java
 create mode 100644 hadoop-common-project/hadoop-common/src/site/markdown/filesystem/openfile.md
 create mode 100644 hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/statistics/IOStatisticAssertions.java
 create mode 100644 hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/statistics/TestDurationTracking.java

diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/AvroFSInput.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/AvroFSInput.java
index 020276929af7f..8cfdb189a3a43 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/AvroFSInput.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/AvroFSInput.java
@@ -25,6 +25,10 @@
 import org.apache.hadoop.classification.InterfaceAudience;
 import org.apache.hadoop.classification.InterfaceStability;
 
+import static org.apache.hadoop.fs.Options.OpenFileOptions.FS_OPTION_OPENFILE_READ_POLICY;
+import static org.apache.hadoop.fs.Options.OpenFileOptions.FS_OPTION_OPENFILE_READ_POLICY_SEQUENTIAL;
+import static org.apache.hadoop.util.functional.FutureIO.awaitFuture;
+
 /** Adapts an {@link FSDataInputStream} to Avro's SeekableInput interface. */
 @InterfaceAudience.Public
 @InterfaceStability.Stable
@@ -42,7 +46,12 @@ public AvroFSInput(final FSDataInputStream in, final long len) {
   public AvroFSInput(final FileContext fc, final Path p) throws IOException {
     FileStatus status = fc.getFileStatus(p);
     this.len = status.getLen();
-    this.stream = fc.open(p);
+    this.stream = awaitFuture(fc.openFile(p)
+        .opt(FS_OPTION_OPENFILE_READ_POLICY,
+            FS_OPTION_OPENFILE_READ_POLICY_SEQUENTIAL)
+        .withFileStatus(status)
+        .build());
+    fc.open(p);
   }
 
   @Override
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FSBuilder.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FSBuilder.java
index b7757a62e28ad..a4c7254cfeb3c 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FSBuilder.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FSBuilder.java
@@ -61,6 +61,13 @@ public interface FSBuilder<S, B extends FSBuilder<S, B>> {
    */
   B opt(@Nonnull String key, float value);
 
+  /**
+   * Set optional long parameter for the Builder.
+   *
+   * @see #opt(String, String)
+   */
+  B opt(@Nonnull String key, long value);
+
   /**
    * Set optional double parameter for the Builder.
    *
@@ -104,6 +111,13 @@ public interface FSBuilder<S, B extends FSBuilder<S, B>> {
    */
   B must(@Nonnull String key, float value);
 
+  /**
+   * Set mandatory long option.
+   *
+   * @see #must(String, String)
+   */
+  B must(@Nonnull String key, long value);
+
   /**
    * Set mandatory double option.
    *
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FileContext.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FileContext.java
index df93e89750ee0..64a347f2b8692 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FileContext.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FileContext.java
@@ -70,7 +70,12 @@
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+import static org.apache.hadoop.fs.Options.OpenFileOptions.FS_OPTION_OPENFILE_BUFFER_SIZE;
+import static org.apache.hadoop.fs.Options.OpenFileOptions.FS_OPTION_OPENFILE_READ_POLICY;
+import static org.apache.hadoop.fs.Options.OpenFileOptions.FS_OPTION_OPENFILE_LENGTH;
+import static org.apache.hadoop.fs.Options.OpenFileOptions.FS_OPTION_OPENFILE_READ_POLICY_WHOLE_FILE;
 import static org.apache.hadoop.fs.impl.PathCapabilitiesSupport.validatePathCapabilityArgs;
+import static org.apache.hadoop.util.functional.FutureIO.awaitFuture;
 
 /**
  * The FileContext class provides an interface for users of the Hadoop
@@ -2194,7 +2199,12 @@ public boolean copy(final Path src, final Path dst, boolean deleteSource,
         EnumSet<CreateFlag> createFlag = overwrite ? EnumSet.of(
             CreateFlag.CREATE, CreateFlag.OVERWRITE) :
             EnumSet.of(CreateFlag.CREATE);
-        InputStream in = open(qSrc);
+        InputStream in = awaitFuture(openFile(qSrc)
+            .opt(FS_OPTION_OPENFILE_READ_POLICY,
+                FS_OPTION_OPENFILE_READ_POLICY_WHOLE_FILE)
+            .opt(FS_OPTION_OPENFILE_LENGTH,
+                fs.getLen())   // file length hint for object stores
+            .build());
         try (OutputStream out = create(qDst, createFlag)) {
           IOUtils.copyBytes(in, out, conf, true);
         } finally {
@@ -2926,9 +2936,11 @@ public CompletableFuture<FSDataInputStream> build() throws IOException {
       final Path absF = fixRelativePart(getPath());
       OpenFileParameters parameters = new OpenFileParameters()
           .withMandatoryKeys(getMandatoryKeys())
+          .withOptionalKeys(getOptionalKeys())
           .withOptions(getOptions())
-          .withBufferSize(getBufferSize())
-          .withStatus(getStatus());
+          .withStatus(getStatus())
+          .withBufferSize(
+              getOptions().getInt(FS_OPTION_OPENFILE_BUFFER_SIZE, getBufferSize()));
       return new FSLinkResolver<CompletableFuture<FSDataInputStream>>() {
         @Override
         public CompletableFuture<FSDataInputStream> next(
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FutureDataInputStreamBuilder.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FutureDataInputStreamBuilder.java
index 27a522e593001..e7f441a75d3c8 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FutureDataInputStreamBuilder.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FutureDataInputStreamBuilder.java
@@ -17,6 +17,7 @@
  */
 package org.apache.hadoop.fs;
 
+import javax.annotation.Nullable;
 import java.io.IOException;
 import java.util.concurrent.CompletableFuture;
 
@@ -34,7 +35,7 @@
  * options accordingly, for example:
  *
  * If the option is not related to the file system, the option will be ignored.
- * If the option is must, but not supported by the file system, a
+ * If the option is must, but not supported/known by the file system, an
  * {@link IllegalArgumentException} will be thrown.
  *
  */
@@ -51,10 +52,11 @@ CompletableFuture<FSDataInputStream> build()
   /**
    * A FileStatus may be provided to the open request.
    * It is up to the implementation whether to use this or not.
-   * @param status status.
+   * @param status status: may be null
    * @return the builder.
    */
-  default FutureDataInputStreamBuilder withFileStatus(FileStatus status) {
+  default FutureDataInputStreamBuilder withFileStatus(
+      @Nullable FileStatus status) {
     return this;
   }
 
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/Options.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/Options.java
index 75bc12df8fdcf..9b457272fcb50 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/Options.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/Options.java
@@ -17,9 +17,13 @@
  */
 package org.apache.hadoop.fs;
 
+import java.util.Collections;
 import java.util.Optional;
+import java.util.Set;
 import java.util.function.Function;
 import java.util.function.BiFunction;
+import java.util.stream.Collectors;
+import java.util.stream.Stream;
 
 import org.apache.hadoop.classification.InterfaceAudience;
 import org.apache.hadoop.classification.InterfaceStability;
@@ -518,4 +522,119 @@ public enum ChecksumCombineMode {
     MD5MD5CRC,  // MD5 of block checksums, which are MD5 over chunk CRCs
     COMPOSITE_CRC  // Block/chunk-independent composite CRC
   }
+
+  /**
+   * The standard {@code openFile()} options.
+   */
+  @InterfaceAudience.Public
+  @InterfaceStability.Evolving
+  public static final class OpenFileOptions {
+
+    private OpenFileOptions() {
+    }
+
+    /**
+     * Prefix for all standard filesystem options: {@value}.
+     */
+    private static final String FILESYSTEM_OPTION = "fs.option.";
+
+    /**
+     * Prefix for all openFile options: {@value}.
+     */
+    public static final String FS_OPTION_OPENFILE =
+        FILESYSTEM_OPTION + "openfile.";
+
+    /**
+     * OpenFile option for file length: {@value}.
+     */
+    public static final String FS_OPTION_OPENFILE_LENGTH =
+        FS_OPTION_OPENFILE + "length";
+
+    /**
+     * OpenFile option for split start: {@value}.
+     */
+    public static final String FS_OPTION_OPENFILE_SPLIT_START =
+        FS_OPTION_OPENFILE + "split.start";
+
+    /**
+     * OpenFile option for split end: {@value}.
+     */
+    public static final String FS_OPTION_OPENFILE_SPLIT_END =
+        FS_OPTION_OPENFILE + "split.end";
+
+    /**
+     * OpenFile option for buffer size: {@value}.
+     */
+    public static final String FS_OPTION_OPENFILE_BUFFER_SIZE =
+        FS_OPTION_OPENFILE + "buffer.size";
+
+    /**
+     * OpenFile option for read policies: {@value}.
+     */
+    public static final String FS_OPTION_OPENFILE_READ_POLICY =
+        FS_OPTION_OPENFILE + "read.policy";
+
+    /**
+     * Set of standard options which openFile implementations
+     * MUST recognize, even if they ignore the actual values.
+     */
+    public static final Set<String> FS_OPTION_OPENFILE_STANDARD_OPTIONS =
+        Collections.unmodifiableSet(Stream.of(
+                FS_OPTION_OPENFILE_BUFFER_SIZE,
+                FS_OPTION_OPENFILE_READ_POLICY,
+                FS_OPTION_OPENFILE_LENGTH,
+                FS_OPTION_OPENFILE_SPLIT_START,
+                FS_OPTION_OPENFILE_SPLIT_END)
+            .collect(Collectors.toSet()));
+
+    /**
+     * Read policy for adaptive IO: {@value}.
+     */
+    public static final String FS_OPTION_OPENFILE_READ_POLICY_ADAPTIVE =
+        "adaptive";
+
+    /**
+     * Read policy {@value} -whateve the implementation does by default.
+     */
+    public static final String FS_OPTION_OPENFILE_READ_POLICY_DEFAULT =
+        "default";
+
+    /**
+     * Read policy for random IO: {@value}.
+     */
+    public static final String FS_OPTION_OPENFILE_READ_POLICY_RANDOM =
+        "random";
+
+    /**
+     * Read policy for sequential IO: {@value}.
+     */
+    public static final String FS_OPTION_OPENFILE_READ_POLICY_SEQUENTIAL =
+        "sequential";
+
+    /**
+     * Vectored IO API to be used: {@value}.
+     */
+    public static final String FS_OPTION_OPENFILE_READ_POLICY_VECTOR =
+        "vector";
+
+    /**
+     * Whole file to be read, end-to-end: {@value}.
+     */
+    public static final String FS_OPTION_OPENFILE_READ_POLICY_WHOLE_FILE =
+        "whole-file";
+
+    /**
+     * All the current read policies as a set.
+     */
+    public static final Set<String> FS_OPTION_OPENFILE_READ_POLICIES =
+        Collections.unmodifiableSet(Stream.of(
+                FS_OPTION_OPENFILE_READ_POLICY_ADAPTIVE,
+                FS_OPTION_OPENFILE_READ_POLICY_DEFAULT,
+                FS_OPTION_OPENFILE_READ_POLICY_RANDOM,
+                FS_OPTION_OPENFILE_READ_POLICY_SEQUENTIAL,
+                FS_OPTION_OPENFILE_READ_POLICY_VECTOR,
+                FS_OPTION_OPENFILE_READ_POLICY_WHOLE_FILE)
+            .collect(Collectors.toSet()));
+
+  }
 }
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/impl/AbstractFSBuilderImpl.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/impl/AbstractFSBuilderImpl.java
index 5fc92e97be76c..8e5d65eba2e94 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/impl/AbstractFSBuilderImpl.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/impl/AbstractFSBuilderImpl.java
@@ -46,7 +46,7 @@
  * <code>
  *   .opt("foofs:option.a", true)
  *   .opt("foofs:option.b", "value")
- *   .opt("barfs:cache", true)
+ *   .opt("fs.s3a.open.option.etag", "9fe4c37c25b")
  *   .must("foofs:cache", true)
  *   .must("barfs:cache-size", 256 * 1024 * 1024)
  *   .build();
@@ -88,6 +88,9 @@
   /** Keep track of the keys for mandatory options. */
   private final Set<String> mandatoryKeys = new HashSet<>();
 
+  /** Keep track of the optional keys. */
+  private final Set<String> optionalKeys = new HashSet<>();
+
   /**
    * Constructor with both optional path and path handle.
    * Either or both argument may be empty, but it is an error for
@@ -163,6 +166,7 @@ public PathHandle getPathHandle() {
   @Override
   public B opt(@Nonnull final String key, @Nonnull final String value) {
     mandatoryKeys.remove(key);
+    optionalKeys.add(key);
     options.set(key, value);
     return getThisBuilder();
   }
@@ -175,6 +179,7 @@ public B opt(@Nonnull final String key, @Nonnull final String value) {
   @Override
   public B opt(@Nonnull final String key, boolean value) {
     mandatoryKeys.remove(key);
+    optionalKeys.add(key);
     options.setBoolean(key, value);
     return getThisBuilder();
   }
@@ -187,10 +192,19 @@ public B opt(@Nonnull final String key, boolean value) {
   @Override
   public B opt(@Nonnull final String key, int value) {
     mandatoryKeys.remove(key);
+    optionalKeys.add(key);
     options.setInt(key, value);
     return getThisBuilder();
   }
 
+  @Override
+  public B opt(@Nonnull final String key, final long value) {
+    mandatoryKeys.remove(key);
+    optionalKeys.add(key);
+    options.setLong(key, value);
+    return getThisBuilder();
+  }
+
   /**
    * Set optional float parameter for the Builder.
    *
@@ -199,6 +213,7 @@ public B opt(@Nonnull final String key, int value) {
   @Override
   public B opt(@Nonnull final String key, float value) {
     mandatoryKeys.remove(key);
+    optionalKeys.add(key);
     options.setFloat(key, value);
     return getThisBuilder();
   }
@@ -211,6 +226,7 @@ public B opt(@Nonnull final String key, float value) {
   @Override
   public B opt(@Nonnull final String key, double value) {
     mandatoryKeys.remove(key);
+    optionalKeys.add(key);
     options.setDouble(key, value);
     return getThisBuilder();
   }
@@ -223,6 +239,7 @@ public B opt(@Nonnull final String key, double value) {
   @Override
   public B opt(@Nonnull final String key, @Nonnull final String... values) {
     mandatoryKeys.remove(key);
+    optionalKeys.add(key);
     options.setStrings(key, values);
     return getThisBuilder();
   }
@@ -248,6 +265,7 @@ public B must(@Nonnull final String key, @Nonnull final String value) {
   @Override
   public B must(@Nonnull final String key, boolean value) {
     mandatoryKeys.add(key);
+    optionalKeys.remove(key);
     options.setBoolean(key, value);
     return getThisBuilder();
   }
@@ -260,10 +278,19 @@ public B must(@Nonnull final String key, boolean value) {
   @Override
   public B must(@Nonnull final String key, int value) {
     mandatoryKeys.add(key);
+    optionalKeys.remove(key);
     options.setInt(key, value);
     return getThisBuilder();
   }
 
+  @Override
+  public B must(@Nonnull final String key, final long value) {
+    mandatoryKeys.add(key);
+    optionalKeys.remove(key);
+    options.setLong(key, value);
+    return getThisBuilder();
+  }
+
   /**
    * Set mandatory float option.
    *
@@ -272,6 +299,7 @@ public B must(@Nonnull final String key, int value) {
   @Override
   public B must(@Nonnull final String key, float value) {
     mandatoryKeys.add(key);
+    optionalKeys.remove(key);
     options.setFloat(key, value);
     return getThisBuilder();
   }
@@ -284,6 +312,7 @@ public B must(@Nonnull final String key, float value) {
   @Override
   public B must(@Nonnull final String key, double value) {
     mandatoryKeys.add(key);
+    optionalKeys.remove(key);
     options.setDouble(key, value);
     return getThisBuilder();
   }
@@ -296,6 +325,7 @@ public B must(@Nonnull final String key, double value) {
   @Override
   public B must(@Nonnull final String key, @Nonnull final String... values) {
     mandatoryKeys.add(key);
+    optionalKeys.remove(key);
     options.setStrings(key, values);
     return getThisBuilder();
   }
@@ -314,6 +344,12 @@ public Configuration getOptions() {
   public Set<String> getMandatoryKeys() {
     return Collections.unmodifiableSet(mandatoryKeys);
   }
+  /**
+   * Get all the keys that are set as optional keys.
+   */
+  public Set<String> getOptionalKeys() {
+    return Collections.unmodifiableSet(optionalKeys);
+  }
 
   /**
    * Reject a configuration if one or more mandatory keys are
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/impl/FutureDataInputStreamBuilderImpl.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/impl/FutureDataInputStreamBuilderImpl.java
index 24a8d49747fe6..70e39de7388c3 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/impl/FutureDataInputStreamBuilderImpl.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/impl/FutureDataInputStreamBuilderImpl.java
@@ -19,6 +19,7 @@
 package org.apache.hadoop.fs.impl;
 
 import javax.annotation.Nonnull;
+import javax.annotation.Nullable;
 import java.io.IOException;
 import java.util.concurrent.CompletableFuture;
 
@@ -47,7 +48,7 @@
  * options accordingly, for example:
  *
  * If the option is not related to the file system, the option will be ignored.
- * If the option is must, but not supported by the file system, a
+ * If the option is must, but not supported/known by the file system, an
  * {@link IllegalArgumentException} will be thrown.
  *
  */
@@ -147,8 +148,9 @@ public FutureDataInputStreamBuilder getThisBuilder() {
   }
 
   @Override
-  public FutureDataInputStreamBuilder withFileStatus(FileStatus st) {
-    this.status = requireNonNull(st, "status");
+  public FutureDataInputStreamBuilder withFileStatus(
+      @Nullable FileStatus st) {
+    this.status = st;
     return this;
   }
 
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/impl/OpenFileParameters.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/impl/OpenFileParameters.java
index 77b4ff52696a3..a19c5faff4d90 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/impl/OpenFileParameters.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/impl/OpenFileParameters.java
@@ -38,6 +38,9 @@ public class OpenFileParameters {
    */
   private Set<String> mandatoryKeys;
 
+  /** The optional keys. */
+  private Set<String> optionalKeys;
+
   /**
    * Options set during the build sequence.
    */
@@ -61,6 +64,11 @@ public OpenFileParameters withMandatoryKeys(final Set<String> keys) {
     return this;
   }
 
+  public OpenFileParameters withOptionalKeys(final Set<String> keys) {
+    this.optionalKeys = requireNonNull(keys);
+    return this;
+  }
+
   public OpenFileParameters withOptions(final Configuration opts) {
     this.options = requireNonNull(opts);
     return this;
@@ -80,6 +88,10 @@ public Set<String> getMandatoryKeys() {
     return mandatoryKeys;
   }
 
+  public Set<String> getOptionalKeys() {
+    return optionalKeys;
+  }
+
   public Configuration getOptions() {
     return options;
   }
@@ -91,4 +103,5 @@ public int getBufferSize() {
   public FileStatus getStatus() {
     return status;
   }
+
 }
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/impl/WrappedIOException.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/impl/WrappedIOException.java
index 1de1ecb785368..d1a8454a3c569 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/impl/WrappedIOException.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/impl/WrappedIOException.java
@@ -19,7 +19,7 @@
 package org.apache.hadoop.fs.impl;
 
 import java.io.IOException;
-import java.util.concurrent.ExecutionException;
+import java.io.UncheckedIOException;
 
 import com.google.common.base.Preconditions;
 
@@ -27,16 +27,16 @@
 import org.apache.hadoop.classification.InterfaceStability;
 
 /**
- * A wrapper for an IOException which
- * {@link FutureIOSupport#raiseInnerCause(ExecutionException)} knows to
- * always extract the exception.
+ * A wrapper for an IOException.
  *
  * The constructor signature guarantees the cause will be an IOException,
  * and as it checks for a null-argument, non-null.
+ * @deprecated use the {@code UncheckedIOException} directly.]
  */
+@Deprecated
 @InterfaceAudience.Private
 @InterfaceStability.Unstable
-public class WrappedIOException extends RuntimeException {
+public class WrappedIOException extends UncheckedIOException {
 
   private static final long serialVersionUID = 2510210974235779294L;
 
@@ -49,8 +49,4 @@ public WrappedIOException(final IOException cause) {
     super(Preconditions.checkNotNull(cause));
   }
 
-  @Override
-  public synchronized IOException getCause() {
-    return (IOException) super.getCause();
-  }
 }
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/shell/CopyCommands.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/shell/CopyCommands.java
index 7b2e83543e96c..218a343d5f452 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/shell/CopyCommands.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/shell/CopyCommands.java
@@ -97,7 +97,8 @@ protected void processArguments(LinkedList<PathData> items)
       try {
         for (PathData src : srcs) {
           if (src.stat.getLen() != 0) {
-            try (FSDataInputStream in = src.fs.open(src.path)) {
+            // Always do sequential reads.
+            try (FSDataInputStream in = src.openForSequentialIO()) {
               IOUtils.copyBytes(in, out, getConf(), false);
               writeDelimiter(out);
             }
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/shell/Display.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/shell/Display.java
index 0e412265cd98f..66d2df4e4875c 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/shell/Display.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/shell/Display.java
@@ -105,7 +105,8 @@ private void printToStdout(InputStream in) throws IOException {
     }
 
     protected InputStream getInputStream(PathData item) throws IOException {
-      return item.fs.open(item.path);
+      // Always do sequential reads;
+      return item.openForSequentialIO();
     }
   }
   
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/shell/Head.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/shell/Head.java
index 2280225b5ae32..7242f261801d6 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/shell/Head.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/shell/Head.java
@@ -28,6 +28,8 @@
 import java.util.LinkedList;
 import java.util.List;
 
+import static org.apache.hadoop.fs.Options.OpenFileOptions.FS_OPTION_OPENFILE_READ_POLICY_SEQUENTIAL;
+
 /**
  * Show the first 1KB of the file.
  */
@@ -68,11 +70,9 @@ protected void processPath(PathData item) throws IOException {
   }
 
   private void dumpToOffset(PathData item) throws IOException {
-    FSDataInputStream in = item.fs.open(item.path);
-    try {
+    try (FSDataInputStream in = item.openFile(
+        FS_OPTION_OPENFILE_READ_POLICY_SEQUENTIAL)) {
       IOUtils.copyBytes(in, System.out, endingOffset, false);
-    } finally {
-      in.close();
     }
   }
 }
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/shell/PathData.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/shell/PathData.java
index adf17df2db84a..140e86b15f656 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/shell/PathData.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/shell/PathData.java
@@ -29,6 +29,7 @@
 import org.apache.hadoop.classification.InterfaceAudience;
 import org.apache.hadoop.classification.InterfaceStability;
 import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSDataInputStream;
 import org.apache.hadoop.fs.FileStatus;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.LocalFileSystem;
@@ -39,6 +40,12 @@
 import org.apache.hadoop.fs.PathNotFoundException;
 import org.apache.hadoop.fs.RemoteIterator;
 
+import static org.apache.hadoop.fs.Options.OpenFileOptions.FS_OPTION_OPENFILE_READ_POLICY;
+import static org.apache.hadoop.fs.Options.OpenFileOptions.FS_OPTION_OPENFILE_READ_POLICY_SEQUENTIAL;
+import static org.apache.hadoop.fs.Options.OpenFileOptions.FS_OPTION_OPENFILE_LENGTH;
+import static org.apache.hadoop.util.functional.FutureIO.awaitFuture;
+import static org.apache.hadoop.util.functional.RemoteIterators.mappingRemoteIterator;
+
 /**
  * Encapsulates a Path (path), its FileStatus (stat), and its FileSystem (fs).
  * PathData ensures that the returned path string will be the same as the
@@ -611,4 +618,34 @@ public boolean equals(Object o) {
   public int hashCode() {
     return path.hashCode();
   }
+
+
+  /**
+   * Open a file for sequential IO.
+   * <p></p>
+   * This uses FileSystem.openFile() to request sequential IO;
+   * the file status is also passed in.
+   * Filesystems may use to optimize their IO.
+   * @return an input stream
+   * @throws IOException failure
+   */
+  protected FSDataInputStream openForSequentialIO()
+      throws IOException {
+    return openFile(FS_OPTION_OPENFILE_READ_POLICY_SEQUENTIAL);
+  }
+
+  /**
+   * Open a file.
+   * @param policy fadvise policy.
+   * @return an input stream
+   * @throws IOException failure
+   */
+  protected FSDataInputStream openFile(final String policy) throws IOException {
+    return awaitFuture(fs.openFile(path)
+        .opt(FS_OPTION_OPENFILE_READ_POLICY,
+            policy)
+        .opt(FS_OPTION_OPENFILE_LENGTH,
+            stat.getLen())   // file length hint for object stores
+        .build());
+  }
 }
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/shell/Tail.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/shell/Tail.java
index 8a75a60f435ca..585ac33601ee9 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/shell/Tail.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/shell/Tail.java
@@ -30,6 +30,12 @@
 
 import com.google.common.annotations.VisibleForTesting;
 
+import static org.apache.hadoop.fs.Options.OpenFileOptions.FS_OPTION_OPENFILE_READ_POLICY_SEQUENTIAL;
+
+import static org.apache.hadoop.fs.Options.OpenFileOptions.FS_OPTION_OPENFILE_READ_POLICY_SEQUENTIAL;
+
+import static org.apache.hadoop.fs.Options.OpenFileOptions.FS_OPTION_OPENFILE_READ_POLICY_SEQUENTIAL;
+
 /**
  * Get a listing of all files in that match the file patterns.
  */
@@ -107,16 +113,15 @@ private long dumpFromOffset(PathData item, long offset) throws IOException {
     if (offset < 0) {
       offset = Math.max(fileSize + offset, 0);
     }
-    
-    FSDataInputStream in = item.fs.open(item.path);
-    try {
+    // Always do sequential reads.
+    try (FSDataInputStream in = item.openFile(
+        FS_OPTION_OPENFILE_READ_POLICY_SEQUENTIAL)) {
       in.seek(offset);
       // use conf so the system configured io block size is used
       IOUtils.copyBytes(in, System.out, getConf(), false);
       offset = in.getPos();
-    } finally {
-      in.close();
     }
     return offset;
   }
+
 }
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/StoreStatisticNames.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/StoreStatisticNames.java
new file mode 100644
index 0000000000000..f30504acdd02a
--- /dev/null
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/StoreStatisticNames.java
@@ -0,0 +1,346 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.statistics;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.classification.InterfaceStability;
+
+/**
+ * Common statistic names for object store operations..
+ * <p>
+ * When adding new common statistic name constants, please make them unique.
+ * By convention:
+ * </p>
+ * <ul>
+ *   <li>the name of the constants are uppercase, words separated by
+ *   underscores.</li>
+ *   <li>the value of the constants are lowercase of the constant names.</li>
+ * </ul>
+ */
+@InterfaceAudience.Public
+@InterfaceStability.Evolving
+public final class StoreStatisticNames {
+
+  /** {@value}. */
+  public static final String OP_APPEND = "op_append";
+
+  /** {@value}. */
+  public static final String OP_COPY_FROM_LOCAL_FILE =
+      "op_copy_from_local_file";
+
+  /** {@value}. */
+  public static final String OP_CREATE = "op_create";
+
+  /** {@value}. */
+  public static final String OP_CREATE_NON_RECURSIVE =
+      "op_create_non_recursive";
+
+  /** {@value}. */
+  public static final String OP_DELETE = "op_delete";
+
+  /** {@value}. */
+  public static final String OP_EXISTS = "op_exists";
+
+  /** {@value}. */
+  public static final String OP_GET_CONTENT_SUMMARY =
+      "op_get_content_summary";
+
+  /** {@value}. */
+  public static final String OP_GET_DELEGATION_TOKEN =
+      "op_get_delegation_token";
+
+  /** {@value}. */
+  public static final String OP_GET_FILE_CHECKSUM =
+      "op_get_file_checksum";
+
+  /** {@value}. */
+  public static final String OP_GET_FILE_STATUS = "op_get_file_status";
+
+  /** {@value}. */
+  public static final String OP_GET_STATUS = "op_get_status";
+
+  /** {@value}. */
+  public static final String OP_GLOB_STATUS = "op_glob_status";
+
+  /** {@value}. */
+  public static final String OP_IS_FILE = "op_is_file";
+
+  /** {@value}. */
+  public static final String OP_IS_DIRECTORY = "op_is_directory";
+
+  /** {@value}. */
+  public static final String OP_LIST_FILES = "op_list_files";
+
+  /** {@value}. */
+  public static final String OP_LIST_LOCATED_STATUS =
+      "op_list_located_status";
+
+  /** {@value}. */
+  public static final String OP_LIST_STATUS = "op_list_status";
+
+  /** {@value}. */
+  public static final String OP_MKDIRS = "op_mkdirs";
+
+  /** {@value}. */
+  public static final String OP_MODIFY_ACL_ENTRIES = "op_modify_acl_entries";
+
+  /** {@value}. */
+  public static final String OP_OPEN = "op_open";
+
+  /** Call to openFile() {@value}. */
+  public static final String OP_OPENFILE = "op_openfile";
+
+  /** {@value}. */
+  public static final String OP_REMOVE_ACL = "op_remove_acl";
+
+  /** {@value}. */
+  public static final String OP_REMOVE_ACL_ENTRIES = "op_remove_acl_entries";
+
+  /** {@value}. */
+  public static final String OP_REMOVE_DEFAULT_ACL = "op_remove_default_acl";
+
+  /** {@value}. */
+  public static final String OP_RENAME = "op_rename";
+
+  /** {@value}. */
+  public static final String OP_SET_ACL = "op_set_acl";
+
+  /** {@value}. */
+  public static final String OP_SET_OWNER = "op_set_owner";
+
+  /** {@value}. */
+  public static final String OP_SET_PERMISSION = "op_set_permission";
+
+  /** {@value}. */
+  public static final String OP_SET_TIMES = "op_set_times";
+
+  /** {@value}. */
+  public static final String OP_TRUNCATE = "op_truncate";
+
+  /** {@value}. */
+  public static final String DELEGATION_TOKENS_ISSUED
+      = "delegation_tokens_issued";
+
+  /** Requests throttled and retried: {@value}. */
+  public static final String STORE_IO_THROTTLED
+      = "store_io_throttled";
+
+  /** Requests made of a store: {@value}. */
+  public static final String STORE_IO_REQUEST
+      = "store_io_request";
+
+  /**
+   * IO retried: {@value}.
+   */
+  public static final String STORE_IO_RETRY
+      = "store_io_retry";
+
+  /**
+   * A store's equivalent of a paged LIST request was initiated: {@value}.
+   */
+  public static final String OBJECT_LIST_REQUEST
+      = "object_list_request";
+
+  /**
+   * Number of continued object listings made.
+   * Value :{@value}.
+   */
+  public static final String OBJECT_CONTINUE_LIST_REQUEST =
+      "object_continue_list_request";
+
+  /**
+   * A bulk DELETE request was made: {@value}.
+   * A separate statistic from {@link #OBJECT_DELETE_REQUEST}
+   * so that metrics on duration of the operations can
+   * be distinguished.
+   */
+  public static final String OBJECT_BULK_DELETE_REQUEST
+      = "object_bulk_delete_request";
+
+  /**
+   * A store's equivalent of a DELETE request was made: {@value}.
+   * This may be an HTTP DELETE verb, or it may be some custom
+   * operation which takes a list of objects to delete.
+   */
+  public static final String OBJECT_DELETE_REQUEST
+      = "object_delete_request";
+
+  /**
+   * The count of objects deleted in delete requests.
+   */
+  public static final String OBJECT_DELETE_OBJECTS
+      = "object_delete_objects";
+
+  /**
+   * Object multipart upload initiated.
+   * Value :{@value}.
+   */
+  public static final String OBJECT_MULTIPART_UPLOAD_INITIATED =
+      "object_multipart_initiated";
+
+  /**
+   * Object multipart upload aborted.
+   * Value :{@value}.
+   */
+  public static final String OBJECT_MULTIPART_UPLOAD_ABORTED =
+      "object_multipart_aborted";
+
+  /**
+   * Object put/multipart upload count.
+   * Value :{@value}.
+   */
+  public static final String OBJECT_PUT_REQUEST =
+      "object_put_request";
+
+  /**
+   * Object put/multipart upload completed count.
+   * Value :{@value}.
+   */
+  public static final String OBJECT_PUT_REQUEST_COMPLETED =
+      "object_put_request_completed";
+
+  /**
+   * Current number of active put requests.
+   * Value :{@value}.
+   */
+  public static final String OBJECT_PUT_REQUEST_ACTIVE =
+      "object_put_request_active";
+
+  /**
+   * number of bytes uploaded.
+   * Value :{@value}.
+   */
+  public static final String OBJECT_PUT_BYTES =
+      "object_put_bytes";
+
+  /**
+   * number of bytes queued for upload/being actively uploaded.
+   * Value :{@value}.
+   */
+  public static final String OBJECT_PUT_BYTES_PENDING =
+      "object_put_bytes_pending";
+
+  /**
+   * Count of S3 Select (or similar) requests issued.
+   * Value :{@value}.
+   */
+  public static final String OBJECT_SELECT_REQUESTS =
+      "object_select_requests";
+
+  /**
+   * Suffix to use for a minimum value when
+   * the same key is shared across min/mean/max
+   * statistics.
+   *
+   * Value {@value}.
+   */
+  public static final String SUFFIX_MIN = ".min";
+
+  /**
+   * Suffix to use for a maximum value when
+   * the same key is shared across max/mean/max
+   * statistics.
+   *
+   * Value {@value}.
+   */
+  public static final String SUFFIX_MAX = ".max";
+
+  /**
+   * Suffix to use for a mean statistic value when
+   * the same key is shared across mean/mean/max
+   * statistics.
+   *
+   * Value {@value}.
+   */
+  public static final String SUFFIX_MEAN = ".mean";
+
+  /**
+   * String to add to counters and other stats to track failures.
+   * This comes before the .min/.mean//max suffixes.
+   *
+   * Value {@value}.
+   */
+  public static final String SUFFIX_FAILURES = ".failures";
+
+  /**
+   * The name of the statistic collected for executor acquisition if
+   * a duration tracker factory is passed in to the constructor.
+   * {@value}.
+   */
+  public static final String ACTION_EXECUTOR_ACQUIRED =
+      "action_executor_acquired";
+
+  /**
+   * A file was opened: {@value}.
+   */
+  public static final String ACTION_FILE_OPENED
+      = "action_file_opened";
+
+  /**
+   * An HTTP HEAD request was made: {@value}.
+   */
+  public static final String ACTION_HTTP_HEAD_REQUEST
+      = "action_http_head_request";
+
+  /**
+   * An HTTP GET request was made: {@value}.
+   */
+  public static final String ACTION_HTTP_GET_REQUEST
+      = "action_http_get_request";
+
+  /**
+   * An HTTP HEAD request was made: {@value}.
+   */
+  public static final String OBJECT_METADATA_REQUESTS
+      = "object_metadata_request";
+
+  public static final String OBJECT_COPY_REQUESTS
+      = "object_copy_requests";
+
+  public static final String STORE_IO_THROTTLE_RATE
+      = "store_io_throttle_rate";
+
+  public static final String DELEGATION_TOKEN_ISSUED
+      = "delegation_token_issued";
+
+  public static final String MULTIPART_UPLOAD_INSTANTIATED
+      = "multipart_instantiated";
+
+  public static final String MULTIPART_UPLOAD_PART_PUT
+      = "multipart_upload_part_put";
+
+  public static final String MULTIPART_UPLOAD_PART_PUT_BYTES
+      = "multipart_upload_part_put_bytes";
+
+  public static final String MULTIPART_UPLOAD_ABORTED
+      = "multipart_upload_aborted";
+
+  public static final String MULTIPART_UPLOAD_ABORT_UNDER_PATH_INVOKED
+      = "multipart_upload_abort_under_path_invoked";
+
+  public static final String MULTIPART_UPLOAD_COMPLETED
+      = "multipart_upload_completed";
+
+  public static final String MULTIPART_UPLOAD_STARTED
+      = "multipart_upload_started";
+
+  private StoreStatisticNames() {
+  }
+
+}
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/StreamStatisticNames.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/StreamStatisticNames.java
new file mode 100644
index 0000000000000..6ced15dad221a
--- /dev/null
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/StreamStatisticNames.java
@@ -0,0 +1,309 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.statistics;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.classification.InterfaceStability;
+
+/**
+ * These are common statistic names.
+ * <p>
+ * When adding new common statistic name constants, please make them unique.
+ * By convention, they are implicitly unique:
+ * <ul>
+ *   <li>
+ *     The name of the constants are uppercase, words separated by
+ *     underscores.
+ *   </li>
+ *   <li>
+ *     The value of the constants are lowercase of the constant names.
+ *   </li>
+ * </ul>
+ */
+@InterfaceAudience.Public
+@InterfaceStability.Evolving
+public final class StreamStatisticNames {
+
+  /**
+   * Count of times the TCP stream was aborted.
+   * Value: {@value}.
+   */
+  public static final String STREAM_READ_ABORTED = "stream_aborted";
+
+  /**
+   * Bytes read from an input stream in read() calls.
+   * Does not include bytes read and then discarded in seek/close etc.
+   * These are the bytes returned to the caller.
+   * Value: {@value}.
+   */
+  public static final String STREAM_READ_BYTES
+      = "stream_read_bytes";
+
+  /**
+   * Count of bytes discarded by aborting an input stream .
+   * Value: {@value}.
+   */
+  public static final String STREAM_READ_BYTES_DISCARDED_ABORT
+      = "stream_read_bytes_discarded_in_abort";
+
+  /**
+   * Count of bytes read and discarded when closing an input stream.
+   * Value: {@value}.
+   */
+  public static final String STREAM_READ_BYTES_DISCARDED_CLOSE
+      = "stream_read_bytes_discarded_in_close";
+
+  /**
+   * Count of times the TCP stream was closed.
+   * Value: {@value}.
+   */
+  public static final String STREAM_READ_CLOSED = "stream_read_closed";
+
+  /**
+   * Total count of times an attempt to close an input stream was made.
+   * Value: {@value}.
+   */
+  public static final String STREAM_READ_CLOSE_OPERATIONS
+      = "stream_read_close_operations";
+
+  /**
+   * Total count of times an input stream to was opened.
+   * For object stores, that means the count a GET request was initiated.
+   * Value: {@value}.
+   */
+  public static final String STREAM_READ_OPENED = "stream_read_opened";
+
+  /**
+   * Count of exceptions raised during input stream reads.
+   * Value: {@value}.
+   */
+  public static final String STREAM_READ_EXCEPTIONS =
+      "stream_read_exceptions";
+
+  /**
+   * Count of readFully() operations in an input stream.
+   * Value: {@value}.
+   */
+  public static final String STREAM_READ_FULLY_OPERATIONS
+      = "stream_read_fully_operations";
+
+  /**
+   * Count of read() operations in an input stream.
+   * Value: {@value}.
+   */
+  public static final String STREAM_READ_OPERATIONS =
+      "stream_read_operations";
+
+  /**
+   * Count of incomplete read() operations in an input stream,
+   * that is, when the bytes returned were less than that requested.
+   * Value: {@value}.
+   */
+  public static final String STREAM_READ_OPERATIONS_INCOMPLETE
+      = "stream_read_operations_incomplete";
+
+  /**
+   * count/duration of aborting a remote stream during stream IO
+   * IO.
+   * Value: {@value}.
+   */
+  public static final String STREAM_READ_REMOTE_STREAM_ABORTED
+      = "stream_read_remote_stream_aborted";
+
+  /**
+   * count/duration of closing a remote stream,
+   * possibly including draining the stream to recycle
+   * the HTTP connection.
+   * Value: {@value}.
+   */
+  public static final String STREAM_READ_REMOTE_STREAM_DRAINED
+      = "stream_read_remote_stream_drain";
+
+  /**
+   * Count of version mismatches encountered while reading an input stream.
+   * Value: {@value}.
+   */
+  public static final String STREAM_READ_VERSION_MISMATCHES
+      = "stream_read_version_mismatches";
+
+  /**
+   * Count of executed seek operations which went backwards in a stream.
+   * Value: {@value}.
+   */
+  public static final String STREAM_READ_SEEK_BACKWARD_OPERATIONS =
+      "stream_read_seek_backward_operations";
+
+  /**
+   * Count of bytes moved backwards during seek operations
+   * in an input stream.
+   * Value: {@value}.
+   */
+  public static final String STREAM_READ_SEEK_BYTES_BACKWARDS
+      = "stream_read_bytes_backwards_on_seek";
+
+  /**
+   * Count of bytes read and discarded during seek() in an input stream.
+   * Value: {@value}.
+   */
+  public static final String STREAM_READ_SEEK_BYTES_DISCARDED =
+      "stream_read_seek_bytes_discarded";
+
+  /**
+   * Count of bytes skipped during forward seek operations.
+   * Value: {@value}.
+   */
+  public static final String STREAM_READ_SEEK_BYTES_SKIPPED
+      = "stream_read_seek_bytes_skipped";
+
+  /**
+   * Count of executed seek operations which went forward in
+   * an input stream.
+   * Value: {@value}.
+   */
+  public static final String STREAM_READ_SEEK_FORWARD_OPERATIONS
+      = "stream_read_seek_forward_operations";
+
+  /**
+   * Count of times the seek policy was dynamically changed
+   * in an input stream.
+   * Value: {@value}.
+   */
+  public static final String STREAM_READ_SEEK_POLICY_CHANGED =
+      "stream_read_seek_policy_changed";
+
+  /**
+   * Count of seek operations in an input stream.
+   * Value: {@value}.
+   */
+  public static final String STREAM_READ_SEEK_OPERATIONS =
+      "stream_read_seek_operations";
+
+  /**
+   * Count of {@code InputStream.skip()} calls.
+   * Value: {@value}.
+   */
+  public static final String STREAM_READ_SKIP_OPERATIONS =
+      "stream_read_skip_operations";
+
+  /**
+   * Count bytes skipped in {@code InputStream.skip()} calls.
+   * Value: {@value}.
+   */
+  public static final String STREAM_READ_SKIP_BYTES =
+      "stream_read_skip_bytes";
+
+  /**
+   * Total count of bytes read from an input stream.
+   * Value: {@value}.
+   */
+  public static final String STREAM_READ_TOTAL_BYTES
+      = "stream_read_total_bytes";
+
+  /**
+   * Count of calls of {@code CanUnbuffer.unbuffer()}.
+   * Value: {@value}.
+   */
+  public static final String STREAM_READ_UNBUFFERED
+      = "stream_read_unbuffered";
+
+  /**
+   * "Count of stream write failures reported.
+   * Value: {@value}.
+   */
+  public static final String STREAM_WRITE_EXCEPTIONS =
+      "stream_write_exceptions";
+
+  /**
+   * Count of failures when finalizing a multipart upload:
+   * {@value}.
+   */
+  public static final String STREAM_WRITE_EXCEPTIONS_COMPLETING_UPLOADS =
+      "stream_write_exceptions_completing_upload";
+
+  /**
+   * Count of block/partition uploads complete.
+   * Value: {@value}.
+   */
+  public static final String STREAM_WRITE_BLOCK_UPLOADS
+      = "stream_write_block_uploads";
+
+  /**
+   * Count of number of block uploads aborted.
+   * Value: {@value}.
+   */
+  public static final String STREAM_WRITE_BLOCK_UPLOADS_ABORTED
+      = "stream_write_block_uploads_aborted";
+
+  /**
+   * Count of block/partition uploads active.
+   * Value: {@value}.
+   */
+  public static final String STREAM_WRITE_BLOCK_UPLOADS_ACTIVE
+      = "stream_write_block_uploads_active";
+
+  /**
+   * Gauge of data queued to be written.
+   * Value: {@value}.
+   */
+  public static final String STREAM_WRITE_BLOCK_UPLOADS_BYTES_PENDING =
+      "stream_write_block_uploads_data_pending";
+
+  /**
+   * Count of number of block uploads committed.
+   * Value: {@value}.
+   */
+  public static final String STREAM_WRITE_BLOCK_UPLOADS_COMMITTED
+      = "stream_write_block_uploads_committed";
+
+  /**
+   * Gauge of block/partitions uploads queued to be written.
+   * Value: {@value}.
+   */
+  public static final String STREAM_WRITE_BLOCK_UPLOADS_PENDING
+      = "stream_write_block_uploads_pending";
+
+
+  /**
+   * "Count of bytes written to output stream including all not yet uploaded.
+   * {@value}.
+   */
+  public static final String STREAM_WRITE_BYTES
+      = "stream_write_bytes";
+
+  /**
+   * Count of total time taken for uploads to complete.
+   * {@value}.
+   */
+  public static final String STREAM_WRITE_TOTAL_TIME
+      = "stream_write_total_time";
+
+  /**
+   * Total queue duration of all block uploads.
+   * {@value}.
+   */
+  public static final String STREAM_WRITE_QUEUE_DURATION
+      = "stream_write_queue_duration";
+
+  public static final String STREAM_WRITE_TOTAL_DATA
+      = "stream_write_total_data";
+
+  private StreamStatisticNames() {
+  }
+
+}
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/impl/IOStatisticsBinding.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/impl/IOStatisticsBinding.java
new file mode 100644
index 0000000000000..6106d3ba0f45d
--- /dev/null
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/impl/IOStatisticsBinding.java
@@ -0,0 +1,632 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.statistics.impl;
+
+import javax.annotation.Nullable;
+import java.io.IOException;
+import java.io.Serializable;
+import java.util.Iterator;
+import java.util.Map;
+import java.util.concurrent.Callable;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.atomic.AtomicLong;
+import java.util.function.BiFunction;
+import java.util.function.Function;
+
+import org.apache.hadoop.thirdparty.com.google.common.annotations.VisibleForTesting;
+
+import org.apache.hadoop.fs.StorageStatistics;
+import org.apache.hadoop.fs.statistics.DurationTracker;
+import org.apache.hadoop.fs.statistics.DurationTrackerFactory;
+import org.apache.hadoop.fs.statistics.IOStatistics;
+import org.apache.hadoop.fs.statistics.IOStatisticsSource;
+import org.apache.hadoop.fs.statistics.MeanStatistic;
+import org.apache.hadoop.util.functional.CallableRaisingIOE;
+import org.apache.hadoop.util.functional.ConsumerRaisingIOE;
+import org.apache.hadoop.util.functional.FunctionRaisingIOE;
+import org.apache.hadoop.util.functional.InvocationRaisingIOE;
+
+import static org.apache.hadoop.fs.statistics.IOStatistics.MIN_UNSET_VALUE;
+import static org.apache.hadoop.fs.statistics.impl.StubDurationTracker.STUB_DURATION_TRACKER;
+
+/**
+ * Support for implementing IOStatistics interfaces.
+ */
+public final class IOStatisticsBinding {
+
+  /** Pattern used for each entry. */
+  public static final String ENTRY_PATTERN = "(%s=%s)";
+
+  /** String to return when a source is null. */
+  @VisibleForTesting
+  public static final String NULL_SOURCE = "()";
+
+  private IOStatisticsBinding() {
+  }
+
+  /**
+   * Create  IOStatistics from a storage statistics instance.
+   *
+   * This will be updated as the storage statistics change.
+   * @param storageStatistics source data.
+   * @return an IO statistics source.
+   */
+  public static IOStatistics fromStorageStatistics(
+      StorageStatistics storageStatistics) {
+    DynamicIOStatisticsBuilder builder = dynamicIOStatistics();
+    Iterator<StorageStatistics.LongStatistic> it = storageStatistics
+        .getLongStatistics();
+    while (it.hasNext()) {
+      StorageStatistics.LongStatistic next = it.next();
+      builder.withLongFunctionCounter(next.getName(),
+          k -> storageStatistics.getLong(k));
+    }
+    return builder.build();
+  }
+
+  /**
+   * Create a builder for dynamic IO Statistics.
+   * @return a builder to be completed.
+   */
+  public static DynamicIOStatisticsBuilder dynamicIOStatistics() {
+    return new DynamicIOStatisticsBuilder();
+  }
+
+  /**
+   * Get the shared instance of the immutable empty statistics
+   * object.
+   * @return an empty statistics object.
+   */
+  public static IOStatistics emptyStatistics() {
+    return EmptyIOStatistics.getInstance();
+  }
+
+  /**
+   * Take an IOStatistics instance and wrap it in a source.
+   * @param statistics statistics.
+   * @return a source which will return the values
+   */
+  public static IOStatisticsSource wrap(IOStatistics statistics) {
+    return new SourceWrappedStatistics(statistics);
+  }
+
+  /**
+   * Create a builder for an {@link IOStatisticsStore}.
+   *
+   * @return a builder instance.
+   */
+  public static IOStatisticsStoreBuilder iostatisticsStore() {
+    return new IOStatisticsStoreBuilderImpl();
+  }
+
+  /**
+   * Convert an entry to the string format used in logging.
+   *
+   * @param entry entry to evaluate
+   * @param <E> entry type
+   * @return formatted string
+   */
+  public static <E> String entryToString(
+      final Map.Entry<String, E> entry) {
+    return entryToString(entry.getKey(), entry.getValue());
+  }
+
+  /**
+   * Convert entry values to the string format used in logging.
+   *
+   * @param name statistic name
+   * @param value stat value
+   * @return formatted string
+   */
+  public static <E> String entryToString(
+      final String name, final E value) {
+    return String.format(
+        ENTRY_PATTERN,
+        name,
+        value);
+  }
+
+  /**
+   * Copy into the dest map all the source entries.
+   * The destination is cleared first.
+   * @param <E> entry type
+   * @param dest destination of the copy
+   * @param source source
+   * @param copyFn function to copy entries
+   * @return the destination.
+   */
+  private static <E> Map<String, E> copyMap(
+      Map<String, E> dest,
+      Map<String, E> source,
+      Function<E, E> copyFn) {
+    // we have to clone the values so that they aren't
+    // bound to the original values
+    dest.clear();
+    source.entrySet()
+        .forEach(entry ->
+            dest.put(entry.getKey(), copyFn.apply(entry.getValue())));
+    return dest;
+  }
+
+  /**
+   * A passthrough copy operation suitable for immutable
+   * types, including numbers.
+   * @param src source object
+   * @return the source object
+   */
+  public static <E extends Serializable> E passthroughFn(E src) {
+    return src;
+  }
+
+  /**
+   * Take a snapshot of a supplied map, where the copy option simply
+   * uses the existing value.
+   *
+   * For this to be safe, the map must refer to immutable objects.
+   * @param source source map
+   * @param <E> type of values.
+   * @return a new map referencing the same values.
+   */
+  public static <E extends Serializable> Map<String, E> snapshotMap(
+      Map<String, E> source) {
+    return snapshotMap(source,
+        IOStatisticsBinding::passthroughFn);
+  }
+
+  /**
+   * Take a snapshot of a supplied map, using the copy function
+   * to replicate the source values.
+   * @param source source map
+   * @param copyFn function to copy the value
+   * @param <E> type of values.
+   * @return a concurrent hash map referencing the same values.
+   */
+  public static <E extends Serializable>
+      ConcurrentHashMap<String, E> snapshotMap(
+          Map<String, E> source,
+          Function<E, E> copyFn) {
+    ConcurrentHashMap<String, E> dest = new ConcurrentHashMap<>();
+    copyMap(dest, source, copyFn);
+    return dest;
+  }
+
+  /**
+   * Aggregate two maps so that the destination.
+   * @param <E> type of values
+   * @param dest destination map.
+   * @param other other map
+   * @param aggregateFn function to aggregate the values.
+   * @param copyFn function to copy the value
+   */
+  public static <E> void aggregateMaps(
+      Map<String, E> dest,
+      Map<String, E> other,
+      BiFunction<E, E, E> aggregateFn,
+      Function<E, E> copyFn) {
+    // scan through the other hand map; copy
+    // any values not in the left map,
+    // aggregate those for which there is already
+    // an entry
+    other.entrySet().forEach(entry -> {
+      String key = entry.getKey();
+      E rVal = entry.getValue();
+      E lVal = dest.get(key);
+      if (lVal == null) {
+        dest.put(key, copyFn.apply(rVal));
+      } else {
+        dest.put(key, aggregateFn.apply(lVal, rVal));
+      }
+    });
+  }
+
+  /**
+   * Aggregate two counters.
+   * @param l left value
+   * @param r right value
+   * @return the aggregate value
+   */
+  public static Long aggregateCounters(Long l, Long r) {
+    return Math.max(l, 0) + Math.max(r, 0);
+  }
+
+  /**
+   * Add two gauges.
+   * @param l left value
+   * @param r right value
+   * @return aggregate value
+   */
+  public static Long aggregateGauges(Long l, Long r) {
+    return l + r;
+  }
+
+
+  /**
+   * Aggregate two minimum values.
+   * @param l left
+   * @param r right
+   * @return the new minimum.
+   */
+  public static Long aggregateMinimums(Long l, Long r) {
+    if (l == MIN_UNSET_VALUE) {
+      return r;
+    } else if (r == MIN_UNSET_VALUE) {
+      return l;
+    } else {
+      return Math.min(l, r);
+    }
+  }
+
+  /**
+   * Aggregate two maximum values.
+   * @param l left
+   * @param r right
+   * @return the new minimum.
+   */
+  public static Long aggregateMaximums(Long l, Long r) {
+    if (l == MIN_UNSET_VALUE) {
+      return r;
+    } else if (r == MIN_UNSET_VALUE) {
+      return l;
+    } else {
+      return Math.max(l, r);
+    }
+  }
+
+  /**
+   * Aggregate the mean statistics.
+   * This returns a new instance.
+   * @param l left value
+   * @param r right value
+   * @return aggregate value
+   */
+  public static MeanStatistic aggregateMeanStatistics(
+      MeanStatistic l, MeanStatistic r) {
+    MeanStatistic res = l.copy();
+    res.add(r);
+    return res;
+  }
+
+  /**
+   * Update a maximum value tracked in an atomic long.
+   * This is thread safe -it uses compareAndSet to ensure
+   * that Thread T1 whose sample is greater than the current
+   * value never overwrites an update from thread T2 whose
+   * sample was also higher -and which completed first.
+   * @param dest destination for all changes.
+   * @param sample sample to update.
+   */
+  public static void maybeUpdateMaximum(AtomicLong dest, long sample) {
+    boolean done;
+    do {
+      long current = dest.get();
+      if (sample > current) {
+        done = dest.compareAndSet(current, sample);
+      } else {
+        done = true;
+      }
+    } while (!done);
+  }
+
+  /**
+   * Update a maximum value tracked in an atomic long.
+   * This is thread safe -it uses compareAndSet to ensure
+   * that Thread T1 whose sample is greater than the current
+   * value never overwrites an update from thread T2 whose
+   * sample was also higher -and which completed first.
+   * @param dest destination for all changes.
+   * @param sample sample to update.
+   */
+  public static void maybeUpdateMinimum(AtomicLong dest, long sample) {
+    boolean done;
+    do {
+      long current = dest.get();
+      if (current == MIN_UNSET_VALUE || sample < current) {
+        done = dest.compareAndSet(current, sample);
+      } else {
+        done = true;
+      }
+    } while (!done);
+  }
+
+  /**
+   * Given an IOException raising function/lambda expression,
+   * return a new one which wraps the inner and tracks
+   * the duration of the operation, including whether
+   * it passes/fails.
+   * @param factory factory of duration trackers
+   * @param statistic statistic key
+   * @param inputFn input function
+   * @param <A> type of argument to the input function.
+   * @param <B> return type.
+   * @return a new function which tracks duration and failure.
+   */
+  public static <A, B> FunctionRaisingIOE<A, B> trackFunctionDuration(
+      @Nullable DurationTrackerFactory factory,
+      String statistic,
+      FunctionRaisingIOE<A, B> inputFn) {
+    return (x) -> {
+      // create the tracker outside try-with-resources so
+      // that failures can be set in the catcher.
+      DurationTracker tracker = createTracker(factory, statistic);
+      try {
+        // exec the input function and return its value
+        return inputFn.apply(x);
+      } catch (IOException | RuntimeException e) {
+        // input function failed: note it
+        tracker.failed();
+        // and rethrow
+        throw e;
+      } finally {
+        // update the tracker.
+        // this is called after the catch() call would have
+        // set the failed flag.
+        tracker.close();
+      }
+    };
+  }
+
+  /**
+   * Given a java function/lambda expression,
+   * return a new one which wraps the inner and tracks
+   * the duration of the operation, including whether
+   * it passes/fails.
+   * @param factory factory of duration trackers
+   * @param statistic statistic key
+   * @param inputFn input function
+   * @param <A> type of argument to the input function.
+   * @param <B> return type.
+   * @return a new function which tracks duration and failure.
+   */
+  public static <A, B> Function<A, B> trackJavaFunctionDuration(
+      @Nullable DurationTrackerFactory factory,
+      String statistic,
+      Function<A, B> inputFn) {
+    return (x) -> {
+      // create the tracker outside try-with-resources so
+      // that failures can be set in the catcher.
+      DurationTracker tracker = createTracker(factory, statistic);
+      try {
+        // exec the input function and return its value
+        return inputFn.apply(x);
+      } catch (RuntimeException e) {
+        // input function failed: note it
+        tracker.failed();
+        // and rethrow
+        throw e;
+      } finally {
+        // update the tracker.
+        // this is called after the catch() call would have
+        // set the failed flag.
+        tracker.close();
+      }
+    };
+  }
+
+  /**
+   * Given an IOException raising callable/lambda expression,
+   * execute it and update the relevant statistic.
+   * @param factory factory of duration trackers
+   * @param statistic statistic key
+   * @param input input callable.
+   * @param <B> return type.
+   * @return the result of the operation.
+   */
+  public static <B> B trackDuration(
+      DurationTrackerFactory factory,
+      String statistic,
+      CallableRaisingIOE<B> input) throws IOException {
+    return trackDurationOfOperation(factory, statistic, input).apply();
+  }
+
+  /**
+   * Given an IOException raising callable/lambda expression,
+   * execute it and update the relevant statistic.
+   * @param factory factory of duration trackers
+   * @param statistic statistic key
+   * @param input input callable.
+   */
+  public static void trackDurationOfInvocation(
+      DurationTrackerFactory factory,
+      String statistic,
+      InvocationRaisingIOE input) throws IOException {
+
+    // create the tracker outside try-with-resources so
+    // that failures can be set in the catcher.
+    DurationTracker tracker = createTracker(factory, statistic);
+    try {
+      // exec the input function and return its value
+      input.apply();
+    } catch (IOException | RuntimeException e) {
+      // input function failed: note it
+      tracker.failed();
+      // and rethrow
+      throw e;
+    } finally {
+      // update the tracker.
+      // this is called after the catch() call would have
+      // set the failed flag.
+      tracker.close();
+    }
+  }
+
+  /**
+   * Given an IOException raising callable/lambda expression,
+   * return a new one which wraps the inner and tracks
+   * the duration of the operation, including whether
+   * it passes/fails.
+   * @param factory factory of duration trackers
+   * @param statistic statistic key
+   * @param input input callable.
+   * @param <B> return type.
+   * @return a new callable which tracks duration and failure.
+   */
+  public static <B> CallableRaisingIOE<B> trackDurationOfOperation(
+      @Nullable DurationTrackerFactory factory,
+      String statistic,
+      CallableRaisingIOE<B> input) {
+    return () -> {
+      // create the tracker outside try-with-resources so
+      // that failures can be set in the catcher.
+      DurationTracker tracker = createTracker(factory, statistic);
+      return invokeTrackingDuration(tracker, input);
+    };
+  }
+
+  /**
+   * Given an IOException raising callable/lambda expression,
+   * execute it, updating the tracker on success/failure.
+   * @param tracker duration tracker.
+   * @param input input callable.
+   * @param <B> return type.
+   * @return the result of the invocation
+   * @throws IOException on failure.
+   */
+  public static <B> B invokeTrackingDuration(
+      final DurationTracker tracker,
+      final CallableRaisingIOE<B> input)
+      throws IOException {
+    try {
+      // exec the input function and return its value
+      return input.apply();
+    } catch (IOException | RuntimeException e) {
+      // input function failed: note it
+      tracker.failed();
+      // and rethrow
+      throw e;
+    } finally {
+      // update the tracker.
+      // this is called after the catch() call would have
+      // set the failed flag.
+      tracker.close();
+    }
+  }
+
+  /**
+   * Given an IOException raising Consumer,
+   * return a new one which wraps the inner and tracks
+   * the duration of the operation, including whether
+   * it passes/fails.
+   * @param factory factory of duration trackers
+   * @param statistic statistic key
+   * @param input input callable.
+   * @param <B> return type.
+   * @return a new consumer which tracks duration and failure.
+   */
+  public static <B> ConsumerRaisingIOE<B> trackDurationConsumer(
+      @Nullable DurationTrackerFactory factory,
+      String statistic,
+      ConsumerRaisingIOE<B> input) {
+    return (B t) -> {
+      // create the tracker outside try-with-resources so
+      // that failures can be set in the catcher.
+      DurationTracker tracker = createTracker(factory, statistic);
+      try {
+        // exec the input function and return its value
+        input.accept(t);
+      } catch (IOException | RuntimeException e) {
+        // input function failed: note it
+        tracker.failed();
+        // and rethrow
+        throw e;
+      } finally {
+        // update the tracker.
+        // this is called after the catch() call would have
+        // set the failed flag.
+        tracker.close();
+      }
+    };
+  }
+
+  /**
+   * Given a callable/lambda expression,
+   * return a new one which wraps the inner and tracks
+   * the duration of the operation, including whether
+   * it passes/fails.
+   * @param factory factory of duration trackers
+   * @param statistic statistic key
+   * @param input input callable.
+   * @param <B> return type.
+   * @return a new callable which tracks duration and failure.
+   */
+  public static <B> Callable<B> trackDurationOfCallable(
+      @Nullable DurationTrackerFactory factory,
+      String statistic,
+      Callable<B> input) {
+    return () -> {
+      // create the tracker outside try-with-resources so
+      // that failures can be set in the catcher.
+      DurationTracker tracker = createTracker(factory, statistic);
+      try {
+        // exec the input function and return its value
+        return input.call();
+      } catch (RuntimeException e) {
+        // input function failed: note it
+        tracker.failed();
+        // and rethrow
+        throw e;
+      } finally {
+        // update the tracker.
+        // this is called after any catch() call will have
+        // set the failed flag.
+        tracker.close();
+      }
+    };
+  }
+
+  /**
+   * Create the tracker. If the factory is null, a stub
+   * tracker is returned.
+   * @param factory tracker factory
+   * @param statistic statistic to track
+   * @return a duration tracker.
+   */
+  private static DurationTracker createTracker(
+      @Nullable final DurationTrackerFactory factory,
+      final String statistic) {
+    return factory != null
+        ? factory.trackDuration(statistic)
+        : STUB_DURATION_TRACKER;
+  }
+
+  /**
+   * Create a DurationTrackerFactory which aggregates the tracking
+   * of two other factories.
+   * @param first first tracker factory
+   * @param second second tracker factory
+   * @return a factory
+   */
+  public static DurationTrackerFactory pairedTrackerFactory(
+      final DurationTrackerFactory first,
+      final DurationTrackerFactory second) {
+    return new PairedDurationTrackerFactory(first, second);
+  }
+
+  /**
+   * Publish the IOStatistics as a set of storage statistics.
+   * This is dynamic.
+   * @param name storage statistics name.
+   * @param scheme FS scheme; may be null.
+   * @param source IOStatistics source.
+   * @return a dynamic storage statistics object.
+   */
+  public static StorageStatistics publishAsStorageStatistics(
+      String name, String scheme, IOStatistics source) {
+    return new StorageStatisticsFromIOStatistics(name, scheme, source);
+  }
+}
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/SequenceFile.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/SequenceFile.java
index 9afa621892bf7..8e6e8af2af60c 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/SequenceFile.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/SequenceFile.java
@@ -59,6 +59,11 @@
 import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.IO_SEQFILE_COMPRESS_BLOCKSIZE_KEY;
 import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.IO_SKIP_CHECKSUM_ERRORS_DEFAULT;
 import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.IO_SKIP_CHECKSUM_ERRORS_KEY;
+import static org.apache.hadoop.fs.Options.OpenFileOptions.FS_OPTION_OPENFILE_BUFFER_SIZE;
+import static org.apache.hadoop.fs.Options.OpenFileOptions.FS_OPTION_OPENFILE_READ_POLICY;
+import static org.apache.hadoop.fs.Options.OpenFileOptions.FS_OPTION_OPENFILE_READ_POLICY_SEQUENTIAL;
+import static org.apache.hadoop.fs.Options.OpenFileOptions.FS_OPTION_OPENFILE_LENGTH;
+import static org.apache.hadoop.util.functional.FutureIO.awaitFuture;
 
 /** 
  * <code>SequenceFile</code>s are flat files consisting of binary key/value 
@@ -1942,7 +1947,14 @@ private void initialize(Path filename, FSDataInputStream in,
      */
     protected FSDataInputStream openFile(FileSystem fs, Path file,
         int bufferSize, long length) throws IOException {
-      return fs.open(file, bufferSize);
+      FutureDataInputStreamBuilder builder = fs.openFile(file)
+          .opt(FS_OPTION_OPENFILE_READ_POLICY,
+              FS_OPTION_OPENFILE_READ_POLICY_SEQUENTIAL)
+          .opt(FS_OPTION_OPENFILE_BUFFER_SIZE, bufferSize);
+      if (length >= 0) {
+        builder.opt(FS_OPTION_OPENFILE_LENGTH, length);
+      }
+      return awaitFuture(builder.build());
     }
     
     /**
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/functional/FutureIO.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/functional/FutureIO.java
new file mode 100644
index 0000000000000..c3fda19d8d73b
--- /dev/null
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/functional/FutureIO.java
@@ -0,0 +1,278 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.util.functional;
+
+import java.io.IOException;
+import java.io.InterruptedIOException;
+import java.io.UncheckedIOException;
+import java.util.Map;
+import java.util.concurrent.CompletableFuture;
+import java.util.concurrent.CompletionException;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.Future;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.TimeoutException;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.classification.InterfaceStability;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSBuilder;
+
+/**
+ * Future IO Helper methods.
+ * <p>
+ * Contains methods promoted from
+ * {@link org.apache.hadoop.fs.impl.FutureIOSupport} because they
+ * are a key part of integrating async IO in application code.
+ * </p>
+ * <p>
+ * One key feature is that the {@link #awaitFuture(Future)} and
+ * {@link #awaitFuture(Future, long, TimeUnit)} calls will
+ * extract and rethrow exceptions raised in the future's execution,
+ * including extracting the inner IOException of any
+ * {@code UncheckedIOException} raised in the future.
+ * This makes it somewhat easier to execute IOException-raising
+ * code inside futures.
+ * </p>
+ */
+@InterfaceAudience.Public
+@InterfaceStability.Unstable
+public final class FutureIO {
+
+  private FutureIO() {
+  }
+
+  /**
+   * Given a future, evaluate it.
+   * <p>
+   * Any exception generated in the future is
+   * extracted and rethrown.
+   * </p>
+   * @param future future to evaluate
+   * @param <T> type of the result.
+   * @return the result, if all went well.
+   * @throws InterruptedIOException future was interrupted
+   * @throws IOException if something went wrong
+   * @throws RuntimeException any nested RTE thrown
+   */
+  public static <T> T awaitFuture(final Future<T> future)
+      throws InterruptedIOException, IOException, RuntimeException {
+    try {
+      return future.get();
+    } catch (InterruptedException e) {
+      throw (InterruptedIOException) new InterruptedIOException(e.toString())
+          .initCause(e);
+    } catch (ExecutionException e) {
+      return raiseInnerCause(e);
+    }
+  }
+
+  /**
+   * Given a future, evaluate it.
+   * <p>
+   * Any exception generated in the future is
+   * extracted and rethrown.
+   * </p>
+   * @param future future to evaluate
+   * @param timeout timeout to wait
+   * @param unit time unit.
+   * @param <T> type of the result.
+   * @return the result, if all went well.
+   * @throws InterruptedIOException future was interrupted
+   * @throws IOException if something went wrong
+   * @throws RuntimeException any nested RTE thrown
+   * @throws TimeoutException the future timed out.
+   */
+  public static <T> T awaitFuture(final Future<T> future,
+      final long timeout,
+      final TimeUnit unit)
+      throws InterruptedIOException, IOException, RuntimeException,
+             TimeoutException {
+    try {
+      return future.get(timeout, unit);
+    } catch (InterruptedException e) {
+      throw (InterruptedIOException) new InterruptedIOException(e.toString())
+          .initCause(e);
+    } catch (ExecutionException e) {
+      return raiseInnerCause(e);
+    }
+  }
+
+  /**
+   * From the inner cause of an execution exception, extract the inner cause
+   * if it is an IOE or RTE.
+   * This will always raise an exception, either the inner IOException,
+   * an inner RuntimeException, or a new IOException wrapping the raised
+   * exception.
+   *
+   * @param e exception.
+   * @param <T> type of return value.
+   * @return nothing, ever.
+   * @throws IOException either the inner IOException, or a wrapper around
+   * any non-Runtime-Exception
+   * @throws RuntimeException if that is the inner cause.
+   */
+  public static <T> T raiseInnerCause(final ExecutionException e)
+      throws IOException {
+    throw unwrapInnerException(e);
+  }
+
+  /**
+   * Extract the cause of a completion failure and rethrow it if an IOE
+   * or RTE.
+   * @param e exception.
+   * @param <T> type of return value.
+   * @return nothing, ever.
+   * @throws IOException either the inner IOException, or a wrapper around
+   * any non-Runtime-Exception
+   * @throws RuntimeException if that is the inner cause.
+   */
+  public static <T> T raiseInnerCause(final CompletionException e)
+      throws IOException {
+    throw unwrapInnerException(e);
+  }
+
+  /**
+   * From the inner cause of an execution exception, extract the inner cause
+   * to an IOException, raising RuntimeExceptions and Errors immediately.
+   * <ol>
+   *   <li> If it is an IOE: Return.</li>
+   *   <li> If it is a {@link UncheckedIOException}: return the cause</li>
+   *   <li> Completion/Execution Exceptions: extract and repeat</li>
+   *   <li> If it is an RTE or Error: throw.</li>
+   *   <li> Any other type: wrap in an IOE</li>
+   * </ol>
+   *
+   * Recursively handles wrapped Execution and Completion Exceptions in
+   * case something very complicated has happened.
+   * @param e exception.
+   * @return an IOException extracted or built from the cause.
+   * @throws RuntimeException if that is the inner cause.
+   * @throws Error if that is the inner cause.
+   */
+  @SuppressWarnings("ChainOfInstanceofChecks")
+  public static IOException unwrapInnerException(final Throwable e) {
+    Throwable cause = e.getCause();
+    if (cause instanceof IOException) {
+      return (IOException) cause;
+    } else if (cause instanceof UncheckedIOException) {
+      // this is always an IOException
+      return ((UncheckedIOException) cause).getCause();
+    } else if (cause instanceof CompletionException) {
+      return unwrapInnerException(cause);
+    } else if (cause instanceof ExecutionException) {
+      return unwrapInnerException(cause);
+    } else if (cause instanceof RuntimeException) {
+      throw (RuntimeException) cause;
+    } else if (cause instanceof Error) {
+      throw (Error) cause;
+    } else if (cause != null) {
+      // other type: wrap with a new IOE
+      return new IOException(cause);
+    } else {
+      // this only happens if there was no cause.
+      return new IOException(e);
+    }
+  }
+
+  /**
+   * Propagate options to any builder, converting everything with the
+   * prefix to an option where, if there were 2+ dot-separated elements,
+   * it is converted to a schema.
+   * See {@link #propagateOptions(FSBuilder, Configuration, String, boolean)}.
+   * @param builder builder to modify
+   * @param conf configuration to read
+   * @param optionalPrefix prefix for optional settings
+   * @param mandatoryPrefix prefix for mandatory settings
+   * @param <T> type of result
+   * @param <U> type of builder
+   * @return the builder passed in.
+   */
+  public static <T, U extends FSBuilder<T, U>>
+      FSBuilder<T, U> propagateOptions(
+        final FSBuilder<T, U> builder,
+        final Configuration conf,
+        final String optionalPrefix,
+        final String mandatoryPrefix) {
+    propagateOptions(builder, conf,
+        optionalPrefix, false);
+    propagateOptions(builder, conf,
+        mandatoryPrefix, true);
+    return builder;
+  }
+
+  /**
+   * Propagate options to any builder, converting everything with the
+   * prefix to an option where, if there were 2+ dot-separated elements,
+   * it is converted to a schema.
+   * <pre>
+   *   fs.example.s3a.option becomes "s3a.option"
+   *   fs.example.fs.io.policy becomes "fs.io.policy"
+   *   fs.example.something becomes "something"
+   * </pre>
+   * @param builder builder to modify
+   * @param conf configuration to read
+   * @param prefix prefix to scan/strip
+   * @param mandatory are the options to be mandatory or optional?
+   */
+  public static void propagateOptions(
+      final FSBuilder<?, ?> builder,
+      final Configuration conf,
+      final String prefix,
+      final boolean mandatory) {
+
+    final String p = prefix.endsWith(".") ? prefix : (prefix + ".");
+    final Map<String, String> propsWithPrefix = conf.getPropsWithPrefix(p);
+    for (Map.Entry<String, String> entry : propsWithPrefix.entrySet()) {
+      // change the schema off each entry
+      String key = entry.getKey();
+      String val = entry.getValue();
+      if (mandatory) {
+        builder.must(key, val);
+      } else {
+        builder.opt(key, val);
+      }
+    }
+  }
+
+  /**
+   * Evaluate a CallableRaisingIOE in the current thread,
+   * converting IOEs to RTEs and propagating.
+   * @param callable callable to invoke
+   * @param <T> Return type.
+   * @return the evaluated result.
+   * @throws UnsupportedOperationException fail fast if unsupported
+   * @throws IllegalArgumentException invalid argument
+   */
+  public static <T> CompletableFuture<T> eval(
+      CallableRaisingIOE<T> callable) {
+    CompletableFuture<T> result = new CompletableFuture<>();
+    try {
+      result.complete(callable.apply());
+    } catch (UnsupportedOperationException | IllegalArgumentException tx) {
+      // fail fast here
+      throw tx;
+    } catch (Throwable tx) {
+      // fail lazily here to ensure callers expect all File IO operations to
+      // surface later
+      result.completeExceptionally(tx);
+    }
+    return result;
+  }
+}
diff --git a/hadoop-common-project/hadoop-common/src/site/markdown/filesystem/filesystem.md b/hadoop-common-project/hadoop-common/src/site/markdown/filesystem/filesystem.md
index 6ea9414f0634e..eac33702f011c 100644
--- a/hadoop-common-project/hadoop-common/src/site/markdown/filesystem/filesystem.md
+++ b/hadoop-common-project/hadoop-common/src/site/markdown/filesystem/filesystem.md
@@ -736,97 +736,11 @@ exists in the metadata, but no copies of any its blocks can be located;
 
 ### `FSDataInputStreamBuilder openFile(Path path)`
 
-Creates a [`FSDataInputStreamBuilder`](fsdatainputstreambuilder.html)
-to construct a operation to open the file at `path` for reading.
-
-When `build()` is invoked on the returned `FSDataInputStreamBuilder` instance,
-the builder parameters are verified and
-`openFileWithOptions(Path, OpenFileParameters)` invoked.
-
-This (protected) operation returns a `CompletableFuture<FSDataInputStream>`
-which, when its `get()` method is called, either returns an input
-stream of the contents of opened file, or raises an exception.
-
-The base implementation of the `openFileWithOptions(PathHandle, OpenFileParameters)`
-ultimately invokes `open(Path, int)`.
-
-Thus the chain `openFile(path).build().get()` has the same preconditions
-and postconditions as `open(Path p, int bufferSize)`
-
-However, there is one difference which implementations are free to
-take advantage of: 
-
-The returned stream MAY implement a lazy open where file non-existence or
-access permission failures may not surface until the first `read()` of the
-actual data.
-
-The `openFile()` operation may check the state of the filesystem during its
-invocation, but as the state of the filesystem may change betwen this call and
-the actual `build()` and `get()` operations, this file-specific
-preconditions (file exists, file is readable, etc) MUST NOT be checked here.
-
-FileSystem implementations which do not implement `open(Path, int)`
-MAY postpone raising an `UnsupportedOperationException` until either the
-`FSDataInputStreamBuilder.build()` or the subsequent `get()` call,
-else they MAY fail fast in the `openFile()` call.
-
-### Implementors notes
-
-The base implementation of `openFileWithOptions()` actually executes
-the `open(path)` operation synchronously, yet still returns the result
-or any failures in the `CompletableFuture<>`, so as to ensure that users
-code expecting this.
-
-Any filesystem where the time to open a file may be significant SHOULD
-execute it asynchronously by submitting the operation in some executor/thread
-pool. This is particularly recommended for object stores and other filesystems
-likely to be accessed over long-haul connections.
-
-Arbitrary filesystem-specific options MAY be supported; these MUST
-be prefixed with either the filesystem schema, e.g. `hdfs.`
-or in the "fs.SCHEMA" format as normal configuration settings `fs.hdfs`). The
-latter style allows the same configuration option to be used for both
-filesystem configuration and file-specific configuration.
-
-It SHOULD be possible to always open a file without specifying any options,
-so as to present a consistent model to users. However, an implementation MAY
-opt to require one or more mandatory options to be set.
-
-The returned stream may perform "lazy" evaluation of file access. This is
-relevant for object stores where the probes for existence are expensive, and,
-even with an asynchronous open, may be considered needless.
- 
-### `FSDataInputStreamBuilder openFile(PathHandle)`
-
-Creates a `FSDataInputStreamBuilder` to build an operation to open a file.
-Creates a [`FSDataInputStreamBuilder`](fsdatainputstreambuilder.html)
-to construct a operation to open the file identified by the given `PathHandle` for reading.
-
-When `build()` is invoked on the returned `FSDataInputStreamBuilder` instance,
-the builder parameters are verified and
-`openFileWithOptions(PathHandle, OpenFileParameters)` invoked.
-
-This (protected) operation returns a `CompletableFuture<FSDataInputStream>`
-which, when its `get()` method is called, either returns an input
-stream of the contents of opened file, or raises an exception.
+See [openFile()](openfile.html).
 
-The base implementation of the `openFileWithOptions(PathHandle, OpenFileParameters)` method
-returns a future which invokes `open(Path, int)`.
-
-Thus the chain `openFile(pathhandle).build().get()` has the same preconditions
-and postconditions as `open(Pathhandle, int)`
-
-As with `FSDataInputStreamBuilder openFile(PathHandle)`, the `openFile()`
-call must not be where path-specific preconditions are checked -that
-is postponed to the `build()` and `get()` calls.
-
-FileSystem implementations which do not implement `open(PathHandle handle, int bufferSize)`
-MAY postpone raising an `UnsupportedOperationException` until either the
-`FSDataInputStreamBuilder.build()` or the subsequent `get()` call,
-else they MAY fail fast in the `openFile()` call.
+### `FSDataInputStreamBuilder openFile(PathHandle)`
 
-The base implementation raises this exception in the `build()` operation;
-other implementations SHOULD copy this.
+See [openFile()](openfile.html).
 
 ### `PathHandle getPathHandle(FileStatus stat, HandleOpt... options)`
 
diff --git a/hadoop-common-project/hadoop-common/src/site/markdown/filesystem/index.md b/hadoop-common-project/hadoop-common/src/site/markdown/filesystem/index.md
index df538ee6cf96b..7e68896577658 100644
--- a/hadoop-common-project/hadoop-common/src/site/markdown/filesystem/index.md
+++ b/hadoop-common-project/hadoop-common/src/site/markdown/filesystem/index.md
@@ -38,3 +38,5 @@ HDFS as these are commonly expected by Hadoop client applications.
 2. [Testing with the Filesystem specification](testing.html)
 2. [Extending the specification and its tests](extending.html)
 1. [Uploading a file using Multiple Parts](multipartuploader.html)
+1. [IOStatistics](iostatistics.html)
+1. [openFile()](openfile.html).
diff --git a/hadoop-common-project/hadoop-common/src/site/markdown/filesystem/openfile.md b/hadoop-common-project/hadoop-common/src/site/markdown/filesystem/openfile.md
new file mode 100644
index 0000000000000..afb3245c5105f
--- /dev/null
+++ b/hadoop-common-project/hadoop-common/src/site/markdown/filesystem/openfile.md
@@ -0,0 +1,122 @@
+<!---
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License. See accompanying LICENSE file.
+-->
+
+# `FileSystem.openFile()`/`FileContext.openFile()`
+
+This is a method provided by both FileSystem and FileContext for
+advanced file opening options and, where implemented,
+an asynchrounous/lazy opening of a file.
+
+Creates a builder to open a file, supporting options
+both standard and filesystem specific. The return
+value of the `build()` call is a `Future<FSDataInputStream>`,
+which must be waited on. The file opening may be
+asynchronous, and it may actually be postponed (including
+permission/existence checks) until reads are actually
+performed.
+
+This API call was added to `FileSystem` and `FileContext` in
+Hadoop 3.3.0; it was tuned in Hadoop 3.3.1 as follows.
+
+* Added `opt(key, long)` and `must(key, long)`.
+* Declared that `withFileStatus(null)` is allowed.
+* Declared that `withFileStatus(status)` only checks
+  the filename of the path, not the full path.
+  This is needed to support passthrough/mounted filesystems.
+* Added standard option keys.
+
+###  <a name="openfile_path_"></a> `FutureDataInputStreamBuilder openFile(Path path)`
+
+Creates a [`FutureDataInputStreamBuilder`](fsdatainputstreambuilder.html)
+to construct a operation to open the file at `path` for reading.
+
+When `build()` is invoked on the returned `FutureDataInputStreamBuilder` instance,
+the builder parameters are verified and
+`FileSystem.openFileWithOptions(Path, OpenFileParameters)` or
+`AbstractFileSystem.openFileWithOptions(Path, OpenFileParameters)` invoked.
+
+These protected methods returns a `CompletableFuture<FSDataInputStream>`
+which, when its `get()` method is called, either returns an input
+stream of the contents of opened file, or raises an exception.
+
+The base implementation of the `FileSystem.openFileWithOptions(PathHandle, OpenFileParameters)`
+ultimately invokes `FileSystem.open(Path, int)`.
+
+Thus the chain `FileSystem.openFile(path).build().get()` has the same preconditions
+and postconditions as `FileSystem.open(Path p, int bufferSize)`
+
+However, there is one difference which implementations are free to
+take advantage of:
+
+The returned stream MAY implement a lazy open where file non-existence or
+access permission failures may not surface until the first `read()` of the
+actual data.
+
+This saves network IO on object stores.
+
+The `openFile()` operation MAY check the state of the filesystem during its
+invocation, but as the state of the filesystem may change between this call and
+the actual `build()` and `get()` operations, this file-specific
+preconditions (file exists, file is readable, etc) MUST NOT be checked here.
+
+FileSystem implementations which do not implement `open(Path, int)`
+MAY postpone raising an `UnsupportedOperationException` until either the
+`FutureDataInputStreamBuilder.build()` or the subsequent `get()` call,
+else they MAY fail fast in the `openFile()` call.
+
+Consult [`FutureDataInputStreamBuilder`](fsdatainputstreambuilder.html) for details
+on how to use the builder, and for standard options which may be passed in.
+
+### <a name="openfile_pathhandle_"></a> `FutureDataInputStreamBuilder openFile(PathHandle)`
+
+Creates a [`FutureDataInputStreamBuilder`](fsdatainputstreambuilder.html)
+to construct a operation to open the file identified by the given `PathHandle` for reading.
+
+If implemented by a filesystem, the semantics of  [`openFile(Path)`](#openfile_path_)
+Thus the chain `openFile(pathhandle).build().get()` has the same preconditions and postconditions
+as `open(Pathhandle, int)`
+
+FileSystem implementations which do not implement `open(PathHandle handle, int bufferSize)`
+MAY postpone raising an `UnsupportedOperationException` until either the
+`FutureDataInputStreamBuilder.build()` or the subsequent `get()` call, else they MAY fail fast in
+the `openFile(PathHandle)` call.
+
+The base implementation raises this exception in the `build()` operation; other implementations
+SHOULD copy this.
+
+### Implementors notes
+
+The base implementation of `openFileWithOptions()` actually executes
+the `open(path)` operation synchronously, yet still returns the result
+or any failures in the `CompletableFuture<>`, so as to provide a consistent
+lifecycle across all filesystems.
+
+Any filesystem client where the time to open a file may be significant SHOULD
+execute it asynchronously by submitting the operation in some executor/thread
+pool. This is particularly recommended for object stores and other filesystems
+likely to be accessed over long-haul connections.
+
+Arbitrary filesystem-specific options MAY be supported; these MUST
+be prefixed with either the filesystem schema, e.g. `hdfs.`
+or in the `fs.SCHEMA` format as normal configuration settings `fs.hdfs`. The
+latter style allows the same configuration option to be used for both
+filesystem configuration and file-specific configuration.
+
+It SHOULD be possible to always open a file without specifying any options,
+so as to present a consistent model to users. However, an implementation MAY
+opt to require one or more mandatory options to be set.
+
+The returned stream may perform "lazy" evaluation of file access. This is
+relevant for object stores where the probes for existence are expensive, and,
+even with an asynchronous open, may be considered needless.
diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/AbstractContractOpenTest.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/AbstractContractOpenTest.java
index a43053180fbf8..25bfe082b01f6 100644
--- a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/AbstractContractOpenTest.java
+++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/AbstractContractOpenTest.java
@@ -30,14 +30,18 @@
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.FutureDataInputStreamBuilder;
 import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.fs.impl.FutureIOSupport;
 import org.apache.hadoop.io.IOUtils;
 
+import static org.apache.hadoop.fs.Options.OpenFileOptions.FS_OPTION_OPENFILE_BUFFER_SIZE;
+import static org.apache.hadoop.fs.Options.OpenFileOptions.FS_OPTION_OPENFILE_READ_POLICY;
+import static org.apache.hadoop.fs.Options.OpenFileOptions.FS_OPTION_OPENFILE_LENGTH;
+import static org.apache.hadoop.fs.contract.ContractTestUtils.compareByteArrays;
 import static org.apache.hadoop.fs.contract.ContractTestUtils.createFile;
 import static org.apache.hadoop.fs.contract.ContractTestUtils.dataset;
 import static org.apache.hadoop.fs.contract.ContractTestUtils.touch;
 import static org.apache.hadoop.test.LambdaTestUtils.intercept;
 import static org.apache.hadoop.test.LambdaTestUtils.interceptFuture;
+import static org.apache.hadoop.util.functional.FutureIO.awaitFuture;
 
 import org.junit.Test;
 
@@ -232,7 +236,7 @@ public void testAwaitFutureFailToFNFE() throws Throwable {
         getFileSystem().openFile(path("testAwaitFutureFailToFNFE"))
             .opt("fs.test.something", true);
     intercept(FileNotFoundException.class,
-        () -> FutureIOSupport.awaitFuture(builder.build()));
+        () -> awaitFuture(builder.build()));
   }
 
   @Test
@@ -242,7 +246,7 @@ public void testAwaitFutureTimeoutFailToFNFE() throws Throwable {
         getFileSystem().openFile(path("testAwaitFutureFailToFNFE"))
             .opt("fs.test.something", true);
     intercept(FileNotFoundException.class,
-        () -> FutureIOSupport.awaitFuture(builder.build(),
+        () -> awaitFuture(builder.build(),
             10, TimeUnit.DAYS));
   }
 
@@ -250,7 +254,7 @@ public void testAwaitFutureTimeoutFailToFNFE() throws Throwable {
   public void testOpenFileExceptionallyTranslating() throws Throwable {
     describe("openFile missing file chains into exceptionally()");
     CompletableFuture<FSDataInputStream> f = getFileSystem()
-        .openFile(path("testOpenFileUnknownOption")).build();
+        .openFile(path("testOpenFileExceptionallyTranslating")).build();
     interceptFuture(RuntimeException.class,
         "exceptionally",
         f.exceptionally(ex -> {
@@ -262,11 +266,12 @@ public void testOpenFileExceptionallyTranslating() throws Throwable {
   public void testChainedFailureAwaitFuture() throws Throwable {
     describe("await Future handles chained failures");
     CompletableFuture<FSDataInputStream> f = getFileSystem()
-        .openFile(path("testOpenFileUnknownOption"))
+        .openFile(path("testChainedFailureAwaitFuture"))
+        .withFileStatus(null)
         .build();
     intercept(RuntimeException.class,
         "exceptionally",
-        () -> FutureIOSupport.awaitFuture(
+        () -> awaitFuture(
             f.exceptionally(ex -> {
               throw new RuntimeException("exceptionally", ex);
             })));
@@ -280,13 +285,34 @@ public void testOpenFileApplyRead() throws Throwable {
     int len = 4096;
     createFile(fs, path, true,
         dataset(len, 0x40, 0x80));
+    FileStatus st = fs.getFileStatus(path);
     CompletableFuture<Long> readAllBytes = fs.openFile(path)
-        .withFileStatus(fs.getFileStatus(path))
+        .withFileStatus(st)
         .build()
         .thenApply(ContractTestUtils::readStream);
     assertEquals("Wrong number of bytes read value",
         len,
         (long) readAllBytes.get());
+    // now reattempt with a new FileStatus and a different path
+    // other than the final name element
+    // implementations MUST use path in openFile() call
+    FileStatus st2 = new FileStatus(
+        len, false,
+        st.getReplication(),
+        st.getBlockSize(),
+        st.getModificationTime(),
+        st.getAccessTime(),
+        st.getPermission(),
+        st.getOwner(),
+        st.getGroup(),
+        new Path("gopher:///localhost:/" + path.getName()));
+    assertEquals("Wrong number of bytes read value",
+        len,
+        (long) fs.openFile(path)
+            .withFileStatus(st2)
+            .build()
+            .thenApply(ContractTestUtils::readStream)
+            .get());
   }
 
   @Test
@@ -298,17 +324,47 @@ public void testOpenFileApplyAsyncRead() throws Throwable {
         dataset(4, 0x40, 0x80));
     CompletableFuture<FSDataInputStream> future = fs.openFile(path).build();
     AtomicBoolean accepted = new AtomicBoolean(false);
-    future.thenAcceptAsync(i -> accepted.set(true)).get();
+    future.thenApply(stream -> {
+      accepted.set(true);
+      return stream;
+    }).get().close();
     assertTrue("async accept operation not invoked",
         accepted.get());
   }
 
+  /**
+   * Open a file with a null status, and the length
+   * passed in as an opt() option (along with sequential IO).
+   * The file is opened, the data read, and it must match
+   * the source data.
+   * opt() is used so that integration testing with external
+   * filesystem connectors will downgrade if the option is not
+   * recognized.
+   */
   @Test
-  public void testOpenFileNullStatus() throws Throwable {
-    describe("use openFile() with a null status");
+  public void testOpenFileNullStatusButFileLength() throws Throwable {
+    describe("use openFile() with a null status and expect the status to be"
+        + " ignored. block size, fadvise and length are passed in as"
+        + " opt() options");
     Path path = path("testOpenFileNullStatus");
-    intercept(NullPointerException.class,
-        () -> getFileSystem().openFile(path).withFileStatus(null));
+    FileSystem fs = getFileSystem();
+    int len = 4;
+    byte[] result = new byte[len];
+    byte[] dataset = dataset(len, 0x40, 0x80);
+    createFile(fs, path, true,
+        dataset);
+    CompletableFuture<FSDataInputStream> future = fs.openFile(path)
+        .withFileStatus(null)
+        .opt(FS_OPTION_OPENFILE_READ_POLICY,
+            "unknown, sequential, random")
+        .opt(FS_OPTION_OPENFILE_BUFFER_SIZE, 32768)
+        .opt(FS_OPTION_OPENFILE_LENGTH, len)
+        .build();
+
+    try (FSDataInputStream in = future.get()) {
+      in.readFully(result);
+    }
+    compareByteArrays(dataset, result, len);
   }
 
 }
diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/ContractTestUtils.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/ContractTestUtils.java
index c5ce46f292712..fa00c9b37af2c 100644
--- a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/ContractTestUtils.java
+++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/ContractTestUtils.java
@@ -1539,17 +1539,22 @@ public static int read(InputStream in) {
 
   /**
    * Read a whole stream; downgrades an IOE to a runtime exception.
+   * Closes the stream afterwards.
    * @param in input
    * @return the number of bytes read.
    * @throws AssertionError on any IOException
    */
   public static long readStream(InputStream in) {
-    long count = 0;
+    try {
+      long count = 0;
 
-    while (read(in) >= 0) {
-      count++;
+      while (read(in) >= 0) {
+        count++;
+      }
+      return count;
+    } finally {
+      IOUtils.cleanupWithLogger(LOG, in);
     }
-    return count;
   }
 
 
diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/statistics/IOStatisticAssertions.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/statistics/IOStatisticAssertions.java
new file mode 100644
index 0000000000000..755599f0c390c
--- /dev/null
+++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/statistics/IOStatisticAssertions.java
@@ -0,0 +1,548 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.statistics;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.ObjectInputStream;
+import java.io.ObjectOutputStream;
+import java.io.ObjectStreamClass;
+import java.io.Serializable;
+import java.util.List;
+import java.util.Map;
+import java.util.stream.Collectors;
+
+import org.assertj.core.api.AbstractLongAssert;
+import org.assertj.core.api.ObjectAssert;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.classification.InterfaceStability;
+
+import static org.apache.hadoop.fs.statistics.StoreStatisticNames.SUFFIX_MAX;
+import static org.apache.hadoop.fs.statistics.StoreStatisticNames.SUFFIX_MIN;
+import static org.assertj.core.api.Assertions.assertThat;
+
+/**
+ * Assertions and any other support for IOStatistics testing.
+ * If used downstream: know it is unstable.
+ */
+
+@InterfaceAudience.Private
+@InterfaceStability.Unstable
+public final class IOStatisticAssertions {
+
+  private static final String COUNTER = "Counter";
+
+  private static final String GAUGE = "Gauge";
+
+  private static final String MINIMUM = "Minimum";
+
+  private static final String MAXIMUM = "Maxiumum";
+
+  private static final String MEAN = "Mean";
+
+  private IOStatisticAssertions() {
+  }
+
+  /**
+   * Get a required counter statistic.
+   * @param stats statistics source
+   * @param key statistic key
+   * @return the value
+   */
+  public static long lookupCounterStatistic(
+      final IOStatistics stats,
+      final String key) {
+    return lookupStatistic(COUNTER, key,
+        verifyStatisticsNotNull(stats).counters());
+  }
+
+  /**
+   * Given an IOStatistics instance, verify it is not null,
+   * and return the value for continued use in a test.
+   * @param stats statistics source.
+   * @param <T> type of statistics
+   * @return the value passed in.
+   */
+  public static <T extends IOStatistics> T
+      verifyStatisticsNotNull(final T stats) {
+    assertThat(stats)
+        .describedAs("IO Statistics reference")
+        .isNotNull();
+    return stats;
+  }
+
+  /**
+   * Get a required gauge statistic.
+   * @param stats statistics source
+   * @param key statistic key
+   * @return the value
+   */
+  public static long lookupGaugeStatistic(
+      final IOStatistics stats,
+      final String key) {
+    return lookupStatistic(GAUGE, key,
+        verifyStatisticsNotNull(stats).gauges());
+  }
+
+  /**
+   * Get a required maximum statistic.
+   * @param stats statistics source
+   * @param key statistic key
+   * @return the value
+   */
+  public static long lookupMaximumStatistic(
+      final IOStatistics stats,
+      final String key) {
+    return lookupStatistic(MAXIMUM, key,
+        verifyStatisticsNotNull(stats).maximums());
+  }
+
+  /**
+   * Get a required minimum statistic.
+   * @param stats statistics source
+   * @param key statistic key
+   * @return the value
+   */
+  public static long lookupMinimumStatistic(
+      final IOStatistics stats,
+      final String key) {
+    return lookupStatistic(MINIMUM, key,
+        verifyStatisticsNotNull(stats).minimums());
+  }
+
+  /**
+   * Get a required mean statistic.
+   * @param stats statistics source
+   * @param key statistic key
+   * @return the value
+   */
+  public static MeanStatistic lookupMeanStatistic(
+      final IOStatistics stats,
+      final String key) {
+    return lookupStatistic(MEAN, key,
+        verifyStatisticsNotNull(stats).meanStatistics());
+  }
+
+  /**
+   * Get a required counter statistic.
+   * @param <E> type of map element
+   * @param type type for error text
+   * @param key statistic key
+   * @param map map to probe
+   * @return the value
+   */
+  private static <E> E lookupStatistic(
+      final String type,
+      final String key,
+      final Map<String, E> map) {
+    final E statistic = map.get(key);
+    assertThat(statistic)
+        .describedAs("%s named %s", type, key)
+        .isNotNull();
+    return statistic;
+  }
+
+  /**
+   * Assert that a counter has an expected value.
+   * @param stats statistics source
+   * @param key statistic key
+   * @param value expected value.
+   * @return the value (which always equals the expected value)
+   */
+  public static long verifyStatisticCounterValue(
+      final IOStatistics stats,
+      final String key,
+      final long value) {
+    return verifyStatisticValue(COUNTER, key,
+        verifyStatisticsNotNull(stats).counters(), value);
+  }
+
+  /**
+   * Assert that a gauge has an expected value.
+   * @param stats statistics source
+   * @param key statistic key
+   * @param value expected value.
+   * @return the value (which always equals the expected value)
+   */
+  public static long verifyStatisticGaugeValue(
+      final IOStatistics stats,
+      final String key,
+      final long value) {
+    return verifyStatisticValue(GAUGE, key,
+        verifyStatisticsNotNull(stats).gauges(), value);
+  }
+
+  /**
+   * Assert that a maximum has an expected value.
+   * @param stats statistics source
+   * @param key statistic key
+   * @param value expected value.
+   * @return the value (which always equals the expected value)
+   */
+  public static long verifyStatisticMaximumValue(
+      final IOStatistics stats,
+      final String key,
+      final long value) {
+    return verifyStatisticValue(MAXIMUM, key,
+        verifyStatisticsNotNull(stats).maximums(), value);
+  }
+
+  /**
+   * Assert that a minimum has an expected value.
+   * @param stats statistics source
+   * @param key statistic key
+   * @param value expected value.
+   * @return the value (which always equals the expected value)
+   */
+  public static long verifyStatisticMinimumValue(
+      final IOStatistics stats,
+      final String key,
+      final long value) {
+    return verifyStatisticValue(MINIMUM, key,
+        verifyStatisticsNotNull(stats).minimums(), value);
+  }
+
+  /**
+   * Assert that a mean has an expected value.
+   * @param stats statistics source
+   * @param key statistic key
+   * @param value expected value.
+   * @return the value (which always equals the expected value)
+   */
+  public static MeanStatistic verifyStatisticMeanValue(
+      final IOStatistics stats,
+      final String key,
+      final MeanStatistic value) {
+    return verifyStatisticValue(MEAN, key,
+        verifyStatisticsNotNull(stats).meanStatistics(), value);
+  }
+
+  /**
+   * Assert that a given statistic has an expected value.
+   * @param type type for error text
+   * @param key statistic key
+   * @param map map to look up
+   * @param value expected value.
+   * @param <E> type of map element
+   * @return the value (which always equals the expected value)
+   */
+  private static <E> E verifyStatisticValue(
+      final String type,
+      final String key,
+      final Map<String, E> map,
+      final E value) {
+    final E statistic = lookupStatistic(type, key, map);
+    assertThat(statistic)
+        .describedAs("%s named %s with expected value %s", type,
+            key, value)
+        .isEqualTo(value);
+    return statistic;
+  }
+
+
+  /**
+   * Assert that a given statistic has an expected value.
+   * @param <E> type of map element
+   * @param type type for error text
+   * @param key statistic key
+   * @param map map to look up
+   * @return an ongoing assertion
+   */
+  private static <E> ObjectAssert<E> assertThatStatistic(
+      final String type,
+      final String key,
+      final Map<String, E> map) {
+    final E statistic = lookupStatistic(type, key, map);
+    return assertThat(statistic)
+        .describedAs("%s named %s", type, key);
+  }
+
+  /**
+   * Assert that a given statistic has an expected value.
+   * @param <E> type of map element
+   * @param type type for error text
+   * @param key statistic key
+   * @param map map to look up
+   * @return an ongoing assertion
+   */
+  private static AbstractLongAssert<?> assertThatStatisticLong(
+      final String type,
+      final String key,
+      final Map<String, Long> map) {
+    final long statistic = lookupStatistic(type, key, map);
+    return assertThat(statistic)
+        .describedAs("%s named %s", type, key);
+  }
+
+  /**
+   * Start an assertion chain on
+   * a required counter statistic.
+   * @param stats statistics source
+   * @param key statistic key
+   * @return an ongoing assertion
+   */
+  public static AbstractLongAssert<?> assertThatStatisticCounter(
+      final IOStatistics stats,
+      final String key) {
+    return assertThatStatisticLong(COUNTER, key,
+        verifyStatisticsNotNull(stats).counters());
+  }
+
+  /**
+   * Start an assertion chain on
+   * a required gauge statistic.
+   * @param stats statistics source
+   * @param key statistic key
+   * @return an ongoing assertion
+   */
+  public static AbstractLongAssert<?> assertThatStatisticGauge(
+      final IOStatistics stats,
+      final String key) {
+    return assertThatStatisticLong(GAUGE, key,
+        verifyStatisticsNotNull(stats).gauges());
+  }
+
+  /**
+   * Start an assertion chain on
+   * a required minimum statistic.
+   * @param stats statistics source
+   * @param key statistic key
+   * @return an ongoing assertion
+   */
+  public static AbstractLongAssert<?> assertThatStatisticMinimum(
+      final IOStatistics stats,
+      final String key) {
+    return assertThatStatisticLong(MINIMUM, key,
+        verifyStatisticsNotNull(stats).minimums());
+  }
+
+  /**
+   * Start an assertion chain on
+   * a required maximum statistic.
+   * @param stats statistics source
+   * @param key statistic key
+   * @return an ongoing assertion
+   */
+  public static AbstractLongAssert<?> assertThatStatisticMaximum(
+      final IOStatistics stats,
+      final String key) {
+    return assertThatStatisticLong(MAXIMUM, key,
+        verifyStatisticsNotNull(stats).maximums());
+  }
+
+  /**
+   * Assert that a duration is within a given minimum/maximum range.
+   * @param stats statistics source
+   * @param key statistic key without any suffix
+   * @param min minimum statistic must be equal to or greater than this.
+   * @param max maximum statistic must be equal to or less than this.
+   */
+  public static void assertDurationRange(
+      final IOStatistics stats,
+      final String key,
+      final long min,
+      final long max) {
+    assertThatStatisticMinimum(stats, key + SUFFIX_MIN)
+        .isGreaterThanOrEqualTo(min);
+    assertThatStatisticMaximum(stats, key + SUFFIX_MAX)
+        .isLessThanOrEqualTo(max);
+  }
+
+  /**
+   * Start an assertion chain on
+   * a required mean statistic.
+   * @param stats statistics source
+   * @param key statistic key
+   * @return an ongoing assertion
+   */
+  public static ObjectAssert<MeanStatistic> assertThatStatisticMean(
+      final IOStatistics stats,
+      final String key) {
+    return assertThatStatistic(MEAN, key,
+        verifyStatisticsNotNull(stats).meanStatistics());
+  }
+
+  /**
+   * Start an assertion chain on
+   * a required mean statistic with the initial validation on the
+   * sample count and sum.
+   * @param stats statistics source
+   * @param key statistic key
+   * @return an ongoing assertion
+   */
+  public static ObjectAssert<MeanStatistic> assertThatStatisticMeanMatches(
+      final IOStatistics stats,
+      final String key,
+      final long samples,
+      final long sum) {
+    return assertThatStatisticMean(stats, key)
+        .matches(p -> (p.getSamples() == samples),
+            "samples == " + samples)
+        .matches(p -> (p.getSum() == sum),
+            "sum == " + sum);
+  }
+
+  /**
+   * Assert that a given counter statistic is untracked.
+   * @param stats statistics source
+   * @param type type for error text
+   * @param key statistic key
+   * @param map map to probe
+   */
+  private static void assertUntracked(final IOStatistics stats,
+      final String type,
+      final String key,
+      final Map<String, ?> map) {
+    assertThat(map.containsKey(key))
+        .describedAs("%s %s is tracked in %s", type, key, stats)
+        .isFalse();
+  }
+
+  /**
+   * Assert that a given counter statistic is untracked.
+   * @param stats statistics source
+   * @param type type for error text
+   * @param key statistic key
+   * @param map map to probe
+   */
+  private static void assertTracked(final IOStatistics stats,
+      final String type,
+      final String key,
+      final Map<String, ?> map) {
+    assertThat(map.containsKey(key))
+        .describedAs("%s %s is not tracked in %s", type, key, stats)
+        .isTrue();
+  }
+
+  /**
+   * Assert that a given statistic is tracked.
+   * @param stats statistics source
+   * @param key statistic key
+   */
+  public static void assertStatisticCounterIsTracked(
+      final IOStatistics stats,
+      final String key) {
+    assertTracked(stats, COUNTER, key,
+        verifyStatisticsNotNull(stats).counters());
+  }
+
+  /**
+   * Assert that a given counter statistic is untracked.
+   * @param stats statistics source
+   * @param key statistic key
+   */
+  public static void assertStatisticCounterIsUntracked(
+      final IOStatistics stats,
+      final String key) {
+    assertUntracked(stats, COUNTER, key,
+        verifyStatisticsNotNull(stats).counters());
+  }
+
+  /**
+   * Assert that an object is a statistics source and that the
+   * statistics is not null.
+   * @param source source object.
+   */
+  public static void assertIsStatisticsSource(Object source) {
+    assertThat(source)
+        .describedAs("Object %s", source)
+        .isInstanceOf(IOStatisticsSource.class)
+        .extracting(o -> ((IOStatisticsSource) o).getIOStatistics())
+        .isNotNull();
+  }
+
+  /**
+   * Query the source for the statistics; fails if the statistics
+   * returned are null or the class does not implement the API.
+   * @param source source object.
+   * @return the statistics it provides.
+   */
+  public static IOStatistics extractStatistics(Object source) {
+    assertThat(source)
+        .describedAs("Object %s", source)
+        .isInstanceOf(IOStatisticsSource.class);
+    IOStatisticsSource ios = (IOStatisticsSource) source;
+    return extractStatistics(ios);
+  }
+
+  /**
+   * Get the non-null statistics.
+   * @param ioStatisticsSource source
+   * @return the statistics, guaranteed to be non null
+   */
+  private static IOStatistics extractStatistics(
+      final IOStatisticsSource ioStatisticsSource) {
+    IOStatistics statistics = ioStatisticsSource.getIOStatistics();
+    assertThat(statistics)
+        .describedAs("Statistics from %s", ioStatisticsSource)
+        .isNotNull();
+    return statistics;
+  }
+
+  /**
+   * Perform a serialization round trip on a statistics instance.
+   * @param stat statistic
+   * @return the deserialized version.
+   */
+  public static IOStatistics statisticsJavaRoundTrip(final IOStatistics stat)
+      throws IOException, ClassNotFoundException {
+    assertThat(stat).isInstanceOf(Serializable.class);
+    ByteArrayOutputStream baos = new ByteArrayOutputStream(1024);
+    try (ObjectOutputStream oos = new ObjectOutputStream(baos)) {
+      oos.writeObject(stat);
+    }
+    ByteArrayInputStream bais = new ByteArrayInputStream(baos.toByteArray());
+    IOStatistics deser;
+    try (ObjectInputStream ois = new RestrictedInput(bais,
+        IOStatisticsSnapshot.requiredSerializationClasses())) {
+      deser = (IOStatistics) ois.readObject();
+    }
+    return deser;
+  }
+
+  private static final class RestrictedInput extends ObjectInputStream {
+
+    private final List<String> allowedClasses;
+
+    private RestrictedInput(final InputStream in,
+        final List<Class> allowedClasses) throws IOException {
+
+      super(in);
+      this.allowedClasses = allowedClasses.stream()
+          .map(Class::getName)
+          .collect(Collectors.toList());
+    }
+
+    @Override
+    protected Class<?> resolveClass(final ObjectStreamClass desc)
+        throws IOException, ClassNotFoundException {
+      final String classname = desc.getName();
+      if (!allowedClasses.contains(classname)) {
+        throw new ClassNotFoundException("Class " + classname
+            + " Not in list of allowed classes");
+      }
+
+      return super.resolveClass(desc);
+    }
+  }
+
+}
diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/statistics/TestDurationTracking.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/statistics/TestDurationTracking.java
new file mode 100644
index 0000000000000..cfde1583e2c21
--- /dev/null
+++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/statistics/TestDurationTracking.java
@@ -0,0 +1,360 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.statistics;
+
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.util.concurrent.CompletableFuture;
+import java.util.concurrent.atomic.AtomicInteger;
+import java.util.function.Function;
+
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.hadoop.fs.statistics.impl.IOStatisticsStore;
+import org.apache.hadoop.test.AbstractHadoopTestBase;
+import org.apache.hadoop.util.functional.FunctionRaisingIOE;
+import org.apache.hadoop.util.functional.FutureIO;
+
+import static org.apache.hadoop.fs.statistics.DurationStatisticSummary.fetchDurationSummary;
+import static org.apache.hadoop.fs.statistics.DurationStatisticSummary.fetchSuccessSummary;
+import static org.apache.hadoop.fs.statistics.IOStatisticAssertions.*;
+import static org.apache.hadoop.fs.statistics.impl.IOStatisticsBinding.*;
+import static org.apache.hadoop.fs.statistics.impl.StubDurationTrackerFactory.STUB_DURATION_TRACKER_FACTORY;
+import static org.apache.hadoop.test.LambdaTestUtils.intercept;
+import static org.assertj.core.api.Assertions.assertThat;
+
+/**
+ * Test the IOStatistic DurationTracker logic.
+ */
+public class TestDurationTracking extends AbstractHadoopTestBase {
+
+  private static final Logger LOG =
+      LoggerFactory.getLogger(TestDurationTracking.class);
+
+  private static final String REQUESTS = "requests";
+
+  public static final String UNKNOWN = "unknown";
+
+  private IOStatisticsStore stats;
+
+  private final AtomicInteger invocationCounter = new AtomicInteger(0);
+
+  @Before
+  public void setup() {
+    stats = iostatisticsStore()
+        .withDurationTracking(REQUESTS)
+        .build();
+  }
+
+  @After
+  public void teardown() {
+    LOG.info("stats {}", stats);
+  }
+
+  /**
+   * Duration tracking.
+   */
+  @Test
+  public void testDurationTryWithResources() throws Throwable {
+    DurationTracker tracker =
+        stats.trackDuration(REQUESTS);
+    verifyStatisticCounterValue(stats, REQUESTS, 1L);
+    sleep();
+    tracker.close();
+    try (DurationTracker ignored =
+             stats.trackDuration(REQUESTS)) {
+      sleep();
+    }
+    LOG.info("Statistics: {}", stats);
+    DurationStatisticSummary summary = fetchSuccessSummary(stats, REQUESTS);
+    assertSummaryValues(summary, 2, 1, 1);
+    assertSummaryMean(summary, 2, 0);
+  }
+
+  /**
+   * A little sleep method; exceptions are swallowed.
+   * Increments {@link #invocationCounter}.
+   * Increments {@inheritDoc #atomicCounter}.
+   */
+  public void sleep() {
+    sleepf(10);
+  }
+
+  /**
+   * A little sleep function; exceptions are swallowed.
+   * Increments {@link #invocationCounter}.
+   */
+  protected int sleepf(final int millis) {
+    invocationCounter.incrementAndGet();
+    try {
+      Thread.sleep(millis);
+    } catch (InterruptedException ignored) {
+    }
+    return millis;
+  }
+
+  /**
+   * Assert that the sleep counter has been invoked
+   * the expected number of times.
+   * @param expected expected value
+   */
+  private void assertCounterValue(final int expected) {
+    assertThat(invocationCounter.get())
+        .describedAs("Sleep invocation Counter")
+        .isEqualTo(expected);
+  }
+
+  /**
+   * Test that a function raising an IOE can be wrapped.
+   */
+  @Test
+  public void testDurationFunctionIOE() throws Throwable {
+    FunctionRaisingIOE<Integer, Integer> fn =
+        trackFunctionDuration(stats, REQUESTS,
+            (Integer x) -> invocationCounter.getAndSet(x));
+    assertThat(fn.apply(1)).isEqualTo(0);
+    assertCounterValue(1);
+    assertSummaryValues(
+        fetchSuccessSummary(stats, REQUESTS),
+        1, 0, 0);
+  }
+
+  /**
+   * Trigger a failure and verify its the failure statistics
+   * which go up.
+   */
+  @Test
+  public void testDurationFunctionIOEFailure() throws Throwable {
+    FunctionRaisingIOE<Integer, Integer> fn =
+        trackFunctionDuration(stats, REQUESTS,
+            (Integer x) -> {
+              sleep();
+              return 100 / x;
+            });
+    intercept(ArithmeticException.class,
+        () -> fn.apply(0));
+    assertSummaryValues(
+        fetchSuccessSummary(stats, REQUESTS),
+        1, -1, -1);
+
+    DurationStatisticSummary failures = fetchDurationSummary(stats, REQUESTS,
+        false);
+    assertSummaryValues(failures, 1, 0, 0);
+    assertSummaryMean(failures, 1, 0);
+  }
+
+  /**
+   * Trigger a failure and verify its the failure statistics
+   * which go up.
+   */
+  @Test
+  public void testDurationJavaFunctionFailure() throws Throwable {
+    Function<Integer, Integer> fn =
+        trackJavaFunctionDuration(stats, REQUESTS,
+            (Integer x) -> {
+              return 100 / x;
+            });
+    intercept(ArithmeticException.class,
+        () -> fn.apply(0));
+    assertSummaryValues(
+        fetchSuccessSummary(stats, REQUESTS),
+        1, -1, -1);
+
+    DurationStatisticSummary failures = fetchDurationSummary(stats, REQUESTS,
+        false);
+    assertSummaryValues(failures, 1, 0, 0);
+  }
+
+  /**
+   * Test trackDurationOfCallable.
+   */
+  @Test
+  public void testCallableDuration() throws Throwable {
+    // call the operation
+    assertThat(
+        trackDurationOfCallable(stats, REQUESTS, () -> sleepf(100)).call())
+        .isEqualTo(100);
+    DurationStatisticSummary summary = fetchSuccessSummary(stats, REQUESTS);
+    assertSummaryValues(summary, 1, 0, 0);
+    assertSummaryMean(summary, 1, 0);
+  }
+
+  /**
+   * Callable raising an RTE after a sleep; failure
+   * stats will be updated and the execution count will be
+   * 1.
+   */
+  @Test
+  public void testCallableFailureDuration() throws Throwable {
+
+    intercept(RuntimeException.class,
+        trackDurationOfCallable(stats, REQUESTS, () -> {
+          sleepf(100);
+          throw new RuntimeException("oops");
+        }));
+    assertCounterValue(1);
+    assertSummaryValues(
+        fetchSuccessSummary(stats, REQUESTS),
+        1, -1, -1);
+
+    assertSummaryValues(fetchDurationSummary(stats, REQUESTS, false),
+        1, 0, 0);
+  }
+
+  /**
+   * Duration of the successful execution of a InvocationRaisingIOE.
+   */
+  @Test
+  public void testInvocationDuration() throws Throwable {
+    // call the operation
+    trackDurationOfInvocation(stats, REQUESTS, () -> {
+      sleepf(100);
+    });
+    assertCounterValue(1);
+    DurationStatisticSummary summary = fetchSuccessSummary(stats, REQUESTS);
+    assertSummaryValues(summary, 1, 0, 0);
+    assertSummaryMean(summary, 1, 0);
+  }
+
+  /**
+   * Duration of the successful execution of a CallableRaisingIOE.
+   */
+  @Test
+  public void testCallableIOEDuration() throws Throwable {
+    // call the operation
+    assertThat(
+        trackDuration(stats, REQUESTS, () -> sleepf(100)))
+        .isEqualTo(100);
+    DurationStatisticSummary summary = fetchSuccessSummary(stats, REQUESTS);
+    assertSummaryValues(summary, 1, 0, 0);
+    assertSummaryMean(summary, 1, 0);
+  }
+
+  /**
+   * Track the duration of an IOE raising callable which fails.
+   */
+  @Test
+  public void testCallableIOEFailureDuration() throws Throwable {
+    intercept(IOException.class,
+        () ->
+        trackDuration(stats, REQUESTS, () -> {
+          sleepf(100);
+          throw new IOException("oops");
+        }));
+    assertSummaryValues(
+        fetchSuccessSummary(stats, REQUESTS),
+        1, -1, -1);
+
+    assertSummaryValues(fetchDurationSummary(stats, REQUESTS, false),
+        1, 0, 0);
+  }
+
+
+  /**
+   * Track the duration of an IOE raising callable which fails.
+   */
+  @Test
+  public void testDurationThroughEval() throws Throwable {
+    CompletableFuture<Object> eval = FutureIO.eval(
+        trackDurationOfOperation(stats, REQUESTS, () -> {
+          sleepf(100);
+          throw new FileNotFoundException("oops");
+        }));
+    intercept(FileNotFoundException.class, "oops", () ->
+        FutureIO.awaitFuture(eval));
+    assertSummaryValues(fetchDurationSummary(stats, REQUESTS, false),
+        1, 0, 0);
+  }
+
+  /**
+   * It's OK to track a duration against an unknown statistic.
+   */
+  @Test
+  public void testUnknownDuration() throws Throwable {
+    trackDurationOfCallable(stats, UNKNOWN, () -> sleepf(1)).call();
+    DurationStatisticSummary summary = fetchSuccessSummary(stats, UNKNOWN);
+    assertSummaryValues(summary, 0, -1, -1);
+    assertThat(summary.getMean()).isNull();
+  }
+
+  /**
+   * The stub duration tracker factory can be supplied as an input.
+   */
+  @Test
+  public void testTrackDurationWithStubFactory() throws Throwable {
+    trackDuration(STUB_DURATION_TRACKER_FACTORY, UNKNOWN, () -> sleepf(1));
+  }
+
+  /**
+   * Make sure the tracker returned from the stub factory
+   * follows the basic lifecycle.
+   */
+  @Test
+  public void testStubDurationLifecycle() throws Throwable {
+    DurationTracker tracker = STUB_DURATION_TRACKER_FACTORY
+        .trackDuration("k", 1);
+    tracker.failed();
+    tracker.close();
+    tracker.close();
+  }
+
+  /**
+   * Assert that a statistics summary has the specific values.
+   * @param summary summary data
+   * @param count count -must match exactly.
+   * @param minBase minimum value for the minimum field (inclusive)
+   * @param maxBase minimum value for the maximum field (inclusive)
+   */
+  protected void assertSummaryValues(
+      final DurationStatisticSummary summary,
+      final int count,
+      final int minBase,
+      final int maxBase) {
+    assertThat(summary)
+        .matches(s -> s.getCount() == count, "Count value")
+        .matches(s -> s.getMax() >= maxBase, "Max value")
+        .matches(s -> s.getMin() >= minBase, "Min value");
+  }
+
+  /**
+   * Assert that at a summary has a matching mean value.
+   * @param summary summary data.
+   * @param expectedSampleCount sample count -which must match
+   * @param meanGreaterThan the mean must be greater than this value.
+   */
+  protected void assertSummaryMean(
+      final DurationStatisticSummary summary,
+      final int expectedSampleCount,
+      final double meanGreaterThan) {
+    String description = "mean of " + summary;
+    assertThat(summary.getMean())
+        .describedAs(description)
+        .isNotNull();
+    assertThat(summary.getMean().getSamples())
+        .describedAs(description)
+        .isEqualTo(expectedSampleCount);
+    assertThat(summary.getMean().mean())
+        .describedAs(description)
+        .isGreaterThan(meanGreaterThan);
+  }
+}

From 58db33bc62b1cf5b605ecff3c8c75b72e5f9a907 Mon Sep 17 00:00:00 2001
From: Steve Loughran <stevel@cloudera.com>
Date: Thu, 31 Dec 2020 17:22:42 +0530
Subject: [PATCH 31/40] HADOOP-17450. Add Public IOStatistics API. (#2577)

This is the API and implementation classes of HADOOP-16830,
which allows callers to query IO object instances
(filesystems, streams, remote iterators, ...) and other classes
for statistics on their I/O Usage: operation count and min/max/mean
durations.

New Packages

org.apache.hadoop.fs.statistics.
  Public API, including:
    IOStatisticsSource
    IOStatistics
    IOStatisticsSnapshot (seralizable to java objects and json)
    +helper classes for logging and integration
    BufferedIOStatisticsInputStream
       implements IOStatisticsSource and StreamCapabilities
     BufferedIOStatisticsOutputStream
       implements IOStatisticsSource, Syncable and StreamCapabilities

org.apache.hadoop.fs.statistics.impl
  Implementation classes for internal use.

org.apache.hadoop.util.functional
  functional programming support for RemoteIterators and
  other operations which raise IOEs; all wrapper classes
  implement and propagate IOStatisticsSource

Contributed by Steve Loughran.

Change-Id: If56e8db2981613ff689c39239135e44feb25f78e
---
 .../hadoop/crypto/CryptoOutputStream.java     |  11 +-
 .../hadoop/fs/BufferedFSInputStream.java      |  29 +-
 .../apache/hadoop/fs/ChecksumFileSystem.java  |  42 +-
 .../apache/hadoop/fs/FSDataOutputStream.java  |  17 +-
 .../org/apache/hadoop/fs/FSInputStream.java   |  22 +
 .../apache/hadoop/fs/RawLocalFileSystem.java  |  96 ++-
 .../apache/hadoop/fs/StreamCapabilities.java  |   5 +
 .../BufferedIOStatisticsInputStream.java      |  85 +++
 .../BufferedIOStatisticsOutputStream.java     | 157 ++++
 .../statistics/DurationStatisticSummary.java  | 154 ++++
 .../hadoop/fs/statistics/DurationTracker.java |  54 ++
 .../fs/statistics/DurationTrackerFactory.java |  57 ++
 .../hadoop/fs/statistics/IOStatistics.java    |  78 ++
 .../fs/statistics/IOStatisticsAggregator.java |  48 ++
 .../fs/statistics/IOStatisticsLogging.java    | 301 ++++++++
 .../fs/statistics/IOStatisticsSnapshot.java   | 285 +++++++
 .../fs/statistics/IOStatisticsSource.java     |  47 ++
 .../fs/statistics/IOStatisticsSupport.java    | 107 +++
 .../hadoop/fs/statistics/MeanStatistic.java   | 290 ++++++++
 .../impl/AbstractIOStatisticsImpl.java        |  30 +
 .../statistics/impl/DynamicIOStatistics.java  | 132 ++++
 .../impl/DynamicIOStatisticsBuilder.java      | 248 +++++++
 .../fs/statistics/impl/EmptyIOStatistics.java |  74 ++
 .../impl/EvaluatingStatisticsMap.java         | 202 +++++
 .../fs/statistics/impl/IOStatisticsStore.java | 258 +++++++
 .../impl/IOStatisticsStoreBuilder.java        |  75 ++
 .../impl/IOStatisticsStoreBuilderImpl.java    | 100 +++
 .../impl/IOStatisticsStoreImpl.java           | 469 ++++++++++++
 .../impl/PairedDurationTrackerFactory.java    |  93 +++
 .../impl/SourceWrappedStatistics.java         |  44 ++
 .../impl/StatisticDurationTracker.java        | 106 +++
 .../StorageStatisticsFromIOStatistics.java    |  98 +++
 .../statistics/impl/StubDurationTracker.java  |  51 ++
 .../impl/StubDurationTrackerFactory.java      |  44 ++
 .../statistics/impl/WrappedIOStatistics.java  | 108 +++
 .../fs/statistics/impl/package-info.java      |  31 +
 .../hadoop/fs/statistics/package-info.java    | 134 ++++
 .../io/compress/CompressionInputStream.java   |  18 +-
 .../io/compress/CompressionOutputStream.java  |  14 +-
 .../org/apache/hadoop/util/LineReader.java    |  16 +-
 .../util/functional/BiFunctionRaisingIOE.java |  40 +
 .../util/functional/CallableRaisingIOE.java   |  36 +
 .../util/functional/ConsumerRaisingIOE.java   |  51 ++
 .../util/functional/FunctionRaisingIOE.java   |  38 +
 .../util/functional/InvocationRaisingIOE.java |  42 ++
 .../util/functional/RemoteIterators.java      | 698 ++++++++++++++++++
 .../hadoop/util/functional/package-info.java  |  41 +
 .../site/markdown/filesystem/iostatistics.md  | 432 +++++++++++
 ...bstractContractStreamIOStatisticsTest.java | 313 ++++++++
 ...TestLocalFSContractStreamIOStatistics.java |  80 ++
 .../statistics/TestDynamicIOStatistics.java   | 311 ++++++++
 .../fs/statistics/TestEmptyIOStatistics.java  | 110 +++
 .../statistics/TestIOStatisticsSnapshot.java  | 147 ++++
 .../fs/statistics/TestIOStatisticsStore.java  | 177 +++++
 .../fs/statistics/TestMeanStatistic.java      | 219 ++++++
 .../util/functional/TestRemoteIterators.java  | 469 ++++++++++++
 56 files changed, 7417 insertions(+), 17 deletions(-)
 create mode 100644 hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/BufferedIOStatisticsInputStream.java
 create mode 100644 hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/BufferedIOStatisticsOutputStream.java
 create mode 100644 hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/DurationStatisticSummary.java
 create mode 100644 hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/DurationTracker.java
 create mode 100644 hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/DurationTrackerFactory.java
 create mode 100644 hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/IOStatistics.java
 create mode 100644 hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/IOStatisticsAggregator.java
 create mode 100644 hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/IOStatisticsLogging.java
 create mode 100644 hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/IOStatisticsSnapshot.java
 create mode 100644 hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/IOStatisticsSource.java
 create mode 100644 hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/IOStatisticsSupport.java
 create mode 100644 hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/MeanStatistic.java
 create mode 100644 hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/impl/AbstractIOStatisticsImpl.java
 create mode 100644 hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/impl/DynamicIOStatistics.java
 create mode 100644 hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/impl/DynamicIOStatisticsBuilder.java
 create mode 100644 hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/impl/EmptyIOStatistics.java
 create mode 100644 hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/impl/EvaluatingStatisticsMap.java
 create mode 100644 hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/impl/IOStatisticsStore.java
 create mode 100644 hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/impl/IOStatisticsStoreBuilder.java
 create mode 100644 hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/impl/IOStatisticsStoreBuilderImpl.java
 create mode 100644 hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/impl/IOStatisticsStoreImpl.java
 create mode 100644 hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/impl/PairedDurationTrackerFactory.java
 create mode 100644 hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/impl/SourceWrappedStatistics.java
 create mode 100644 hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/impl/StatisticDurationTracker.java
 create mode 100644 hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/impl/StorageStatisticsFromIOStatistics.java
 create mode 100644 hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/impl/StubDurationTracker.java
 create mode 100644 hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/impl/StubDurationTrackerFactory.java
 create mode 100644 hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/impl/WrappedIOStatistics.java
 create mode 100644 hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/impl/package-info.java
 create mode 100644 hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/package-info.java
 create mode 100644 hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/functional/BiFunctionRaisingIOE.java
 create mode 100644 hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/functional/CallableRaisingIOE.java
 create mode 100644 hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/functional/ConsumerRaisingIOE.java
 create mode 100644 hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/functional/FunctionRaisingIOE.java
 create mode 100644 hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/functional/InvocationRaisingIOE.java
 create mode 100644 hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/functional/RemoteIterators.java
 create mode 100644 hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/functional/package-info.java
 create mode 100644 hadoop-common-project/hadoop-common/src/site/markdown/filesystem/iostatistics.md
 create mode 100644 hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/AbstractContractStreamIOStatisticsTest.java
 create mode 100644 hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/localfs/TestLocalFSContractStreamIOStatistics.java
 create mode 100644 hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/statistics/TestDynamicIOStatistics.java
 create mode 100644 hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/statistics/TestEmptyIOStatistics.java
 create mode 100644 hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/statistics/TestIOStatisticsSnapshot.java
 create mode 100644 hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/statistics/TestIOStatisticsStore.java
 create mode 100644 hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/statistics/TestMeanStatistic.java
 create mode 100644 hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/util/functional/TestRemoteIterators.java

diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/crypto/CryptoOutputStream.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/crypto/CryptoOutputStream.java
index 8d11043937612..829f205e22eb2 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/crypto/CryptoOutputStream.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/crypto/CryptoOutputStream.java
@@ -28,9 +28,13 @@
 import org.apache.hadoop.fs.CanSetDropBehind;
 import org.apache.hadoop.fs.StreamCapabilities;
 import org.apache.hadoop.fs.Syncable;
+import org.apache.hadoop.fs.statistics.IOStatistics;
+import org.apache.hadoop.fs.statistics.IOStatisticsSource;
 
 import com.google.common.base.Preconditions;
 
+import static org.apache.hadoop.fs.statistics.IOStatisticsSupport.retrieveIOStatistics;
+
 /**
  * CryptoOutputStream encrypts data. It is not thread-safe. AES CTR mode is
  * required in order to ensure that the plain text and cipher text have a 1:1
@@ -48,7 +52,7 @@
 @InterfaceAudience.Private
 @InterfaceStability.Evolving
 public class CryptoOutputStream extends FilterOutputStream implements 
-    Syncable, CanSetDropBehind, StreamCapabilities {
+    Syncable, CanSetDropBehind, StreamCapabilities, IOStatisticsSource {
   private final byte[] oneByteBuf = new byte[1];
   private final CryptoCodec codec;
   private final Encryptor encryptor;
@@ -313,4 +317,9 @@ public boolean hasCapability(String capability) {
     }
     return false;
   }
+
+  @Override
+  public IOStatistics getIOStatistics() {
+    return retrieveIOStatistics(out);
+  }
 }
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/BufferedFSInputStream.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/BufferedFSInputStream.java
index 973b136bb3ab2..0c5b4f0d3745a 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/BufferedFSInputStream.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/BufferedFSInputStream.java
@@ -24,6 +24,10 @@
 
 import org.apache.hadoop.classification.InterfaceAudience;
 import org.apache.hadoop.classification.InterfaceStability;
+import org.apache.hadoop.fs.statistics.IOStatistics;
+import org.apache.hadoop.fs.statistics.IOStatisticsSource;
+
+import static org.apache.hadoop.fs.statistics.IOStatisticsSupport.retrieveIOStatistics;
 
 
 /**
@@ -33,7 +37,8 @@
 @InterfaceAudience.Private
 @InterfaceStability.Unstable
 public class BufferedFSInputStream extends BufferedInputStream
-implements Seekable, PositionedReadable, HasFileDescriptor {
+    implements Seekable, PositionedReadable, HasFileDescriptor,
+    IOStatisticsSource, StreamCapabilities {
   /**
    * Creates a <code>BufferedFSInputStream</code>
    * with the specified buffer size,
@@ -126,4 +131,26 @@ public FileDescriptor getFileDescriptor() throws IOException {
       return null;
     }
   }
+
+  /**
+   * If the inner stream supports {@link StreamCapabilities},
+   * forward the probe to it.
+   * Otherwise: return false.
+   *
+   * @param capability string to query the stream support for.
+   * @return true if a capability is known to be supported.
+   */
+  @Override
+  public boolean hasCapability(final String capability) {
+    if (in instanceof StreamCapabilities) {
+      return ((StreamCapabilities) in).hasCapability(capability);
+    } else {
+      return false;
+    }
+  }
+
+  @Override
+  public IOStatistics getIOStatistics() {
+    return retrieveIOStatistics(in);
+  }
 }
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/ChecksumFileSystem.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/ChecksumFileSystem.java
index d5401308adc93..882f1d951dfcd 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/ChecksumFileSystem.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/ChecksumFileSystem.java
@@ -32,6 +32,9 @@
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.permission.AclEntry;
 import org.apache.hadoop.fs.permission.FsPermission;
+import org.apache.hadoop.fs.statistics.IOStatistics;
+import org.apache.hadoop.fs.statistics.IOStatisticsSource;
+import org.apache.hadoop.fs.statistics.IOStatisticsSupport;
 import org.apache.hadoop.util.DataChecksum;
 import org.apache.hadoop.util.Progressable;
 
@@ -127,7 +130,8 @@ private int getSumBufferSize(int bytesPerSum, int bufferSize) {
    * For open()'s FSInputStream
    * It verifies that data matches checksums.
    *******************************************************/
-  private static class ChecksumFSInputChecker extends FSInputChecker {
+  private static class ChecksumFSInputChecker extends FSInputChecker implements
+      IOStatisticsSource {
     private ChecksumFileSystem fs;
     private FSDataInputStream datas;
     private FSDataInputStream sums;
@@ -263,6 +267,17 @@ protected int readChunk(long pos, byte[] buf, int offset, int len,
       }
       return nread;
     }
+
+    /**
+     * Get the IO Statistics of the nested stream, falling back to
+     * null if the stream does not implement the interface
+     * {@link IOStatisticsSource}.
+     * @return an IOStatistics instance or null
+     */
+    @Override
+    public IOStatistics getIOStatistics() {
+      return IOStatisticsSupport.retrieveIOStatistics(datas);
+    }
   }
   
   private static class FSDataBoundedInputStream extends FSDataInputStream {
@@ -382,7 +397,8 @@ public static long getChecksumLength(long size, int bytesPerSum) {
 
   /** This class provides an output stream for a checksummed file.
    * It generates checksums for data. */
-  private static class ChecksumFSOutputSummer extends FSOutputSummer {
+  private static class ChecksumFSOutputSummer extends FSOutputSummer
+      implements IOStatisticsSource, StreamCapabilities {
     private FSDataOutputStream datas;    
     private FSDataOutputStream sums;
     private static final float CHKSUM_AS_FRACTION = 0.01f;
@@ -436,6 +452,28 @@ protected void checkClosed() throws IOException {
         throw new ClosedChannelException();
       }
     }
+
+    /**
+     * Get the IO Statistics of the nested stream, falling back to
+     * null if the stream does not implement the interface
+     * {@link IOStatisticsSource}.
+     * @return an IOStatistics instance or null
+     */
+    @Override
+    public IOStatistics getIOStatistics() {
+      return IOStatisticsSupport.retrieveIOStatistics(datas);
+    }
+
+    /**
+     * Probe the inner stream for a capability.
+     *
+     * @param capability string to query the stream support for.
+     * @return true if a capability is known to be supported.
+     */
+    @Override
+    public boolean hasCapability(final String capability) {
+      return datas.hasCapability(capability);
+    }
   }
 
   @Override
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FSDataOutputStream.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FSDataOutputStream.java
index 5970373a9f31a..7ec4067b9800d 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FSDataOutputStream.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FSDataOutputStream.java
@@ -24,13 +24,17 @@
 
 import org.apache.hadoop.classification.InterfaceAudience;
 import org.apache.hadoop.classification.InterfaceStability;
+import org.apache.hadoop.fs.statistics.IOStatistics;
+import org.apache.hadoop.fs.statistics.IOStatisticsSource;
+import org.apache.hadoop.fs.statistics.IOStatisticsSupport;
 
 /** Utility that wraps a {@link OutputStream} in a {@link DataOutputStream}.
  */
 @InterfaceAudience.Public
 @InterfaceStability.Stable
 public class FSDataOutputStream extends DataOutputStream
-    implements Syncable, CanSetDropBehind, StreamCapabilities {
+    implements Syncable, CanSetDropBehind, StreamCapabilities,
+      IOStatisticsSource {
   private final OutputStream wrappedStream;
 
   private static class PositionCache extends FilterOutputStream {
@@ -155,4 +159,15 @@ public void setDropBehind(Boolean dropBehind) throws IOException {
           "not support setting the drop-behind caching setting.");
     }
   }
+
+  /**
+   * Get the IO Statistics of the nested stream, falling back to
+   * empty statistics if the stream does not implement the interface
+   * {@link IOStatisticsSource}.
+   * @return an IOStatistics instance.
+   */
+  @Override
+  public IOStatistics getIOStatistics() {
+    return IOStatisticsSupport.retrieveIOStatistics(wrappedStream);
+  }
 }
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FSInputStream.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FSInputStream.java
index 672ab15f16c3b..373120ce3078f 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FSInputStream.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FSInputStream.java
@@ -24,6 +24,9 @@
 import com.google.common.base.Preconditions;
 import org.apache.hadoop.classification.InterfaceAudience;
 import org.apache.hadoop.classification.InterfaceStability;
+import org.apache.hadoop.fs.statistics.IOStatisticsLogging;
+import org.apache.hadoop.fs.statistics.IOStatisticsSource;
+
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -134,4 +137,23 @@ public void readFully(long position, byte[] buffer)
     throws IOException {
     readFully(position, buffer, 0, buffer.length);
   }
+
+  /**
+   * toString method returns the superclass toString, but if the subclass
+   * implements {@link IOStatisticsSource} then those statistics are
+   * extracted and included in the output.
+   * That is: statistics of subclasses are automatically reported.
+   * @return a string value.
+   */
+  @Override
+  public String toString() {
+    final StringBuilder sb = new StringBuilder(super.toString());
+    sb.append('{');
+    if (this instanceof IOStatisticsSource) {
+      sb.append(IOStatisticsLogging.ioStatisticsSourceToString(
+          (IOStatisticsSource) this));
+    }
+    sb.append('}');
+    return sb.toString();
+  }
 }
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/RawLocalFileSystem.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/RawLocalFileSystem.java
index cf2210575da15..d837fd62ccb24 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/RawLocalFileSystem.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/RawLocalFileSystem.java
@@ -40,6 +40,7 @@
 import java.nio.file.attribute.FileTime;
 import java.util.Arrays;
 import java.util.EnumSet;
+import java.util.Locale;
 import java.util.Optional;
 import java.util.StringTokenizer;
 
@@ -47,6 +48,10 @@
 import org.apache.hadoop.classification.InterfaceStability;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.permission.FsPermission;
+import org.apache.hadoop.fs.statistics.IOStatistics;
+import org.apache.hadoop.fs.statistics.IOStatisticsSource;
+import org.apache.hadoop.fs.statistics.BufferedIOStatisticsOutputStream;
+import org.apache.hadoop.fs.statistics.impl.IOStatisticsStore;
 import org.apache.hadoop.io.IOUtils;
 import org.apache.hadoop.io.nativeio.NativeIO;
 import org.apache.hadoop.util.Progressable;
@@ -54,6 +59,14 @@
 import org.apache.hadoop.util.StringUtils;
 
 import static org.apache.hadoop.fs.impl.PathCapabilitiesSupport.validatePathCapabilityArgs;
+import static org.apache.hadoop.fs.statistics.StreamStatisticNames.STREAM_READ_BYTES;
+import static org.apache.hadoop.fs.statistics.StreamStatisticNames.STREAM_READ_EXCEPTIONS;
+import static org.apache.hadoop.fs.statistics.StreamStatisticNames.STREAM_READ_SEEK_OPERATIONS;
+import static org.apache.hadoop.fs.statistics.StreamStatisticNames.STREAM_READ_SKIP_BYTES;
+import static org.apache.hadoop.fs.statistics.StreamStatisticNames.STREAM_READ_SKIP_OPERATIONS;
+import static org.apache.hadoop.fs.statistics.StreamStatisticNames.STREAM_WRITE_BYTES;
+import static org.apache.hadoop.fs.statistics.StreamStatisticNames.STREAM_WRITE_EXCEPTIONS;
+import static org.apache.hadoop.fs.statistics.impl.IOStatisticsBinding.iostatisticsStore;
 
 /****************************************************************
  * Implement the FileSystem API for the raw local filesystem.
@@ -105,10 +118,23 @@ public void initialize(URI uri, Configuration conf) throws IOException {
   /*******************************************************
    * For open()'s FSInputStream.
    *******************************************************/
-  class LocalFSFileInputStream extends FSInputStream implements HasFileDescriptor {
+  class LocalFSFileInputStream extends FSInputStream implements
+      HasFileDescriptor, IOStatisticsSource, StreamCapabilities {
     private FileInputStream fis;
     private long position;
 
+    /**
+     * Minimal set of counters.
+     */
+    private final IOStatisticsStore ioStatistics = iostatisticsStore()
+        .withCounters(
+            STREAM_READ_BYTES,
+            STREAM_READ_EXCEPTIONS,
+            STREAM_READ_SEEK_OPERATIONS,
+            STREAM_READ_SKIP_OPERATIONS,
+            STREAM_READ_SKIP_BYTES)
+        .build();
+
     public LocalFSFileInputStream(Path f) throws IOException {
       fis = new FileInputStream(pathToFile(f));
     }
@@ -150,9 +176,11 @@ public int read() throws IOException {
         if (value >= 0) {
           this.position++;
           statistics.incrementBytesRead(1);
+          ioStatistics.incrementCounter(STREAM_READ_BYTES);
         }
         return value;
       } catch (IOException e) {                 // unexpected exception
+        ioStatistics.incrementCounter(STREAM_READ_EXCEPTIONS);
         throw new FSError(e);                   // assume native fs error
       }
     }
@@ -166,9 +194,11 @@ public int read(byte[] b, int off, int len) throws IOException {
         if (value > 0) {
           this.position += value;
           statistics.incrementBytesRead(value);
+          ioStatistics.incrementCounter(STREAM_READ_BYTES, value);
         }
         return value;
       } catch (IOException e) {                 // unexpected exception
+        ioStatistics.incrementCounter(STREAM_READ_EXCEPTIONS);
         throw new FSError(e);                   // assume native fs error
       }
     }
@@ -187,18 +217,22 @@ public int read(long position, byte[] b, int off, int len)
         int value = fis.getChannel().read(bb, position);
         if (value > 0) {
           statistics.incrementBytesRead(value);
+          ioStatistics.incrementCounter(STREAM_READ_BYTES, value);
         }
         return value;
       } catch (IOException e) {
+        ioStatistics.incrementCounter(STREAM_READ_EXCEPTIONS);
         throw new FSError(e);
       }
     }
     
     @Override
     public long skip(long n) throws IOException {
+      ioStatistics.incrementCounter(STREAM_READ_SKIP_OPERATIONS);
       long value = fis.skip(n);
       if (value > 0) {
         this.position += value;
+        ioStatistics.incrementCounter(STREAM_READ_SKIP_BYTES, value);
       }
       return value;
     }
@@ -207,6 +241,23 @@ public long skip(long n) throws IOException {
     public FileDescriptor getFileDescriptor() throws IOException {
       return fis.getFD();
     }
+
+    @Override
+    public boolean hasCapability(String capability) {
+      // a bit inefficient, but intended to make it easier to add
+      // new capabilities.
+      switch (capability.toLowerCase(Locale.ENGLISH)) {
+      case StreamCapabilities.IOSTATISTICS:
+        return true;
+      default:
+        return false;
+      }
+    }
+
+    @Override
+    public IOStatistics getIOStatistics() {
+      return ioStatistics;
+    }
   }
   
   @Override
@@ -231,9 +282,19 @@ public FSDataInputStream open(PathHandle fd, int bufferSize)
   /*********************************************************
    * For create()'s FSOutputStream.
    *********************************************************/
-  class LocalFSFileOutputStream extends OutputStream {
+  final class LocalFSFileOutputStream extends OutputStream implements
+      IOStatisticsSource, StreamCapabilities {
     private FileOutputStream fos;
-    
+
+    /**
+     * Minimal set of counters.
+     */
+    private final IOStatisticsStore ioStatistics = iostatisticsStore()
+        .withCounters(
+            STREAM_WRITE_BYTES,
+            STREAM_WRITE_EXCEPTIONS)
+        .build();
+
     private LocalFSFileOutputStream(Path f, boolean append,
         FsPermission permission) throws IOException {
       File file = pathToFile(f);
@@ -273,7 +334,9 @@ private LocalFSFileOutputStream(Path f, boolean append,
     public void write(byte[] b, int off, int len) throws IOException {
       try {
         fos.write(b, off, len);
+        ioStatistics.incrementCounter(STREAM_WRITE_BYTES, len);
       } catch (IOException e) {                // unexpected exception
+        ioStatistics.incrementCounter(STREAM_WRITE_EXCEPTIONS);
         throw new FSError(e);                  // assume native fs error
       }
     }
@@ -282,10 +345,29 @@ public void write(byte[] b, int off, int len) throws IOException {
     public void write(int b) throws IOException {
       try {
         fos.write(b);
+        ioStatistics.incrementCounter(STREAM_WRITE_BYTES);
       } catch (IOException e) {              // unexpected exception
+        ioStatistics.incrementCounter(STREAM_WRITE_EXCEPTIONS);
         throw new FSError(e);                // assume native fs error
       }
     }
+
+    @Override
+    public boolean hasCapability(String capability) {
+      // a bit inefficient, but intended to make it easier to add
+      // new capabilities.
+      switch (capability.toLowerCase(Locale.ENGLISH)) {
+      case StreamCapabilities.IOSTATISTICS:
+        return true;
+      default:
+        return false;
+      }
+    }
+
+    @Override
+    public IOStatistics getIOStatistics() {
+      return ioStatistics;
+    }
   }
 
   @Override
@@ -318,8 +400,8 @@ private FSDataOutputStream create(Path f, boolean overwrite,
     if (parent != null && !mkdirs(parent)) {
       throw new IOException("Mkdirs failed to create " + parent.toString());
     }
-    return new FSDataOutputStream(new BufferedOutputStream(
-        createOutputStreamWithMode(f, false, permission), bufferSize),
+    return new FSDataOutputStream(new BufferedIOStatisticsOutputStream(
+        createOutputStreamWithMode(f, false, permission), bufferSize, true),
         statistics);
   }
   
@@ -340,8 +422,8 @@ public FSDataOutputStream createNonRecursive(Path f, FsPermission permission,
     if (exists(f) && !flags.contains(CreateFlag.OVERWRITE)) {
       throw new FileAlreadyExistsException("File already exists: " + f);
     }
-    return new FSDataOutputStream(new BufferedOutputStream(
-        createOutputStreamWithMode(f, false, permission), bufferSize),
+    return new FSDataOutputStream(new BufferedIOStatisticsOutputStream(
+        createOutputStreamWithMode(f, false, permission), bufferSize, true),
             statistics);
   }
 
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/StreamCapabilities.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/StreamCapabilities.java
index 9d4b6fe7bc2ae..cb129057ce74e 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/StreamCapabilities.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/StreamCapabilities.java
@@ -70,6 +70,11 @@ public interface StreamCapabilities {
    */
   String PREADBYTEBUFFER = "in:preadbytebuffer";
 
+  /**
+   * IOStatisticsSource API.
+   */
+  String IOSTATISTICS = "iostatistics";
+
   /**
    * Capabilities that a stream can support and be queried for.
    */
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/BufferedIOStatisticsInputStream.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/BufferedIOStatisticsInputStream.java
new file mode 100644
index 0000000000000..bdc432570542b
--- /dev/null
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/BufferedIOStatisticsInputStream.java
@@ -0,0 +1,85 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.statistics;
+
+import java.io.BufferedInputStream;
+import java.io.InputStream;
+
+import org.apache.hadoop.fs.StreamCapabilities;
+
+import static org.apache.hadoop.fs.statistics.IOStatisticsSupport.retrieveIOStatistics;
+
+/**
+ * An extension of {@code BufferedInputStream} which implements
+ * {@link IOStatisticsSource} and forwards requests for the
+ * {@link IOStatistics} to the wrapped stream.
+ *
+ * This should be used when any input stream needs buffering while
+ * allowing the inner stream to be a source of statistics.
+ *
+ * It also implements {@link StreamCapabilities} and forwards the probe
+ * to the inner stream, if possible.
+ */
+public class BufferedIOStatisticsInputStream
+    extends BufferedInputStream
+    implements IOStatisticsSource, StreamCapabilities {
+
+  /**
+   * Buffer an input stream with the default buffer size of 8k.
+   * @param in input stream
+   */
+  public BufferedIOStatisticsInputStream(final InputStream in) {
+    super(in);
+  }
+
+  /**
+   * Buffer an input stream with the chosen buffer size.
+   * @param in input stream
+   * @param size buffer size
+   */
+  public BufferedIOStatisticsInputStream(final InputStream in, final int size) {
+    super(in, size);
+  }
+
+  /**
+   * Return any IOStatistics offered by the inner stream.
+   * @return inner IOStatistics or null
+   */
+  @Override
+  public IOStatistics getIOStatistics() {
+    return retrieveIOStatistics(in);
+  }
+
+  /**
+   * If the inner stream supports {@link StreamCapabilities},
+   * forward the probe to it.
+   * Otherwise: return false.
+   *
+   * @param capability string to query the stream support for.
+   * @return true if a capability is known to be supported.
+   */
+  @Override
+  public boolean hasCapability(final String capability) {
+    if (in instanceof StreamCapabilities) {
+      return ((StreamCapabilities) in).hasCapability(capability);
+    } else {
+      return false;
+    }
+  }
+}
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/BufferedIOStatisticsOutputStream.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/BufferedIOStatisticsOutputStream.java
new file mode 100644
index 0000000000000..88e73a0629b1d
--- /dev/null
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/BufferedIOStatisticsOutputStream.java
@@ -0,0 +1,157 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.statistics;
+
+import java.io.BufferedOutputStream;
+import java.io.IOException;
+import java.io.OutputStream;
+
+import org.apache.hadoop.fs.StreamCapabilities;
+import org.apache.hadoop.fs.Syncable;
+
+import static org.apache.hadoop.fs.statistics.IOStatisticsSupport.retrieveIOStatistics;
+
+/**
+ * An extension of {@code BufferedOutputStream} which implements
+ * {@link IOStatisticsSource} and forwards requests for the
+ * {@link IOStatistics} to the wrapped stream.
+ *
+ * This should be used when any output stream needs buffering while
+ * allowing the inner stream to be a source of statistics.
+ *
+ * It also implements {@link StreamCapabilities}
+ * and {@link Syncable} and forwards to to the inner stream,
+ * if possible.
+ */
+public class BufferedIOStatisticsOutputStream
+    extends BufferedOutputStream
+    implements IOStatisticsSource, Syncable, StreamCapabilities {
+
+  /**
+   * Should calls to Syncable downgrade to flush if the underlying
+   * stream does not support it?
+   * While that breaks a core contract requirement of Syncable:
+   * "Sync.sync() guarantees durability", downgrading is
+   * the default behavior of FsDataOutputStream.
+   */
+  private final boolean downgradeSyncable;
+
+  /**
+   * Construct with default buffer size.
+   * @param out output stream to buffer
+   * @param downgradeSyncable should Syncable calls downgrade?
+   */
+  public BufferedIOStatisticsOutputStream(
+      final OutputStream out,
+      final boolean downgradeSyncable) {
+    super(out);
+    this.downgradeSyncable = downgradeSyncable;
+  }
+
+  /**
+   * Construct with custom buffer size.
+   *
+   * @param out output stream to buffer
+   * @param size buffer.
+   * @param downgradeSyncable should Syncable calls downgrade?
+   */
+  public BufferedIOStatisticsOutputStream(
+      final OutputStream out,
+      final int size,
+      final boolean downgradeSyncable) {
+    super(out, size);
+    this.downgradeSyncable = downgradeSyncable;
+  }
+
+  /**
+   * Ask the inner stream for their IOStatistics.
+   * @return any IOStatistics offered by the inner stream.
+   */
+  @Override
+  public IOStatistics getIOStatistics() {
+    return retrieveIOStatistics(out);
+  }
+
+  /**
+   * If the inner stream supports {@link StreamCapabilities},
+   * forward the probe to it.
+   * Otherwise: return false.
+   *
+   * @param capability string to query the stream support for.
+   * @return true if a capability is known to be supported.
+   */
+  @Override
+  public boolean hasCapability(final String capability) {
+    if (out instanceof StreamCapabilities) {
+      return ((StreamCapabilities) out).hasCapability(capability);
+    } else {
+      return false;
+    }
+  }
+
+  /**
+   * If the inner stream is Syncable, flush the buffer and then
+   * invoke the inner stream's hflush() operation.
+   *
+   * Otherwise: throw an exception, unless the stream was constructed with
+   * {@link #downgradeSyncable} set to true, in which case the stream
+   * is just flushed.
+   * @throws IOException IO Problem
+   * @throws UnsupportedOperationException if the inner class is not syncable
+   */
+  @Override
+  public void hflush() throws IOException {
+    if (out instanceof Syncable) {
+      flush();
+      ((Syncable) out).hflush();
+    } else {
+      if (!downgradeSyncable) {
+        throw new UnsupportedOperationException("hflush not supported by "
+            + out);
+      } else {
+        flush();
+      }
+    }
+  }
+
+  /**
+   * If the inner stream is Syncable, flush the buffer and then
+   * invoke the inner stream's hsync() operation.
+   *
+   * Otherwise: throw an exception, unless the stream was constructed with
+   * {@link #downgradeSyncable} set to true, in which case the stream
+   * is just flushed.
+   * @throws IOException IO Problem
+   * @throws UnsupportedOperationException if the inner class is not syncable
+   */
+  @Override
+  public void hsync() throws IOException {
+    if (out instanceof Syncable) {
+      flush();
+      ((Syncable) out).hsync();
+    } else {
+      if (!downgradeSyncable) {
+        throw new UnsupportedOperationException("hsync not supported by "
+            + out);
+      } else {
+        flush();
+      }
+    }
+  }
+}
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/DurationStatisticSummary.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/DurationStatisticSummary.java
new file mode 100644
index 0000000000000..e1335d77d792a
--- /dev/null
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/DurationStatisticSummary.java
@@ -0,0 +1,154 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.statistics;
+
+import javax.annotation.Nullable;
+import java.io.Serializable;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.classification.InterfaceStability;
+
+import static org.apache.hadoop.fs.statistics.StoreStatisticNames.SUFFIX_FAILURES;
+import static org.apache.hadoop.fs.statistics.StoreStatisticNames.SUFFIX_MAX;
+import static org.apache.hadoop.fs.statistics.StoreStatisticNames.SUFFIX_MEAN;
+import static org.apache.hadoop.fs.statistics.StoreStatisticNames.SUFFIX_MIN;
+
+/**
+ * Summary of duration tracking statistics
+ * as extracted from an IOStatistics instance.
+ * <p>
+ * This is for reporting and testing.
+ */
+@InterfaceAudience.Public
+@InterfaceStability.Unstable
+public final class DurationStatisticSummary implements Serializable {
+
+  private static final long serialVersionUID = 6776381340896518486L;
+
+  /** Statistic key. */
+  private final String key;
+
+  /** Are these success or failure statistics. */
+  private final boolean success;
+
+  /** Count of operation invocations. */
+  private final long count;
+
+  /** Max duration; -1 if unknown. */
+  private final long max;
+
+  /** Min duration; -1 if unknown. */
+  private final long min;
+
+  /** Mean duration -may be null. */
+  private final MeanStatistic mean;
+
+  /**
+   * Constructor.
+   * @param key Statistic key.
+   * @param success Are these success or failure statistics.
+   * @param count Count of operation invocations.
+   * @param max Max duration; -1 if unknown.
+   * @param min Min duration; -1 if unknown.
+   * @param mean Mean duration -may be null. (will be cloned)
+   */
+  public DurationStatisticSummary(final String key,
+      final boolean success,
+      final long count,
+      final long max,
+      final long min,
+      @Nullable final MeanStatistic mean) {
+    this.key = key;
+    this.success = success;
+    this.count = count;
+    this.max = max;
+    this.min = min;
+    this.mean = mean == null ? null : mean.clone();
+  }
+
+  public String getKey() {
+    return key;
+  }
+
+  public boolean isSuccess() {
+    return success;
+  }
+
+  public long getCount() {
+    return count;
+  }
+
+  public long getMax() {
+    return max;
+  }
+
+  public long getMin() {
+    return min;
+  }
+
+  public MeanStatistic getMean() {
+    return mean;
+  }
+
+  @Override
+  public String toString() {
+    return "DurationStatisticSummary{" +
+        "key='" + key + '\'' +
+        ", success=" + success +
+        ", counter=" + count +
+        ", max=" + max +
+        ", mean=" + mean +
+        '}';
+  }
+
+  /**
+   * Fetch the duration timing summary of success or failure operations
+   * from an IO Statistics source.
+   * If the duration key is unknown, the summary will be incomplete.
+   * @param source source of data
+   * @param key duration statistic key
+   * @param success fetch success statistics, or if false, failure stats.
+   * @return a summary of the statistics.
+   */
+  public static DurationStatisticSummary fetchDurationSummary(
+      IOStatistics source,
+      String key,
+      boolean success) {
+    String fullkey = success ? key : key + SUFFIX_FAILURES;
+    return new DurationStatisticSummary(key, success,
+        source.counters().getOrDefault(fullkey, 0L),
+        source.maximums().getOrDefault(fullkey + SUFFIX_MAX, -1L),
+        source.minimums().getOrDefault(fullkey + SUFFIX_MIN, -1L),
+        source.meanStatistics()
+            .get(fullkey + SUFFIX_MEAN));
+  }
+
+  /**
+   * Fetch the duration timing summary from an IOStatistics source.
+   * If the duration key is unknown, the summary will be incomplete.
+   * @param source source of data
+   * @param key duration statistic key
+   * @return a summary of the statistics.
+   */
+  public static DurationStatisticSummary fetchSuccessSummary(
+      IOStatistics source,
+      String key) {
+    return fetchDurationSummary(source, key, true);
+  }
+}
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/DurationTracker.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/DurationTracker.java
new file mode 100644
index 0000000000000..5a15c7ad66c4f
--- /dev/null
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/DurationTracker.java
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.statistics;
+
+import java.time.Duration;
+
+/**
+ * Interface to be implemented by objects which can track duration.
+ * It extends AutoCloseable to fit into a try-with-resources statement,
+ * but then strips out the {@code throws Exception} aspect of the signature
+ * so it doesn't force code to add extra handling for any failures.
+ *
+ * If a duration is declared as "failed()" then the failure counters
+ * will be updated.
+ */
+public interface DurationTracker extends AutoCloseable {
+
+  /**
+   * The operation failed. Failure statistics will be updated.
+   */
+  void failed();
+
+  /**
+   * Finish tracking: update the statistics with the timings.
+   */
+  void close();
+
+  /**
+   * Get the duration of an operation as a java Duration
+   * instance. If the duration tracker hasn't completed,
+   * or its duration tracking doesn't actually measure duration,
+   * returns Duration.ZERO.
+   * @return a duration, value of ZERO until close().
+   */
+  default Duration asDuration() {
+    return Duration.ZERO;
+  }
+}
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/DurationTrackerFactory.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/DurationTrackerFactory.java
new file mode 100644
index 0000000000000..b1d87c9100f95
--- /dev/null
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/DurationTrackerFactory.java
@@ -0,0 +1,57 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.statistics;
+
+/**
+ * Interface for a source of duration tracking.
+ *
+ * This is intended for uses where it can be passed into classes
+ * which update operation durations, without tying those
+ * classes to internal implementation details.
+ */
+public interface DurationTrackerFactory {
+
+  /**
+   * Initiate a duration tracking operation by creating/returning
+   * an object whose {@code close()} call will
+   * update the statistics.
+   *
+   * The statistics counter with the key name will be incremented
+   * by the given count.
+   *
+   * The expected use is within a try-with-resources clause.
+   * @param key statistic key prefix
+   * @param count  #of times to increment the matching counter in this
+   * operation.
+   * @return an object to close after an operation completes.
+   */
+  DurationTracker trackDuration(String key, long count);
+
+  /**
+   * Initiate a duration tracking operation by creating/returning
+   * an object whose {@code close()} call will
+   * update the statistics.
+   * The expected use is within a try-with-resources clause.
+   * @param key statistic key
+   * @return an object to close after an operation completes.
+   */
+  default DurationTracker trackDuration(String key) {
+    return trackDuration(key, 1);
+  }
+}
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/IOStatistics.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/IOStatistics.java
new file mode 100644
index 0000000000000..75d9965128101
--- /dev/null
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/IOStatistics.java
@@ -0,0 +1,78 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.statistics;
+
+import java.util.Map;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.classification.InterfaceStability;
+
+/**
+ * IO Statistics.
+ * <p>
+ * These are low-cost per-instance statistics provided by any Hadoop
+ * I/O class instance.
+ * <p>
+ * Consult the filesystem specification document for the requirements
+ * of an implementation of this interface.
+ */
+@InterfaceAudience.Public
+@InterfaceStability.Unstable
+public interface IOStatistics {
+
+  /**
+   * Map of counters.
+   * @return the current map of counters.
+   */
+  Map<String, Long> counters();
+
+  /**
+   * Map of gauges.
+   * @return the current map of gauges.
+   */
+  Map<String, Long> gauges();
+
+  /**
+   * Map of minimums.
+   * @return the current map of minimums.
+   */
+  Map<String, Long> minimums();
+
+  /**
+   * Map of maximums.
+   * @return the current map of maximums.
+   */
+  Map<String, Long> maximums();
+
+  /**
+   * Map of meanStatistics.
+   * @return the current map of MeanStatistic statistics.
+   */
+  Map<String, MeanStatistic> meanStatistics();
+
+  /**
+   * Value when a minimum value has never been set.
+   */
+  long MIN_UNSET_VALUE = -1;
+
+  /**
+   * Value when a max value has never been set.
+   */
+  long MAX_UNSET_VALUE = -1;
+}
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/IOStatisticsAggregator.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/IOStatisticsAggregator.java
new file mode 100644
index 0000000000000..1c5451c6f0e83
--- /dev/null
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/IOStatisticsAggregator.java
@@ -0,0 +1,48 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.statistics;
+
+import javax.annotation.Nullable;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.classification.InterfaceStability;
+
+/**
+ * Interface exported by classes which support
+ * aggregation of {@link IOStatistics}.
+ * Implementations MAY aggregate all statistics
+ * exported by the IOStatistics reference passed in to
+ * {@link #aggregate(IOStatistics)}, or they
+ * may selectively aggregate specific values/classes
+ * of statistics.
+ */
+@InterfaceAudience.Public
+@InterfaceStability.Evolving
+public interface IOStatisticsAggregator {
+
+  /**
+   * Aggregate the supplied statistics into the current
+   * set.
+   *
+   * @param statistics statistics; may be null
+   * @return true if the statistics reference was not null and
+   * so aggregated.
+   */
+  boolean aggregate(@Nullable IOStatistics statistics);
+}
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/IOStatisticsLogging.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/IOStatisticsLogging.java
new file mode 100644
index 0000000000000..c7230e25c3434
--- /dev/null
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/IOStatisticsLogging.java
@@ -0,0 +1,301 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.statistics;
+
+import javax.annotation.Nullable;
+import java.util.Map;
+import java.util.TreeMap;
+import java.util.function.Predicate;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.classification.InterfaceStability;
+import org.apache.hadoop.fs.statistics.impl.IOStatisticsBinding;
+
+import static org.apache.hadoop.fs.statistics.IOStatisticsSupport.retrieveIOStatistics;
+
+/**
+ * Utility operations convert IO Statistics sources/instances
+ * to strings, especially for robustly logging.
+ */
+@InterfaceAudience.Public
+@InterfaceStability.Unstable
+public final class IOStatisticsLogging {
+
+  private static final Logger LOG =
+      LoggerFactory.getLogger(IOStatisticsLogging.class);
+
+  private IOStatisticsLogging() {
+  }
+
+  /**
+   * Extract the statistics from a source object -or ""
+   * if it is not an instance of {@link IOStatistics},
+   * {@link IOStatisticsSource} or the retrieved
+   * statistics are null.
+   * <p>
+   * Exceptions are caught and downgraded to debug logging.
+   * @param source source of statistics.
+   * @return a string for logging.
+   */
+  public static String ioStatisticsSourceToString(@Nullable Object source) {
+    try {
+      return ioStatisticsToString(retrieveIOStatistics(source));
+    } catch (RuntimeException e) {
+      LOG.debug("Ignoring", e);
+      return "";
+    }
+  }
+
+  /**
+   * Convert IOStatistics to a string form.
+   * @param statistics A statistics instance.
+   * @return string value or the empty string if null
+   */
+  public static String ioStatisticsToString(
+      @Nullable final IOStatistics statistics) {
+    if (statistics != null) {
+      StringBuilder sb = new StringBuilder();
+      mapToString(sb, "counters", statistics.counters(), " ");
+      mapToString(sb, "gauges", statistics.gauges(), " ");
+      mapToString(sb, "minimums", statistics.minimums(), " ");
+      mapToString(sb, "maximums", statistics.maximums(), " ");
+      mapToString(sb, "means", statistics.meanStatistics(), " ");
+
+      return sb.toString();
+    } else {
+      return "";
+    }
+  }
+
+  /**
+   * Convert IOStatistics to a string form, with all the metrics sorted
+   * and empty value stripped.
+   * This is more expensive than the simple conversion, so should only
+   * be used for logging/output where it's known/highly likely that the
+   * caller wants to see the values. Not for debug logging.
+   * @param statistics A statistics instance.
+   * @return string value or the empty string if null
+   */
+  public static String ioStatisticsToPrettyString(
+      @Nullable final IOStatistics statistics) {
+    if (statistics != null) {
+      StringBuilder sb = new StringBuilder();
+      mapToSortedString(sb, "counters", statistics.counters(),
+          p -> p == 0);
+      mapToSortedString(sb, "\ngauges", statistics.gauges(),
+          p -> p == 0);
+      mapToSortedString(sb, "\nminimums", statistics.minimums(),
+          p -> p  < 0);
+      mapToSortedString(sb, "\nmaximums", statistics.maximums(),
+          p -> p < 0);
+      mapToSortedString(sb, "\nmeans", statistics.meanStatistics(),
+          MeanStatistic::isEmpty);
+
+      return sb.toString();
+    } else {
+      return "";
+    }
+  }
+
+  /**
+   * Given a map, add its entryset to the string.
+   * The entries are only sorted if the source entryset
+   * iterator is sorted, such as from a TreeMap.
+   * @param sb string buffer to append to
+   * @param type type (for output)
+   * @param map map to evaluate
+   * @param separator separator
+   * @param <E> type of values of the map
+   */
+  private static <E> void mapToString(StringBuilder sb,
+      final String type,
+      final Map<String, E> map,
+      final String separator) {
+    int count = 0;
+    sb.append(type);
+    sb.append("=(");
+    for (Map.Entry<String, E> entry : map.entrySet()) {
+      if (count > 0) {
+        sb.append(separator);
+      }
+      count++;
+      sb.append(IOStatisticsBinding.entryToString(
+          entry.getKey(), entry.getValue()));
+    }
+    sb.append(");\n");
+  }
+
+  /**
+   * Given a map, produce a string with all the values, sorted.
+   * Needs to create a treemap and insert all the entries.
+   * @param sb string buffer to append to
+   * @param type type (for output)
+   * @param map map to evaluate
+   * @param <E> type of values of the map
+   */
+  private static <E> void mapToSortedString(StringBuilder sb,
+      final String type,
+      final Map<String, E> map,
+      final Predicate<E> isEmpty) {
+    mapToString(sb, type, sortedMap(map, isEmpty), "\n");
+  }
+
+  /**
+   * Create a sorted (tree) map from an unsorted map.
+   * This incurs the cost of creating a map and that
+   * of inserting every object into the tree.
+   * @param source source map
+   * @param <E> value type
+   * @return a treemap with all the entries.
+   */
+  private static <E> Map<String, E> sortedMap(
+      final Map<String, E> source,
+      final Predicate<E> isEmpty) {
+    Map<String, E> tm = new TreeMap<>();
+    for (Map.Entry<String, E> entry : source.entrySet()) {
+      if (!isEmpty.test(entry.getValue())) {
+        tm.put(entry.getKey(), entry.getValue());
+      }
+    }
+    return tm;
+  }
+
+  /**
+   * On demand stringifier of an IOStatisticsSource instance.
+   * <p>
+   * Whenever this object's toString() method is called, it evaluates the
+   * statistics.
+   * <p>
+   * This is designed to affordable to use in log statements.
+   * @param source source of statistics -may be null.
+   * @return an object whose toString() operation returns the current values.
+   */
+  public static Object demandStringifyIOStatisticsSource(
+      @Nullable IOStatisticsSource source) {
+    return new SourceToString(source);
+  }
+
+  /**
+   * On demand stringifier of an IOStatistics instance.
+   * <p>
+   * Whenever this object's toString() method is called, it evaluates the
+   * statistics.
+   * <p>
+   * This is for use in log statements where for the cost of creation
+   * of this entry is low; it is affordable to use in log statements.
+   * @param statistics statistics to stringify -may be null.
+   * @return an object whose toString() operation returns the current values.
+   */
+  public static Object demandStringifyIOStatistics(
+      @Nullable IOStatistics statistics) {
+    return new StatisticsToString(statistics);
+  }
+
+  /**
+   * Extract any statistics from the source and log at debug, if
+   * the log is set to log at debug.
+   * No-op if logging is not at debug or the source is null/of
+   * the wrong type/doesn't provide statistics.
+   * @param log log to log to
+   * @param message message for log -this must contain "{}" for the
+   * statistics report to actually get logged.
+   * @param source source object
+   */
+  public static void logIOStatisticsAtDebug(
+      Logger log,
+      String message,
+      Object source) {
+    if (log.isDebugEnabled()) {
+      // robust extract and convert to string
+      String stats = ioStatisticsSourceToString(source);
+      if (!stats.isEmpty()) {
+        log.debug(message, stats);
+      }
+    }
+  }
+
+  /**
+   * Extract any statistics from the source and log to
+   * this class's log at debug, if
+   * the log is set to log at debug.
+   * No-op if logging is not at debug or the source is null/of
+   * the wrong type/doesn't provide statistics.
+   * @param message message for log -this must contain "{}" for the
+   * statistics report to actually get logged.
+   * @param source source object
+   */
+  public static void logIOStatisticsAtDebug(
+      String message,
+      Object source) {
+    logIOStatisticsAtDebug(LOG, message, source);
+  }
+
+  /**
+   * On demand stringifier.
+   * <p>
+   * Whenever this object's toString() method is called, it
+   * retrieves the latest statistics instance and re-evaluates it.
+   */
+  private static final class SourceToString {
+
+    private final IOStatisticsSource source;
+
+    private SourceToString(@Nullable IOStatisticsSource source) {
+      this.source = source;
+    }
+
+    @Override
+    public String toString() {
+      return source != null
+          ? ioStatisticsSourceToString(source)
+          : IOStatisticsBinding.NULL_SOURCE;
+    }
+  }
+
+  /**
+   * Stringifier of statistics: low cost to instantiate and every
+   * toString/logging will re-evaluate the statistics.
+   */
+  private static final class StatisticsToString {
+
+    private final IOStatistics statistics;
+
+    /**
+     * Constructor.
+     * @param statistics statistics
+     */
+    private StatisticsToString(@Nullable IOStatistics statistics) {
+      this.statistics = statistics;
+    }
+
+    /**
+     * Evaluate and stringify the statistics.
+     * @return a string value.
+     */
+    @Override
+    public String toString() {
+      return statistics != null
+          ? ioStatisticsToString(statistics)
+          : IOStatisticsBinding.NULL_SOURCE;
+    }
+  }
+}
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/IOStatisticsSnapshot.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/IOStatisticsSnapshot.java
new file mode 100644
index 0000000000000..5b8b2e284cc11
--- /dev/null
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/IOStatisticsSnapshot.java
@@ -0,0 +1,285 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.statistics;
+
+import javax.annotation.Nullable;
+import java.io.IOException;
+import java.io.ObjectInputStream;
+import java.io.ObjectOutputStream;
+import java.io.Serializable;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Map;
+import java.util.TreeMap;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.stream.Collectors;
+
+import com.fasterxml.jackson.annotation.JsonProperty;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.classification.InterfaceStability;
+import org.apache.hadoop.fs.statistics.impl.IOStatisticsBinding;
+import org.apache.hadoop.util.JsonSerialization;
+
+import static org.apache.hadoop.thirdparty.com.google.common.base.Preconditions.checkNotNull;
+import static org.apache.hadoop.fs.statistics.IOStatisticsLogging.ioStatisticsToString;
+import static org.apache.hadoop.fs.statistics.impl.IOStatisticsBinding.aggregateMaps;
+import static org.apache.hadoop.fs.statistics.impl.IOStatisticsBinding.snapshotMap;
+
+/**
+ * Snapshot of statistics from a different source.
+ * <p>
+ * It is serializable so that frameworks which can use java serialization
+ * to propagate data (Spark, Flink...) can send the statistics
+ * back. For this reason, TreeMaps are explicitly used as field types,
+ * even though IDEs can recommend use of Map instead.
+ * For security reasons, untrusted java object streams should never be
+ * deserialized. If for some reason this is required, use
+ * {@link #requiredSerializationClasses()} to get the list of classes
+ * used when deserializing instances of this object.
+ * <p>
+ * <p>
+ * It is annotated for correct serializations with jackson2.
+ * </p>
+ */
+@SuppressWarnings("CollectionDeclaredAsConcreteClass")
+@InterfaceAudience.Public
+@InterfaceStability.Evolving
+public final class IOStatisticsSnapshot
+    implements IOStatistics, Serializable, IOStatisticsAggregator {
+
+  private static final long serialVersionUID = -1762522703841538084L;
+
+  /**
+   * List of chasses needed to deserialize.
+   */
+  private static final Class[] DESERIALIZATION_CLASSES = {
+      IOStatisticsSnapshot.class,
+      TreeMap.class,
+      Long.class,
+      MeanStatistic.class,
+  };
+
+  /**
+   * Counters.
+   */
+  @JsonProperty
+  private transient Map<String, Long> counters;
+
+  /**
+   * Gauges.
+   */
+  @JsonProperty
+  private transient Map<String, Long> gauges;
+
+  /**
+   * Minimum values.
+   */
+  @JsonProperty
+  private transient Map<String, Long> minimums;
+
+  /**
+   * Maximum values.
+   */
+  @JsonProperty
+  private transient Map<String, Long> maximums;
+
+  /**
+   * mean statistics. The JSON key is all lower case..
+   */
+  @JsonProperty("meanstatistics")
+  private transient Map<String, MeanStatistic> meanStatistics;
+
+  /**
+   * Construct.
+   */
+  public IOStatisticsSnapshot() {
+    createMaps();
+  }
+
+  /**
+   * Construct, taking a snapshot of the source statistics data
+   * if the source is non-null.
+   * If the source is null, the empty maps are created
+   * @param source statistics source. Nullable.
+   */
+  public IOStatisticsSnapshot(IOStatistics source) {
+    if (source != null) {
+      snapshot(source);
+    } else {
+      createMaps();
+    }
+  }
+
+  /**
+   * Create the maps.
+   */
+  private synchronized void createMaps() {
+    counters = new ConcurrentHashMap<>();
+    gauges = new ConcurrentHashMap<>();
+    minimums = new ConcurrentHashMap<>();
+    maximums = new ConcurrentHashMap<>();
+    meanStatistics = new ConcurrentHashMap<>();
+  }
+
+  /**
+   * Clear all the maps.
+   */
+  public synchronized void clear() {
+    counters.clear();
+    gauges.clear();
+    minimums.clear();
+    maximums.clear();
+    meanStatistics.clear();
+  }
+
+  /**
+   * Take a snapshot.
+   *
+   * This completely overwrites the map data with the statistics
+   * from the source.
+   * @param source statistics source.
+   */
+  public synchronized void snapshot(IOStatistics source) {
+    checkNotNull(source);
+    counters = snapshotMap(source.counters());
+    gauges = snapshotMap(source.gauges());
+    minimums = snapshotMap(source.minimums());
+    maximums = snapshotMap(source.maximums());
+    meanStatistics = snapshotMap(source.meanStatistics(),
+        MeanStatistic::copy);
+  }
+
+  /**
+   * Aggregate the current statistics with the
+   * source reference passed in.
+   *
+   * The operation is synchronized.
+   * @param source source; may be null
+   * @return true if a merge took place.
+   */
+  @Override
+  public synchronized boolean aggregate(
+      @Nullable IOStatistics source) {
+    if (source == null) {
+      return false;
+    }
+    aggregateMaps(counters, source.counters(),
+        IOStatisticsBinding::aggregateCounters,
+        IOStatisticsBinding::passthroughFn);
+    aggregateMaps(gauges, source.gauges(),
+        IOStatisticsBinding::aggregateGauges,
+        IOStatisticsBinding::passthroughFn);
+    aggregateMaps(minimums, source.minimums(),
+        IOStatisticsBinding::aggregateMinimums,
+        IOStatisticsBinding::passthroughFn);
+    aggregateMaps(maximums, source.maximums(),
+        IOStatisticsBinding::aggregateMaximums,
+        IOStatisticsBinding::passthroughFn);
+    aggregateMaps(meanStatistics, source.meanStatistics(),
+        IOStatisticsBinding::aggregateMeanStatistics, MeanStatistic::copy);
+    return true;
+  }
+
+  @Override
+  public synchronized Map<String, Long> counters() {
+    return counters;
+  }
+
+  @Override
+  public synchronized Map<String, Long> gauges() {
+    return gauges;
+  }
+
+  @Override
+  public synchronized Map<String, Long> minimums() {
+    return minimums;
+  }
+
+  @Override
+  public synchronized Map<String, Long> maximums() {
+    return maximums;
+  }
+
+  @Override
+  public synchronized Map<String, MeanStatistic> meanStatistics() {
+    return meanStatistics;
+  }
+
+  @Override
+  public String toString() {
+    return ioStatisticsToString(this);
+  }
+
+  /**
+   * Get a JSON serializer for this class.
+   * @return a serializer.
+   */
+  public static JsonSerialization<IOStatisticsSnapshot> serializer() {
+    return new JsonSerialization<>(IOStatisticsSnapshot.class, false, true);
+  }
+
+  /**
+   * Serialize by converting each map to a TreeMap, and saving that
+   * to the stream.
+   */
+  private synchronized void writeObject(ObjectOutputStream s)
+      throws IOException {
+    // Write out the core
+    s.defaultWriteObject();
+    s.writeObject(new TreeMap<String, Long>(counters));
+    s.writeObject(new TreeMap<String, Long>(gauges));
+    s.writeObject(new TreeMap<String, Long>(minimums));
+    s.writeObject(new TreeMap<String, Long>(maximums));
+    s.writeObject(new TreeMap<String, MeanStatistic>(meanStatistics));
+  }
+
+  /**
+   * Deserialize by loading each TreeMap, and building concurrent
+   * hash maps from them.
+   */
+  private void readObject(final ObjectInputStream s)
+      throws IOException, ClassNotFoundException {
+    // read in core
+    s.defaultReadObject();
+    // and rebuild a concurrent hashmap from every serialized tree map
+    // read back from the stream.
+    counters = new ConcurrentHashMap<>(
+        (TreeMap<String, Long>) s.readObject());
+    gauges = new ConcurrentHashMap<>(
+        (TreeMap<String, Long>) s.readObject());
+    minimums = new ConcurrentHashMap<>(
+        (TreeMap<String, Long>) s.readObject());
+    maximums = new ConcurrentHashMap<>(
+        (TreeMap<String, Long>) s.readObject());
+    meanStatistics = new ConcurrentHashMap<>(
+        (TreeMap<String, MeanStatistic>) s.readObject());
+  }
+
+  /**
+   * What classes are needed to deserialize this class?
+   * Needed to securely unmarshall this from untrusted sources.
+   * @return a list of required classes to deserialize the data.
+   */
+  public static List<Class> requiredSerializationClasses() {
+    return Arrays.stream(DESERIALIZATION_CLASSES)
+        .collect(Collectors.toList());
+  }
+
+}
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/IOStatisticsSource.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/IOStatisticsSource.java
new file mode 100644
index 0000000000000..67bf51fc0c3ae
--- /dev/null
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/IOStatisticsSource.java
@@ -0,0 +1,47 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.statistics;
+
+import org.apache.hadoop.classification.InterfaceStability;
+
+/**
+ * A source of IO statistics.
+ * <p>
+ * These statistics MUST be instance specific, not thread local.
+ * </p>
+ */
+
+@InterfaceStability.Unstable
+public interface IOStatisticsSource {
+
+  /**
+   * Return a statistics instance.
+   * <p>
+   * It is not a requirement that the same instance is returned every time.
+   * {@link IOStatisticsSource}.
+   * <p>
+   * If the object implementing this is Closeable, this method
+   * may return null if invoked on a closed object, even if
+   * it returns a valid instance when called earlier.
+   * @return an IOStatistics instance or null
+   */
+  default IOStatistics getIOStatistics() {
+    return null;
+  }
+}
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/IOStatisticsSupport.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/IOStatisticsSupport.java
new file mode 100644
index 0000000000000..75977047c0f2a
--- /dev/null
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/IOStatisticsSupport.java
@@ -0,0 +1,107 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.statistics;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.classification.InterfaceStability;
+import org.apache.hadoop.fs.statistics.impl.StubDurationTracker;
+import org.apache.hadoop.fs.statistics.impl.StubDurationTrackerFactory;
+
+/**
+ * Support for working with IOStatistics.
+ */
+@InterfaceAudience.Public
+@InterfaceStability.Unstable
+public final class IOStatisticsSupport {
+
+  private IOStatisticsSupport() {
+  }
+
+  /**
+   * Take a snapshot of the current statistics state.
+   * <p>
+   * This is not an atomic option.
+   * <p>
+   * The instance can be serialized, and its
+   * {@code toString()} method lists all the values.
+   * @param statistics statistics
+   * @return a snapshot of the current values.
+   */
+  public static IOStatisticsSnapshot
+      snapshotIOStatistics(IOStatistics statistics) {
+
+    return new IOStatisticsSnapshot(statistics);
+  }
+
+  /**
+   * Create a snapshot statistics instance ready to aggregate data.
+   *
+   * The instance can be serialized, and its
+   * {@code toString()} method lists all the values.
+   * @return an empty snapshot
+   */
+  public static IOStatisticsSnapshot
+      snapshotIOStatistics() {
+
+    return new IOStatisticsSnapshot();
+  }
+
+  /**
+   * Get the IOStatistics of the source, casting it
+   * if it is of the relevant type, otherwise,
+   * if it implements {@link IOStatisticsSource}
+   * extracting the value.
+   *
+   * Returns null if the source isn't of the write type
+   * or the return value of
+   * {@link IOStatisticsSource#getIOStatistics()} was null.
+   * @return an IOStatistics instance or null
+   */
+
+  public static IOStatistics retrieveIOStatistics(
+      final Object source) {
+    if (source instanceof IOStatistics) {
+      return (IOStatistics) source;
+    } else if (source instanceof IOStatisticsSource) {
+      return ((IOStatisticsSource) source).getIOStatistics();
+    } else {
+      // null source or interface not implemented
+      return null;
+    }
+  }
+
+  /**
+   * Return a stub duration tracker factory whose returned trackers
+   * are always no-ops.
+   *
+   * As singletons are returned, this is very low-cost to use.
+   * @return a duration tracker factory.
+   */
+  public static DurationTrackerFactory stubDurationTrackerFactory() {
+    return StubDurationTrackerFactory.STUB_DURATION_TRACKER_FACTORY;
+  }
+
+  /**
+   * Get a stub duration tracker.
+   * @return a stub tracker.
+   */
+  public static DurationTracker stubDurationTracker() {
+    return StubDurationTracker.STUB_DURATION_TRACKER;
+  }
+}
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/MeanStatistic.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/MeanStatistic.java
new file mode 100644
index 0000000000000..d9ff0c25c6a21
--- /dev/null
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/MeanStatistic.java
@@ -0,0 +1,290 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.statistics;
+
+import java.io.Serializable;
+import java.util.Objects;
+
+import com.fasterxml.jackson.annotation.JsonIgnore;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.classification.InterfaceStability;
+
+/**
+ * A mean statistic represented as the sum and the sample count;
+ * the mean is calculated on demand.
+ * <p>
+ * It can be used to accrue values so as to dynamically update
+ * the mean. If so, know that there is no synchronization
+ * on the methods.
+ * </p>
+ * <p>
+ * If a statistic has 0 samples then it is considered to be empty.
+ * </p>
+ * <p>
+ * All 'empty' statistics are equivalent, independent of the sum value.
+ * </p>
+ * <p>
+ * For non-empty statistics, sum and sample values must match
+ * for equality.
+ * </p>
+ * <p>
+ * It is serializable and annotated for correct serializations with jackson2.
+ * </p>
+ * <p>
+ * Thread safety. The operations to add/copy sample data, are thread safe.
+ * </p>
+ * <ol>
+ *   <li>{@link #add(MeanStatistic)}</li>
+ *   <li>{@link #addSample(long)} </li>
+ *   <li>{@link #clear()} </li>
+ *   <li>{@link #setSamplesAndSum(long, long)}</li>
+ *   <li>{@link #set(MeanStatistic)}</li>
+ *   <li>{@link #setSamples(long)} and {@link #setSum(long)}</li>
+ * </ol>
+ * <p>
+ * So is the {@link #mean()} method. This ensures that when
+ * used to aggregated statistics, the aggregate value and sample
+ * count are set and evaluated consistently.
+ * </p>
+ * <p>
+ *   Other methods marked as synchronized because Findbugs overreacts
+ *   to the idea that some operations to update sum and sample count
+ *   are synchronized, but that things like equals are not.
+ * </p>
+ */
+@InterfaceAudience.Public
+@InterfaceStability.Evolving
+public final class MeanStatistic implements Serializable, Cloneable {
+
+  private static final long serialVersionUID = 567888327998615425L;
+
+  /**
+   * Number of samples used to calculate
+   * the mean.
+   */
+  private long samples;
+
+  /**
+   * sum of the values.
+   */
+  private long sum;
+
+  /**
+   * Constructor, with some resilience against invalid sample counts.
+   * If the sample count is 0 or less, the sum is set to 0 and
+   * the sample count to 0.
+   * @param samples sample count.
+   * @param sum sum value
+   */
+  public MeanStatistic(final long samples, final long sum) {
+    if (samples > 0) {
+      this.sum = sum;
+      this.samples = samples;
+    }
+  }
+
+  /**
+   * Create from another statistic.
+   * @param that source
+   */
+  public MeanStatistic(MeanStatistic that) {
+    synchronized (that) {
+      set(that);
+    }
+  }
+
+  /**
+   * Create an empty statistic.
+   */
+  public MeanStatistic() {
+  }
+
+  /**
+   * Get the sum of samples.
+   * @return the sum
+   */
+  public synchronized long getSum() {
+    return sum;
+  }
+
+  /**
+   * Get the sample count.
+   * @return the sample count; 0 means empty
+   */
+  public synchronized long getSamples() {
+    return samples;
+  }
+
+  /**
+   * Is a statistic empty?
+   * @return true if the sample count is 0
+   */
+  @JsonIgnore
+  public synchronized boolean isEmpty() {
+    return samples == 0;
+  }
+
+  /**
+   * Set the values to 0.
+   */
+  public void clear() {
+    setSamplesAndSum(0, 0);
+  }
+
+  /**
+   * Set the sum and samples.
+   * Synchronized.
+   * @param sampleCount new sample count.
+   * @param newSum new sum
+   */
+  public synchronized void setSamplesAndSum(long sampleCount,
+      long newSum) {
+    setSamples(sampleCount);
+    setSum(newSum);
+  }
+
+  /**
+   * Set the statistic to the values of another.
+   * Synchronized.
+   * @param other the source.
+   */
+  public void set(final MeanStatistic other) {
+    setSamplesAndSum(other.getSamples(), other.getSum());
+  }
+
+  /**
+   * Set the sum.
+   * @param sum new sum
+   */
+  public synchronized void setSum(final long sum) {
+    this.sum = sum;
+  }
+
+  /**
+   * Set the sample count.
+   *
+   * If this is less than zero, it is set to zero.
+   * This stops an ill-formed JSON entry from
+   * breaking deserialization, or get an invalid sample count
+   * into an entry.
+   * @param samples sample count.
+   */
+  public synchronized void setSamples(final long samples) {
+    if (samples < 0) {
+      this.samples = 0;
+    } else {
+      this.samples = samples;
+    }
+  }
+
+  /**
+   * Get the arithmetic mean value.
+   * @return the mean
+   */
+  public synchronized double mean() {
+    return samples > 0
+        ? ((double) sum) / samples
+        : 0.0d;
+  }
+
+  /**
+   * Add another MeanStatistic.
+   * @param other other value
+   */
+  public synchronized MeanStatistic add(final MeanStatistic other) {
+    if (other.isEmpty()) {
+      return this;
+    }
+    long otherSamples;
+    long otherSum;
+    synchronized (other) {
+      otherSamples = other.samples;
+      otherSum = other.sum;
+    }
+    if (isEmpty()) {
+      samples = otherSamples;
+      sum = otherSum;
+      return this;
+    }
+    samples += otherSamples;
+    sum += otherSum;
+    return this;
+  }
+
+  /**
+   * Add a sample.
+   * Thread safe.
+   * @param value value to add to the sum
+   */
+  public synchronized void addSample(long value) {
+    samples++;
+    sum += value;
+  }
+
+  /**
+   * The hash code is derived from the mean
+   * and sample count: if either is changed
+   * the statistic cannot be used as a key
+   * for hash tables/maps.
+   * @return a hash value
+   */
+  @Override
+  public synchronized int hashCode() {
+    return Objects.hash(sum, samples);
+  }
+
+  @Override
+  public synchronized boolean equals(final Object o) {
+    if (this == o) {
+      return true;
+    }
+    if (o == null || getClass() != o.getClass()) {
+      return false;
+    }
+    MeanStatistic that = (MeanStatistic) o;
+    if (isEmpty()) {
+      // if we are empty, then so must the other.
+      return that.isEmpty();
+    }
+    return getSum() == that.getSum() &&
+        getSamples() == that.getSamples();
+  }
+
+  @Override
+  public MeanStatistic clone() {
+    return copy();
+  }
+
+  /**
+   * Create a copy of this instance.
+   * @return copy.
+   *
+   */
+  public MeanStatistic copy() {
+    return new MeanStatistic(this);
+  }
+
+  @Override
+  public String toString() {
+    return String.format("(samples=%d, sum=%d, mean=%.4f)",
+        samples, sum, mean());
+  }
+
+}
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/impl/AbstractIOStatisticsImpl.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/impl/AbstractIOStatisticsImpl.java
new file mode 100644
index 0000000000000..c701a509d8951
--- /dev/null
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/impl/AbstractIOStatisticsImpl.java
@@ -0,0 +1,30 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.statistics.impl;
+
+import org.apache.hadoop.fs.statistics.IOStatistics;
+
+/**
+ * Base implementation in case common methods/fields need to be added
+ * in future.
+ */
+public abstract class AbstractIOStatisticsImpl implements IOStatistics {
+
+
+}
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/impl/DynamicIOStatistics.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/impl/DynamicIOStatistics.java
new file mode 100644
index 0000000000000..50c2625c3513d
--- /dev/null
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/impl/DynamicIOStatistics.java
@@ -0,0 +1,132 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.statistics.impl;
+
+import java.util.Collections;
+import java.util.Map;
+import java.util.function.Function;
+
+import org.apache.hadoop.fs.statistics.MeanStatistic;
+
+/**
+ * These statistics are dynamically evaluated by the supplied
+ * String -&gt; type functions.
+ *
+ * This allows statistic sources to supply a list of callbacks used to
+ * generate the statistics on demand; similar to some of the Coda Hale metrics.
+ *
+ * The evaluation actually takes place during the iteration's {@code next()}
+ * call.
+ */
+final class DynamicIOStatistics
+    extends AbstractIOStatisticsImpl {
+
+  /**
+   * Counter evaluators.
+   */
+  private final EvaluatingStatisticsMap<Long> counters
+      = new EvaluatingStatisticsMap<>();
+
+  private final EvaluatingStatisticsMap<Long> gauges
+      = new EvaluatingStatisticsMap<>();
+
+  private final EvaluatingStatisticsMap<Long> minimums
+      = new EvaluatingStatisticsMap<>();
+
+  private final EvaluatingStatisticsMap<Long> maximums
+      = new EvaluatingStatisticsMap<>();
+
+  private final EvaluatingStatisticsMap<MeanStatistic> meanStatistics
+      = new EvaluatingStatisticsMap<>(MeanStatistic::copy);
+
+  DynamicIOStatistics() {
+  }
+
+  @Override
+  public Map<String, Long> counters() {
+    return Collections.unmodifiableMap(counters);
+  }
+
+  @Override
+  public Map<String, Long> gauges() {
+    return Collections.unmodifiableMap(gauges);
+  }
+
+  @Override
+  public Map<String, Long> minimums() {
+    return Collections.unmodifiableMap(minimums);
+  }
+
+  @Override
+  public Map<String, Long> maximums() {
+    return Collections.unmodifiableMap(maximums);
+  }
+
+  @Override
+  public Map<String, MeanStatistic> meanStatistics() {
+    return Collections.unmodifiableMap(meanStatistics);
+  }
+
+  /**
+   * add a mapping of a key to a counter function.
+   * @param key the key
+   * @param eval the evaluator
+   */
+  void addCounterFunction(String key, Function<String, Long> eval) {
+    counters.addFunction(key, eval);
+  }
+
+  /**
+   * add a mapping of a key to a gauge function.
+   * @param key the key
+   * @param eval the evaluator
+   */
+  void addGaugeFunction(String key, Function<String, Long> eval) {
+    gauges.addFunction(key, eval);
+  }
+
+  /**
+   * add a mapping of a key to a minimum function.
+   * @param key the key
+   * @param eval the evaluator
+   */
+  void addMinimumFunction(String key, Function<String, Long> eval) {
+    minimums.addFunction(key, eval);
+  }
+
+  /**
+   * add a mapping of a key to a maximum function.
+   * @param key the key
+   * @param eval the evaluator
+   */
+  void addMaximumFunction(String key, Function<String, Long> eval) {
+    maximums.addFunction(key, eval);
+  }
+
+  /**
+   * add a mapping of a key to a meanStatistic function.
+   * @param key the key
+   * @param eval the evaluator
+   */
+  void addMeanStatisticFunction(String key,
+      Function<String, MeanStatistic> eval) {
+    meanStatistics.addFunction(key, eval);
+  }
+
+}
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/impl/DynamicIOStatisticsBuilder.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/impl/DynamicIOStatisticsBuilder.java
new file mode 100644
index 0000000000000..47a317076dcf2
--- /dev/null
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/impl/DynamicIOStatisticsBuilder.java
@@ -0,0 +1,248 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.statistics.impl;
+
+import java.util.concurrent.atomic.AtomicInteger;
+import java.util.concurrent.atomic.AtomicLong;
+import java.util.function.Function;
+import java.util.function.ToLongFunction;
+
+import org.apache.hadoop.fs.statistics.IOStatistics;
+import org.apache.hadoop.fs.statistics.MeanStatistic;
+import org.apache.hadoop.metrics2.lib.MutableCounterLong;
+
+import static org.apache.hadoop.thirdparty.com.google.common.base.Preconditions.checkState;
+
+/**
+ * Builder of {@link DynamicIOStatistics}.
+ *
+ * Instantiate through
+ * {@link IOStatisticsBinding#dynamicIOStatistics()}.
+ */
+public class DynamicIOStatisticsBuilder {
+
+  /**
+   * the instance being built up. Will be null after the (single)
+   * call to {@link #build()}.
+   */
+  private DynamicIOStatistics instance = new DynamicIOStatistics();
+
+  /**
+   * Build the IOStatistics instance.
+   * @return an instance.
+   * @throws IllegalStateException if the builder has already been built.
+   */
+  public IOStatistics build() {
+    final DynamicIOStatistics stats = activeInstance();
+    // stop the builder from working any more.
+    instance = null;
+    return stats;
+  }
+
+  /**
+   * Get the statistics instance.
+   * @return the instance to build/return
+   * @throws IllegalStateException if the builder has already been built.
+   */
+  private DynamicIOStatistics activeInstance() {
+    checkState(instance != null, "Already built");
+    return instance;
+  }
+
+  /**
+   * Add a new evaluator to the counter statistics.
+   * @param key key of this statistic
+   * @param eval evaluator for the statistic
+   * @return the builder.
+   */
+  public DynamicIOStatisticsBuilder withLongFunctionCounter(String key,
+      ToLongFunction<String> eval) {
+    activeInstance().addCounterFunction(key, eval::applyAsLong);
+    return this;
+  }
+
+  /**
+   * Add a counter statistic to dynamically return the
+   * latest value of the source.
+   * @param key key of this statistic
+   * @param source atomic long counter
+   * @return the builder.
+   */
+  public DynamicIOStatisticsBuilder withAtomicLongCounter(String key,
+      AtomicLong source) {
+    withLongFunctionCounter(key, s -> source.get());
+    return this;
+  }
+
+  /**
+   * Add a counter statistic to dynamically return the
+   * latest value of the source.
+   * @param key key of this statistic
+   * @param source atomic int counter
+   * @return the builder.
+   */
+  public DynamicIOStatisticsBuilder withAtomicIntegerCounter(String key,
+      AtomicInteger source) {
+    withLongFunctionCounter(key, s -> source.get());
+    return this;
+  }
+
+  /**
+   * Build a dynamic counter statistic from a
+   * {@link MutableCounterLong}.
+   * @param key key of this statistic
+   * @param source mutable long counter
+   * @return the builder.
+   */
+  public DynamicIOStatisticsBuilder withMutableCounter(String key,
+      MutableCounterLong source) {
+    withLongFunctionCounter(key, s -> source.value());
+    return this;
+  }
+
+  /**
+   * Add a new evaluator to the gauge statistics.
+   * @param key key of this statistic
+   * @param eval evaluator for the statistic
+   * @return the builder.
+   */
+  public DynamicIOStatisticsBuilder withLongFunctionGauge(String key,
+      ToLongFunction<String> eval) {
+    activeInstance().addGaugeFunction(key, eval::applyAsLong);
+    return this;
+  }
+
+  /**
+   * Add a gauge statistic to dynamically return the
+   * latest value of the source.
+   * @param key key of this statistic
+   * @param source atomic long gauge
+   * @return the builder.
+   */
+  public DynamicIOStatisticsBuilder withAtomicLongGauge(String key,
+      AtomicLong source) {
+    withLongFunctionGauge(key, s -> source.get());
+    return this;
+  }
+
+  /**
+   * Add a gauge statistic to dynamically return the
+   * latest value of the source.
+   * @param key key of this statistic
+   * @param source atomic int gauge
+   * @return the builder.
+   */
+  public DynamicIOStatisticsBuilder withAtomicIntegerGauge(String key,
+      AtomicInteger source) {
+    withLongFunctionGauge(key, s -> source.get());
+    return this;
+  }
+
+  /**
+   * Add a new evaluator to the minimum statistics.
+   * @param key key of this statistic
+   * @param eval evaluator for the statistic
+   * @return the builder.
+   */
+  public DynamicIOStatisticsBuilder withLongFunctionMinimum(String key,
+      ToLongFunction<String> eval) {
+    activeInstance().addMinimumFunction(key, eval::applyAsLong);
+    return this;
+  }
+
+  /**
+   * Add a minimum statistic to dynamically return the
+   * latest value of the source.
+   * @param key key of this statistic
+   * @param source atomic long minimum
+   * @return the builder.
+   */
+  public DynamicIOStatisticsBuilder withAtomicLongMinimum(String key,
+      AtomicLong source) {
+    withLongFunctionMinimum(key, s -> source.get());
+    return this;
+  }
+
+  /**
+   * Add a minimum statistic to dynamically return the
+   * latest value of the source.
+   * @param key key of this statistic
+   * @param source atomic int minimum
+   * @return the builder.
+   */
+  public DynamicIOStatisticsBuilder withAtomicIntegerMinimum(String key,
+      AtomicInteger source) {
+    withLongFunctionMinimum(key, s -> source.get());
+    return this;
+  }
+
+
+  /**
+   * Add a new evaluator to the maximum statistics.
+   * @param key key of this statistic
+   * @param eval evaluator for the statistic
+   * @return the builder.
+   */
+  public DynamicIOStatisticsBuilder withLongFunctionMaximum(String key,
+      ToLongFunction<String> eval) {
+    activeInstance().addMaximumFunction(key, eval::applyAsLong);
+    return this;
+  }
+
+  /**
+   * Add a maximum statistic to dynamically return the
+   * latest value of the source.
+   * @param key key of this statistic
+   * @param source atomic long maximum
+   * @return the builder.
+   */
+  public DynamicIOStatisticsBuilder withAtomicLongMaximum(String key,
+      AtomicLong source) {
+    withLongFunctionMaximum(key, s -> source.get());
+    return this;
+  }
+
+  /**
+   * Add a maximum statistic to dynamically return the
+   * latest value of the source.
+   * @param key key of this statistic
+   * @param source atomic int maximum
+   * @return the builder.
+   */
+  public DynamicIOStatisticsBuilder withAtomicIntegerMaximum(String key,
+      AtomicInteger source) {
+    withLongFunctionMaximum(key, s -> source.get());
+    return this;
+  }
+
+  /**
+   * Add a new evaluator to the mean statistics.
+   *
+   * This is a function which must return the mean and the sample count.
+   * @param key key of this statistic
+   * @param eval evaluator for the statistic
+   * @return the builder.
+   */
+  public DynamicIOStatisticsBuilder withMeanStatisticFunction(String key,
+      Function<String, MeanStatistic> eval) {
+    activeInstance().addMeanStatisticFunction(key, eval);
+    return this;
+  }
+
+}
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/impl/EmptyIOStatistics.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/impl/EmptyIOStatistics.java
new file mode 100644
index 0000000000000..f474fc209771c
--- /dev/null
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/impl/EmptyIOStatistics.java
@@ -0,0 +1,74 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.statistics.impl;
+
+import java.util.Map;
+
+import org.apache.hadoop.fs.statistics.IOStatistics;
+import org.apache.hadoop.fs.statistics.MeanStatistic;
+
+import static java.util.Collections.emptyMap;
+
+/**
+ * An empty IO Statistics implementation for classes which always
+ * want to return a non-null set of statistics.
+ */
+final class EmptyIOStatistics extends AbstractIOStatisticsImpl {
+
+  /**
+   * The sole instance of this class.
+   */
+  private static final EmptyIOStatistics INSTANCE = new EmptyIOStatistics();
+
+  private EmptyIOStatistics() {
+  }
+
+  @Override
+  public Map<String, Long> counters() {
+    return emptyMap();
+  }
+
+  @Override
+  public Map<String, Long> gauges() {
+    return emptyMap();
+  }
+
+  @Override
+  public Map<String, Long> minimums() {
+    return emptyMap();
+  }
+
+  @Override
+  public Map<String, Long> maximums() {
+    return emptyMap();
+  }
+
+  @Override
+  public Map<String, MeanStatistic> meanStatistics() {
+    return emptyMap();
+  }
+
+  /**
+   * Get the single instance of this class.
+   * @return a shared, empty instance.
+   */
+  public static IOStatistics getInstance() {
+    return INSTANCE;
+  }
+}
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/impl/EvaluatingStatisticsMap.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/impl/EvaluatingStatisticsMap.java
new file mode 100644
index 0000000000000..e4680f2d81fa0
--- /dev/null
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/impl/EvaluatingStatisticsMap.java
@@ -0,0 +1,202 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.statistics.impl;
+
+import java.io.Serializable;
+import java.util.Collection;
+import java.util.Map;
+import java.util.Set;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.function.Function;
+import java.util.stream.Collectors;
+
+/**
+ * A map of functions which can be invoked to dynamically
+ * create the value of an entry.
+ * @param <E> type of entry value.
+ */
+final class EvaluatingStatisticsMap<E extends Serializable> implements
+    Map<String, E> {
+
+  /**
+   * Functions to invoke when evaluating keys.
+   */
+  private final Map<String, Function<String, E>> evaluators
+      = new ConcurrentHashMap<>();
+
+  /**
+   * Function to use when copying map values.
+   */
+  private final Function<E, E> copyFn;
+
+  /**
+   * Construct with the copy function being simple passthrough.
+   */
+  EvaluatingStatisticsMap() {
+    this(IOStatisticsBinding::passthroughFn);
+  }
+
+  /**
+   * Construct with the copy function being that supplied in.
+   * @param copyFn copy function.
+   */
+  EvaluatingStatisticsMap(final Function<E, E> copyFn) {
+    this.copyFn = copyFn;
+  }
+
+  /**
+   * add a mapping of a key to a function.
+   * @param key the key
+   * @param eval the evaluator
+   */
+  void addFunction(String key, Function<String, E> eval) {
+    evaluators.put(key, eval);
+  }
+
+  @Override
+  public int size() {
+    return evaluators.size();
+  }
+
+  @Override
+  public boolean isEmpty() {
+    return evaluators.isEmpty();
+  }
+
+  @Override
+  public boolean containsKey(final Object key) {
+    return evaluators.containsKey(key);
+  }
+
+  @Override
+  public boolean containsValue(final Object value) {
+    throw new UnsupportedOperationException();
+  }
+
+  @Override
+  public E get(final Object key) {
+    Function<String, E> fn = evaluators.get(key);
+    return fn != null
+        ? fn.apply((String) key)
+        : null;
+  }
+
+  @Override
+  public E put(final String key, final E value) {
+    throw new UnsupportedOperationException();
+  }
+
+  @Override
+  public E remove(final Object key) {
+    throw new UnsupportedOperationException();
+  }
+
+  @Override
+  public void putAll(final Map<? extends String, ? extends E> m) {
+    throw new UnsupportedOperationException();
+  }
+
+  @Override
+  public void clear() {
+    throw new UnsupportedOperationException();
+  }
+
+  @Override
+  public Set<String> keySet() {
+    return evaluators.keySet();
+  }
+
+  /**
+   * Evaluate all the entries and provide a list of the results.
+   *
+   * This is not a snapshot, so if the evaluators actually return
+   * references to mutable objects (e.g. a MeanStatistic instance)
+   * then that value may still change.
+   * @return the current list of evaluated results.
+   */
+  @Override
+  public Collection<E> values() {
+    Set<Entry<String, Function<String, E>>> evalEntries =
+        evaluators.entrySet();
+    return evalEntries.parallelStream().map((e) ->
+        e.getValue().apply(e.getKey()))
+        .collect(Collectors.toList());
+  }
+
+  /**
+   * Take a snapshot.
+   * @return a map snapshot.
+   */
+  public Map<String, E> snapshot() {
+    return IOStatisticsBinding.snapshotMap(this, copyFn);
+  }
+
+  /**
+   * Creating the entry set forces an evaluation of the functions.
+   *
+   * This is not a snapshot, so if the evaluators actually return
+   * references to mutable objects (e.g. a MeanStatistic instance)
+   * then that value may still change.
+   *
+   * The evaluation may be parallelized.
+   * @return an evaluated set of values
+   */
+  @Override
+  public synchronized Set<Entry<String, E>> entrySet() {
+    Set<Entry<String, Function<String, E>>> evalEntries =
+        evaluators.entrySet();
+    Set<Entry<String, E>> r = evalEntries.parallelStream().map((e) ->
+        new EntryImpl<>(e.getKey(), e.getValue().apply(e.getKey())))
+        .collect(Collectors.toSet());
+    return r;
+  }
+
+  /**
+   * Simple entry.
+   * @param <E> entry type
+   */
+  private static final class EntryImpl<E> implements Entry<String, E> {
+
+    private String key;
+
+    private E value;
+
+    private EntryImpl(final String key, final E value) {
+      this.key = key;
+      this.value = value;
+    }
+
+    @Override
+    public String getKey() {
+      return key;
+    }
+
+    @Override
+    public E getValue() {
+      return value;
+    }
+
+    @Override
+    public E setValue(final E val) {
+      this.value = val;
+      return val;
+    }
+  }
+
+}
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/impl/IOStatisticsStore.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/impl/IOStatisticsStore.java
new file mode 100644
index 0000000000000..1b4139e463a9e
--- /dev/null
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/impl/IOStatisticsStore.java
@@ -0,0 +1,258 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.statistics.impl;
+
+import java.time.Duration;
+import java.util.concurrent.atomic.AtomicLong;
+
+import org.apache.hadoop.fs.statistics.IOStatistics;
+import org.apache.hadoop.fs.statistics.IOStatisticsAggregator;
+import org.apache.hadoop.fs.statistics.DurationTrackerFactory;
+import org.apache.hadoop.fs.statistics.MeanStatistic;
+
+/**
+ * Interface of an IOStatistics store intended for
+ * use in classes which track statistics for reporting.
+ */
+public interface IOStatisticsStore extends IOStatistics,
+    IOStatisticsAggregator,
+    DurationTrackerFactory {
+
+  /**
+   * Increment a counter by one.
+   *
+   * No-op if the counter is unknown.
+   * @param key statistics key
+   * @return old value or, if the counter is unknown: 0
+   */
+  default long incrementCounter(String key) {
+    return incrementCounter(key, 1);
+  }
+
+  /**
+   * Increment a counter.
+   *
+   * No-op if the counter is unknown.
+   * If the value is negative, it is ignored.
+   * @param key statistics key
+   * @param value value to increment
+   * @return the updated value or, if the counter is unknown: 0
+   */
+  long incrementCounter(String key, long value);
+
+  /**
+   * Set a counter.
+   *
+   * No-op if the counter is unknown.
+   * @param key statistics key
+   * @param value value to set
+   */
+  void setCounter(String key, long value);
+
+  /**
+   * Set a gauge.
+   *
+   * No-op if the gauge is unknown.
+   * @param key statistics key
+   * @param value value to set
+   */
+  void setGauge(String key, long value);
+
+  /**
+   * Increment a gauge.
+   * <p>
+   * No-op if the gauge is unknown.
+   * </p>
+   * @param key statistics key
+   * @param value value to increment
+   * @return new value or 0 if the key is unknown
+   */
+  long incrementGauge(String key, long value);
+
+  /**
+   * Set a maximum.
+   * No-op if the maximum is unknown.
+   * @param key statistics key
+   * @param value value to set
+   */
+  void setMaximum(String key, long value);
+
+  /**
+   * Increment a maximum.
+   * <p>
+   * No-op if the maximum is unknown.
+   * </p>
+   * @param key statistics key
+   * @param value value to increment
+   * @return new value or 0 if the key is unknown
+   */
+  long incrementMaximum(String key, long value);
+
+  /**
+   * Set a minimum.
+   * <p>
+   * No-op if the minimum is unknown.
+   * </p>
+   * @param key statistics key
+   * @param value value to set
+   */
+  void setMinimum(String key, long value);
+
+  /**
+   * Increment a minimum.
+   * <p>
+   * No-op if the minimum is unknown.
+   * </p>
+   * @param key statistics key
+   * @param value value to increment
+   * @return new value or 0 if the key is unknown
+   */
+  long incrementMinimum(String key, long value);
+
+  /**
+   * Add a minimum sample: if less than the current value,
+   * updates the value.
+   * <p>
+   * No-op if the minimum is unknown.
+   * </p>
+   * @param key statistics key
+   * @param value sample value
+   */
+  void addMinimumSample(String key, long value);
+
+  /**
+   * Add a maximum sample: if greater than the current value,
+   * updates the value.
+   * <p>
+   * No-op if the key is unknown.
+   * </p>
+   * @param key statistics key
+   * @param value sample value
+   */
+  void addMaximumSample(String key, long value);
+
+  /**
+   * Set a mean statistic to a given value.
+   * <p>
+   * No-op if the key is unknown.
+   * </p>
+   * @param key statistic key
+   * @param value new value.
+   */
+  void setMeanStatistic(String key, MeanStatistic value);
+
+  /**
+   * Add a sample to the mean statistics.
+   * <p>
+   * No-op if the key is unknown.
+   * </p>
+   * @param key key
+   * @param value sample value.
+   */
+  void addMeanStatisticSample(String key, long value);
+
+  /**
+   * Reset all statistics.
+   * Unsynchronized.
+   */
+  void reset();
+
+  /**
+   * Get a reference to the atomic instance providing the
+   * value for a specific counter. This is useful if
+   * the value is passed around.
+   * @param key statistic name
+   * @return the reference
+   * @throws NullPointerException if there is no entry of that name
+   */
+  AtomicLong getCounterReference(String key);
+
+  /**
+   * Get a reference to the atomic instance providing the
+   * value for a specific maximum. This is useful if
+   * the value is passed around.
+   * @param key statistic name
+   * @return the reference
+   * @throws NullPointerException if there is no entry of that name
+   */
+  AtomicLong getMaximumReference(String key);
+
+  /**
+   * Get a reference to the atomic instance providing the
+   * value for a specific minimum. This is useful if
+   * the value is passed around.
+   * @param key statistic name
+   * @return the reference
+   * @throws NullPointerException if there is no entry of that name
+   */
+  AtomicLong getMinimumReference(String key);
+
+  /**
+   * Get a reference to the atomic instance providing the
+   * value for a specific gauge. This is useful if
+   * the value is passed around.
+   * @param key statistic name
+   * @return the reference
+   * @throws NullPointerException if there is no entry of that name
+   */
+  AtomicLong getGaugeReference(String key);
+
+  /**
+   * Get a reference to the atomic instance providing the
+   * value for a specific meanStatistic. This is useful if
+   * the value is passed around.
+   * @param key statistic name
+   * @return the reference
+   * @throws NullPointerException if there is no entry of that name
+   */
+  MeanStatistic getMeanStatistic(String key);
+
+  /**
+   * Add a duration to the min/mean/max statistics, using the
+   * given prefix and adding a suffix for each specific value.
+   *
+   * The update is not-atomic, even though each individual statistic
+   * is updated thread-safely. If two threads update the values
+   * simultaneously, at the end of each operation the state will
+   * be correct. It is only during the sequence that the statistics
+   * may be observably inconsistent.
+   * @param prefix statistic prefix
+   * @param durationMillis duration in milliseconds.
+   */
+  void addTimedOperation(String prefix, long durationMillis);
+
+  /**
+   * Add a duration to the min/mean/max statistics, using the
+   * given prefix and adding a suffix for each specific value.;
+   * increment tha counter whose name == prefix.
+   *
+   * If any of the statistics are not registered, that part of
+   * the sequence will be omitted -the rest will proceed.
+   *
+   * The update is not-atomic, even though each individual statistic
+   * is updated thread-safely. If two threads update the values
+   * simultaneously, at the end of each operation the state will
+   * be correct. It is only during the sequence that the statistics
+   * may be observably inconsistent.
+   * @param prefix statistic prefix
+   * @param duration duration
+   */
+  void addTimedOperation(String prefix, Duration duration);
+
+}
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/impl/IOStatisticsStoreBuilder.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/impl/IOStatisticsStoreBuilder.java
new file mode 100644
index 0000000000000..d94a8389b7ff8
--- /dev/null
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/impl/IOStatisticsStoreBuilder.java
@@ -0,0 +1,75 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.statistics.impl;
+
+/**
+ * Builder of the {@link IOStatisticsStore} implementation.
+ */
+public interface IOStatisticsStoreBuilder {
+
+  /**
+   * Declare a varargs list of counters to add.
+   * @param keys names of statistics.
+   * @return this builder.
+   */
+  IOStatisticsStoreBuilder withCounters(String... keys);
+
+  /**
+   * Declare a varargs list of gauges to add.
+   * @param keys names of statistics.
+   * @return this builder.
+   */
+  IOStatisticsStoreBuilder withGauges(String... keys);
+
+  /**
+   * Declare a varargs list of maximums to add.
+   * @param keys names of statistics.
+   * @return this builder.
+   */
+  IOStatisticsStoreBuilder withMaximums(String... keys);
+
+  /**
+   * Declare a varargs list of minimums to add.
+   * @param keys names of statistics.
+   * @return this builder.
+   */
+  IOStatisticsStoreBuilder withMinimums(String... keys);
+
+  /**
+   * Declare a varargs list of means to add.
+   * @param keys names of statistics.
+   * @return this builder.
+   */
+  IOStatisticsStoreBuilder withMeanStatistics(String... keys);
+
+  /**
+   * Add a statistic in the counter, min, max and mean maps for each
+   * declared statistic prefix.
+   * @param prefixes prefixes for the stats.
+   * @return this
+   */
+  IOStatisticsStoreBuilder withDurationTracking(
+      String... prefixes);
+
+  /**
+   * Build the collector.
+   * @return a new collector.
+   */
+  IOStatisticsStore build();
+}
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/impl/IOStatisticsStoreBuilderImpl.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/impl/IOStatisticsStoreBuilderImpl.java
new file mode 100644
index 0000000000000..0562271db3ef8
--- /dev/null
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/impl/IOStatisticsStoreBuilderImpl.java
@@ -0,0 +1,100 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.statistics.impl;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+import static org.apache.hadoop.fs.statistics.StoreStatisticNames.SUFFIX_FAILURES;
+import static org.apache.hadoop.fs.statistics.StoreStatisticNames.SUFFIX_MAX;
+import static org.apache.hadoop.fs.statistics.StoreStatisticNames.SUFFIX_MEAN;
+import static org.apache.hadoop.fs.statistics.StoreStatisticNames.SUFFIX_MIN;
+
+/**
+ * Builder for an IOStatistics store..
+ */
+final class IOStatisticsStoreBuilderImpl implements
+    IOStatisticsStoreBuilder {
+
+  private final List<String> counters = new ArrayList<>();
+
+  private final List<String> gauges = new ArrayList<>();
+
+  private final List<String> minimums = new ArrayList<>();
+
+  private final List<String> maximums = new ArrayList<>();
+
+  private final List<String> meanStatistics = new ArrayList<>();
+
+  @Override
+  public IOStatisticsStoreBuilderImpl withCounters(final String... keys) {
+    counters.addAll(Arrays.asList(keys));
+    return this;
+  }
+
+  @Override
+  public IOStatisticsStoreBuilderImpl withGauges(final String... keys) {
+    gauges.addAll(Arrays.asList(keys));
+    return this;
+  }
+
+  @Override
+  public IOStatisticsStoreBuilderImpl withMaximums(final String... keys) {
+    maximums.addAll(Arrays.asList(keys));
+    return this;
+  }
+
+  @Override
+  public IOStatisticsStoreBuilderImpl withMinimums(final String... keys) {
+    minimums.addAll(Arrays.asList(keys));
+    return this;
+  }
+
+  @Override
+  public IOStatisticsStoreBuilderImpl withMeanStatistics(
+      final String... keys) {
+    meanStatistics.addAll(Arrays.asList(keys));
+    return this;
+  }
+
+  @Override
+  public IOStatisticsStoreBuilderImpl withDurationTracking(
+      final String... prefixes) {
+    for (String p : prefixes) {
+      withCounters(p, p + SUFFIX_FAILURES);
+      withMinimums(
+          p + SUFFIX_MIN,
+          p + SUFFIX_FAILURES + SUFFIX_MIN);
+      withMaximums(
+          p + SUFFIX_MAX,
+          p + SUFFIX_FAILURES + SUFFIX_MAX);
+      withMeanStatistics(
+          p + SUFFIX_MEAN,
+          p + SUFFIX_FAILURES + SUFFIX_MEAN);
+    }
+    return this;
+  }
+
+  @Override
+  public IOStatisticsStore build() {
+    return new IOStatisticsStoreImpl(counters, gauges, minimums,
+        maximums, meanStatistics);
+  }
+}
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/impl/IOStatisticsStoreImpl.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/impl/IOStatisticsStoreImpl.java
new file mode 100644
index 0000000000000..0471703b3b040
--- /dev/null
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/impl/IOStatisticsStoreImpl.java
@@ -0,0 +1,469 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.statistics.impl;
+
+import javax.annotation.Nullable;
+import java.time.Duration;
+import java.util.List;
+import java.util.Map;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.atomic.AtomicLong;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.hadoop.fs.statistics.DurationTracker;
+import org.apache.hadoop.fs.statistics.IOStatistics;
+import org.apache.hadoop.fs.statistics.MeanStatistic;
+
+import static java.util.Objects.requireNonNull;
+import static org.apache.hadoop.fs.statistics.IOStatisticsSupport.stubDurationTracker;
+import static org.apache.hadoop.fs.statistics.StoreStatisticNames.SUFFIX_MAX;
+import static org.apache.hadoop.fs.statistics.StoreStatisticNames.SUFFIX_MEAN;
+import static org.apache.hadoop.fs.statistics.StoreStatisticNames.SUFFIX_MIN;
+import static org.apache.hadoop.fs.statistics.impl.IOStatisticsBinding.aggregateMaximums;
+import static org.apache.hadoop.fs.statistics.impl.IOStatisticsBinding.aggregateMinimums;
+import static org.apache.hadoop.fs.statistics.impl.IOStatisticsBinding.dynamicIOStatistics;
+import static org.apache.hadoop.fs.statistics.impl.IOStatisticsBinding.maybeUpdateMaximum;
+import static org.apache.hadoop.fs.statistics.impl.IOStatisticsBinding.maybeUpdateMinimum;
+
+/**
+ * Implementation of {@link IOStatisticsStore}.
+ * <p>
+ *   A ConcurrentHashMap of each set of statistics is created;
+ *   the AtomicLong/MeanStatistic entries are fetched as required.
+ *   When the statistics are updated, the referenced objects
+ *   are updated rather than new values set in the map.
+ * </p>
+ */
+final class IOStatisticsStoreImpl extends WrappedIOStatistics
+    implements IOStatisticsStore {
+
+  /**
+   * Log changes at debug.
+   * Noisy, but occasionally useful.
+   */
+  private static final Logger LOG =
+      LoggerFactory.getLogger(IOStatisticsStoreImpl.class);
+
+  /** All the counters are atomic longs. */
+  private final Map<String, AtomicLong> counterMap = new ConcurrentHashMap<>();
+
+  /** All the gauges are atomic longs. */
+  private final Map<String, AtomicLong> gaugeMap = new ConcurrentHashMap<>();
+
+  /** All the minimum values are atomic longs. */
+  private final Map<String, AtomicLong> minimumMap = new ConcurrentHashMap<>();
+
+  /** All the maximum values are atomic longs. */
+  private final Map<String, AtomicLong> maximumMap = new ConcurrentHashMap<>();
+
+  /**
+   * The mean statistics.
+   * Relies on the MeanStatistic operations being synchronized.
+   */
+  private final Map<String, MeanStatistic> meanStatisticMap
+      = new ConcurrentHashMap<>();
+
+  /**
+   * Constructor invoked via the builder.
+   * @param counters keys to use for the counter statistics.
+   * @param gauges names of gauges
+   * @param minimums names of minimums
+   * @param maximums names of maximums
+   * @param meanStatistics names of mean statistics.
+   */
+  IOStatisticsStoreImpl(
+      final List<String> counters,
+      final List<String> gauges,
+      final List<String> minimums,
+      final List<String> maximums,
+      final List<String> meanStatistics) {
+    // initially create the superclass with no wrapped mapping;
+    super(null);
+
+    // now construct a dynamic statistics source mapping to
+    // the various counters, gauges etc dynamically created
+    // into maps
+    DynamicIOStatisticsBuilder builder = dynamicIOStatistics();
+    if (counters != null) {
+      for (String key : counters) {
+        AtomicLong counter = new AtomicLong();
+        counterMap.put(key, counter);
+        builder.withAtomicLongCounter(key, counter);
+      }
+    }
+    if (gauges != null) {
+      for (String key : gauges) {
+        AtomicLong gauge = new AtomicLong();
+        gaugeMap.put(key, gauge);
+        builder.withAtomicLongGauge(key, gauge);
+      }
+    }
+    if (maximums != null) {
+      for (String key : maximums) {
+        AtomicLong maximum = new AtomicLong(MAX_UNSET_VALUE);
+        maximumMap.put(key, maximum);
+        builder.withAtomicLongMaximum(key, maximum);
+      }
+    }
+    if (minimums != null) {
+      for (String key : minimums) {
+        AtomicLong minimum = new AtomicLong(MIN_UNSET_VALUE);
+        minimumMap.put(key, minimum);
+        builder.withAtomicLongMinimum(key, minimum);
+      }
+    }
+    if (meanStatistics != null) {
+      for (String key : meanStatistics) {
+        meanStatisticMap.put(key, new MeanStatistic());
+        builder.withMeanStatisticFunction(key, k -> meanStatisticMap.get(k));
+      }
+    }
+    setWrapped(builder.build());
+  }
+
+  /**
+   * Set an atomic long to a value.
+   * @param aLong atomic long; may be null
+   * @param value value to set to
+   */
+  private void setAtomicLong(final AtomicLong aLong, final long value) {
+    if (aLong != null) {
+      aLong.set(value);
+    }
+  }
+
+  /**
+   * increment an atomic long and return its value;
+   * null long is no-op returning 0.
+   * @param aLong atomic long; may be null
+   * param increment amount to increment; negative for a decrement
+   * @return final value or 0 if the long is null
+   */
+  private long incAtomicLong(final AtomicLong aLong,
+      final long increment) {
+    if (aLong != null) {
+      // optimization: zero is a get rather than addAndGet()
+      return increment != 0
+          ? aLong.addAndGet(increment)
+          : aLong.get();
+    } else {
+      return 0;
+    }
+  }
+
+  @Override
+  public void setCounter(final String key, final long value) {
+    setAtomicLong(counterMap.get(key), value);
+    LOG.debug("Setting counter {} to {}", key, value);
+  }
+
+  @Override
+  public long incrementCounter(final String key, final long value) {
+    AtomicLong counter = counterMap.get(key);
+    if (counter == null) {
+      LOG.debug("Ignoring counter increment for unknown counter {}",
+          key);
+      return 0;
+    }
+    if (value < 0) {
+      LOG.debug("Ignoring negative increment value {} for counter {}",
+          value, key);
+      // returns old value
+      return counter.get();
+    } else {
+      long l = incAtomicLong(counter, value);
+      LOG.debug("Incrementing counter {} by {} with final value {}",
+          key, value, l);
+      return l;
+    }
+  }
+
+  @Override
+  public void setMaximum(final String key, final long value) {
+    setAtomicLong(maximumMap.get(key), value);
+  }
+
+  @Override
+  public long incrementMaximum(final String key, final long value) {
+    return incAtomicLong(maximumMap.get(key), value);
+  }
+
+  @Override
+  public void setMinimum(final String key, final long value) {
+    setAtomicLong(minimumMap.get(key), value);
+  }
+
+  @Override
+  public long incrementMinimum(final String key, final long value) {
+    return incAtomicLong(minimumMap.get(key), value);
+  }
+
+  @Override
+  public void addMinimumSample(final String key, final long value) {
+    AtomicLong min = minimumMap.get(key);
+    if (min != null) {
+      maybeUpdateMinimum(min, value);
+    }
+  }
+
+  @Override
+  public void addMaximumSample(final String key, final long value) {
+    AtomicLong max = maximumMap.get(key);
+    if (max != null) {
+      maybeUpdateMaximum(max, value);
+    }
+  }
+
+  @Override
+  public void setGauge(final String key, final long value) {
+    setAtomicLong(gaugeMap.get(key), value);
+  }
+
+  @Override
+  public long incrementGauge(final String key, final long value) {
+    return incAtomicLong(gaugeMap.get(key), value);
+  }
+
+  @Override
+  public void setMeanStatistic(final String key, final MeanStatistic value) {
+    final MeanStatistic ref = meanStatisticMap.get(key);
+    if (ref != null) {
+      ref.set(value);
+    }
+  }
+
+  @Override
+  public void addMeanStatisticSample(final String key, final long value) {
+    final MeanStatistic ref = meanStatisticMap.get(key);
+    if (ref != null) {
+      ref.addSample(value);
+    }
+  }
+
+  /**
+   * Reset all statistics.
+   */
+  @Override
+  public synchronized void reset() {
+    counterMap.values().forEach(a -> a.set(0));
+    gaugeMap.values().forEach(a -> a.set(0));
+    minimumMap.values().forEach(a -> a.set(0));
+    maximumMap.values().forEach(a -> a.set(0));
+    meanStatisticMap.values().forEach(a -> a.clear());
+  }
+
+  /**
+   * Aggregate those statistics which the store is tracking;
+   * ignore the rest.
+   *
+   * @param source statistics; may be null
+   * @return true if a statistics reference was supplied/aggregated.
+   */
+  @Override
+  public synchronized boolean aggregate(
+      @Nullable final IOStatistics source) {
+
+    if (source == null) {
+      return false;
+    }
+    // counters: addition
+    Map<String, Long> sourceCounters = source.counters();
+    counterMap.entrySet().
+        forEach(e -> {
+          Long sourceValue = lookupQuietly(sourceCounters, e.getKey());
+          if (sourceValue != null) {
+            e.getValue().addAndGet(sourceValue);
+          }
+        });
+    // gauge: add positive values only
+    Map<String, Long> sourceGauges = source.gauges();
+    gaugeMap.entrySet().forEach(e -> {
+      Long sourceGauge = lookupQuietly(sourceGauges, e.getKey());
+      if (sourceGauge != null && sourceGauge > 0) {
+        e.getValue().addAndGet(sourceGauge);
+      }
+    });
+    // min: min of current and source
+    Map<String, Long> sourceMinimums = source.minimums();
+    minimumMap.entrySet().forEach(e -> {
+      Long sourceValue = lookupQuietly(sourceMinimums, e.getKey());
+      if (sourceValue != null) {
+        AtomicLong dest = e.getValue();
+        dest.set(aggregateMaximums(dest.get(), sourceValue));
+        dest.set(aggregateMinimums(dest.get(), sourceValue));
+      }
+    });
+    // max: max of current and source
+    Map<String, Long> sourceMaximums = source.maximums();
+    maximumMap.entrySet().forEach(e -> {
+      Long sourceValue = lookupQuietly(sourceMaximums, e.getKey());
+      if (sourceValue != null) {
+        AtomicLong dest = e.getValue();
+        dest.set(aggregateMaximums(dest.get(), sourceValue));
+      }
+    });
+    // the most complex
+    Map<String, MeanStatistic> sourceMeans = source.meanStatistics();
+    meanStatisticMap.entrySet().forEach(e -> {
+      MeanStatistic current = e.getValue();
+      MeanStatistic sourceValue = lookupQuietly(
+          sourceMeans, e.getKey());
+      if (sourceValue != null) {
+        current.add(sourceValue);
+      }
+    });
+    return true;
+  }
+
+  /**
+   * Get a reference to the map type providing the
+   * value for a specific key, raising an exception if
+   * there is no entry for that key.
+   * @param <T> type of map/return type.
+   * @param map map to look up
+   * @param key statistic name
+   * @return the value
+   * @throws NullPointerException if there is no entry of that name
+   */
+  private static <T> T lookup(final Map<String, T> map, String key) {
+    T val = map.get(key);
+    requireNonNull(val, () -> ("unknown statistic " + key));
+    return val;
+  }
+
+  /**
+   * Get a reference to the map type providing the
+   * value for a specific key, returning null if it not found.
+   * @param <T> type of map/return type.
+   * @param map map to look up
+   * @param key statistic name
+   * @return the value
+   */
+  private static <T> T lookupQuietly(final Map<String, T> map, String key) {
+    return map.get(key);
+  }
+
+  /**
+   * Get a reference to the atomic instance providing the
+   * value for a specific counter. This is useful if
+   * the value is passed around.
+   * @param key statistic name
+   * @return the reference
+   * @throws NullPointerException if there is no entry of that name
+   */
+  @Override
+  public AtomicLong getCounterReference(String key) {
+    return lookup(counterMap, key);
+  }
+
+  /**
+   * Get a reference to the atomic instance providing the
+   * value for a specific maximum. This is useful if
+   * the value is passed around.
+   * @param key statistic name
+   * @return the reference
+   * @throws NullPointerException if there is no entry of that name
+   */
+  @Override
+  public AtomicLong getMaximumReference(String key) {
+    return lookup(maximumMap, key);
+  }
+
+  /**
+   * Get a reference to the atomic instance providing the
+   * value for a specific minimum. This is useful if
+   * the value is passed around.
+   * @param key statistic name
+   * @return the reference
+   * @throws NullPointerException if there is no entry of that name
+   */
+  @Override
+  public AtomicLong getMinimumReference(String key) {
+    return lookup(minimumMap, key);
+  }
+
+  /**
+   * Get a reference to the atomic instance providing the
+   * value for a specific gauge. This is useful if
+   * the value is passed around.
+   * @param key statistic name
+   * @return the reference
+   * @throws NullPointerException if there is no entry of that name
+   */
+  @Override
+  public AtomicLong getGaugeReference(String key) {
+    return lookup(gaugeMap, key);
+  }
+
+  /**
+   * Get a mean statistic.
+   * @param key statistic name
+   * @return the reference
+   * @throws NullPointerException if there is no entry of that name
+   */
+  @Override
+  public MeanStatistic getMeanStatistic(String key) {
+    return lookup(meanStatisticMap, key);
+  }
+
+  /**
+   * Add a duration to the min/mean/max statistics, using the
+   * given prefix and adding a suffix for each specific value.
+   * <p>
+   * The update is non -atomic, even though each individual statistic
+   * is updated thread-safely. If two threads update the values
+   * simultaneously, at the end of each operation the state will
+   * be correct. It is only during the sequence that the statistics
+   * may be observably inconsistent.
+   * </p>
+   * @param prefix statistic prefix
+   * @param durationMillis duration in milliseconds.
+   */
+  @Override
+  public void addTimedOperation(String prefix, long durationMillis) {
+    addMeanStatisticSample(prefix + SUFFIX_MEAN, durationMillis);
+    addMinimumSample(prefix + SUFFIX_MIN, durationMillis);
+    addMaximumSample(prefix + SUFFIX_MAX, durationMillis);
+  }
+
+  @Override
+  public void addTimedOperation(String prefix, Duration duration) {
+    addTimedOperation(prefix, duration.toMillis());
+  }
+
+  /**
+   * If the store is tracking the given key, return the
+   * duration tracker for it. If not tracked, return the
+   * stub tracker.
+   * @param key statistic key prefix
+   * @param count  #of times to increment the matching counter in this
+   * operation.
+   * @return a tracker.
+   */
+  @Override
+  public DurationTracker trackDuration(final String key, final long count) {
+    if (counterMap.containsKey(key)) {
+      return new StatisticDurationTracker(this, key, count);
+    } else {
+      return stubDurationTracker();
+    }
+  }
+}
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/impl/PairedDurationTrackerFactory.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/impl/PairedDurationTrackerFactory.java
new file mode 100644
index 0000000000000..33b13f78418a9
--- /dev/null
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/impl/PairedDurationTrackerFactory.java
@@ -0,0 +1,93 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.statistics.impl;
+
+import java.time.Duration;
+
+import org.apache.hadoop.fs.statistics.DurationTracker;
+import org.apache.hadoop.fs.statistics.DurationTrackerFactory;
+
+/**
+ * A duration tracker factory which aggregates two other trackers
+ * to have the same lifecycle.
+ *
+ * This is to ease having instance-level tracking alongside global
+ * values, such as an input stream and a filesystem.
+ *
+ * It's got some inefficiencies -assuming system time is used for
+ * the tracking, System.currentTimeMillis will be invoked twice
+ * at each point of the process -and the results may actually be different.
+ * However, it enables multiple duration tracker factories to be given the
+ * opportunity to collect the statistics.
+ */
+final class PairedDurationTrackerFactory implements DurationTrackerFactory {
+
+  private final DurationTrackerFactory local;
+  private final DurationTrackerFactory global;
+
+  PairedDurationTrackerFactory(final DurationTrackerFactory local,
+      final DurationTrackerFactory global) {
+    this.local = local;
+    this.global = global;
+  }
+
+  @Override
+  public DurationTracker trackDuration(final String key, final long count) {
+    return new PairedDurationTracker(
+        global.trackDuration(key, count),
+        local.trackDuration(key, count));
+  }
+
+  /**
+   * Tracker which wraps the two duration trackers created for the operation.
+   */
+  private static final class PairedDurationTracker
+      implements DurationTracker {
+    private final DurationTracker firstDuration;
+    private final DurationTracker secondDuration;
+
+    private PairedDurationTracker(
+        final DurationTracker firstDuration,
+        final DurationTracker secondDuration) {
+      this.firstDuration = firstDuration;
+      this.secondDuration = secondDuration;
+    }
+
+    @Override
+    public void failed() {
+      firstDuration.failed();
+      secondDuration.failed();
+    }
+
+    @Override
+    public void close() {
+      firstDuration.close();
+      secondDuration.close();
+    }
+
+    /**
+     * @return the global duration
+     */
+    @Override
+    public Duration asDuration() {
+      return firstDuration.asDuration();
+    }
+  }
+
+}
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/impl/SourceWrappedStatistics.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/impl/SourceWrappedStatistics.java
new file mode 100644
index 0000000000000..5aced7c5cddbf
--- /dev/null
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/impl/SourceWrappedStatistics.java
@@ -0,0 +1,44 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.statistics.impl;
+
+import org.apache.hadoop.fs.statistics.IOStatistics;
+import org.apache.hadoop.fs.statistics.IOStatisticsSource;
+
+/**
+ * Wrap a statistics instance with an {@link IOStatisticsSource}
+ * instance which will then serve up the statistics when asked.
+ */
+public class SourceWrappedStatistics implements IOStatisticsSource {
+
+  private final IOStatistics source;
+
+  /**
+   * Constructor.
+   * @param source source of statistics.
+   */
+  public SourceWrappedStatistics(final IOStatistics source) {
+    this.source = source;
+  }
+
+  @Override
+  public IOStatistics getIOStatistics() {
+    return source;
+  }
+}
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/impl/StatisticDurationTracker.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/impl/StatisticDurationTracker.java
new file mode 100644
index 0000000000000..ef9e7cb107a0d
--- /dev/null
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/impl/StatisticDurationTracker.java
@@ -0,0 +1,106 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.statistics.impl;
+
+import org.apache.hadoop.fs.statistics.DurationTracker;
+import org.apache.hadoop.fs.statistics.StoreStatisticNames;
+import org.apache.hadoop.util.OperationDuration;
+
+/**
+ * Track the duration of an object.
+ *
+ * When closed the
+ * min/max/mean statistics are updated.
+ *
+ * In the constructor, the counter with name of 'key' is
+ * incremented -default is by 1, but can be set to other
+ * values, including 0.
+ */
+public class StatisticDurationTracker extends OperationDuration
+    implements DurationTracker {
+
+  /**
+   * Statistics to update.
+   */
+  private final IOStatisticsStore iostats;
+
+  /**
+   * Key to use as prefix of values.
+   */
+  private final String key;
+
+  /**
+   * Flag to indicate the operation failed.
+   */
+  private boolean failed;
+
+  /**
+   * Constructor -increments the counter by 1.
+   * @param iostats statistics to update
+   * @param key prefix of values.
+   */
+  public StatisticDurationTracker(
+      final IOStatisticsStore iostats,
+      final String key) {
+    this(iostats, key, 1);
+  }
+
+  /**
+   * Constructor.
+   * If the supplied count is greater than zero, the counter
+   * of the key name is updated.
+   * @param iostats statistics to update
+   * @param key Key to use as prefix of values.
+   * @param count #of times to increment the matching counter.
+   */
+  public StatisticDurationTracker(
+      final IOStatisticsStore iostats,
+      final String key,
+      final long count) {
+    this.iostats = iostats;
+    this.key = key;
+    if (count > 0) {
+      iostats.incrementCounter(key, count);
+    }
+  }
+
+  @Override
+  public void failed() {
+    failed = true;
+  }
+
+  /**
+   * Set the finished time and then update the statistics.
+   * If the operation failed then the key + .failures counter will be
+   * incremented by one.
+   * The operation min/mean/max values will be updated with the duration;
+   * on a failure these will all be the .failures metrics.
+   */
+  @Override
+  public void close() {
+    finished();
+    String name = key;
+    if (failed) {
+      // failure:
+      name = key + StoreStatisticNames.SUFFIX_FAILURES;
+      iostats.incrementCounter(name);
+    }
+    iostats.addTimedOperation(name, asDuration());
+  }
+}
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/impl/StorageStatisticsFromIOStatistics.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/impl/StorageStatisticsFromIOStatistics.java
new file mode 100644
index 0000000000000..a55f04cae8c4c
--- /dev/null
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/impl/StorageStatisticsFromIOStatistics.java
@@ -0,0 +1,98 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.statistics.impl;
+
+import java.util.Iterator;
+import java.util.Map;
+import java.util.Set;
+import java.util.stream.Collectors;
+
+import org.apache.hadoop.fs.StorageStatistics;
+import org.apache.hadoop.fs.statistics.IOStatistics;
+
+/**
+ * Returns all the counters of an IOStatistics instance as StorageStatistics.
+ * This is dynamic.
+ * The {@link #reset()} is downgraded to a no-op.
+ */
+public class StorageStatisticsFromIOStatistics
+    extends StorageStatistics
+    implements Iterable<StorageStatistics.LongStatistic> {
+
+  private final IOStatistics ioStatistics;
+  private final String scheme;
+
+  /**
+   * Instantiate.
+   * @param name storage statistics name.
+   * @param scheme FS scheme; may be null.
+   * @param ioStatistics IOStatistics source.
+   */
+  public StorageStatisticsFromIOStatistics(
+      final String name,
+      final String scheme,
+      final IOStatistics ioStatistics) {
+    super(name);
+    this.scheme = scheme;
+    this.ioStatistics = ioStatistics;
+  }
+
+  @Override
+  public Iterator<LongStatistic> iterator() {
+    return getLongStatistics();
+  }
+
+  /**
+   * Take a snapshot of the current counter values
+   * and return an iterator over them.
+   * @return all the counter statistics.
+   */
+  @Override
+  public Iterator<LongStatistic> getLongStatistics() {
+    final Set<Map.Entry<String, Long>> counters = counters()
+        .entrySet();
+    return counters.stream().map(e ->
+        new StorageStatistics.LongStatistic(e.getKey(), e.getValue()))
+        .collect(Collectors.toSet()).iterator();
+  }
+
+  private Map<String, Long> counters() {
+    return ioStatistics.counters();
+  }
+
+  @Override
+  public Long getLong(final String key) {
+    return counters().get(key);
+  }
+
+  @Override
+  public boolean isTracked(final String key) {
+    return counters().containsKey(key);
+  }
+
+  @Override
+  public void reset() {
+    /* no-op */
+  }
+
+  @Override
+  public String getScheme() {
+    return scheme;
+  }
+}
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/impl/StubDurationTracker.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/impl/StubDurationTracker.java
new file mode 100644
index 0000000000000..638a9da9c7b51
--- /dev/null
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/impl/StubDurationTracker.java
@@ -0,0 +1,51 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.statistics.impl;
+
+import java.time.Duration;
+
+import org.apache.hadoop.fs.statistics.DurationTracker;
+
+/**
+ * A simple stub duration tracker which can be issued in interfaces
+ * and other places where full duration tracking is not implemented.
+ */
+public final class StubDurationTracker implements DurationTracker {
+
+  public static final DurationTracker STUB_DURATION_TRACKER =
+      new StubDurationTracker();
+
+  private StubDurationTracker() {
+  }
+
+  @Override
+  public void failed() {
+
+  }
+
+  @Override
+  public void close() {
+
+  }
+
+  @Override
+  public Duration asDuration() {
+    return Duration.ZERO;
+  }
+}
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/impl/StubDurationTrackerFactory.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/impl/StubDurationTrackerFactory.java
new file mode 100644
index 0000000000000..8856b6330cee6
--- /dev/null
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/impl/StubDurationTrackerFactory.java
@@ -0,0 +1,44 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.statistics.impl;
+
+import org.apache.hadoop.fs.statistics.DurationTracker;
+import org.apache.hadoop.fs.statistics.DurationTrackerFactory;
+
+/**
+ * This is a stub factory which always returns no-op duration
+ * trackers. Allows for code to always be handed a factory.
+ */
+public final class StubDurationTrackerFactory
+    implements DurationTrackerFactory {
+
+  /**
+   * Single instance.
+   */
+  public static final StubDurationTrackerFactory STUB_DURATION_TRACKER_FACTORY
+      = new StubDurationTrackerFactory();
+
+  private StubDurationTrackerFactory() {
+  }
+
+  @Override
+  public DurationTracker trackDuration(final String key, final long count) {
+    return StubDurationTracker.STUB_DURATION_TRACKER;
+  }
+}
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/impl/WrappedIOStatistics.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/impl/WrappedIOStatistics.java
new file mode 100644
index 0000000000000..4e5fc6a6a1071
--- /dev/null
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/impl/WrappedIOStatistics.java
@@ -0,0 +1,108 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.statistics.impl;
+
+import java.util.Map;
+
+import org.apache.hadoop.thirdparty.com.google.common.base.Preconditions;
+
+import org.apache.hadoop.fs.statistics.IOStatistics;
+import org.apache.hadoop.fs.statistics.MeanStatistic;
+
+import static org.apache.hadoop.fs.statistics.IOStatisticsLogging.ioStatisticsToString;
+
+/**
+ * Wrap IOStatistics source with another (dynamic) wrapper.
+ */
+public class WrappedIOStatistics extends AbstractIOStatisticsImpl {
+
+  /**
+   * The wrapped statistics.
+   */
+  private IOStatistics wrapped;
+
+  /**
+   * Instantiate.
+   * @param wrapped nullable wrapped statistics.
+   */
+  public WrappedIOStatistics(final IOStatistics wrapped) {
+    this.wrapped = wrapped;
+  }
+
+  /**
+   * Instantiate without setting the statistics.
+   * This is for subclasses which build up the map during their own
+   * construction.
+   */
+  protected WrappedIOStatistics() {
+  }
+
+  @Override
+  public Map<String, Long> counters() {
+    return getWrapped().counters();
+  }
+
+  /**
+   * Get at the wrapped inner statistics.
+   * @return the wrapped value
+   */
+  protected IOStatistics getWrapped() {
+    return wrapped;
+  }
+
+  /**
+   * Set the wrapped statistics.
+   * Will fail if the field is already set.
+   * @param wrapped new value
+   */
+  protected void setWrapped(final IOStatistics wrapped) {
+    Preconditions.checkState(this.wrapped == null,
+        "Attempted to overwrite existing wrapped statistics");
+    this.wrapped = wrapped;
+  }
+
+  @Override
+  public Map<String, Long> gauges() {
+    return getWrapped().gauges();
+  }
+
+  @Override
+  public Map<String, Long> minimums() {
+    return getWrapped().minimums();
+  }
+
+  @Override
+  public Map<String, Long> maximums() {
+    return getWrapped().maximums();
+  }
+
+  @Override
+  public Map<String, MeanStatistic> meanStatistics() {
+    return getWrapped().meanStatistics();
+  }
+
+  /**
+   * Return the statistics dump of the wrapped statistics.
+   * @return the statistics for logging.
+   */
+  @Override
+  public String toString() {
+    return ioStatisticsToString(wrapped);
+  }
+}
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/impl/package-info.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/impl/package-info.java
new file mode 100644
index 0000000000000..3ff7dacadce7a
--- /dev/null
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/impl/package-info.java
@@ -0,0 +1,31 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Implementation support for statistics.
+ * For use internally; external filesystems MAY use this if the implementors
+ * accept that it is unstable and that incompatible changes may take
+ * place over minor point releases.
+ */
+
+@InterfaceAudience.LimitedPrivate("Filesystems")
+@InterfaceStability.Unstable
+package org.apache.hadoop.fs.statistics.impl;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.classification.InterfaceStability;
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/package-info.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/package-info.java
new file mode 100644
index 0000000000000..bf46b33a516c6
--- /dev/null
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/package-info.java
@@ -0,0 +1,134 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * This package contains support for statistic collection and reporting.
+ * This is the public API; implementation classes are to be kept elsewhere.
+ * <p>
+ * This package defines two interfaces:
+ * <p>
+ * {@link org.apache.hadoop.fs.statistics.IOStatisticsSource}:
+ * a source of statistic data, which can be retrieved
+ * through a call to
+ * {@link org.apache.hadoop.fs.statistics.IOStatisticsSource#getIOStatistics()} .
+ * <p>
+ * {@link org.apache.hadoop.fs.statistics.IOStatistics} the statistics retrieved
+ * from a statistics source.
+ * <p>
+ * The retrieved statistics may be an immutable snapshot -in which case to get
+ * updated statistics another call to
+ * {@link org.apache.hadoop.fs.statistics.IOStatisticsSource#getIOStatistics()}
+ * must be made. Or they may be dynamic -in which case every time a specific
+ * statistic is retrieved, the latest version is returned. Callers should assume
+ * that if a statistics instance is dynamic, there is no atomicity when querying
+ * multiple statistics. If the statistics source was a closeable object (e.g. a
+ * stream), the statistics MUST remain valid after the stream is closed.
+ * <p>
+ * Use pattern:
+ * <p>
+ * An application probes an object (filesystem, stream etc) to see if it
+ * implements {@code IOStatisticsSource}, and, if it is,
+ * calls {@code getIOStatistics()} to get its statistics.
+ * If this is non-null, the client has statistics on the current
+ * state of the statistics.
+ * <p>
+ * The expectation is that a statistics source is dynamic: when a value is
+ * looked up the most recent values are returned.
+ * When iterating through the set, the values of the iterator SHOULD
+ * be frozen at the time the iterator was requested.
+ * <p>
+ * These statistics can be used to: log operations, profile applications,
+ * and make assertions about the state of the output.
+ * <p>
+ * The names of statistics are a matter of choice of the specific source.
+ * However, {@link org.apache.hadoop.fs.statistics.StoreStatisticNames}
+ * contains a
+ * set of names recommended for object store operations.
+ * {@link org.apache.hadoop.fs.statistics.StreamStatisticNames} declares
+ * recommended names for statistics provided for
+ * input and output streams.
+ * <p>
+ * Utility classes:
+ * <ul>
+ *   <li>
+ *     {@link org.apache.hadoop.fs.statistics.IOStatisticsSupport}.
+ *     General support, including the ability to take a serializable
+ *     snapshot of the current state of an IOStatistics instance.
+ *   </li>
+ *   <li>
+ *     {@link org.apache.hadoop.fs.statistics.IOStatisticsLogging}.
+ *     Methods for robust/on-demand string conversion, designed
+ *     for use in logging statements and {@code toString()} implementations.
+ *   </li>
+ *   <li>
+ *     {@link org.apache.hadoop.fs.statistics.IOStatisticsSnapshot}.
+ *     A static snaphot of statistics which can be marshalled via
+ *     java serialization or as JSON via jackson. It supports
+ *     aggregation, so can be used to generate aggregate statistics.
+ *   </li>
+ * </ul>
+ *
+ * <p>
+ * Implementors notes:
+ * <ol>
+ * <li>
+ * IOStatistics keys SHOULD be standard names where possible.
+ * </li>
+ * <li>
+ * An IOStatistics instance MUST be unique to that specific instance of
+ * {@link org.apache.hadoop.fs.statistics.IOStatisticsSource}.
+ * (i.e. not shared the way StorageStatistics are)
+ * </li>
+ * <li>
+ * MUST return the same values irrespective of which thread the statistics are
+ * retrieved or its keys evaluated.
+ * </li>
+ * <li>
+ * MUST NOT remove keys once a statistic instance has been created.
+ * </li>
+ * <li>
+ * MUST NOT add keys once a statistic instance has been created.
+ * </li>
+ * <li>
+ * MUST NOT block for long periods of time while blocking operations
+ * (reads, writes) are taking place in the source.
+ * That is: minimal synchronization points (AtomicLongs etc.) may be
+ * used to share values, but retrieval of statistics should
+ * be fast and return values even while slow/blocking remote IO is underway.
+ * </li>
+ * <li>
+ * MUST support value enumeration and retrieval after the source has been
+ * closed.
+ * </li>
+ * <li>
+ * SHOULD NOT have back-references to potentially expensive objects
+ * (filesystem instances etc.)
+ * </li>
+ * <li>
+ * SHOULD provide statistics which can be added to generate aggregate
+ * statistics.
+ * </li>
+ * </ol>
+ */
+
+@InterfaceAudience.Public
+@InterfaceStability.Unstable
+package org.apache.hadoop.fs.statistics;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.classification.InterfaceStability;
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/compress/CompressionInputStream.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/compress/CompressionInputStream.java
index 2dfa30bf76ec4..55bb132e9c87c 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/compress/CompressionInputStream.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/compress/CompressionInputStream.java
@@ -25,6 +25,10 @@
 import org.apache.hadoop.classification.InterfaceStability;
 import org.apache.hadoop.fs.PositionedReadable;
 import org.apache.hadoop.fs.Seekable;
+import org.apache.hadoop.fs.statistics.IOStatistics;
+import org.apache.hadoop.fs.statistics.IOStatisticsSource;
+import org.apache.hadoop.fs.statistics.IOStatisticsSupport;
+
 /**
  * A compression input stream.
  *
@@ -34,7 +38,8 @@
  */
 @InterfaceAudience.Public
 @InterfaceStability.Evolving
-public abstract class CompressionInputStream extends InputStream implements Seekable {
+public abstract class CompressionInputStream extends InputStream
+    implements Seekable, IOStatisticsSource {
   /**
    * The input stream to be compressed. 
    */
@@ -68,7 +73,16 @@ public void close() throws IOException {
       }
     }
   }
-  
+
+  /**
+   * Return any IOStatistics provided by the underlying stream.
+   * @return IO stats from the inner stream.
+   */
+  @Override
+  public IOStatistics getIOStatistics() {
+    return IOStatisticsSupport.retrieveIOStatistics(in);
+  }
+
   /**
    * Read bytes from the stream.
    * Made abstract to prevent leakage to underlying stream.
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/compress/CompressionOutputStream.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/compress/CompressionOutputStream.java
index 71c7f32e665e5..2a11ace81702c 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/compress/CompressionOutputStream.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/compress/CompressionOutputStream.java
@@ -23,13 +23,17 @@
 
 import org.apache.hadoop.classification.InterfaceAudience;
 import org.apache.hadoop.classification.InterfaceStability;
+import org.apache.hadoop.fs.statistics.IOStatistics;
+import org.apache.hadoop.fs.statistics.IOStatisticsSource;
+import org.apache.hadoop.fs.statistics.IOStatisticsSupport;
 
 /**
  * A compression output stream.
  */
 @InterfaceAudience.Public
 @InterfaceStability.Evolving
-public abstract class CompressionOutputStream extends OutputStream {
+public abstract class CompressionOutputStream extends OutputStream
+    implements IOStatisticsSource {
   /**
    * The output stream to be compressed. 
    */
@@ -94,4 +98,12 @@ public void flush() throws IOException {
    */
   public abstract void resetState() throws IOException;
 
+  /**
+   * Return any IOStatistics provided by the underlying stream.
+   * @return IO stats from the inner stream.
+   */
+  @Override
+  public IOStatistics getIOStatistics() {
+    return IOStatisticsSupport.retrieveIOStatistics(out);
+  }
 }
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/LineReader.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/LineReader.java
index e2cd3048d5843..520ddf6bdf401 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/LineReader.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/LineReader.java
@@ -25,6 +25,9 @@
 import org.apache.hadoop.classification.InterfaceAudience;
 import org.apache.hadoop.classification.InterfaceStability;
 import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.statistics.IOStatistics;
+import org.apache.hadoop.fs.statistics.IOStatisticsSource;
+import org.apache.hadoop.fs.statistics.IOStatisticsSupport;
 import org.apache.hadoop.io.Text;
 
 import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.IO_FILE_BUFFER_SIZE_KEY;
@@ -42,7 +45,7 @@
  */
 @InterfaceAudience.LimitedPrivate({"MapReduce"})
 @InterfaceStability.Unstable
-public class LineReader implements Closeable {
+public class LineReader implements Closeable, IOStatisticsSource {
   private static final int DEFAULT_BUFFER_SIZE = 64 * 1024;
   private int bufferSize = DEFAULT_BUFFER_SIZE;
   private InputStream in;
@@ -148,7 +151,16 @@ public LineReader(InputStream in, Configuration conf,
   public void close() throws IOException {
     in.close();
   }
-  
+
+  /**
+   * Return any IOStatistics provided by the source.
+   * @return IO stats from the input stream.
+   */
+  @Override
+  public IOStatistics getIOStatistics() {
+    return IOStatisticsSupport.retrieveIOStatistics(in);
+  }
+
   /**
    * Read one line from the InputStream into the given Text.
    *
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/functional/BiFunctionRaisingIOE.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/functional/BiFunctionRaisingIOE.java
new file mode 100644
index 0000000000000..ea17c16d01e87
--- /dev/null
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/functional/BiFunctionRaisingIOE.java
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.util.functional;
+
+import java.io.IOException;
+
+/**
+ * Function of arity 2 which may raise an IOException.
+ * @param <T> type of arg1
+ * @param <U> type of arg2
+ * @param <R> type of return value.
+ */
+@FunctionalInterface
+public interface BiFunctionRaisingIOE<T, U, R> {
+
+  /**
+   * Apply the function.
+   * @param t argument 1
+   * @param u argument 2
+   * @return result
+   * @throws IOException Any IO failure
+   */
+  R apply(T t, U u) throws IOException;
+}
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/functional/CallableRaisingIOE.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/functional/CallableRaisingIOE.java
new file mode 100644
index 0000000000000..65b3a63b2b9a0
--- /dev/null
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/functional/CallableRaisingIOE.java
@@ -0,0 +1,36 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.util.functional;
+
+import java.io.IOException;
+
+/**
+ * This is a callable which only raises an IOException.
+ * @param <R> return type
+ */
+@FunctionalInterface
+public interface CallableRaisingIOE<R> {
+
+  /**
+   * Apply the operation.
+   * @return result
+   * @throws IOException Any IO failure
+   */
+  R apply() throws IOException;
+}
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/functional/ConsumerRaisingIOE.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/functional/ConsumerRaisingIOE.java
new file mode 100644
index 0000000000000..24a3b55c58d4a
--- /dev/null
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/functional/ConsumerRaisingIOE.java
@@ -0,0 +1,51 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.util.functional;
+
+import java.io.IOException;
+
+/**
+ * Version of java.util.function.Consumer which raises
+ * exceptions.
+ * @param <T> type of argument,.
+ */
+@FunctionalInterface
+public interface ConsumerRaisingIOE<T> {
+
+  /**
+   * Process the argument.
+   * @param t type
+   * @throws IOException if needed
+   */
+  void accept(T t) throws IOException;
+
+  /**
+   * after calling {@link #accept(Object)},
+   * invoke the next consumer in the chain.
+   * @param next next consumer
+   * @return the chain.
+   */
+  default ConsumerRaisingIOE<T> andThen(
+      ConsumerRaisingIOE<? super T> next) {
+    return (T t) -> {
+      accept(t);
+      next.accept(t);
+    };
+  }
+}
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/functional/FunctionRaisingIOE.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/functional/FunctionRaisingIOE.java
new file mode 100644
index 0000000000000..83e041e2b3160
--- /dev/null
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/functional/FunctionRaisingIOE.java
@@ -0,0 +1,38 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.util.functional;
+
+import java.io.IOException;
+
+/**
+ * Function of arity 1 which may raise an IOException.
+ * @param <T> type of arg1
+ * @param <R> type of return value.
+ */
+@FunctionalInterface
+public interface FunctionRaisingIOE<T, R> {
+
+  /**
+   * Apply the function.
+   * @param t argument 1
+   * @return result
+   * @throws IOException Any IO failure
+   */
+  R apply(T t) throws IOException;
+}
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/functional/InvocationRaisingIOE.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/functional/InvocationRaisingIOE.java
new file mode 100644
index 0000000000000..b59dabea89ea9
--- /dev/null
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/functional/InvocationRaisingIOE.java
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.util.functional;
+
+import java.io.IOException;
+
+/**
+ * This is a lambda-expression which may raises an IOException.
+ * This is a recurrent design patten in the hadoop codebase, e.g
+ * {@code LambdaTestUtils.VoidCallable} and
+ * the S3A {@code Invoker.VoidOperation}}. Hopefully this should
+ * be the last.
+ * Note for implementors of methods which take this as an argument:
+ * don't use method overloading to determine which specific functional
+ * interface is to be used.
+ */
+@FunctionalInterface
+public interface InvocationRaisingIOE {
+
+  /**
+   * Apply the operation.
+   * @throws IOException Any IO failure
+   */
+  void apply() throws IOException;
+
+}
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/functional/RemoteIterators.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/functional/RemoteIterators.java
new file mode 100644
index 0000000000000..3ac0fced1493d
--- /dev/null
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/functional/RemoteIterators.java
@@ -0,0 +1,698 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.util.functional;
+
+import javax.annotation.Nullable;
+import java.io.Closeable;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Iterator;
+import java.util.List;
+import java.util.NoSuchElementException;
+import java.util.Objects;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.classification.InterfaceStability;
+import org.apache.hadoop.fs.RemoteIterator;
+import org.apache.hadoop.fs.statistics.IOStatistics;
+import org.apache.hadoop.fs.statistics.IOStatisticsSource;
+import org.apache.hadoop.io.IOUtils;
+
+import static java.util.Objects.requireNonNull;
+import static org.apache.hadoop.fs.statistics.IOStatisticsLogging.logIOStatisticsAtDebug;
+import static org.apache.hadoop.fs.statistics.IOStatisticsSupport.retrieveIOStatistics;
+
+/**
+ * A set of remote iterators supporting transformation and filtering,
+ * with IOStatisticsSource passthrough, and of conversions of
+ * the iterators to lists/arrays and of performing actions
+ * on the values.
+ * <p></p>
+ * This aims to make it straightforward to use lambda-expressions to
+ * transform the results of an iterator, without losing the statistics
+ * in the process, and to chain the operations together.
+ * <p></p>
+ * The closeable operation will be passed through RemoteIterators which
+ * wrap other RemoteIterators. This is to support any iterator which
+ * can be closed to release held connections, file handles etc.
+ * Unless client code is written to assume that RemoteIterator instances
+ * may be closed, this is not likely to be broadly used. It is added
+ * to make it possible to adopt this feature in a managed way.
+ * <p></p>
+ * One notable feature is that the
+ * {@link #foreach(RemoteIterator, ConsumerRaisingIOE)} method will
+ * LOG at debug any IOStatistics provided by the iterator, if such
+ * statistics are provided. There's no attempt at retrieval and logging
+ * if the LOG is not set to debug, so it is a zero cost feature unless
+ * the logger {@code org.apache.hadoop.fs.functional.RemoteIterators}
+ * is at DEBUG.
+ * <p></p>
+ * Based on the S3A Listing code, and some some work on moving other code
+ * to using iterative listings so as to pick up the statistics.
+ */
+@InterfaceAudience.Public
+@InterfaceStability.Unstable
+public final class RemoteIterators {
+
+  /**
+   * Log used for logging any statistics in
+   * {@link #foreach(RemoteIterator, ConsumerRaisingIOE)}
+   * at DEBUG.
+   */
+  private static final Logger LOG = LoggerFactory.getLogger(
+      RemoteIterators.class);
+
+  private RemoteIterators() {
+  }
+
+  /**
+   * Create an iterator from a singleton.
+   * @param singleton instance
+   * @param <T> type
+   * @return a remote iterator
+   */
+  public static <T> RemoteIterator<T> remoteIteratorFromSingleton(
+      @Nullable T singleton) {
+    return new SingletonIterator<>(singleton);
+  }
+
+  /**
+   * Create a remote iterator from a java.util.Iterator.
+   * @param <T> type
+   * @return a remote iterator
+   */
+  public static <T> RemoteIterator<T> remoteIteratorFromIterator(
+      Iterator<T> iterator) {
+    return new WrappedJavaIterator<>(iterator);
+  }
+
+  /**
+   * Create a remote iterator from a java.util.Iterable -e.g. a list
+   * or other collection.
+   * @param <T> type
+   * @return a remote iterator
+   */
+  public static <T> RemoteIterator<T> remoteIteratorFromIterable(
+      Iterable<T> iterable) {
+    return new WrappedJavaIterator<>(iterable.iterator());
+  }
+
+  /**
+   * Create a remote iterator from an array.
+   * @param <T> type
+   * @return a remote iterator
+   */
+  public static <T> RemoteIterator<T> remoteIteratorFromArray(T[] array) {
+    return new WrappedJavaIterator<>(Arrays.stream(array).iterator());
+  }
+
+  /**
+   * Create an iterator from an iterator and a transformation function.
+   * @param <S> source type
+   * @param <T> result type
+   * @param iterator source
+   * @param mapper transformation
+   * @return a remote iterator
+   */
+  public static <S, T> RemoteIterator<T> mappingRemoteIterator(
+      RemoteIterator<S> iterator,
+      FunctionRaisingIOE<? super S, T> mapper) {
+    return new MappingRemoteIterator<>(iterator, mapper);
+  }
+
+  /**
+   * Create a RemoteIterator from a RemoteIterator, casting the
+   * type in the process. This is to help with filesystem API
+   * calls where overloading causes confusion (e.g. listStatusIterator())
+   * @param <S> source type
+   * @param <T> result type
+   * @param iterator source
+   * @return a remote iterator
+   */
+  public static <S, T> RemoteIterator<T> typeCastingRemoteIterator(
+      RemoteIterator<S> iterator) {
+    return new TypeCastingRemoteIterator<>(iterator);
+  }
+
+  /**
+   * Create a RemoteIterator from a RemoteIterator and a filter
+   * function which returns true for every element to be passed
+   * through.
+   * <p></p>
+   * Elements are filtered in the hasNext() method; if not used
+   * the filtering will be done on demand in the {@code next()}
+   * call.
+   * @param <S> type
+   * @param iterator source
+   * @param filter filter
+   * @return a remote iterator
+   */
+  public static <S> RemoteIterator<S> filteringRemoteIterator(
+      RemoteIterator<S> iterator,
+      FunctionRaisingIOE<? super S, Boolean> filter) {
+    return new FilteringRemoteIterator<>(iterator, filter);
+  }
+
+  /**
+   * This adds an extra close operation alongside the passthrough
+   * to any Closeable.close() method supported by the source iterator.
+   * @param iterator source
+   * @param toClose extra object to close.
+   * @param <S> source type.
+   * @return a new iterator
+   */
+  public static <S> RemoteIterator<S> closingRemoteIterator(
+      RemoteIterator<S> iterator,
+      Closeable toClose) {
+    return new CloseRemoteIterator<>(iterator, toClose);
+  }
+
+  /**
+   * Build a list from a RemoteIterator.
+   * @param <T> type
+   * @return a list of the values.
+   * @throws IOException if the source RemoteIterator raises it.
+   */
+  public static <T> List<T> toList(RemoteIterator<T> source)
+      throws IOException {
+    List<T> l = new ArrayList<>();
+    foreach(source, l::add);
+    return l;
+  }
+
+  /**
+   * Build an array from a RemoteIterator.
+   * @param <T> type
+   * @return an array of the values.
+   * @throws IOException if the source RemoteIterator raises it.
+   */
+  public static <T> T[] toArray(RemoteIterator<T> source) throws IOException {
+    return (T[]) toList(source).toArray();
+  }
+
+  /**
+   * Apply an operation to all values of a RemoteIterator.
+   * <p></p>
+   * If the iterator is an IOStatisticsSource returning a non-null
+   * set of statistics, <i>and</i> this classes log is set to DEBUG,
+   * then the statistics of the operation are evaluated and logged at
+   * debug.
+   * <p></p>
+   * The number of entries processed is returned, as it is useful to
+   * know this, especially during tests or when reporting values
+   * to users.
+   * <p></p>
+   * This does not close the iterator afterwards.
+   * @param source iterator source
+   * @param consumer consumer of the values.
+   * @return the number of elements processed
+   * @param <T> type of source
+   * @throws IOException if the source RemoteIterator or the consumer raise one.
+   */
+  public static <T> long foreach(
+      RemoteIterator<T> source,
+      ConsumerRaisingIOE<? super T> consumer) throws IOException {
+    long count = 0;
+
+    try {
+      while (source.hasNext()) {
+        count++;
+        consumer.accept(source.next());
+      }
+
+      // maybe log the results
+      logIOStatisticsAtDebug(LOG, "RemoteIterator Statistics: {}", source);
+    } finally {
+      if (source instanceof Closeable) {
+        // source is closeable, so close.
+        IOUtils.cleanupWithLogger(LOG, (Closeable) source);
+      }
+    }
+
+    return count;
+  }
+
+  /**
+   * A remote iterator from a singleton. It has a single next()
+   * value, after which hasNext() returns false and next() fails.
+   * <p></p>
+   * If it is a source of
+   * remote statistics, these are returned.
+   * @param <T> type.
+   */
+  private static final class SingletonIterator<T>
+      implements RemoteIterator<T>, IOStatisticsSource {
+
+    /**
+     * Single entry.
+     */
+    private final T singleton;
+
+    /** Has the entry been processed?  */
+    private boolean processed;
+
+    /**
+     * Instantiate.
+     * @param singleton single value...may be null
+     */
+    private SingletonIterator(@Nullable T singleton) {
+      this.singleton = singleton;
+      // if the entry is null, consider it processed.
+      this.processed = singleton == null;
+    }
+
+    @Override
+    public boolean hasNext() throws IOException {
+      return !processed;
+    }
+
+    @SuppressWarnings("NewExceptionWithoutArguments")
+    @Override
+    public T next() throws IOException {
+      if (hasNext()) {
+        processed = true;
+        return singleton;
+      } else {
+        throw new NoSuchElementException();
+      }
+    }
+
+    @Override
+    public IOStatistics getIOStatistics() {
+      return retrieveIOStatistics(singleton);
+    }
+
+    @Override
+    public String toString() {
+      return "SingletonIterator{"
+          + (singleton != null ? singleton : "")
+          + '}';
+    }
+
+  }
+
+  /**
+   * Create a remote iterator from a simple java.util.Iterator, or
+   * an iterable.
+   * <p> </p>
+   * If the iterator is a source of statistics that is passed through.
+   * <p></p>
+   * The {@link #close()} will close the source iterator if it is
+   * Closeable;
+   * @param <T> iterator type.
+   */
+  private static final class WrappedJavaIterator<T>
+      implements RemoteIterator<T>, IOStatisticsSource, Closeable {
+
+    /**
+     * inner iterator..
+     */
+    private final Iterator<? extends T> source;
+
+    private final Closeable sourceToClose;
+
+
+    /**
+     * Construct from an interator.
+     * @param source source iterator.
+     */
+    private WrappedJavaIterator(Iterator<? extends T> source) {
+      this.source = requireNonNull(source);
+      sourceToClose = new MaybeClose(source);
+    }
+
+    @Override
+    public boolean hasNext() {
+      return source.hasNext();
+    }
+
+    @Override
+    public T next() {
+      return source.next();
+    }
+
+    @Override
+    public IOStatistics getIOStatistics() {
+      return retrieveIOStatistics(source);
+    }
+
+    @Override
+    public String toString() {
+      return "FromIterator{" + source + '}';
+    }
+
+    @Override
+    public void close() throws IOException {
+      sourceToClose.close();
+
+    }
+  }
+
+  /**
+   * Wrapper of another remote iterator; IOStatistics
+   * and Closeable methods are passed down if implemented.
+   * @param <S> source type
+   * @param <T> type of returned value
+   */
+  private static abstract class WrappingRemoteIterator<S, T>
+      implements RemoteIterator<T>, IOStatisticsSource, Closeable {
+
+    /**
+     * Source iterator.
+     */
+    private final RemoteIterator<S> source;
+
+    private final Closeable sourceToClose;
+
+    protected WrappingRemoteIterator(final RemoteIterator<S> source) {
+      this.source = requireNonNull(source);
+      sourceToClose = new MaybeClose(source);
+    }
+
+    protected RemoteIterator<S> getSource() {
+      return source;
+    }
+
+    @Override
+    public IOStatistics getIOStatistics() {
+      return retrieveIOStatistics(source);
+    }
+
+    @Override
+    public void close() throws IOException {
+      sourceToClose.close();
+    }
+
+    /**
+     * Check for the source having a next element.
+     * If it does not, this object's close() method
+     * is called and false returned
+     * @return true if there is a new value
+     * @throws IOException failure to retrieve next value
+     */
+    protected boolean sourceHasNext() throws IOException {
+      boolean hasNext;
+      try {
+        hasNext = getSource().hasNext();
+      } catch (IOException e) {
+        IOUtils.cleanupWithLogger(LOG, this);
+        throw e;
+      }
+      if (!hasNext) {
+        // there is nothing less so automatically close.
+        close();
+      }
+      return hasNext;
+    }
+
+    /**
+     * Get the next source value.
+     * This calls {@link #sourceHasNext()} first to verify
+     * that there is data.
+     * @return the next value
+     * @throws IOException failure
+     * @throws NoSuchElementException no more data
+     */
+    protected S sourceNext() throws IOException {
+      try {
+        if (!sourceHasNext()) {
+          throw new NoSuchElementException();
+        }
+        return getSource().next();
+      } catch (NoSuchElementException | IOException e) {
+        IOUtils.cleanupWithLogger(LOG, this);
+        throw e;
+      }
+    }
+
+    @Override
+    public String toString() {
+      return source.toString();
+    }
+
+  }
+
+  /**
+   * Iterator taking a source and a transformational function.
+   * @param <S> source type
+   * @param <T> final output type.There
+   */
+  private static final class MappingRemoteIterator<S, T>
+      extends WrappingRemoteIterator<S, T> {
+
+    /**
+     * Mapper to invoke.
+     */
+    private final FunctionRaisingIOE<? super S, T> mapper;
+
+    private MappingRemoteIterator(
+        RemoteIterator<S> source,
+        FunctionRaisingIOE<? super S, T> mapper) {
+      super(source);
+      this.mapper = requireNonNull(mapper);
+    }
+
+    @Override
+    public boolean hasNext() throws IOException {
+      return sourceHasNext();
+    }
+
+    @Override
+    public T next() throws IOException {
+      return mapper.apply(sourceNext());
+    }
+
+    @Override
+    public String toString() {
+      return "FunctionRemoteIterator{" + getSource() + '}';
+    }
+  }
+
+  /**
+   * RemoteIterator which can change the type of the input.
+   * This is useful in some situations.
+   * @param <S> source type
+   * @param <T> final output type.
+   */
+  private static final class TypeCastingRemoteIterator<S, T>
+      extends WrappingRemoteIterator<S, T> {
+
+    private TypeCastingRemoteIterator(
+        RemoteIterator<S> source) {
+      super(source);
+    }
+
+    @Override
+    public boolean hasNext() throws IOException {
+      return sourceHasNext();
+    }
+
+    @Override
+    public T next() throws IOException {
+      return (T)sourceNext();
+    }
+
+    @Override
+    public String toString() {
+      return getSource().toString();
+    }
+  }
+
+  /**
+   * Extend the wrapped iterator by filtering source values out.
+   * Only those values for which the filter predicate returns true
+   * will be returned.
+   * @param <S> type of iterator.
+   */
+  @SuppressWarnings("NewExceptionWithoutArguments")
+  private static final class FilteringRemoteIterator<S>
+      extends WrappingRemoteIterator<S, S> {
+
+    /**
+     * Filter Predicate.
+     * Takes the input type or any superclass.
+     */
+    private final FunctionRaisingIOE<? super S, Boolean>
+        filter;
+
+    /**
+     * Next value; will be null if none has been evaluated, or the
+     * last one was already returned by next().
+     */
+    private S next;
+
+    /**
+     * An iterator which combines filtering with transformation.
+     * All source elements for which filter = true are returned,
+     * transformed via the mapper.
+     * @param source source iterator.
+     * @param filter filter predicate.
+     */
+    private FilteringRemoteIterator(
+        RemoteIterator<S> source,
+        FunctionRaisingIOE<? super S, Boolean> filter) {
+      super(source);
+
+      this.filter = requireNonNull(filter);
+    }
+
+    /**
+     * Fetch: retrieve the next value.
+     * @return true if a new value was found after filtering.
+     * @throws IOException failure in retrieval from source or mapping
+     */
+    private boolean fetch() throws IOException {
+      while (next == null && sourceHasNext()) {
+        S candidate = getSource().next();
+        if (filter.apply(candidate)) {
+          next = candidate;
+          return true;
+        }
+      }
+      return false;
+    }
+
+    /**
+     * Trigger a fetch if an entry is needed.
+     * @return true if there was already an entry return,
+     * or there was not but one could then be retrieved.set
+     * @throws IOException failure in fetch operation
+     */
+    @Override
+    public boolean hasNext() throws IOException {
+      if (next != null) {
+        return true;
+      }
+      return fetch();
+    }
+
+    /**
+     * Return the next value.
+     * Will retrieve the next elements if needed.
+     * This is where the mapper takes place.
+     * @return true if there is another data element.
+     * @throws IOException failure in fetch operation or the transformation.
+     * @throws NoSuchElementException no more data
+     */
+    @Override
+    public S next() throws IOException {
+      if (hasNext()) {
+        S result = next;
+        next = null;
+        return result;
+      }
+      throw new NoSuchElementException();
+    }
+
+    @Override
+    public String toString() {
+      return "FilteringRemoteIterator{" + getSource() + '}';
+    }
+  }
+
+  /**
+   * A wrapping remote iterator which adds another entry to
+   * close. This is to assist cleanup.
+   * @param <S> type
+   */
+  private static final class CloseRemoteIterator<S>
+      extends WrappingRemoteIterator<S, S> {
+
+    private final MaybeClose toClose;
+    private boolean closed;
+
+    private CloseRemoteIterator(
+        final RemoteIterator<S> source,
+        final Closeable toClose) {
+      super(source);
+      this.toClose = new MaybeClose(Objects.requireNonNull(toClose));
+    }
+
+    @Override
+    public boolean hasNext() throws IOException {
+      return sourceHasNext();
+    }
+
+    @Override
+    public S next() throws IOException {
+
+      return sourceNext();
+    }
+
+    @Override
+    public void close() throws IOException {
+      if (closed) {
+        return;
+      }
+      closed = true;
+      LOG.debug("Closing {}", this);
+      try {
+        super.close();
+      } finally {
+        toClose.close();
+      }
+    }
+  }
+
+  /**
+   * Class to help with Closeable logic, where sources may/may not
+   * be closeable, only one invocation is allowed.
+   * On the second and later call of close(), it is a no-op.
+   */
+  private static final class MaybeClose implements Closeable {
+
+    private Closeable toClose;
+
+    /**
+     * Construct.
+     * @param o object to close.
+     */
+    private MaybeClose(Object o) {
+      this(o, true);
+    }
+
+    /**
+     * Construct -close the object if it is closeable and close==true.
+     * @param o object to close.
+     * @param close should close?
+     */
+    private MaybeClose(Object o, boolean close) {
+      if (close && o instanceof Closeable) {
+        this.toClose = (Closeable) o;
+      } else {
+        this.toClose = null;
+      }
+    }
+
+    @Override
+    public void close() throws IOException {
+      if (toClose != null) {
+        try {
+          toClose.close();
+        } finally {
+          toClose = null;
+        }
+      }
+    }
+  }
+}
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/functional/package-info.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/functional/package-info.java
new file mode 100644
index 0000000000000..1c204bb9979a8
--- /dev/null
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/functional/package-info.java
@@ -0,0 +1,41 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Support for functional programming within the Hadoop APIs.
+ * <p></p>
+ * Much of this is needed simply to cope with Java's checked exceptions and
+ * the fact that the java.util.function can only throw runtime exceptions.
+ * <p></p>
+ * Pretty much all the Hadoop FS APIs raise IOExceptions, hence the need
+ * for these classes. If Java had made a different decision about the
+ * nature of exceptions, life would be better.
+ * <p></p>
+ * Do note that the {@link org.apache.hadoop.util.functional.RemoteIterators}
+ * iterators go beyond that of the java ones, in terms of declaring themselves
+ * Closeable and implementors of
+ * {@link org.apache.hadoop.fs.statistics.IOStatisticsSource}; a chain
+ * of wrapped iterators can supply statistics of the inner iterators, and
+ * encourage close() to be called after use.
+ */
+@InterfaceAudience.Public
+@InterfaceStability.Unstable
+package org.apache.hadoop.util.functional;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.classification.InterfaceStability;
\ No newline at end of file
diff --git a/hadoop-common-project/hadoop-common/src/site/markdown/filesystem/iostatistics.md b/hadoop-common-project/hadoop-common/src/site/markdown/filesystem/iostatistics.md
new file mode 100644
index 0000000000000..bd77dc7e0f8a7
--- /dev/null
+++ b/hadoop-common-project/hadoop-common/src/site/markdown/filesystem/iostatistics.md
@@ -0,0 +1,432 @@
+<!---
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License. See accompanying LICENSE file.
+-->
+
+# Statistic collection with the IOStatistics API
+
+```java
+@InterfaceAudience.Public
+@InterfaceStability.Unstable
+```
+
+The `IOStatistics` API is intended to provide statistics on individual IO
+classes -such as input and output streams, *in a standard way which
+applications can query*
+
+Many filesystem-related classes have implemented statistics gathering
+and provided private/unstable ways to query this, but as they were
+not common across implementations it was unsafe for applications
+to reference these values. Example: `S3AInputStream` and its statistics
+API. This is used in internal tests, but cannot be used downstream in
+applications such as Apache Hive or Apache HBase.
+
+The IOStatistics API is intended to
+
+1. Be instance specific:, rather than shared across multiple instances
+   of a class, or thread local.
+1. Be public and stable enough to be used by applications.
+1. Be easy to use in applications written in Java, Scala, and, via libhdfs, C/C++
+1. Have foundational interfaces and classes in the `hadoop-common` JAR.
+
+## Core Model
+
+Any class *may* implement `IOStatisticsSource` in order to
+provide statistics.
+
+Wrapper I/O Classes such as `FSDataInputStream` anc `FSDataOutputStream` *should*
+implement the interface and forward it to the wrapped class, if they also
+implement it -and return `null` if they do not.
+
+`IOStatisticsSource` implementations `getIOStatistics()` return an
+instance of `IOStatistics` enumerating the statistics of that specific
+instance.
+
+The `IOStatistics` Interface exports five kinds of statistic:
+
+
+| Category | Type | Description |
+|------|------|-------------|
+| `counter`        | `long`          | a counter which may increase in value; SHOULD BE >= 0 |
+| `gauge`          | `long`          | an arbitrary value which can down as well as up; SHOULD BE >= 0 |
+| `minimum`        | `long`          | an minimum value; MAY BE negative |
+| `maximum`        | `long`          | a maximum value;  MAY BE negative |
+| `meanStatistic`  | `MeanStatistic` | an arithmetic mean and sample size; mean MAY BE negative |
+
+Four are simple `long` values, with the variations how they are likely to
+change and how they are aggregated.
+
+
+#### Aggregation of Statistic Values
+
+For the different statistic category, the result of `aggregate(x, y)` is
+
+| Category         | Aggregation |
+|------------------|-------------|
+| `counter`        | `max(0, x) + max(0, y)`  |
+| `gauge`          | `max(0, x) + max(0, y)` |
+| `minimum`        | `min(x, y)` |
+| `maximum`        | `max(x, y)` |
+| `meanStatistic` | calculation of the mean of `x` and `y` ) |
+
+
+#### Class `MeanStatistic`
+
+## package `org.apache.hadoop.fs.statistics`
+
+This package contains the public statistics APIs intended
+for use by applications.
+
+<!--  ============================================================= -->
+<!--  Class: MeanStatistic -->
+<!--  ============================================================= -->
+
+`MeanStatistic` is a tuple of `(mean, samples)` to support aggregation.
+
+A `MeanStatistic`  with a sample of `0` is considered an empty statistic.
+
+All `MeanStatistic` instances where `sample = 0` are considered equal,
+irrespective of the `mean` value.
+
+Algorithm to calculate the mean :
+
+```python
+if x.samples = 0:
+    y
+else if y.samples = 0 :
+    x
+else:
+    samples' = x.samples + y.samples
+    mean' = (x.mean * x.samples) + (y.mean * y.samples) / samples'
+    (samples', mean')
+```
+
+Implicitly, this means that if both samples are empty, then the aggregate value is also empty.
+
+```java
+public final class MeanStatistic implements Serializable, Cloneable {
+  /**
+   * Arithmetic mean.
+   */
+  private double mean;
+
+  /**
+   * Number of samples used to calculate
+   * the mean.
+   */
+  private long samples;
+
+  /**
+   * Get the mean value.
+   * @return the mean
+   */
+  public double getMean() {
+    return mean;
+  }
+
+  /**
+   * Get the sample count.
+   * @return the sample count; 0 means empty
+   */
+  public long getSamples() {
+    return samples;
+  }
+
+  /**
+   * Is a statistic empty?
+   * @return true if the sample count is 0
+   */
+  public boolean isEmpty() {
+    return samples == 0;
+  }
+   /**
+   * Add another mean statistic to create a new statistic.
+   * When adding two statistics, if either is empty then
+   * a copy of the non-empty statistic is returned.
+   * If both are empty then a new empty statistic is returned.
+   *
+   * @param other other value
+   * @return the aggregate mean
+   */
+  public MeanStatistic add(final MeanStatistic other) {
+    /* Implementation elided. */
+  }
+  @Override
+  public int hashCode() {
+    return Objects.hash(mean, samples);
+  }
+
+  @Override
+  public boolean equals(final Object o) {
+    if (this == o) { return true; }
+    if (o == null || getClass() != o.getClass()) { return false; }
+    MeanStatistic that = (MeanStatistic) o;
+    if (this.isEmpty()) {
+      return that.isEmpty();
+    }
+    return Double.compare(that.mean, mean) == 0 &&
+        samples == that.samples;
+  }
+
+  @Override
+  public MeanStatistic clone() {
+    return new MeanStatistic(this);
+  }
+
+  public MeanStatistic copy() {
+    return new MeanStatistic(this);
+  }
+
+}
+```
+
+<!--  ============================================================= -->
+<!--  Interface: IOStatisticsSource -->
+<!--  ============================================================= -->
+
+### class `org.apache.hadoop.fs.statistics.IOStatisticsSource`
+
+```java
+
+/**
+ * A source of IO statistics.
+ * These statistics MUST be instance specific, not thread local.
+ */
+@InterfaceStability.Unstable
+public interface IOStatisticsSource {
+
+  /**
+   * Return a statistics instance.
+   * It is not a requirement that the same instance is returned every time.
+   * {@link IOStatisticsSource}.
+   * If the object implementing this is Closeable, this method
+   * may return null if invoked on a closed object, even if
+   * it returns a valid instance when called earlier.
+   * @return an IOStatistics instance or null
+   */
+  IOStatistics getIOStatistics();
+}
+```
+
+This is the interface which an object instance MUST implement if they are a source of
+IOStatistics information.
+
+#### Invariants
+
+The result of `getIOStatistics()` must be one of
+
+* `null`
+* an immutable `IOStatistics` for which each map of entries is
+an empty map.
+* an instance of an `IOStatistics` whose statistics MUST BE unique to that
+instance of the class implementing `IOStatisticsSource`.
+
+Less formally: if the statistics maps returned are non-empty, all the statistics
+must be collected from the current instance, and not from any other instances, the way
+some of the `FileSystem` statistics are collected.
+
+
+The result of `getIOStatistics()`, if non-null, MAY be a different instance
+on every invocation.
+
+
+<!--  ============================================================= -->
+<!--  Interface: IOStatistics -->
+<!--  ============================================================= -->
+
+### class `org.apache.hadoop.fs.statistics.IOStatistics`
+
+These are per-instance statistics provided by an object which
+implements `IOStatisticsSource`.
+
+```java
+@InterfaceAudience.Public
+@InterfaceStability.Unstable
+public interface IOStatistics {
+
+  /**
+   * Map of counters.
+   * @return the current map of counters.
+   */
+  Map<String, Long> counters();
+
+  /**
+   * Map of gauges.
+   * @return the current map of gauges.
+   */
+  Map<String, Long> gauges();
+
+  /**
+   * Map of minumums.
+   * @return the current map of minumums.
+   */
+  Map<String, Long> minumums();
+
+  /**
+   * Map of maximums.
+   * @return the current map of maximums.
+   */
+  Map<String, Long> maximums();
+
+  /**
+   * Map of meanStatistics.
+   * @return the current map of MeanStatistic statistics.
+   */
+  Map<String, MeanStatistic> meanStatistics();
+
+}
+```
+
+### Statistic Naming
+
+The naming policy of statistics is designed to be readable, shareable
+and ideally consistent across `IOStatisticSource` implementations.
+
+* Characters in key names MUST match the regular expression
+  `[a-z|0-9|_]` with the exception of the first character, which
+  MUST be in the range `[a-z]`. Thus the full regular expression
+  for a valid statistic name is:
+
+        [a-z][a-z|0-9|_]+
+
+* Where possible, the names of statistics SHOULD be those defined
+  with common names.
+
+        org.apache.hadoop.fs.statistics.StreamStatisticNames
+        org.apache.hadoop.fs.statistics.StoreStatisticNames
+
+   Note 1.: these are evolving; for clients to safely reference their
+   statistics by name they SHOULD be copied to the application.
+   (i.e. for an application compiled hadoop 3.4.2 to link against hadoop 3.4.1,
+   copy the strings).
+
+   Note 2: keys defined in these classes SHALL NOT be removed
+   from subsequent Hadoop releases.
+
+* A common statistic name MUST NOT be used to report any other statistic and
+  MUST use the pre-defined unit of measurement.
+
+* A statistic name in one of the maps SHOULD NOT be re-used in another map.
+  This aids diagnostics of logged statistics.
+
+### Statistic Maps
+
+For each map of statistics returned:
+
+* The operations to add/remove entries are unsupported: the map returned
+  MAY be mutable by the source of statistics.
+
+* The map MAY be empty.
+
+* The map keys each represent a measured statistic.
+
+* The set of keys in a map SHOULD remain unchanged, and MUST NOT remove keys.
+
+* The statistics SHOULD be dynamic: every lookup of an entry SHOULD
+  return the latest value.
+
+* The values MAY change across invocations of `Map.values()` and `Map.entries()`
+
+* The update MAY be in the `iterable()` calls of the iterators returned,
+  or MAY be in the actual `iterable.next()` operation. That is: there is
+  no guarantee as to when the evaluation takes place.
+
+* The returned `Map.Entry` instances MUST return the same value on
+ repeated `getValue()` calls. (i.e once you have the entry, it is immutable).
+
+* Queries of statistics SHOULD be fast and non-blocking to the extent
+ that if invoked during a long operation, they will prioritize
+ returning fast over most timely values.
+
+* The statistics MAY lag; especially for statistics collected in separate
+ operations (e.g stream IO statistics as provided by a filesystem
+ instance).
+
+* Statistics which represent time SHOULD use milliseconds as their unit.
+
+* Statistics which represent time and use a different unit MUST document
+  the unit used.
+
+### Thread Model
+
+1. An instance of `IOStatistics` can be shared across threads;
+
+1. Read access to the supplied statistics maps MUST be thread safe.
+
+1. Iterators returned from the maps MUST NOT be shared across threads.
+
+1. The statistics collected MUST include all operations which took
+   place across all threads performing work for the monitored object.
+
+1. The statistics reported MUST NOT be local to the active thread.
+
+This is different from the `FileSystem.Statistics` behavior where per-thread statistics
+are collected and reported.
+
+That mechanism supports collecting limited read/write statistics for different
+worker threads sharing the same FS instance, but as the collection is thread local,
+it invariably under-reports IO performed in other threads on behalf of a worker thread.
+
+
+## Statisic Snapshot
+
+A snapshot of the current statistic values MAY be obtained by calling
+`IOStatisticsSupport.snapshotIOStatistics()`
+
+```java
+  public static <X extends IOStatistics & Serializable> X
+      snapshotIOStatistics(IOStatistics statistics)
+```
+
+This snapshot is serializable through Java serialization and through
+Jackson to/from JSON.
+
+## Helper Classes
+
+
+### class `org.apache.hadoop.fs.statistics.IOStatisticsSupport`
+
+This provides helper methods to work with IOStatistics sources and instances.
+
+Consult the javadocs for its operations.
+
+### class `org.apache.hadoop.fs.statistics.IOStatisticsLogging`
+
+Support for efficiently logging `IOStatistics`/`IOStatisticsSource`
+instances.
+
+These are intended for assisting logging, including only enumerating the
+state of an `IOStatistics` instance when the log level needs it.
+
+```java
+LOG.info("IOStatistics after upload: {}", demandStringify(iostats));
+
+// or even better, as it results in only a single object creations
+Object latest = demandStringify(iostats);
+LOG.info("IOStatistics : {}", latest);
+/* do some work. */
+LOG.info("IOStatistics : {}", latest);
+
+```
+
+## Package `org.apache.hadoop.fs.statistics.impl`
+
+This contains implementation classes to support providing statistics to applications.
+
+These MUST NOT BE used by applications. If a feature is needed from this package then
+the provisioning of a public implementation MAY BE raised via the Hadoop development
+channels.
+
+These MAY be used by those implementations of the Hadoop `FileSystem`, `AbstractFileSystem`
+and related classes which are not in the hadoop source tree. Implementors MUST BE
+aware that the implementation this code is unstable and may change across
+minor point releases of Hadoop.
diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/AbstractContractStreamIOStatisticsTest.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/AbstractContractStreamIOStatisticsTest.java
new file mode 100644
index 0000000000000..89b21c497083b
--- /dev/null
+++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/AbstractContractStreamIOStatisticsTest.java
@@ -0,0 +1,313 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.contract;
+
+import java.util.Collections;
+import java.util.List;
+
+import org.assertj.core.api.Assertions;
+import org.junit.AfterClass;
+import org.junit.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.hadoop.fs.FSDataInputStream;
+import org.apache.hadoop.fs.FSDataOutputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.statistics.IOStatistics;
+import org.apache.hadoop.fs.statistics.IOStatisticsSnapshot;
+import org.apache.hadoop.fs.statistics.IOStatisticsSource;
+
+import static org.apache.hadoop.fs.contract.ContractTestUtils.dataset;
+import static org.apache.hadoop.fs.statistics.IOStatisticAssertions.extractStatistics;
+import static org.apache.hadoop.fs.statistics.IOStatisticAssertions.verifyStatisticCounterValue;
+import static org.apache.hadoop.fs.statistics.IOStatisticsLogging.demandStringifyIOStatisticsSource;
+import static org.apache.hadoop.fs.statistics.IOStatisticsLogging.ioStatisticsToPrettyString;
+import static org.apache.hadoop.fs.statistics.IOStatisticsSupport.snapshotIOStatistics;
+import static org.apache.hadoop.fs.statistics.StreamStatisticNames.STREAM_READ_BYTES;
+import static org.apache.hadoop.fs.statistics.StreamStatisticNames.STREAM_WRITE_BYTES;
+
+/**
+ * Tests {@link IOStatistics} support in input and output streams.
+ * <p>
+ * Requires both the input and output streams to offer the basic
+ * bytes read/written statistics.
+ * </p>
+ * If the IO is buffered, that information must be provided,
+ * especially the input buffer size.
+ */
+public abstract class AbstractContractStreamIOStatisticsTest
+    extends AbstractFSContractTestBase {
+
+  private static final Logger LOG =
+      LoggerFactory.getLogger(AbstractContractStreamIOStatisticsTest.class);
+
+  /**
+   * FileSystem statistics are collected across every test case.
+   */
+  protected static final IOStatisticsSnapshot FILESYSTEM_IOSTATS =
+      snapshotIOStatistics();
+
+  @Override
+  public void teardown() throws Exception {
+    final FileSystem fs = getFileSystem();
+    if (fs instanceof IOStatisticsSource) {
+      FILESYSTEM_IOSTATS.aggregate(((IOStatisticsSource)fs).getIOStatistics());
+    }
+    super.teardown();
+  }
+
+  /**
+   * Dump the filesystem statistics after the class if contains any values.
+   */
+  @AfterClass
+  public static void dumpFileSystemIOStatistics() {
+    if (!FILESYSTEM_IOSTATS.counters().isEmpty()) {
+      // if there is at least one counter
+      LOG.info("Aggregate FileSystem Statistics {}",
+          ioStatisticsToPrettyString(FILESYSTEM_IOSTATS));
+    }
+  }
+
+  @Test
+  public void testOutputStreamStatisticKeys() throws Throwable {
+    describe("Look at the statistic keys of an output stream");
+    Path path = methodPath();
+    FileSystem fs = getFileSystem();
+    fs.mkdirs(path.getParent());
+    try (FSDataOutputStream out = fs.create(path, true)) {
+      IOStatistics statistics = extractStatistics(out);
+      final List<String> keys = outputStreamStatisticKeys();
+      Assertions.assertThat(statistics.counters().keySet())
+          .describedAs("statistic keys of %s", statistics)
+          .containsAll(keys);
+      Assertions.assertThat(keys)
+          .describedAs("Statistics supported by the stream %s", out)
+          .contains(STREAM_WRITE_BYTES);
+    } finally {
+      fs.delete(path, false);
+    }
+  }
+
+  /**
+   * If the stream writes in blocks, then counters during the write may be
+   * zero until a whole block is written -or the write has finished.
+   * @return true if writes are buffered into whole blocks.
+   */
+  public boolean streamWritesInBlocks() {
+    return false;
+  }
+
+  @Test
+  public void testWriteSingleByte() throws Throwable {
+    describe("Write a byte to a file and verify"
+        + " the stream statistics are updated");
+    Path path = methodPath();
+    FileSystem fs = getFileSystem();
+    fs.mkdirs(path.getParent());
+    boolean writesInBlocks = streamWritesInBlocks();
+    try (FSDataOutputStream out = fs.create(path, true)) {
+      IOStatistics statistics = extractStatistics(out);
+      // before a write, no bytes
+      verifyStatisticCounterValue(statistics, STREAM_WRITE_BYTES, 0);
+      out.write('0');
+      verifyStatisticCounterValue(statistics, STREAM_WRITE_BYTES,
+          writesInBlocks ? 0 : 1);
+      // close the stream
+      out.close();
+      // statistics are still valid after the close
+      // always call the output stream to check that behavior
+      statistics = extractStatistics(out);
+      final String strVal = statistics.toString();
+      LOG.info("Statistics = {}", strVal);
+      verifyStatisticCounterValue(statistics, STREAM_WRITE_BYTES, 1);
+    } finally {
+      fs.delete(path, false);
+    }
+  }
+
+  @Test
+  public void testWriteByteArrays() throws Throwable {
+    describe("Write byte arrays to a file and verify"
+        + " the stream statistics are updated");
+    Path path = methodPath();
+    FileSystem fs = getFileSystem();
+    fs.mkdirs(path.getParent());
+    boolean writesInBlocks = streamWritesInBlocks();
+    try (FSDataOutputStream out = fs.create(path, true)) {
+      Object demandStatsString = demandStringifyIOStatisticsSource(out);
+      // before a write, no bytes
+      final byte[] bytes = ContractTestUtils.toAsciiByteArray(
+          "statistically-speaking");
+      final long len = bytes.length;
+      out.write(bytes);
+      out.flush();
+      LOG.info("stats {}", demandStatsString);
+      IOStatistics statistics = extractStatistics(out);
+      verifyStatisticCounterValue(statistics, STREAM_WRITE_BYTES,
+          writesInBlocks ? 0 : len);
+      out.write(bytes);
+      out.flush();
+      verifyStatisticCounterValue(statistics, STREAM_WRITE_BYTES,
+          writesInBlocks ? 0 : len * 2);
+      // close the stream
+      out.close();
+      LOG.info("stats {}", demandStatsString);
+      // statistics are still valid after the close
+      // always call the output stream to check that behavior
+      statistics = extractStatistics(out);
+      verifyStatisticCounterValue(statistics, STREAM_WRITE_BYTES, len * 2);
+      // the to string value must contain the same counterHiCable you mean
+      Assertions.assertThat(demandStatsString.toString())
+          .contains(Long.toString(len * 2));
+    } finally {
+      fs.delete(path, false);
+    }
+  }
+
+  @Test
+  public void testInputStreamStatisticKeys() throws Throwable {
+    describe("Look at the statistic keys of an input stream");
+    Path path = methodPath();
+    FileSystem fs = getFileSystem();
+    ContractTestUtils.touch(fs, path);
+    try (FSDataInputStream in = fs.open(path)) {
+      IOStatistics statistics = extractStatistics(in);
+      final List<String> keys = inputStreamStatisticKeys();
+      Assertions.assertThat(statistics.counters().keySet())
+          .describedAs("statistic keys of %s", statistics)
+          .containsAll(keys);
+      Assertions.assertThat(keys)
+          .describedAs("Statistics supported by the stream %s", in)
+          .contains(STREAM_READ_BYTES);
+      verifyStatisticCounterValue(statistics, STREAM_READ_BYTES, 0);
+    } finally {
+      fs.delete(path, false);
+    }
+  }
+
+  @Test
+  public void testInputStreamStatisticRead() throws Throwable {
+    describe("Read Data from an input stream");
+    Path path = methodPath();
+    FileSystem fs = getFileSystem();
+    final int fileLen = 1024;
+    final byte[] ds = dataset(fileLen, 'a', 26);
+    ContractTestUtils.writeDataset(fs, path, ds, fileLen, 8_000, true);
+
+    try (FSDataInputStream in = fs.open(path)) {
+      long current = 0;
+      IOStatistics statistics = extractStatistics(in);
+      verifyStatisticCounterValue(statistics, STREAM_READ_BYTES, 0);
+      Assertions.assertThat(in.read()).isEqualTo('a');
+      int bufferSize = readBufferSize();
+      // either a single byte was read or a whole block
+      current = verifyBytesRead(statistics, current, 1, bufferSize);
+      final int bufferLen = 128;
+      byte[] buf128 = new byte[bufferLen];
+      in.read(buf128);
+      current = verifyBytesRead(statistics, current, bufferLen, bufferSize);
+      in.readFully(buf128);
+      current = verifyBytesRead(statistics, current, bufferLen, bufferSize);
+      in.readFully(0, buf128);
+      current = verifyBytesRead(statistics, current, bufferLen, bufferSize);
+      // seek must not increment the read counter
+      in.seek(256);
+      verifyBytesRead(statistics, current, 0, bufferSize);
+
+      // if a stream implements lazy-seek the seek operation
+      // may be postponed until the read
+      final int sublen = 32;
+      Assertions.assertThat(in.read(buf128, 0, sublen))
+          .isEqualTo(sublen);
+      current = verifyBytesRead(statistics, current, sublen, bufferSize);
+
+      // perform some read operations near the end of the file such that
+      // the buffer will not be completely read.
+      // skip these tests for buffered IO as it is too complex to work out
+      if (bufferSize == 0) {
+        final int pos = fileLen - sublen;
+        in.seek(pos);
+        Assertions.assertThat(in.read(buf128))
+            .describedAs("Read overlapping EOF")
+            .isEqualTo(sublen);
+        current = verifyStatisticCounterValue(statistics, STREAM_READ_BYTES,
+            current + sublen);
+        Assertions.assertThat(in.read(pos, buf128, 0, bufferLen))
+            .describedAs("Read(buffer) overlapping EOF")
+            .isEqualTo(sublen);
+        verifyStatisticCounterValue(statistics, STREAM_READ_BYTES,
+            current + sublen);
+      }
+    } finally {
+      fs.delete(path, false);
+    }
+  }
+
+  /**
+   * Verify the bytes read value, taking into account block size.
+   * @param statistics stats
+   * @param current current count
+   * @param bytesRead bytes explicitly read
+   * @param bufferSize buffer size of stream
+   * @return the current count of bytes read <i>ignoring block size</i>
+   */
+  public long verifyBytesRead(final IOStatistics statistics,
+      final long current,
+      final int bytesRead, final int bufferSize) {
+    // final position. for unbuffered read, this is the expected value
+    long finalPos = current + bytesRead;
+    long expected = finalPos;
+    if (bufferSize > 0) {
+      // buffered. count of read is number of buffers already read
+      // plus the current buffer, multiplied by that buffer size
+      expected = bufferSize * (1 + (current / bufferSize));
+    }
+    verifyStatisticCounterValue(statistics, STREAM_READ_BYTES, expected);
+    return finalPos;
+  }
+
+  /**
+   * Buffer size for reads.
+   * Filesystems performing block reads (checksum, etc)
+   * must return their buffer value is
+   * @return buffer capacity; 0 for unbuffered
+   */
+  public int readBufferSize() {
+    return 0;
+  }
+
+  /**
+   * Keys which the output stream must support.
+   * @return a list of keys
+   */
+  public List<String> outputStreamStatisticKeys() {
+    return Collections.singletonList(STREAM_WRITE_BYTES);
+  }
+
+  /**
+   * Keys which the input stream must support.
+   * @return a list of keys
+   */
+  public List<String> inputStreamStatisticKeys() {
+    return Collections.singletonList(STREAM_READ_BYTES);
+  }
+
+}
diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/localfs/TestLocalFSContractStreamIOStatistics.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/localfs/TestLocalFSContractStreamIOStatistics.java
new file mode 100644
index 0000000000000..642baec502d2e
--- /dev/null
+++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/localfs/TestLocalFSContractStreamIOStatistics.java
@@ -0,0 +1,80 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.contract.localfs;
+
+import java.util.Arrays;
+import java.util.List;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.contract.AbstractContractStreamIOStatisticsTest;
+import org.apache.hadoop.fs.contract.AbstractFSContract;
+
+import static org.apache.hadoop.fs.statistics.StreamStatisticNames.STREAM_READ_BYTES;
+import static org.apache.hadoop.fs.statistics.StreamStatisticNames.STREAM_READ_EXCEPTIONS;
+import static org.apache.hadoop.fs.statistics.StreamStatisticNames.STREAM_READ_SEEK_OPERATIONS;
+import static org.apache.hadoop.fs.statistics.StreamStatisticNames.STREAM_READ_SKIP_BYTES;
+import static org.apache.hadoop.fs.statistics.StreamStatisticNames.STREAM_READ_SKIP_OPERATIONS;
+import static org.apache.hadoop.fs.statistics.StreamStatisticNames.STREAM_WRITE_BYTES;
+import static org.apache.hadoop.fs.statistics.StreamStatisticNames.STREAM_WRITE_EXCEPTIONS;
+
+/**
+ * Test IOStatistics through the local FS.
+ */
+public class TestLocalFSContractStreamIOStatistics extends
+    AbstractContractStreamIOStatisticsTest {
+
+  @Override
+  protected AbstractFSContract createContract(Configuration conf) {
+    return new LocalFSContract(conf);
+  }
+
+  /**
+   * Keys which the input stream must support.
+   * @return a list of keys
+   */
+  public List<String> inputStreamStatisticKeys() {
+    return Arrays.asList(STREAM_READ_BYTES,
+        STREAM_READ_EXCEPTIONS,
+        STREAM_READ_SEEK_OPERATIONS,
+        STREAM_READ_SKIP_OPERATIONS,
+        STREAM_READ_SKIP_BYTES);
+  }
+
+  /**
+   * Keys which the output stream must support.
+   * @return a list of keys
+   */
+  @Override
+  public List<String> outputStreamStatisticKeys() {
+    return Arrays.asList(STREAM_WRITE_BYTES,
+        STREAM_WRITE_EXCEPTIONS);
+  }
+
+  @Override
+  public int readBufferSize() {
+    return 1024;
+  }
+
+  @Override
+  public boolean streamWritesInBlocks() {
+    return true;
+  }
+
+
+}
diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/statistics/TestDynamicIOStatistics.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/statistics/TestDynamicIOStatistics.java
new file mode 100644
index 0000000000000..9b929ac82ff11
--- /dev/null
+++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/statistics/TestDynamicIOStatistics.java
@@ -0,0 +1,311 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.statistics;
+
+import java.util.Iterator;
+import java.util.Map;
+import java.util.concurrent.atomic.AtomicInteger;
+import java.util.concurrent.atomic.AtomicLong;
+
+import org.assertj.core.api.Assertions;
+import org.junit.Before;
+import org.junit.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.hadoop.fs.statistics.impl.SourceWrappedStatistics;
+import org.apache.hadoop.metrics2.MetricsInfo;
+import org.apache.hadoop.metrics2.lib.MutableCounterLong;
+import org.apache.hadoop.test.AbstractHadoopTestBase;
+
+import static org.apache.hadoop.fs.statistics.IOStatisticAssertions.assertStatisticCounterIsTracked;
+import static org.apache.hadoop.fs.statistics.IOStatisticAssertions.assertStatisticCounterIsUntracked;
+import static org.apache.hadoop.fs.statistics.IOStatisticAssertions.verifyStatisticCounterValue;
+import static org.apache.hadoop.fs.statistics.IOStatisticsLogging.demandStringifyIOStatistics;
+import static org.apache.hadoop.fs.statistics.IOStatisticsLogging.demandStringifyIOStatisticsSource;
+import static org.apache.hadoop.fs.statistics.IOStatisticsLogging.ioStatisticsToString;
+import static org.apache.hadoop.fs.statistics.impl.IOStatisticsBinding.ENTRY_PATTERN;
+import static org.apache.hadoop.fs.statistics.impl.IOStatisticsBinding.NULL_SOURCE;
+import static org.apache.hadoop.fs.statistics.impl.IOStatisticsBinding.dynamicIOStatistics;
+import static org.apache.hadoop.fs.statistics.impl.IOStatisticsBinding.emptyStatistics;
+import static org.assertj.core.api.Assertions.assertThat;
+import static org.assertj.core.api.Assertions.assertThatThrownBy;
+
+/**
+ * verify dynamic statistics are dynamic, except when you iterate through
+ * them, along with other tests of the class's behavior.
+ */
+public class TestDynamicIOStatistics extends AbstractHadoopTestBase {
+
+  private static final Logger LOG =
+      LoggerFactory.getLogger(TestDynamicIOStatistics.class);
+
+  private static final String ALONG = "along";
+
+  private static final String AINT = "aint";
+
+  private static final String COUNT = "count";
+
+  private static final String EVAL = "eval";
+
+  /**
+   * The statistics.
+   */
+  private IOStatistics statistics = emptyStatistics();
+
+  /**
+   * A source of these statistics.
+   */
+  private IOStatisticsSource statsSource;
+
+  private final AtomicLong aLong = new AtomicLong();
+
+  private final AtomicInteger aInt = new AtomicInteger();
+
+  private final MutableCounterLong counter = new MutableCounterLong(
+      new Info("counter"), 0);
+
+  private long evalLong;
+
+  private static final String[] KEYS = new String[]{ALONG, AINT, COUNT, EVAL};
+
+  @Before
+  public void setUp() throws Exception {
+    statistics = dynamicIOStatistics()
+        .withAtomicLongCounter(ALONG, aLong)
+        .withAtomicIntegerCounter(AINT, aInt)
+        .withMutableCounter(COUNT, counter)
+        .withLongFunctionCounter(EVAL, x -> evalLong)
+        .build();
+    statsSource = new SourceWrappedStatistics(statistics);
+  }
+
+  /**
+   * The eval operation is foundational.
+   */
+  @Test
+  public void testEval() throws Throwable {
+    verifyStatisticCounterValue(statistics, EVAL, 0);
+    evalLong = 10;
+    verifyStatisticCounterValue(statistics, EVAL, 10);
+  }
+
+  /**
+   * Atomic Long statistic.
+   */
+  @Test
+  public void testAlong() throws Throwable {
+    verifyStatisticCounterValue(statistics, ALONG, 0);
+    aLong.addAndGet(1);
+    verifyStatisticCounterValue(statistics, ALONG, 1);
+  }
+
+  /**
+   * Atomic Int statistic.
+   */
+  @Test
+  public void testAint() throws Throwable {
+    verifyStatisticCounterValue(statistics, AINT, 0);
+    aInt.addAndGet(1);
+    verifyStatisticCounterValue(statistics, AINT, 1);
+  }
+
+  /**
+   * Metrics2 counter.
+   */
+  @Test
+  public void testCounter() throws Throwable {
+    verifyStatisticCounterValue(statistics, COUNT, 0);
+    counter.incr();
+    verifyStatisticCounterValue(statistics, COUNT, 1);
+  }
+
+  /**
+   * keys() returns all the keys.
+   */
+  @Test
+  public void testKeys() throws Throwable {
+    Assertions.assertThat(statistics.counters().keySet())
+        .describedAs("statistic keys of %s", statistics)
+        .containsExactlyInAnyOrder(KEYS);
+  }
+
+  @Test
+  public void testIteratorHasAllKeys() throws Throwable {
+    // go through the statistics iterator and assert that it contains exactly
+    // the values.
+    assertThat(statistics.counters().keySet())
+        .containsExactlyInAnyOrder(KEYS);
+  }
+
+  /**
+   * Verify that the iterator is taken from
+   * a snapshot of the values.
+   */
+  @Test
+  public void testIteratorIsSnapshot() throws Throwable {
+    // set the counters all to 1
+    incrementAllCounters();
+    // take the snapshot
+    final Iterator<Map.Entry<String, Long>> it =
+        statistics.counters().entrySet().iterator();
+    // increment the counters
+    incrementAllCounters();
+    // now assert that all the iterator values are of value 1
+    while (it.hasNext()) {
+      Map.Entry<String, Long> next = it.next();
+      assertThat(next.getValue())
+          .describedAs("Value of entry %s", next)
+          .isEqualTo(1);
+    }
+  }
+
+  @Test
+  public void testUnknownStatistic() throws Throwable {
+    assertStatisticCounterIsUntracked(statistics, "anything");
+  }
+
+  @Test
+  public void testStatisticsTrackedAssertion() throws Throwable {
+    // expect an exception to be raised when an assertion
+    // is made that an unknown statistic is tracked,.
+    assertThatThrownBy(() ->
+        assertStatisticCounterIsTracked(statistics, "anything"))
+        .isInstanceOf(AssertionError.class);
+  }
+
+  @Test
+  public void testStatisticsValueAssertion() throws Throwable {
+    // expect an exception to be raised when
+    // an assertion is made about the value of an unknown statistics
+    assertThatThrownBy(() ->
+        verifyStatisticCounterValue(statistics, "anything", 0))
+        .isInstanceOf(AssertionError.class);
+  }
+
+  /**
+   * Serialization round trip will preserve all the values.
+   */
+  @Test
+  public void testSerDeser() throws Throwable {
+    incrementAllCounters();
+    IOStatistics stat = IOStatisticsSupport.snapshotIOStatistics(statistics);
+    incrementAllCounters();
+    IOStatistics deser = IOStatisticAssertions.statisticsJavaRoundTrip(stat);
+    assertThat(deser.counters().keySet())
+        .containsExactlyInAnyOrder(KEYS);
+    for (Map.Entry<String, Long> e : deser.counters().entrySet()) {
+      assertThat(e.getValue())
+          .describedAs("Value of entry %s", e)
+          .isEqualTo(1);
+    }
+  }
+
+  @Test
+  public void testStringification() throws Throwable {
+    assertThat(ioStatisticsToString(statistics))
+        .isNotBlank()
+        .contains(KEYS);
+  }
+
+  @Test
+  public void testDemandStringification() throws Throwable {
+    String counterPattern = ENTRY_PATTERN;
+    // this is not yet evaluated
+    Object demand = demandStringifyIOStatistics(statistics);
+    // nor is this.
+    Object demandSource = demandStringifyIOStatisticsSource(statsSource);
+
+    // show it evaluates
+    String formatted1 = String.format(counterPattern, ALONG, aLong.get());
+    assertThat(demand
+        .toString())
+        .contains(formatted1);
+    assertThat(demandSource
+        .toString())
+        .contains(formatted1);
+
+    // when the counters are incremented
+    incrementAllCounters();
+    incrementAllCounters();
+    // there are new values to expect
+    String formatted2 = String.format(counterPattern, ALONG, aLong.get());
+    assertThat(demand
+        .toString())
+        .doesNotContain(formatted1)
+        .contains(formatted2);
+    assertThat(demandSource
+        .toString())
+        .doesNotContain(formatted1)
+        .contains(formatted2);
+  }
+
+  @Test
+  public void testNullSourceStringification() throws Throwable {
+    assertThat(demandStringifyIOStatisticsSource((IOStatisticsSource) null)
+        .toString())
+        .isEqualTo(NULL_SOURCE);
+  }
+
+  @Test
+  public void testNullStatStringification() throws Throwable {
+    assertThat(demandStringifyIOStatistics((IOStatistics) null)
+        .toString())
+        .isEqualTo(NULL_SOURCE);
+  }
+
+  @Test
+  public void testStringLogging() throws Throwable {
+    LOG.info("Output {}", demandStringifyIOStatistics(statistics));
+  }
+
+  /**
+   * Increment all the counters from their current value.
+   */
+  private void incrementAllCounters() {
+    aLong.incrementAndGet();
+    aInt.incrementAndGet();
+    evalLong += 1;
+    counter.incr();
+  }
+
+  /**
+   * Needed to provide a metrics info instance for the counter
+   * constructor.
+   */
+  private static final class Info implements MetricsInfo {
+
+    private final String name;
+
+    private Info(final String name) {
+      this.name = name;
+    }
+
+    @Override
+    public String name() {
+      return name;
+    }
+
+    @Override
+    public String description() {
+      return name;
+    }
+  }
+
+}
diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/statistics/TestEmptyIOStatistics.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/statistics/TestEmptyIOStatistics.java
new file mode 100644
index 0000000000000..296470abaa9bf
--- /dev/null
+++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/statistics/TestEmptyIOStatistics.java
@@ -0,0 +1,110 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.statistics;
+
+import org.junit.Test;
+
+import org.apache.hadoop.fs.statistics.impl.IOStatisticsBinding;
+import org.apache.hadoop.test.AbstractHadoopTestBase;
+
+import static org.apache.hadoop.fs.statistics.IOStatisticAssertions.assertStatisticCounterIsTracked;
+import static org.apache.hadoop.fs.statistics.IOStatisticAssertions.assertStatisticCounterIsUntracked;
+import static org.apache.hadoop.fs.statistics.IOStatisticAssertions.verifyStatisticCounterValue;
+import static org.apache.hadoop.fs.statistics.IOStatisticsLogging.ioStatisticsToString;
+import static org.apache.hadoop.fs.statistics.impl.IOStatisticsBinding.emptyStatistics;
+import static org.assertj.core.api.Assertions.assertThat;
+import static org.assertj.core.api.Assertions.assertThatThrownBy;
+
+/**
+ * Test handling of the empty IO statistics class.
+ */
+public class TestEmptyIOStatistics extends AbstractHadoopTestBase {
+
+  private final IOStatistics empty = emptyStatistics();
+
+  @Test
+  public void testUnknownStatistic() throws Throwable {
+    assertStatisticCounterIsUntracked(empty, "anything");
+  }
+
+  @Test
+  public void testStatisticsTrackedAssertion() throws Throwable {
+    // expect an exception to be raised when an assertion
+    // is made that an unknown statistic is tracked,.
+    assertThatThrownBy(() ->
+        assertStatisticCounterIsTracked(empty, "anything"))
+        .isInstanceOf(AssertionError.class);
+  }
+
+  @Test
+  public void testStatisticsValueAssertion() throws Throwable {
+    // expect an exception to be raised when
+    // an assertion is made about the value of an unknown statistics
+    assertThatThrownBy(() ->
+        verifyStatisticCounterValue(empty, "anything", 0))
+        .isInstanceOf(AssertionError.class);
+  }
+
+  @Test
+  public void testEmptySnapshot() throws Throwable {
+    final IOStatistics stat = IOStatisticsSupport.snapshotIOStatistics(empty);
+    assertThat(stat.counters().keySet())
+        .describedAs("keys of snapshot")
+        .isEmpty();
+    IOStatistics deser = IOStatisticAssertions.statisticsJavaRoundTrip(stat);
+    assertThat(deser.counters().keySet())
+        .describedAs("keys of deserialized snapshot")
+        .isEmpty();
+  }
+
+  @Test
+  public void testStringification() throws Throwable {
+    assertThat(ioStatisticsToString(empty))
+        .isNotBlank();
+  }
+
+  @Test
+  public void testWrap() throws Throwable {
+    IOStatisticsSource statisticsSource = IOStatisticsBinding.wrap(empty);
+    assertThat(statisticsSource.getIOStatistics())
+        .isSameAs(empty);
+  }
+
+  @Test
+  public void testStringifyNullSource() throws Throwable {
+    assertThat(IOStatisticsLogging.ioStatisticsSourceToString(null))
+        .isEmpty();
+  }
+
+  @Test
+  public void testStringifyNullStats() throws Throwable {
+    assertThat(
+        IOStatisticsLogging.ioStatisticsSourceToString(
+            IOStatisticsBinding.wrap(null)))
+        .isEmpty();
+  }
+
+  @Test
+  public void testStringificationNull() throws Throwable {
+    assertThat(ioStatisticsToString(null))
+        .describedAs("Null statistics should stringify to \"\"")
+        .isEmpty();
+  }
+
+}
diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/statistics/TestIOStatisticsSnapshot.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/statistics/TestIOStatisticsSnapshot.java
new file mode 100644
index 0000000000000..41e9bffefe834
--- /dev/null
+++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/statistics/TestIOStatisticsSnapshot.java
@@ -0,0 +1,147 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.statistics;
+
+import org.assertj.core.api.Assertions;
+import org.junit.Before;
+import org.junit.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.hadoop.fs.statistics.impl.IOStatisticsBinding;
+import org.apache.hadoop.test.AbstractHadoopTestBase;
+import org.apache.hadoop.util.JsonSerialization;
+
+import static org.apache.hadoop.fs.statistics.IOStatisticAssertions.*;
+import static org.apache.hadoop.fs.statistics.IOStatisticsLogging.ioStatisticsToString;
+import static org.assertj.core.api.Assertions.assertThat;
+import static org.assertj.core.api.Assertions.assertThatThrownBy;
+
+/**
+ * Test handling of the {@link IOStatisticsSnapshot} class.
+ */
+public class TestIOStatisticsSnapshot extends AbstractHadoopTestBase {
+
+  private static final Logger LOG =
+      LoggerFactory.getLogger(TestIOStatisticsSnapshot.class);
+
+  /**
+   * Simple snapshot built up in test setup.
+   */
+  private final IOStatisticsSnapshot snapshot = new IOStatisticsSnapshot();
+
+  /** Saved to the snapshot as "mean01". */
+  private MeanStatistic mean0;
+
+  /** Saved to the snapshot as "mean1". */
+  private MeanStatistic mean1;
+
+  @Before
+  public void setup() throws Exception {
+    snapshot.counters().put("c1", 0L);
+    snapshot.gauges().put("g1", 1L);
+    snapshot.minimums().put("m1", -1L);
+    mean1 = new MeanStatistic(1, 1);
+    snapshot.meanStatistics().put("mean1",
+        mean1);
+    mean0 = new MeanStatistic(0, 1);
+    snapshot.meanStatistics().put("mean0",
+        mean0);
+  }
+
+  @Test
+  public void testTrackedValues() throws Throwable {
+    verifyStatisticCounterValue(snapshot, "c1", 0L);
+    verifyStatisticGaugeValue(snapshot, "g1", 1L);
+    verifyStatisticMinimumValue(snapshot, "m1", -1L);
+    verifyStatisticMeanValue(snapshot, "mean0",
+        new MeanStatistic(0, 1));
+  }
+
+  @Test
+  public void testStatisticsValueAssertion() throws Throwable {
+    // expect an exception to be raised when
+    // an assertion is made about the value of an unknown statistics
+    assertThatThrownBy(() ->
+        verifyStatisticCounterValue(snapshot, "anything", 0))
+        .isInstanceOf(AssertionError.class);
+  }
+
+  @Test
+  public void testStringification() throws Throwable {
+    assertThat(ioStatisticsToString(snapshot))
+        .isNotBlank();
+  }
+
+  @Test
+  public void testStringification2() throws Throwable {
+
+    String ss = snapshot.toString();
+    LOG.info("original {}", ss);
+    Assertions.assertThat(ss)
+        .describedAs("snapshot toString()")
+        .contains("c1=0")
+        .contains("g1=1");
+  }
+
+  @Test
+  public void testWrap() throws Throwable {
+    IOStatisticsSource statisticsSource = IOStatisticsBinding.wrap(snapshot);
+    assertThat(statisticsSource.getIOStatistics())
+        .isSameAs(snapshot);
+  }
+
+  @Test
+  public void testJsonRoundTrip() throws Throwable {
+    JsonSerialization<IOStatisticsSnapshot> serializer
+        = IOStatisticsSnapshot.serializer();
+
+    String json = serializer.toJson(snapshot);
+    LOG.info("serialized form\n{}", json);
+    IOStatisticsSnapshot deser = serializer.fromJson(json);
+    verifyDeserializedInstance(deser);
+  }
+
+  /**
+   * Verify the deserialized instance's data
+   * matches the expected values.
+   * @param deser deserialized vlaue.
+   */
+  public void verifyDeserializedInstance(
+      final IOStatistics deser) {
+    LOG.info("deserialized {}", deser);
+    verifyStatisticCounterValue(deser, "c1", 0L);
+    verifyStatisticGaugeValue(deser, "g1", 1L);
+    verifyStatisticMinimumValue(deser, "m1", -1L);
+    verifyStatisticMeanValue(deser, "mean0",
+        new MeanStatistic(0, 1));
+    verifyStatisticMeanValue(deser, "mean1",
+        snapshot.meanStatistics().get("mean1"));
+  }
+
+  @Test
+  public void testJavaRoundTrip() throws Throwable {
+    verifyDeserializedInstance(
+        IOStatisticAssertions.statisticsJavaRoundTrip(
+            snapshot));
+
+
+  }
+
+}
diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/statistics/TestIOStatisticsStore.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/statistics/TestIOStatisticsStore.java
new file mode 100644
index 0000000000000..778eab8315aa5
--- /dev/null
+++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/statistics/TestIOStatisticsStore.java
@@ -0,0 +1,177 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.statistics;
+
+import org.assertj.core.api.Assertions;
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.hadoop.fs.statistics.impl.IOStatisticsStore;
+import org.apache.hadoop.test.AbstractHadoopTestBase;
+import org.apache.hadoop.util.JsonSerialization;
+
+import static org.apache.hadoop.fs.statistics.IOStatisticAssertions.assertThatStatisticMeanMatches;
+import static org.apache.hadoop.fs.statistics.IOStatisticAssertions.verifyStatisticCounterValue;
+import static org.apache.hadoop.fs.statistics.IOStatisticAssertions.verifyStatisticGaugeValue;
+import static org.apache.hadoop.fs.statistics.IOStatisticAssertions.verifyStatisticMaximumValue;
+import static org.apache.hadoop.fs.statistics.IOStatisticAssertions.verifyStatisticMinimumValue;
+import static org.apache.hadoop.fs.statistics.IOStatisticsSupport.snapshotIOStatistics;
+import static org.apache.hadoop.fs.statistics.impl.IOStatisticsBinding.iostatisticsStore;
+
+/**
+ * Test the IOStatisticStore implementation.
+ */
+public class TestIOStatisticsStore extends AbstractHadoopTestBase {
+
+  private static final Logger LOG =
+      LoggerFactory.getLogger(TestIOStatisticsStore.class);
+
+
+  private static final String COUNT = "count";
+
+  private static final String GAUGE = "gauge";
+
+  private static final String MIN = "min";
+
+  private static final String MAX = "max";
+
+  private static final String MEAN = "mean";
+
+  public static final String UNKNOWN = "unknown";
+
+  private IOStatisticsStore stats;
+
+  @Before
+  public void setup() {
+    stats = iostatisticsStore()
+        .withCounters(COUNT)
+        .withGauges(GAUGE)
+        .withMinimums(MIN)
+        .withMaximums(MAX)
+        .withMeanStatistics(MEAN)
+        .build();
+  }
+
+  @After
+  public void teardown() {
+    LOG.info("stats {}", stats);
+  }
+
+  /**
+   * Gauges go up and down.
+   */
+  @Test
+  public void testGauges() throws Throwable {
+    stats.setGauge(GAUGE, 1);
+    verifyStatisticGaugeValue(stats, GAUGE, 1);
+    stats.incrementGauge(GAUGE, 1);
+    verifyStatisticGaugeValue(stats, GAUGE, 2);
+    stats.setGauge(GAUGE, -1);
+    verifyStatisticGaugeValue(stats, GAUGE, -1);
+    Assertions.assertThat(stats.incrementGauge(GAUGE, -1))
+        .isEqualTo(-2);
+    verifyStatisticGaugeValue(stats, GAUGE, -2);
+    Assertions.assertThat(stats.getGaugeReference(GAUGE).get())
+        .isEqualTo(-2);
+    stats.setGauge(UNKNOWN, 1);
+    Assertions.assertThat(stats.incrementGauge(UNKNOWN, 1))
+        .isEqualTo(0);
+  }
+
+  @Test
+  public void testMinimums() throws Throwable {
+    stats.setMinimum(MIN, 100);
+    verifyStatisticMinimumValue(stats, MIN, 100);
+    stats.setMinimum(MIN, 100);
+    // will do nothing as it is higher
+    stats.addMinimumSample(MIN, 200);
+    verifyStatisticMinimumValue(stats, MIN, 100);
+    stats.addMinimumSample(MIN, 10);
+    verifyStatisticMinimumValue(stats, MIN, 10);
+    stats.setMinimum(UNKNOWN, 100);
+    stats.addMinimumSample(UNKNOWN, 200);
+  }
+
+  @Test
+  public void testMaximums() throws Throwable {
+    stats.setMaximum(MAX, 100);
+    verifyStatisticMaximumValue(stats, MAX, 100);
+    stats.setMaximum(MAX, 100);
+    stats.addMaximumSample(MAX, 200);
+    verifyStatisticMaximumValue(stats, MAX, 200);
+    stats.addMaximumSample(MAX, 10);
+    verifyStatisticMaximumValue(stats, MAX, 200);
+    stats.setMaximum(UNKNOWN, 100);
+    stats.addMaximumSample(UNKNOWN, 200);
+  }
+
+  @Test
+  public void testMeans() throws Throwable {
+    stats.setMeanStatistic(MEAN,
+        new MeanStatistic(1, 1));
+
+    assertThatStatisticMeanMatches(stats, MEAN, 1, 1)
+        .matches(p -> p.mean() == 1, "mean");
+    stats.addMeanStatisticSample(MEAN, 9);
+    assertThatStatisticMeanMatches(stats, MEAN, 2, 10)
+        .matches(p -> p.mean() == 5, "mean");
+  }
+
+  @Test
+  public void testRoundTrip() throws Throwable {
+    JsonSerialization<IOStatisticsSnapshot> serializer
+        = IOStatisticsSnapshot.serializer();
+    stats.incrementCounter(COUNT);
+    stats.setGauge(GAUGE, -1);
+    stats.addMaximumSample(MAX, 200);
+    stats.addMinimumSample(MIN, -100);
+    stats.addMeanStatisticSample(MEAN, 1);
+    stats.addMeanStatisticSample(MEAN, 9);
+
+    String json = serializer.toJson(snapshotIOStatistics(stats));
+    LOG.info("serialized form\n{}", json);
+    IOStatisticsSnapshot deser = serializer.fromJson(json);
+    LOG.info("deserialized {}", deser);
+    verifyStatisticCounterValue(deser, COUNT, 1L);
+    verifyStatisticGaugeValue(deser, GAUGE, -1);
+    verifyStatisticMaximumValue(deser, MAX, 200);
+    verifyStatisticMinimumValue(deser, MIN, -100);
+    assertThatStatisticMeanMatches(deser, MEAN, 2, 10)
+        .matches(p -> p.mean() == 5, "mean");
+
+  }
+
+  @Test
+  public void testUnknownCounter() throws Throwable {
+    Assertions.assertThat(stats.incrementCounter("unknown", -10))
+        .isEqualTo(0);
+  }
+
+  @Test
+  public void testNegativeCounterIncrementIgnored() throws Throwable {
+    Assertions.assertThat(stats.incrementCounter(COUNT, 2))
+        .isEqualTo(2);
+    Assertions.assertThat(stats.incrementCounter(COUNT, -10))
+        .isEqualTo(2);
+  }
+
+}
diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/statistics/TestMeanStatistic.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/statistics/TestMeanStatistic.java
new file mode 100644
index 0000000000000..749a6ee4d9eb4
--- /dev/null
+++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/statistics/TestMeanStatistic.java
@@ -0,0 +1,219 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.statistics;
+
+import org.assertj.core.api.Assertions;
+import org.junit.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.hadoop.test.AbstractHadoopTestBase;
+import org.apache.hadoop.util.JsonSerialization;
+
+/**
+ * Test the {@link MeanStatistic} class.
+ */
+public class TestMeanStatistic extends AbstractHadoopTestBase {
+
+  private static final Logger LOG =
+      LoggerFactory.getLogger(TestMeanStatistic.class);
+
+  private static final int TEN = 10;
+
+  private static final double ZEROD = 0.0d;
+
+  private static final double TEND = 10.0d;
+
+  private final MeanStatistic empty = new MeanStatistic(0, 0);
+
+  private final MeanStatistic tenFromOne = new MeanStatistic(1, TEN);
+
+  private final MeanStatistic tenFromTen = new MeanStatistic(TEN, TEN);
+
+  @Test
+  public void testEmptiness() throws Throwable {
+    Assertions.assertThat(empty)
+        .matches(MeanStatistic::isEmpty, "is empty")
+        .isEqualTo(new MeanStatistic(0, TEN))
+        .isEqualTo(new MeanStatistic())
+        .isNotEqualTo(tenFromOne);
+    Assertions.assertThat(empty.mean())
+        .isEqualTo(ZEROD);
+    Assertions.assertThat(empty.toString())
+        .contains("0.0");
+  }
+
+  @Test
+  public void testTenFromOne() throws Throwable {
+    Assertions.assertThat(tenFromOne)
+        .matches(p -> !p.isEmpty(), "is not empty")
+        .isEqualTo(tenFromOne)
+        .isNotEqualTo(tenFromTen);
+    Assertions.assertThat(tenFromOne.mean())
+        .isEqualTo(TEND);
+  }
+
+  @Test
+  public void testNegativeSamplesAreEmpty() throws Throwable {
+    MeanStatistic stat = new MeanStatistic(-10, 1);
+    Assertions.assertThat(stat)
+        .describedAs("stat with negative samples")
+        .matches(MeanStatistic::isEmpty, "is empty")
+        .isEqualTo(empty)
+        .extracting(MeanStatistic::mean)
+        .isEqualTo(ZEROD);
+    Assertions.assertThat(stat.toString())
+        .contains("0.0");
+
+  }
+
+  @Test
+  public void testCopyNonEmpty() throws Throwable {
+    MeanStatistic stat = tenFromOne.copy();
+    Assertions.assertThat(stat)
+        .describedAs("copy of " + tenFromOne)
+        .isEqualTo(tenFromOne)
+        .isNotSameAs(tenFromOne);
+  }
+
+  @Test
+  public void testCopyEmpty() throws Throwable {
+    MeanStatistic stat = empty.copy();
+    Assertions.assertThat(stat)
+        .describedAs("copy of " + empty)
+        .isEqualTo(empty)
+        .isNotSameAs(empty);
+  }
+
+  @Test
+  public void testDoubleSamples() throws Throwable {
+    MeanStatistic stat = tenFromOne.copy();
+    Assertions.assertThat(stat.add(tenFromOne))
+        .isEqualTo(new MeanStatistic(2, 20))
+        .extracting(MeanStatistic::mean)
+        .isEqualTo(TEND);
+  }
+
+  @Test
+  public void testAddEmptyR() throws Throwable {
+    MeanStatistic stat = tenFromOne.copy();
+    Assertions.assertThat(stat.add(empty))
+        .isEqualTo(tenFromOne);
+  }
+
+  @Test
+  public void testAddEmptyL() throws Throwable {
+    MeanStatistic stat = empty.copy();
+    Assertions.assertThat(stat.add(tenFromOne))
+        .isEqualTo(tenFromOne);
+  }
+
+  @Test
+  public void testAddEmptyLR() throws Throwable {
+    MeanStatistic stat = empty.copy();
+    Assertions.assertThat(stat.add(empty))
+        .isEqualTo(empty);
+  }
+
+  @Test
+  public void testAddSampleToEmpty() throws Throwable {
+    MeanStatistic stat = empty.copy();
+    stat.addSample(TEN);
+    Assertions.assertThat(stat)
+        .isEqualTo(tenFromOne);
+  }
+
+  @Test
+  public void testAddZeroValueSamples() throws Throwable {
+    MeanStatistic stat = tenFromOne.copy();
+    for (int i = 0; i < 9; i++) {
+      stat.addSample(0);
+    }
+    Assertions.assertThat(stat)
+        .isEqualTo(tenFromTen);
+  }
+
+  @Test
+  public void testSetSamples() throws Throwable {
+    MeanStatistic stat = tenFromOne.copy();
+    stat.setSamples(10);
+    Assertions.assertThat(stat)
+        .isEqualTo(tenFromTen);
+  }
+
+  @Test
+  public void testSetSums() throws Throwable {
+    MeanStatistic stat = tenFromOne.copy();
+    stat.setSum(100);
+    stat.setSamples(20);
+    Assertions.assertThat(stat)
+        .isEqualTo(new MeanStatistic(20, 100))
+        .extracting(MeanStatistic::mean)
+        .isEqualTo(5.0d);
+  }
+
+  @Test
+  public void testSetNegativeSamplesMakesEmpty() throws Throwable {
+    MeanStatistic stat = tenFromOne.copy();
+    stat.setSamples(-3);
+    Assertions.assertThat(stat)
+        .isEqualTo(empty);
+  }
+
+  @Test
+  public void testJsonRoundTrip() throws Throwable {
+    JsonSerialization<MeanStatistic> serializer = serializer();
+
+    String json = serializer.toJson(tenFromTen);
+    LOG.info("serialized form\n{}", json);
+    Assertions.assertThat(json)
+        .describedAs("JSON form of %s", tenFromTen)
+        .doesNotContain("empty")
+        .doesNotContain("mean");
+
+    MeanStatistic deser = serializer.fromJson(json);
+    LOG.info("deserialized {}", deser);
+    Assertions.assertThat(deser)
+        .isEqualTo(tenFromTen);
+  }
+
+  /**
+   * negative sample counts in the json convert the stat to being empty.
+   */
+  @Test
+  public void testHandleMaliciousStat() throws Throwable {
+    String json = "{\n"
+        + "  \"sum\" : 10,\n"
+        + "  \"samples\" : -10\n"
+        + "}";
+    JsonSerialization<MeanStatistic> serializer = serializer();
+    MeanStatistic deser = serializer.fromJson(json);
+    LOG.info("deserialized {}", deser);
+    Assertions.assertThat(deser)
+        .isEqualTo(empty);
+  }
+
+  /**
+   * Get a JSON serializer.
+   * @return a serializer.
+   */
+  public static JsonSerialization<MeanStatistic> serializer() {
+    return new JsonSerialization<>(MeanStatistic.class, true, true);
+  }
+}
diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/util/functional/TestRemoteIterators.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/util/functional/TestRemoteIterators.java
new file mode 100644
index 0000000000000..8cd5c58585e6f
--- /dev/null
+++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/util/functional/TestRemoteIterators.java
@@ -0,0 +1,469 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.util.functional;
+
+import java.io.Closeable;
+import java.io.IOException;
+import java.util.Iterator;
+import java.util.NoSuchElementException;
+
+import org.apache.hadoop.thirdparty.com.google.common.base.Preconditions;
+import org.junit.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.hadoop.fs.RemoteIterator;
+import org.apache.hadoop.fs.statistics.IOStatistics;
+import org.apache.hadoop.fs.statistics.IOStatisticsSnapshot;
+import org.apache.hadoop.fs.statistics.IOStatisticsSource;
+import org.apache.hadoop.test.AbstractHadoopTestBase;
+
+import static org.apache.hadoop.fs.statistics.IOStatisticAssertions.extractStatistics;
+import static org.apache.hadoop.test.LambdaTestUtils.intercept;
+import static org.apache.hadoop.util.functional.RemoteIterators.*;
+import static org.assertj.core.api.Assertions.assertThat;
+
+/**
+ * Test for {@link RemoteIterators}.
+ *
+ */
+public class TestRemoteIterators extends AbstractHadoopTestBase {
+
+  private static final Logger LOG = LoggerFactory.getLogger(
+      TestRemoteIterators.class);
+
+  private static final String[] DATA = {"a", "b", "c"};
+
+  /** Counter for lambda-expressions. */
+  private int counter;
+
+  @Test
+  public void testIterateArray() throws Throwable {
+    verifyInvoked(remoteIteratorFromArray(DATA), DATA.length,
+        (s) -> LOG.info(s));
+  }
+
+  @Test
+  public void testIterateArrayMapped() throws Throwable {
+    verifyInvoked(
+        mappingRemoteIterator(
+            remoteIteratorFromArray(DATA),
+            (d) -> {
+              counter += d.length();
+              return d;
+            }),
+        DATA.length,
+        this::log);
+    assertCounterValue(3);
+  }
+
+  public void log(Object o) {
+    LOG.info("{}", o);
+  }
+
+  /**
+   * Singleton is iterated through once.
+   * The toString() call is passed through.
+   */
+  @Test
+  public void testSingleton() throws Throwable {
+    StringBuffer result = new StringBuffer();
+    String name = "singleton";
+    RemoteIterator<String> it = remoteIteratorFromSingleton(name);
+    assertStringValueContains(it, "SingletonIterator");
+    assertStringValueContains(it, name);
+    verifyInvoked(
+        it,
+        1,
+        (s) -> result.append(s));
+    assertThat(result.toString())
+        .isEqualTo(name);
+  }
+
+  @Test
+  public void testSingletonNotClosed() throws Throwable {
+    CloseCounter closeCounter = new CloseCounter();
+    RemoteIterator<CloseCounter> it = remoteIteratorFromSingleton(closeCounter);
+    verifyInvoked(it, 1, this::log);
+    close(it);
+    closeCounter.assertCloseCount(0);
+  }
+
+  /**
+   * A null singleton is not an error.
+   */
+  @Test
+  public void testNullSingleton() throws Throwable {
+    verifyInvoked(remoteIteratorFromSingleton(null), 0, this::log);
+  }
+
+
+  /**
+   * If you create a singleton iterator and it is an IOStatisticsSource,
+   * then that is the statistics which can be extracted from the
+   * iterator.
+   */
+  @Test
+  public void testSingletonStats() throws Throwable {
+    IOStatsInstance singleton = new IOStatsInstance();
+    RemoteIterator<IOStatsInstance> it
+        = remoteIteratorFromSingleton(singleton);
+    extractStatistics(it);
+  }
+
+  /**
+   * The mapping remote iterator passes IOStatistics
+   * calls down.
+   */
+  @Test
+  public void testMappedSingletonStats() throws Throwable {
+    IOStatsInstance singleton = new IOStatsInstance();
+    RemoteIterator<String> it
+        = mappingRemoteIterator(remoteIteratorFromSingleton(singleton),
+        Object::toString);
+    verifyInvoked(it, 1, this::log);
+    extractStatistics(it);
+  }
+
+  /**
+   * Close() calls are passed through.
+   */
+  @Test
+  public void testClosePassthrough() throws Throwable {
+    CountdownRemoteIterator countdown = new CountdownRemoteIterator(0);
+    RemoteIterator<Integer> it = mappingRemoteIterator(
+        countdown,
+        i -> i);
+    verifyInvoked(it, 0, this::log);
+    // the foreach() operation called close()
+    countdown.assertCloseCount(1);
+    extractStatistics(countdown);
+    ((Closeable)it).close();
+    countdown.assertCloseCount(1);
+  }
+
+  @Test
+  public void testMapping() throws Throwable {
+    CountdownRemoteIterator countdown = new CountdownRemoteIterator(100);
+    RemoteIterator<Integer> it = mappingRemoteIterator(
+        countdown,
+        i -> i);
+    verifyInvoked(it, 100, c -> counter++);
+    assertCounterValue(100);
+    extractStatistics(it);
+    assertStringValueContains(it, "CountdownRemoteIterator");
+    close(it);
+    countdown.assertCloseCount(1);
+  }
+
+  @Test
+  public void testFiltering() throws Throwable {
+    CountdownRemoteIterator countdown = new CountdownRemoteIterator(100);
+    // only even numbers are passed through
+    RemoteIterator<Integer> it = filteringRemoteIterator(
+        countdown,
+        i -> (i % 2) == 0);
+    verifyInvoked(it, 50, c -> counter++);
+    assertCounterValue(50);
+    extractStatistics(it);
+    close(it);
+    countdown.assertCloseCount(1);
+  }
+
+  /**
+   * A filter which accepts nothing results in
+   * an empty iteration.
+   */
+  @Test
+  public void testFilterNoneAccepted() throws Throwable {
+    // nothing gets through
+    RemoteIterator<Integer> it = filteringRemoteIterator(
+        new CountdownRemoteIterator(100),
+        i -> false);
+    verifyInvoked(it, 0, c -> counter++);
+    assertCounterValue(0);
+    extractStatistics(it);
+  }
+
+  @Test
+  public void testFilterAllAccepted() throws Throwable {
+    // nothing gets through
+    RemoteIterator<Integer> it = filteringRemoteIterator(
+        new CountdownRemoteIterator(100),
+        i -> true);
+    verifyInvoked(it, 100, c -> counter++);
+    assertStringValueContains(it, "CountdownRemoteIterator");
+  }
+
+  @Test
+  public void testJavaIteratorSupport() throws Throwable {
+    CountdownIterator countdownIterator = new CountdownIterator(100);
+    RemoteIterator<Integer> it = remoteIteratorFromIterator(
+        countdownIterator);
+    verifyInvoked(it, 100, c -> counter++);
+    assertStringValueContains(it, "CountdownIterator");
+    extractStatistics(it);
+    close(it);
+    countdownIterator.assertCloseCount(1);
+  }
+
+  @Test
+  public void testJavaIterableSupport() throws Throwable {
+    CountdownIterable countdown = new CountdownIterable(100);
+    RemoteIterator<Integer> it = remoteIteratorFromIterable(
+        countdown);
+    verifyInvoked(it, 100, c -> counter++);
+    assertStringValueContains(it, "CountdownIterator");
+    extractStatistics(it);
+    // close the iterator
+    close(it);
+    countdown.assertCloseCount(0);
+    // and a new iterator can be crated
+    verifyInvoked(remoteIteratorFromIterable(countdown),
+        100, c -> counter++);
+  }
+
+  /**
+   * If a RemoteIterator is constructed from an iterable
+   * and that is to be closed, we close it.
+   */
+  @Test
+  public void testJavaIterableClose() throws Throwable {
+    CountdownIterable countdown = new CountdownIterable(100);
+    RemoteIterator<Integer> it = closingRemoteIterator(
+        remoteIteratorFromIterable(countdown),
+        countdown);
+    verifyInvoked(it, 100, c -> counter++);
+    assertStringValueContains(it, "CountdownIterator");
+    extractStatistics(it);
+
+    // verify the iterator was self closed in hasNext()
+    countdown.assertCloseCount(1);
+
+    // explicitly close the iterator
+    close(it);
+    countdown.assertCloseCount(1);
+    // and a new iterator cannot be created
+    intercept(IllegalStateException.class, () ->
+        remoteIteratorFromIterable(countdown));
+  }
+
+  /**
+   * If a RemoteIterator is constructed from an iterable
+   * and that is to be closed, we close it.
+   */
+  @SuppressWarnings("InfiniteLoopStatement")
+  @Test
+  public void testJavaIterableCloseInNextLoop() throws Throwable {
+    CountdownIterable countdown = new CountdownIterable(100);
+    RemoteIterator<Integer> it = closingRemoteIterator(
+        remoteIteratorFromIterable(countdown),
+        countdown);
+    try {
+      while(true) {
+        it.next();
+      }
+    } catch (NoSuchElementException expected) {
+
+    }
+    // verify the iterator was self closed in next()
+    countdown.assertCloseCount(1);
+
+  }
+
+  /**
+   * assert that the string value of an object contains the
+   * expected text.
+   * @param o object
+   * @param expected  expected text
+   */
+  protected void assertStringValueContains(
+      final Object o,
+      final String expected) {
+    assertThat(o.toString())
+        .describedAs("Object string value")
+        .contains(expected);
+  }
+
+  /**
+   * Assert that the counter field is at a specific value.
+   * @param expected counter
+   */
+  protected void assertCounterValue(final int expected) {
+    assertThat(counter)
+        .describedAs("Counter value")
+        .isEqualTo(expected);
+  }
+
+  /**
+   * Verify that the iteration completes with a given size.
+   * @param it iterator
+   * @param <T> type.
+   * @param length expected size
+   * @param consumer consumer
+   */
+  protected <T> void verifyInvoked(final RemoteIterator<T> it,
+      int length,
+      ConsumerRaisingIOE<T> consumer)
+      throws IOException {
+    assertThat(foreach(it, consumer))
+        .describedAs("Scan through iterator %s", it)
+        .isEqualTo(length);
+  }
+
+  /**
+   * Close an iterator if it is iterable.
+   * @param it iterator
+   * @param <T> type.
+   */
+  private <T> void close(final RemoteIterator<T> it) throws IOException {
+    if (it instanceof Closeable) {
+      ((Closeable) it).close();
+    }
+  }
+
+  /**
+   * Class whose close() call increments a counter.
+   */
+  private static class CloseCounter extends
+      IOStatsInstance implements Closeable {
+
+    private int closeCount;
+
+    @Override
+    public void close() throws IOException {
+      closeCount++;
+      LOG.info("close ${}", closeCount);
+    }
+
+    public int getCloseCount() {
+      return closeCount;
+    }
+
+    public void reset() {
+      closeCount = 0;
+    }
+
+    public void assertCloseCount(int expected) {
+      assertThat(closeCount)
+          .describedAs("Close count")
+          .isEqualTo(expected);
+    }
+
+  }
+
+  /**
+   * Simple class to implement IOStatistics.
+   */
+  private static class IOStatsInstance implements IOStatisticsSource {
+
+    private IOStatisticsSnapshot stats = new IOStatisticsSnapshot();
+
+    @Override
+    public IOStatistics getIOStatistics() {
+      return stats;
+    }
+
+  }
+
+  /**
+   * Iterator which counts down.
+   */
+  private static final class CountdownRemoteIterator extends CloseCounter
+      implements RemoteIterator<Integer> {
+
+    private int limit;
+
+    private CountdownRemoteIterator(final int limit) {
+      this.limit = limit;
+    }
+
+    @Override
+    public boolean hasNext() throws IOException {
+      return limit > 0;
+    }
+
+    @Override
+    public Integer next() throws IOException {
+      return limit--;
+    }
+
+    @Override
+    public String toString() {
+      return "CountdownRemoteIterator{" +
+          "limit=" + limit +
+          '}';
+    }
+  }
+
+  /**
+   * Iterator which counts down.
+   */
+  private static final class CountdownIterator extends CloseCounter
+      implements Iterator<Integer> {
+
+    private int limit;
+
+    private CountdownIterator(final int limit) {
+      this.limit = limit;
+    }
+
+    @Override
+    public boolean hasNext() {
+      return limit > 0;
+    }
+
+    @Override
+    public Integer next() {
+      if (!hasNext()) {
+        throw new NoSuchElementException("limit reached");
+      }
+      return limit--;
+    }
+
+    @Override
+    public String toString() {
+      return "CountdownIterator{" +
+          "limit=" + limit +
+          '}';
+    }
+  }
+
+  /**
+   * Iterable for countdown iterators.
+   * Once closed, calls to iterator() raise an exception.
+   */
+  private static final class CountdownIterable extends CloseCounter
+      implements Iterable<Integer> {
+
+    private int limit;
+
+    private CountdownIterable(final int limit) {
+      this.limit = limit;
+    }
+
+    @Override
+    public Iterator<Integer> iterator() {
+      Preconditions.checkState(getCloseCount() == 0);
+
+      return new CountdownIterator(limit);
+    }
+  }
+
+}

From eca7b37dffff8c513a418e8148239d5250f5ca47 Mon Sep 17 00:00:00 2001
From: Steve Loughran <stevel@apache.org>
Date: Mon, 14 Jan 2019 23:29:27 +0530
Subject: [PATCH 32/40] HADOOP-14556. S3A to support Delegation Tokens.

Contributed by Steve Loughran and Daryn Sharp.
---
 .../apache/hadoop/fs/StorageStatistics.java   |  10 +-
 .../java/org/apache/hadoop/mapreduce/Job.java |  10 +-
 hadoop-project/pom.xml                        |   5 +
 .../fs/s3a/AWSCredentialProviderList.java     |  80 +-
 .../org/apache/hadoop/fs/s3a/Constants.java   |  51 +-
 .../hadoop/fs/s3a/DefaultS3ClientFactory.java |  11 +-
 .../org/apache/hadoop/fs/s3a/Invoker.java     |   2 +-
 .../java/org/apache/hadoop/fs/s3a/S3A.java    |  12 +-
 .../hadoop/fs/s3a/S3AEncryptionMethods.java   |  43 +-
 .../apache/hadoop/fs/s3a/S3AFileSystem.java   | 244 +----
 .../org/apache/hadoop/fs/s3a/S3AUtils.java    | 143 ++-
 .../apache/hadoop/fs/s3a/S3ClientFactory.java |   6 +-
 .../fs/s3a/SimpleAWSCredentialsProvider.java  |  33 +-
 .../org/apache/hadoop/fs/s3a/Statistic.java   |   7 +-
 .../s3a/TemporaryAWSCredentialsProvider.java  |  89 +-
 .../auth/AbstractAWSCredentialProvider.java   |  70 ++
 .../AbstractSessionCredentialsProvider.java   | 170 ++++
 .../auth/AssumedRoleCredentialProvider.java   |  41 +-
 .../auth/IAMInstanceCredentialsProvider.java  |  75 ++
 .../s3a/auth/MarshalledCredentialBinding.java | 205 +++++
 .../auth/MarshalledCredentialProvider.java    |  92 ++
 .../fs/s3a/auth/MarshalledCredentials.java    | 409 ++++++++
 .../fs/s3a/auth/NoAuthWithAWSException.java   |   8 +-
 .../s3a/auth/NoAwsCredentialsException.java   |  69 ++
 .../hadoop/fs/s3a/auth/RolePolicies.java      | 190 +++-
 .../hadoop/fs/s3a/auth/STSClientFactory.java  | 173 +++-
 .../auth/delegation/AWSPolicyProvider.java    |  59 ++
 .../auth/delegation/AbstractDTService.java    | 154 ++++
 .../AbstractDelegationTokenBinding.java       | 305 ++++++
 .../AbstractS3ATokenIdentifier.java           | 305 ++++++
 .../auth/delegation/DelegationConstants.java  | 165 ++++
 .../DelegationTokenIOException.java           |  50 +
 .../EncryptionSecretOperations.java           |  73 ++
 .../auth/delegation/EncryptionSecrets.java    | 221 +++++
 .../FullCredentialsTokenBinding.java          | 172 ++++
 .../FullCredentialsTokenIdentifier.java       |  50 +
 .../s3a/auth/delegation/RoleTokenBinding.java | 176 ++++
 .../auth/delegation/RoleTokenIdentifier.java} |  40 +-
 .../auth/delegation/S3ADelegationTokens.java  | 685 ++++++++++++++
 .../fs/s3a/auth/delegation/S3ADtFetcher.java  |  80 ++
 .../auth/delegation/SessionTokenBinding.java  | 421 +++++++++
 .../delegation/SessionTokenIdentifier.java    | 146 +++
 .../fs/s3a/auth/delegation/package-info.java  |  34 +
 .../hadoop/fs/s3a/auth/package-info.java      |   6 +-
 .../hadoop/fs/s3a/commit/DurationInfo.java    |  39 +-
 .../fs/s3a/s3guard/DynamoDBMetadataStore.java |  37 +-
 .../hadoop/fs/s3native/S3xLoginHelper.java    |   2 -
 ...org.apache.hadoop.security.token.DtFetcher |  18 +
 ...ache.hadoop.security.token.TokenIdentifier |  20 +
 .../tools/hadoop-aws/assumed_roles.md         | 289 +++---
 .../delegation_token_architecture.md          | 466 ++++++++++
 .../tools/hadoop-aws/delegation_tokens.md     | 870 ++++++++++++++++++
 .../site/markdown/tools/hadoop-aws/index.md   |  87 +-
 .../site/markdown/tools/hadoop-aws/testing.md |  44 +-
 .../tools/hadoop-aws/troubleshooting_s3a.md   |  18 +-
 .../hadoop/fs/s3a/AbstractS3ATestBase.java    |   6 +-
 ...SSEKMSUserDefinedKeyBlockOutputStream.java |  50 -
 .../fs/s3a/ITestS3ATemporaryCredentials.java  | 364 +++++++-
 .../hadoop/fs/s3a/MockS3AFileSystem.java      |  19 +-
 .../hadoop/fs/s3a/MockS3ClientFactory.java    |   3 +-
 .../hadoop/fs/s3a/S3ATestConstants.java       |  20 +
 .../apache/hadoop/fs/s3a/S3ATestUtils.java    |  79 +-
 .../fs/s3a/TestS3AAWSCredentialsProvider.java | 218 +++--
 .../hadoop/fs/s3a/TestSSEConfiguration.java   |  25 +
 .../hadoop/fs/s3a/auth/ITestAssumeRole.java   |  55 +-
 .../ITestAssumedRoleCommitOperations.java     |   2 +-
 .../hadoop/fs/s3a/auth/RoleTestUtils.java     |  41 +-
 .../s3a/auth/TestMarshalledCredentials.java   | 138 +++
 .../auth/delegation/AbstractDelegationIT.java | 207 +++++
 .../delegation/CountInvocationsProvider.java  |  52 ++
 .../hadoop/fs/s3a/auth/delegation/Csvout.java | 103 +++
 .../delegation/ILoadTestRoleCredentials.java} |  24 +-
 .../ILoadTestSessionCredentials.java          | 295 ++++++
 .../auth/delegation/ITestDelegatedMRJob.java  | 272 ++++++
 .../ITestRoleDelegationInFileystem.java       |  68 ++
 .../delegation/ITestRoleDelegationTokens.java | 122 +++
 .../ITestSessionDelegationInFileystem.java    | 727 +++++++++++++++
 .../ITestSessionDelegationTokens.java         | 282 ++++++
 .../MiniKerberizedHadoopCluster.java          | 378 ++++++++
 .../TestS3ADelegationTokenSupport.java        | 171 ++++
 .../s3a/commit/staging/StagingTestBase.java   |  28 +-
 .../commit/staging/TestStagingCommitter.java  |   6 +-
 .../TestStagingDirectoryOutputCommitter.java  |  22 +-
 .../TestStagingPartitionedFileListing.java    |   4 +-
 .../TestStagingPartitionedJobCommit.java      |   4 +-
 .../TestStagingPartitionedTaskCommit.java     |  24 +-
 .../ITestS3AFileContextStatistics.java        |  20 +-
 .../hadoop/fs/s3a/scale/NanoTimerStats.java   | 192 ++++
 .../fs/s3a/yarn/ITestS3AMiniYarnCluster.java  |  38 +-
 .../org/apache/hadoop/mapreduce/MockJob.java  | 115 +++
 90 files changed, 10542 insertions(+), 912 deletions(-)
 create mode 100644 hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/AbstractAWSCredentialProvider.java
 create mode 100644 hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/AbstractSessionCredentialsProvider.java
 create mode 100644 hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/IAMInstanceCredentialsProvider.java
 create mode 100644 hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/MarshalledCredentialBinding.java
 create mode 100644 hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/MarshalledCredentialProvider.java
 create mode 100644 hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/MarshalledCredentials.java
 create mode 100644 hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/NoAwsCredentialsException.java
 create mode 100644 hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/delegation/AWSPolicyProvider.java
 create mode 100644 hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/delegation/AbstractDTService.java
 create mode 100644 hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/delegation/AbstractDelegationTokenBinding.java
 create mode 100644 hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/delegation/AbstractS3ATokenIdentifier.java
 create mode 100644 hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/delegation/DelegationConstants.java
 create mode 100644 hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/delegation/DelegationTokenIOException.java
 create mode 100644 hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/delegation/EncryptionSecretOperations.java
 create mode 100644 hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/delegation/EncryptionSecrets.java
 create mode 100644 hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/delegation/FullCredentialsTokenBinding.java
 create mode 100644 hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/delegation/FullCredentialsTokenIdentifier.java
 create mode 100644 hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/delegation/RoleTokenBinding.java
 rename hadoop-tools/hadoop-aws/src/{test/java/org/apache/hadoop/fs/s3a/ITestS3AEncryptionSSECBlockOutputStream.java => main/java/org/apache/hadoop/fs/s3a/auth/delegation/RoleTokenIdentifier.java} (52%)
 create mode 100644 hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/delegation/S3ADelegationTokens.java
 create mode 100644 hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/delegation/S3ADtFetcher.java
 create mode 100644 hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/delegation/SessionTokenBinding.java
 create mode 100644 hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/delegation/SessionTokenIdentifier.java
 create mode 100644 hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/delegation/package-info.java
 create mode 100644 hadoop-tools/hadoop-aws/src/main/resources/META-INF/services/org.apache.hadoop.security.token.DtFetcher
 create mode 100644 hadoop-tools/hadoop-aws/src/main/resources/META-INF/services/org.apache.hadoop.security.token.TokenIdentifier
 create mode 100644 hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/delegation_token_architecture.md
 create mode 100644 hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/delegation_tokens.md
 delete mode 100644 hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AEncryptionSSEKMSUserDefinedKeyBlockOutputStream.java
 create mode 100644 hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/TestMarshalledCredentials.java
 create mode 100644 hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/delegation/AbstractDelegationIT.java
 create mode 100644 hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/delegation/CountInvocationsProvider.java
 create mode 100644 hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/delegation/Csvout.java
 rename hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/{ITestS3AEncryptionSSES3BlockOutputStream.java => auth/delegation/ILoadTestRoleCredentials.java} (55%)
 create mode 100644 hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/delegation/ILoadTestSessionCredentials.java
 create mode 100644 hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/delegation/ITestDelegatedMRJob.java
 create mode 100644 hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/delegation/ITestRoleDelegationInFileystem.java
 create mode 100644 hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/delegation/ITestRoleDelegationTokens.java
 create mode 100644 hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/delegation/ITestSessionDelegationInFileystem.java
 create mode 100644 hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/delegation/ITestSessionDelegationTokens.java
 create mode 100644 hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/delegation/MiniKerberizedHadoopCluster.java
 create mode 100644 hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/delegation/TestS3ADelegationTokenSupport.java
 create mode 100644 hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/scale/NanoTimerStats.java
 create mode 100644 hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/mapreduce/MockJob.java

diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/StorageStatistics.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/StorageStatistics.java
index 5a3d7363107ee..74631b5695537 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/StorageStatistics.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/StorageStatistics.java
@@ -18,6 +18,7 @@
 package org.apache.hadoop.fs;
 
 import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.classification.InterfaceStability;
 
 import java.util.Iterator;
 
@@ -37,9 +38,13 @@ public abstract class StorageStatistics {
    *
    * When adding new common statistic name constants, please make them unique.
    * By convention, they are implicitly unique:
-   *  - the name of the constants are uppercase, words separated by underscores.
-   *  - the value of the constants are lowercase of the constant names.
+   * <ul>
+   *   <li>the name of the constants are uppercase, words separated by
+   *   underscores.</li>
+   *   <li>the value of the constants are lowercase of the constant names.</li>
+   * </ul>
    */
+  @InterfaceStability.Evolving
   public interface CommonStatisticNames {
     // The following names are for file system operation invocations
     String OP_APPEND = "op_append";
@@ -49,6 +54,7 @@ public interface CommonStatisticNames {
     String OP_DELETE = "op_delete";
     String OP_EXISTS = "op_exists";
     String OP_GET_CONTENT_SUMMARY = "op_get_content_summary";
+    String OP_GET_DELEGATION_TOKEN = "op_get_delegation_token";
     String OP_GET_FILE_CHECKSUM = "op_get_file_checksum";
     String OP_GET_FILE_STATUS = "op_get_file_status";
     String OP_GET_STATUS = "op_get_status";
diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/Job.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/Job.java
index d7fa75d7e8903..9a998dacd9820 100644
--- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/Job.java
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/Job.java
@@ -42,6 +42,8 @@
 import org.apache.hadoop.mapreduce.util.ConfigUtil;
 import org.apache.hadoop.util.StringUtils;
 import org.apache.hadoop.yarn.api.records.ReservationId;
+
+import com.google.common.annotations.VisibleForTesting;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -1524,7 +1526,10 @@ public static Map<String, Boolean> getArchiveSharedCacheUploadPolicies(
     return getSharedCacheUploadPolicies(conf, false);
   }
 
-  private synchronized void connect()
+  /** Only for mocking via unit tests. */
+  @Private
+  @VisibleForTesting
+  synchronized void connect()
           throws IOException, InterruptedException, ClassNotFoundException {
     if (cluster == null) {
       cluster = 
@@ -1544,7 +1549,8 @@ boolean isConnected() {
 
   /** Only for mocking via unit tests. */
   @Private
-  public JobSubmitter getJobSubmitter(FileSystem fs, 
+  @VisibleForTesting
+  JobSubmitter getJobSubmitter(FileSystem fs,
       ClientProtocol submitClient) throws IOException {
     return new JobSubmitter(fs, submitClient);
   }
diff --git a/hadoop-project/pom.xml b/hadoop-project/pom.xml
index c3837cd8b5620..171a9299a94ee 100644
--- a/hadoop-project/pom.xml
+++ b/hadoop-project/pom.xml
@@ -1732,6 +1732,11 @@
           <artifactId>snakeyaml</artifactId>
           <version>${snakeyaml.version}</version>
         </dependency>
+        <dependency>
+          <groupId>org.hamcrest</groupId>
+          <artifactId>hamcrest-library</artifactId>
+          <version>1.3</version>
+        </dependency>
         <dependency>
           <groupId>org.assertj</groupId>
           <artifactId>assertj-core</artifactId>
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/AWSCredentialProviderList.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/AWSCredentialProviderList.java
index f9052fa97b9f8..542e6f4871cf3 100644
--- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/AWSCredentialProviderList.java
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/AWSCredentialProviderList.java
@@ -21,6 +21,7 @@
 import java.io.Closeable;
 import java.util.ArrayList;
 import java.util.Collection;
+import java.util.Collections;
 import java.util.List;
 import java.util.concurrent.atomic.AtomicBoolean;
 import java.util.concurrent.atomic.AtomicInteger;
@@ -39,6 +40,7 @@
 import org.apache.hadoop.classification.InterfaceAudience;
 import org.apache.hadoop.classification.InterfaceStability;
 import org.apache.hadoop.fs.s3a.auth.NoAuthWithAWSException;
+import org.apache.hadoop.fs.s3a.auth.NoAwsCredentialsException;
 import org.apache.hadoop.io.IOUtils;
 
 /**
@@ -52,7 +54,8 @@
  *   an {@link AmazonClientException}, that is rethrown, rather than
  *   swallowed.</li>
  *   <li>Has some more diagnostics.</li>
- *   <li>On failure, the last AmazonClientException raised is rethrown.</li>
+ *   <li>On failure, the last "relevant" AmazonClientException raised is
+ *   rethrown; exceptions other than 'no credentials' have priority.</li>
  *   <li>Special handling of {@link AnonymousAWSCredentials}.</li>
  * </ol>
  */
@@ -78,6 +81,12 @@ public class AWSCredentialProviderList implements AWSCredentialsProvider,
 
   private final AtomicBoolean closed = new AtomicBoolean(false);
 
+  /**
+   * The name, which is empty by default.
+   * Uses in the code assume if non empty there's a trailing space.
+   */
+  private String name = "";
+
   /**
    * Empty instance. This is not ready to be used.
    */
@@ -93,6 +102,29 @@ public AWSCredentialProviderList(
     this.providers.addAll(providers);
   }
 
+  /**
+   * Create with an initial list of providers.
+   * @param name name for error messages, may be ""
+   * @param providerArgs provider list.
+   */
+  public AWSCredentialProviderList(final String name,
+      final AWSCredentialsProvider... providerArgs) {
+    setName(name);
+    Collections.addAll(providers, providerArgs);
+  }
+
+  /**
+   * Set the name; adds a ": " if needed.
+   * @param name name to add, or "" for no name.
+   */
+  public void setName(final String name) {
+    if (!name.isEmpty() && !name.endsWith(": ")) {
+      this.name = name + ": ";
+    } else {
+      this.name = name;
+    }
+  }
+
   /**
    * Add a new provider.
    * @param p provider
@@ -101,6 +133,14 @@ public void add(AWSCredentialsProvider p) {
     providers.add(p);
   }
 
+  /**
+   * Add all providers from another list to this one.
+   * @param other the other list.
+   */
+  public void addAll(AWSCredentialProviderList other) {
+    providers.addAll(other.providers);
+  }
+
   /**
    * Refresh all child entries.
    */
@@ -123,7 +163,7 @@ public void refresh() {
   public AWSCredentials getCredentials() {
     if (isClosed()) {
       LOG.warn(CREDENTIALS_REQUESTED_WHEN_CLOSED);
-      throw new NoAuthWithAWSException(
+      throw new NoAuthWithAWSException(name +
           CREDENTIALS_REQUESTED_WHEN_CLOSED);
     }
     checkNotEmpty();
@@ -135,6 +175,8 @@ public AWSCredentials getCredentials() {
     for (AWSCredentialsProvider provider : providers) {
       try {
         AWSCredentials credentials = provider.getCredentials();
+        Preconditions.checkNotNull(credentials,
+            "Null credentials returned by %s", provider);
         if ((credentials.getAWSAccessKeyId() != null &&
             credentials.getAWSSecretKey() != null)
             || (credentials instanceof AnonymousAWSCredentials)) {
@@ -142,6 +184,18 @@ public AWSCredentials getCredentials() {
           LOG.debug("Using credentials from {}", provider);
           return credentials;
         }
+      } catch (NoAwsCredentialsException e) {
+        // don't bother with the stack trace here as it is usually a
+        // minor detail.
+
+        // only update the last exception if it isn't set.
+        // Why so? Stops delegation token issues being lost on the fallback
+        // values.
+        if (lastException == null) {
+          lastException = e;
+        }
+        LOG.debug("No credentials from {}: {}",
+            provider, e.toString());
       } catch (AmazonClientException e) {
         lastException = e;
         LOG.debug("No credentials provided by {}: {}",
@@ -151,12 +205,16 @@ public AWSCredentials getCredentials() {
 
     // no providers had any credentials. Rethrow the last exception
     // or create a new one.
-    String message = "No AWS Credentials provided by "
+    String message =  name +  "No AWS Credentials provided by "
         + listProviderNames();
     if (lastException != null) {
       message += ": " + lastException;
     }
-    throw new NoAuthWithAWSException(message, lastException);
+    if (lastException instanceof CredentialInitializationException) {
+      throw lastException;
+    } else {
+      throw new NoAuthWithAWSException(message, lastException);
+    }
   }
 
   /**
@@ -175,7 +233,7 @@ List<AWSCredentialsProvider> getProviders() {
    */
   public void checkNotEmpty() {
     if (providers.isEmpty()) {
-      throw new NoAuthWithAWSException(NO_AWS_CREDENTIAL_PROVIDERS);
+      throw new NoAuthWithAWSException(name + NO_AWS_CREDENTIAL_PROVIDERS);
     }
   }
 
@@ -198,8 +256,10 @@ public String listProviderNames() {
   @Override
   public String toString() {
     return "AWSCredentialProviderList[" +
+        name +
         "refcount= " + refCount.get() + ": [" +
-        StringUtils.join(providers, ", ") + ']';
+        StringUtils.join(providers, ", ") + ']'
+        + (lastProvider != null ? (" last provider: " + lastProvider) : "");
   }
 
   /**
@@ -265,4 +325,12 @@ public void close() {
       }
     }
   }
+
+  /**
+   * Get the size of this list.
+   * @return the number of providers in the list.
+   */
+  public int size() {
+    return providers.size();
+  }
 }
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/Constants.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/Constants.java
index 57b437ef42b77..1f15efb7cd942 100644
--- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/Constants.java
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/Constants.java
@@ -49,7 +49,7 @@ private Constants() {
   // s3 secret key
   public static final String SECRET_KEY = "fs.s3a.secret.key";
 
-  // aws credentials provider
+  // aws credentials providers
   public static final String AWS_CREDENTIALS_PROVIDER =
       "fs.s3a.aws.credentials.provider";
 
@@ -61,18 +61,20 @@ private Constants() {
   public static final String S3A_SECURITY_CREDENTIAL_PROVIDER_PATH =
       "fs.s3a.security.credential.provider.path";
 
-  // session token for when using TemporaryAWSCredentialsProvider
+  /**
+   * session token for when using TemporaryAWSCredentialsProvider: : {@value}.
+   */
   public static final String SESSION_TOKEN = "fs.s3a.session.token";
 
   /**
-   * AWS Role to request.
+   * ARN of AWS Role to request: {@value}.
    */
   public static final String ASSUMED_ROLE_ARN =
       "fs.s3a.assumed.role.arn";
 
   /**
    * Session name for the assumed role, must be valid characters according
-   * to the AWS APIs.
+   * to the AWS APIs: {@value}.
    * If not set, one is generated from the current Hadoop/Kerberos username.
    */
   public static final String ASSUMED_ROLE_SESSION_NAME =
@@ -84,34 +86,50 @@ private Constants() {
   public static final String ASSUMED_ROLE_SESSION_DURATION =
       "fs.s3a.assumed.role.session.duration";
 
-  /** Security Token Service Endpoint. If unset, uses the default endpoint. */
+  /**
+   * Security Token Service Endpoint: {@value}.
+   * If unset, uses the default endpoint.
+   */
   public static final String ASSUMED_ROLE_STS_ENDPOINT =
       "fs.s3a.assumed.role.sts.endpoint";
 
   /**
-   * Region for the STS endpoint; only relevant if the endpoint
-   * is set.
+   * Default endpoint for session tokens: {@value}.
+   * This is the central STS endpoint which, for v3 signing, can
+   * issue STS tokens for any region.
+   */
+  public static final String DEFAULT_ASSUMED_ROLE_STS_ENDPOINT = "";
+
+  /**
+   * Region for the STS endpoint; needed if the endpoint
+   * is set to anything other then the central one.: {@value}.
    */
   public static final String ASSUMED_ROLE_STS_ENDPOINT_REGION =
       "fs.s3a.assumed.role.sts.endpoint.region";
 
   /**
    * Default value for the STS endpoint region; needed for
-   * v4 signing.
+   * v4 signing: {@value}.
    */
-  public static final String ASSUMED_ROLE_STS_ENDPOINT_REGION_DEFAULT =
-      "us-west-1";
+  public static final String ASSUMED_ROLE_STS_ENDPOINT_REGION_DEFAULT = "";
 
   /**
-   * Default duration of an assumed role.
+   * Default duration of an assumed role: {@value}.
    */
-  public static final String ASSUMED_ROLE_SESSION_DURATION_DEFAULT = "30m";
+  public static final String ASSUMED_ROLE_SESSION_DURATION_DEFAULT = "1h";
 
-  /** list of providers to authenticate for the assumed role. */
+  /**
+   * List of providers to authenticate for the assumed role: {@value}.
+   */
   public static final String ASSUMED_ROLE_CREDENTIALS_PROVIDER =
       "fs.s3a.assumed.role.credentials.provider";
 
-  /** JSON policy containing the policy to apply to the role. */
+  /**
+   * JSON policy containing the policy to apply to the role: {@value}.
+   * This is not used for delegation tokens, which generate the policy
+   * automatically, and restrict it to the S3, KMS and S3Guard services
+   * needed.
+   */
   public static final String ASSUMED_ROLE_POLICY =
       "fs.s3a.assumed.role.policy";
 
@@ -318,7 +336,10 @@ private Constants() {
   /** Prefix for S3A bucket-specific properties: {@value}. */
   public static final String FS_S3A_BUCKET_PREFIX = "fs.s3a.bucket.";
 
-  public static final int S3A_DEFAULT_PORT = -1;
+  /**
+   * Default port for this is 443: HTTPS.
+   */
+  public static final int S3A_DEFAULT_PORT = 443;
 
   public static final String USER_AGENT_PREFIX = "fs.s3a.user.agent.prefix";
 
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/DefaultS3ClientFactory.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/DefaultS3ClientFactory.java
index ade317fd60776..3e9368d10f624 100644
--- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/DefaultS3ClientFactory.java
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/DefaultS3ClientFactory.java
@@ -28,6 +28,9 @@
 import com.amazonaws.services.s3.S3ClientOptions;
 import org.slf4j.Logger;
 
+import org.apache.commons.lang3.StringUtils;
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.classification.InterfaceStability;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.conf.Configured;
 
@@ -39,6 +42,8 @@
  * This which calls the AWS SDK to configure and create an
  * {@link AmazonS3Client} that communicates with the S3 service.
  */
+@InterfaceAudience.Private
+@InterfaceStability.Unstable
 public class DefaultS3ClientFactory extends Configured
     implements S3ClientFactory {
 
@@ -47,9 +52,13 @@ public class DefaultS3ClientFactory extends Configured
   @Override
   public AmazonS3 createS3Client(URI name,
       final String bucket,
-      final AWSCredentialsProvider credentials) throws IOException {
+      final AWSCredentialsProvider credentials,
+      final String userAgentSuffix) throws IOException {
     Configuration conf = getConf();
     final ClientConfiguration awsConf = S3AUtils.createAwsConf(getConf(), bucket);
+    if (!StringUtils.isEmpty(userAgentSuffix)) {
+      awsConf.setUserAgentSuffix(userAgentSuffix);
+    }
     return configureAmazonS3Client(
         newAmazonS3Client(credentials, awsConf), conf);
   }
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/Invoker.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/Invoker.java
index 45912a0ac3dd6..68a69f39321be 100644
--- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/Invoker.java
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/Invoker.java
@@ -476,7 +476,7 @@ public void onFailure(String text,
   };
 
   /**
-   * Log summary at info, full stack at debug.
+   * Log retries at debug.
    */
   public static final Retried LOG_EVENT = new Retried() {
     @Override
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3A.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3A.java
index d856d802d5c35..78643cc5e0424 100644
--- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3A.java
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3A.java
@@ -42,6 +42,16 @@ public S3A(URI theUri, Configuration conf)
 
   @Override
   public int getUriDefaultPort() {
-    return Constants.S3A_DEFAULT_PORT;
+    // return Constants.S3A_DEFAULT_PORT;
+    return super.getUriDefaultPort();
+  }
+
+  @Override
+  public String toString() {
+    final StringBuilder sb = new StringBuilder("S3A{");
+    sb.append("URI =").append(fsImpl.getUri());
+    sb.append("; fsImpl=").append(fsImpl);
+    sb.append('}');
+    return sb.toString();
   }
 }
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AEncryptionMethods.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AEncryptionMethods.java
index e718cd4caa45b..85a00b11b73ea 100644
--- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AEncryptionMethods.java
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AEncryptionMethods.java
@@ -25,27 +25,44 @@
 /**
  * This enum is to centralize the encryption methods and
  * the value required in the configuration.
+ *
+ * There's two enum values for the two client encryption mechanisms the AWS
+ * S3 SDK supports, even though these are not currently supported in S3A.
+ * This is to aid supporting CSE in some form in future, fundamental
+ * issues about file length of encrypted data notwithstanding.
+ *
  */
 public enum S3AEncryptionMethods {
 
-  SSE_S3("AES256"),
-  SSE_KMS("SSE-KMS"),
-  SSE_C("SSE-C"),
-  NONE("");
+  NONE("", false),
+  SSE_S3("AES256", true),
+  SSE_KMS("SSE-KMS", true),
+  SSE_C("SSE-C", true),
+  CSE_KMS("CSE-KMS", false),
+  CSE_CUSTOM("CSE-CUSTOM", false);
 
   static final String UNKNOWN_ALGORITHM
-      = "Unknown Server Side Encryption algorithm ";
+      = "Unknown encryption algorithm ";
 
   private String method;
+  private boolean serverSide;
 
-  S3AEncryptionMethods(String method) {
+  S3AEncryptionMethods(String method, final boolean serverSide) {
     this.method = method;
+    this.serverSide = serverSide;
   }
 
   public String getMethod() {
     return method;
   }
 
+  /**
+   * Flag to indicate this is a server-side encryption option.
+   * @return true if this is server side.
+   */
+  public boolean isServerSide() {
+    return serverSide;
+  }
 
   /**
    * Get the encryption mechanism from the value provided.
@@ -57,16 +74,12 @@ public static S3AEncryptionMethods getMethod(String name) throws IOException {
     if(StringUtils.isBlank(name)) {
       return NONE;
     }
-    switch(name) {
-    case "AES256":
-      return SSE_S3;
-    case "SSE-KMS":
-      return SSE_KMS;
-    case "SSE-C":
-      return SSE_C;
-    default:
-      throw new IOException(UNKNOWN_ALGORITHM + name);
+    for (S3AEncryptionMethods v : values()) {
+      if (v.getMethod().equals(name)) {
+        return v;
+      }
     }
+    throw new IOException(UNKNOWN_ALGORITHM + name);
   }
 
 }
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AFileSystem.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AFileSystem.java
index 7b046bef5162d..eb055dc6bc334 100644
--- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AFileSystem.java
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AFileSystem.java
@@ -34,11 +34,11 @@
 import java.util.EnumSet;
 import java.util.HashSet;
 import java.util.List;
+import java.util.Locale;
 import java.util.Map;
 import java.util.Optional;
 import java.util.Set;
 import java.util.Objects;
-import java.util.concurrent.CompletableFuture;
 import java.util.concurrent.ExecutorService;
 import java.util.concurrent.LinkedBlockingQueue;
 import java.util.concurrent.ThreadPoolExecutor;
@@ -84,12 +84,9 @@
 import org.apache.hadoop.classification.InterfaceAudience;
 import org.apache.hadoop.classification.InterfaceStability;
 import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.CommonPathCapabilities;
 import org.apache.hadoop.fs.CreateFlag;
 import org.apache.hadoop.fs.FSDataInputStream;
 import org.apache.hadoop.fs.FSDataOutputStream;
-import org.apache.hadoop.fs.s3a.select.InternalSelectConstants;
-import org.apache.hadoop.util.LambdaUtils;
 import org.apache.hadoop.fs.FileAlreadyExistsException;
 import org.apache.hadoop.fs.FileStatus;
 import org.apache.hadoop.fs.FileSystem;
@@ -114,8 +111,6 @@
 import org.apache.hadoop.fs.s3a.commit.CommitConstants;
 import org.apache.hadoop.fs.s3a.commit.PutTracker;
 import org.apache.hadoop.fs.s3a.commit.MagicCommitIntegration;
-import org.apache.hadoop.fs.s3a.select.SelectBinding;
-import org.apache.hadoop.fs.s3a.select.SelectConstants;
 import org.apache.hadoop.fs.s3a.s3guard.DirListingMetadata;
 import org.apache.hadoop.fs.s3a.s3guard.MetadataStoreListFilesIterator;
 import org.apache.hadoop.fs.s3a.s3guard.MetadataStore;
@@ -131,8 +126,6 @@
 import org.apache.hadoop.util.ReflectionUtils;
 import org.apache.hadoop.util.SemaphoredDelegatingExecutor;
 
-import static org.apache.hadoop.fs.impl.AbstractFSBuilderImpl.rejectUnknownMandatoryKeys;
-import static org.apache.hadoop.fs.impl.PathCapabilitiesSupport.validatePathCapabilityArgs;
 import static org.apache.hadoop.fs.s3a.Constants.*;
 import static org.apache.hadoop.fs.s3a.Invoker.*;
 import static org.apache.hadoop.fs.s3a.S3AUtils.*;
@@ -175,7 +168,6 @@ public class S3AFileSystem extends FileSystem implements StreamCapabilities,
    * retryable results in files being deleted.
   */
   public static final boolean DELETE_CONSIDERED_IDEMPOTENT = true;
-
   private URI uri;
   private Path workingDir;
   private String username;
@@ -232,7 +224,6 @@ public class S3AFileSystem extends FileSystem implements StreamCapabilities,
   private S3ADataBlocks.BlockFactory blockFactory;
   private int blockOutputActiveBlocks;
   private WriteOperationHelper writeHelper;
-  private SelectBinding selectBinding;
   private boolean useListV1;
   private MagicCommitIntegration committerIntegration;
 
@@ -370,9 +361,6 @@ public void initialize(URI name, Configuration originalConf)
       committerIntegration = new MagicCommitIntegration(
           this, magicCommitterEnabled);
 
-      // instantiate S3 Select support
-      selectBinding = new SelectBinding(writeHelper);
-
       boolean blockUploadEnabled = conf.getBoolean(FAST_UPLOAD, true);
 
       if (!blockUploadEnabled) {
@@ -842,87 +830,31 @@ protected URI canonicalizeUri(URI rawUri) {
    * @param f the file name to open
    * @param bufferSize the size of the buffer to be used.
    */
-  @Retries.RetryTranslated
   public FSDataInputStream open(Path f, int bufferSize)
       throws IOException {
-    return open(f, Optional.empty());
-  }
-
-  /**
-   * Opens an FSDataInputStream at the indicated Path.
-   * @param path the file to open
-   * @param options configuration options if opened with the builder API.
-   * @throws IOException IO failure.
-   */
-  @Retries.RetryTranslated
-  private FSDataInputStream open(
-      final Path path,
-      final Optional<Configuration> options)
-      throws IOException {
-
     entryPoint(INVOCATION_OPEN);
-    final FileStatus fileStatus = getFileStatus(path);
+    LOG.debug("Opening '{}' for reading; input policy = {}", f, inputPolicy);
+    final FileStatus fileStatus = getFileStatus(f);
     if (fileStatus.isDirectory()) {
-      throw new FileNotFoundException("Can't open " + path
+      throw new FileNotFoundException("Can't open " + f
           + " because it is a directory");
     }
 
-    S3AReadOpContext readContext;
-    if (options.isPresent()) {
-      Configuration o = options.get();
-      // normal path. Open the file with the chosen seek policy, if different
-      // from the normal one.
-      // and readahead.
-      S3AInputPolicy policy = S3AInputPolicy.getPolicy(
-          o.get(INPUT_FADVISE, inputPolicy.toString()));
-      long readAheadRange2 = o.getLong(READAHEAD_RANGE, readAhead);
-      readContext = createReadContext(fileStatus, policy, readAheadRange2);
-    } else {
-      readContext = createReadContext(fileStatus, inputPolicy, readAhead);
-    }
-    LOG.debug("Opening '{}'", readContext);
-
     return new FSDataInputStream(
-        new S3AInputStream(
-            readContext,
-            createObjectAttributes(path),
+        new S3AInputStream(new S3AReadOpContext(hasMetadataStore(),
+            invoker,
+            s3guardInvoker,
+            statistics,
+            instrumentation,
+            fileStatus),
+            new S3ObjectAttributes(bucket,
+                pathToKey(f),
+                getServerSideEncryptionAlgorithm(),
+                encryptionSecrets.getEncryptionKey()),
             fileStatus.getLen(),
-            s3));
-  }
-
-  /**
-   * Create the read context for reading from the referenced file,
-   * using FS state as well as the status.
-   * @param fileStatus file status.
-   * @param seekPolicy input policy for this operation
-   * @param readAheadRange readahead value.
-   * @return a context for read and select operations.
-   */
-  private S3AReadOpContext createReadContext(
-      final FileStatus fileStatus,
-      final S3AInputPolicy seekPolicy,
-      final long readAheadRange) {
-    return new S3AReadOpContext(fileStatus.getPath(),
-        hasMetadataStore(),
-        invoker,
-        s3guardInvoker,
-        statistics,
-        instrumentation,
-        fileStatus,
-        seekPolicy,
-        readAheadRange);
-  }
-
-  /**
-   * Create the attributes of an object for a get/select request.
-   * @param f path path of the request.
-   * @return attributes to use when building the query.
-   */
-  private S3ObjectAttributes createObjectAttributes(final Path f) {
-    return new S3ObjectAttributes(bucket,
-        pathToKey(f),
-        getServerSideEncryptionAlgorithm(),
-        encryptionSecrets.getEncryptionKey());
+            s3,
+            readAhead,
+            inputPolicy));
   }
 
   /**
@@ -3603,47 +3535,21 @@ public S3AInstrumentation.CommitterStatistics newCommitterStatistics() {
     return instrumentation.newCommitterStatistics();
   }
 
-  @SuppressWarnings("deprecation")
-  @Override
-  public boolean hasPathCapability(final Path path, final String capability)
-      throws IOException {
-    final Path p = makeQualified(path);
-    switch (validatePathCapabilityArgs(p, capability)) {
-
-    case CommitConstants.STORE_CAPABILITY_MAGIC_COMMITTER:
-    case CommitConstants.STORE_CAPABILITY_MAGIC_COMMITTER_OLD:
-      // capability depends on FS configuration
-      return isMagicCommitEnabled();
-
-    case SelectConstants.S3_SELECT_CAPABILITY:
-      // select is only supported if enabled
-      return selectBinding.isEnabled();
-
-    case CommonPathCapabilities.FS_CHECKSUMS:
-      // capability depends on FS configuration
-      return getConf().getBoolean(ETAG_CHECKSUM_ENABLED,
-          ETAG_CHECKSUM_ENABLED_DEFAULT);
-
-    default:
-      return super.hasPathCapability(p, capability);
-    }
-  }
-
   /**
    * Return the capabilities of this filesystem instance.
-   *
-   * This has been supplanted by {@link #hasPathCapability(Path, String)}.
    * @param capability string to query the stream support for.
    * @return whether the FS instance has the capability.
    */
-  @Deprecated
   @Override
   public boolean hasCapability(String capability) {
-    try {
-      return hasPathCapability(workingDir, capability);
-    } catch (IOException ex) {
-      // should never happen, so log and downgrade.
-      LOG.debug("Ignoring exception on hasCapability({}})", capability, ex);
+
+    switch (capability.toLowerCase(Locale.ENGLISH)) {
+
+    case CommitConstants.STORE_CAPABILITY_MAGIC_COMMITTER:
+      // capability depends on FS configuration
+      return isMagicCommitEnabled();
+
+    default:
       return false;
     }
   }
@@ -3670,104 +3576,4 @@ protected S3Guard.ITtlTimeProvider getTtlTimeProvider() {
   protected void setTtlTimeProvider(S3Guard.ITtlTimeProvider ttlTimeProvider) {
     this.ttlTimeProvider = ttlTimeProvider;
   }
-
-  /**
-   * This is a proof of concept of a select API.
-   * Once a proper factory mechanism for opening files is added to the
-   * FileSystem APIs, this will be deleted <i>without any warning</i>.
-   * @param source path to source data
-   * @param expression select expression
-   * @param options request configuration from the builder.
-   * @return the stream of the results
-   * @throws IOException IO failure
-   */
-  @Retries.RetryTranslated
-  private FSDataInputStream select(final Path source,
-      final String expression,
-      final Configuration options)
-      throws IOException {
-    entryPoint(OBJECT_SELECT_REQUESTS);
-    requireSelectSupport(source);
-    final Path path = makeQualified(source);
-    // call getFileStatus(), which will look at S3Guard first,
-    // so the operation will fail if it is not there or S3Guard believes it has
-    // been deleted.
-    // validation of the file status are delegated to the binding.
-    final FileStatus fileStatus = getFileStatus(path);
-
-    // readahead range can be dynamically set
-    long ra = options.getLong(READAHEAD_RANGE, readAhead);
-    // build and execute the request
-    return selectBinding.select(
-        createReadContext(fileStatus, inputPolicy, ra),
-        expression,
-        options,
-        generateSSECustomerKey(),
-        createObjectAttributes(path));
-  }
-
-  /**
-   * Verify the FS supports S3 Select.
-   * @param source source file.
-   * @throws UnsupportedOperationException if not.
-   */
-  private void requireSelectSupport(final Path source) throws
-      UnsupportedOperationException {
-    if (!selectBinding.isEnabled()) {
-      throw new UnsupportedOperationException(
-          SelectConstants.SELECT_UNSUPPORTED);
-    }
-  }
-
-  /**
-   * Initiate the open or select operation.
-   * This is invoked from both the FileSystem and FileContext APIs
-   * @param path path to the file
-   * @param mandatoryKeys set of options declared as mandatory.
-   * @param options options set during the build sequence.
-   * @return a future which will evaluate to the opened/selected file.
-   * @throws IOException failure to resolve the link.
-   * @throws PathIOException operation is a select request but S3 select is
-   * disabled
-   * @throws IllegalArgumentException unknown mandatory key
-   */
-  @Override
-  @Retries.RetryTranslated
-  public CompletableFuture<FSDataInputStream> openFileWithOptions(
-      final Path path,
-      final Set<String> mandatoryKeys,
-      final Configuration options,
-      final int bufferSize) throws IOException {
-    String sql = options.get(SelectConstants.SELECT_SQL, null);
-    boolean isSelect = sql != null;
-    // choice of keys depends on open type
-    if (isSelect) {
-      rejectUnknownMandatoryKeys(
-          mandatoryKeys,
-          InternalSelectConstants.SELECT_OPTIONS,
-          "for " + path + " in S3 Select operation");
-    } else {
-      rejectUnknownMandatoryKeys(
-          mandatoryKeys,
-          InternalConstants.STANDARD_OPENFILE_KEYS,
-          "for " + path + " in non-select file I/O");
-    }
-    CompletableFuture<FSDataInputStream> result = new CompletableFuture<>();
-    if (!isSelect) {
-      // normal path.
-      unboundedThreadPool.submit(() ->
-          LambdaUtils.eval(result,
-              () -> open(path, Optional.of(options))));
-    } else {
-      // it is a select statement.
-      // fail fast if the method is not present
-      requireSelectSupport(path);
-      // submit the query
-      unboundedThreadPool.submit(() ->
-          LambdaUtils.eval(result,
-              () -> select(path, sql, options)));
-    }
-    return result;
-  }
-
 }
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AUtils.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AUtils.java
index a00fb1a79c4e8..cc548eca189e6 100644
--- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AUtils.java
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AUtils.java
@@ -26,7 +26,6 @@
 import com.amazonaws.SdkBaseException;
 import com.amazonaws.auth.AWSCredentialsProvider;
 import com.amazonaws.auth.EnvironmentVariableCredentialsProvider;
-import com.amazonaws.auth.InstanceProfileCredentialsProvider;
 import com.amazonaws.retry.RetryUtils;
 import com.amazonaws.services.dynamodbv2.model.AmazonDynamoDBException;
 import com.amazonaws.services.dynamodbv2.model.LimitExceededException;
@@ -36,6 +35,7 @@
 import com.amazonaws.services.s3.model.MultiObjectDeleteException;
 import com.amazonaws.services.s3.model.S3ObjectSummary;
 
+import com.google.common.annotations.VisibleForTesting;
 import com.google.common.base.Preconditions;
 
 import org.apache.commons.lang3.StringUtils;
@@ -47,6 +47,7 @@
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.fs.PathFilter;
 import org.apache.hadoop.fs.RemoteIterator;
+import org.apache.hadoop.fs.s3a.auth.IAMInstanceCredentialsProvider;
 import org.apache.hadoop.fs.s3a.auth.NoAuthWithAWSException;
 import org.apache.hadoop.fs.s3native.S3xLoginHelper;
 import org.apache.hadoop.net.ConnectTimeoutException;
@@ -71,11 +72,15 @@
 import java.net.URI;
 import java.nio.file.AccessDeniedException;
 import java.util.ArrayList;
+import java.util.Arrays;
 import java.util.Collection;
+import java.util.Collections;
 import java.util.Date;
+import java.util.HashSet;
 import java.util.List;
 import java.util.Map;
 import java.util.Optional;
+import java.util.Set;
 import java.util.concurrent.ExecutionException;
 
 import static org.apache.commons.lang3.StringUtils.isEmpty;
@@ -127,6 +132,13 @@ public final class S3AUtils {
 
   private static final String BUCKET_PATTERN = FS_S3A_BUCKET_PREFIX + "%s.%s";
 
+  /**
+   * Error message when the AWS provider list built up contains a forbidden
+   * entry.
+   */
+  @VisibleForTesting
+  public static final String E_FORBIDDEN_AWS_PROVIDER
+      = "AWS provider class cannot be used";
 
   private S3AUtils() {
   }
@@ -169,7 +181,7 @@ public static IOException translateException(@Nullable String operation,
       SdkBaseException exception) {
     String message = String.format("%s%s: %s",
         operation,
-        path != null ? (" on " + path) : "",
+        StringUtils.isNotEmpty(path)? (" on " + path) : "",
         exception);
     if (!(exception instanceof AmazonServiceException)) {
       Exception innerCause = containsInterruptedException(exception);
@@ -587,36 +599,40 @@ public static long dateToLong(final Date date) {
     return date.getTime();
   }
 
+  /**
+   * The standard AWS provider list for AWS connections.
+   */
+  public static final List<Class<?>>
+      STANDARD_AWS_PROVIDERS = Collections.unmodifiableList(
+      Arrays.asList(
+          TemporaryAWSCredentialsProvider.class,
+          SimpleAWSCredentialsProvider.class,
+          EnvironmentVariableCredentialsProvider.class,
+          IAMInstanceCredentialsProvider.class));
+
   /**
    * Create the AWS credentials from the providers, the URI and
    * the key {@link Constants#AWS_CREDENTIALS_PROVIDER} in the configuration.
-   * @param binding Binding URI, may contain user:pass login details;
-   * may be null
+   * @param binding Binding URI -may be null
    * @param conf filesystem configuration
    * @return a credentials provider list
    * @throws IOException Problems loading the providers (including reading
    * secrets from credential files).
    */
   public static AWSCredentialProviderList createAWSCredentialProviderSet(
-      URI binding, Configuration conf) throws IOException {
-    AWSCredentialProviderList credentials = new AWSCredentialProviderList();
-
-    Class<?>[] awsClasses = loadAWSProviderClasses(conf,
-        AWS_CREDENTIALS_PROVIDER);
-    if (awsClasses.length == 0) {
-      credentials.add(new SimpleAWSCredentialsProvider(binding, conf));
-      credentials.add(new EnvironmentVariableCredentialsProvider());
-      credentials.add(InstanceProfileCredentialsProvider.getInstance());
-    } else {
-      for (Class<?> aClass : awsClasses) {
-        credentials.add(createAWSCredentialProvider(conf,
-            aClass,
-            binding));
-      }
-    }
+      @Nullable URI binding,
+      Configuration conf) throws IOException {
+    // this will reject any user:secret entries in the URI
+    S3xLoginHelper.rejectSecretsInURIs(binding);
+    AWSCredentialProviderList credentials =
+        buildAWSProviderList(binding,
+            conf,
+            AWS_CREDENTIALS_PROVIDER,
+            STANDARD_AWS_PROVIDERS,
+            new HashSet<>());
     // make sure the logging message strips out any auth details
     LOG.debug("For URI {}, using credentials {}",
-        S3xLoginHelper.toString(binding), credentials);
+        binding, credentials);
     return credentials;
   }
 
@@ -628,17 +644,60 @@ public static AWSCredentialProviderList createAWSCredentialProviderSet(
    * @return the list of classes, possibly empty
    * @throws IOException on a failure to load the list.
    */
-  public static Class<?>[] loadAWSProviderClasses(Configuration conf,
+  public static List<Class<?>> loadAWSProviderClasses(Configuration conf,
       String key,
       Class<?>... defaultValue) throws IOException {
     try {
-      return conf.getClasses(key, defaultValue);
+      return Arrays.asList(conf.getClasses(key, defaultValue));
     } catch (RuntimeException e) {
       Throwable c = e.getCause() != null ? e.getCause() : e;
       throw new IOException("From option " + key + ' ' + c, c);
     }
   }
 
+  /**
+   * Load list of AWS credential provider/credential provider factory classes;
+   * support a forbidden list to prevent loops, mandate full secrets, etc.
+   * @param binding Binding URI -may be null
+   * @param conf configuration
+   * @param key key
+   * @param forbidden a possibly empty set of forbidden classes.
+   * @param defaultValues list of default providers.
+   * @return the list of classes, possibly empty
+   * @throws IOException on a failure to load the list.
+   */
+  public static AWSCredentialProviderList buildAWSProviderList(
+      @Nullable final URI binding,
+      final Configuration conf,
+      final String key,
+      final List<Class<?>> defaultValues,
+      final Set<Class<?>> forbidden) throws IOException {
+
+    // build up the base provider
+    List<Class<?>> awsClasses = loadAWSProviderClasses(conf,
+        key,
+        defaultValues.toArray(new Class[defaultValues.size()]));
+    // and if the list is empty, switch back to the defaults.
+    // this is to address the issue that configuration.getClasses()
+    // doesn't return the default if the config value is just whitespace.
+    if (awsClasses.isEmpty()) {
+      awsClasses = defaultValues;
+    }
+    // iterate through, checking for blacklists and then instantiating
+    // each provider
+    AWSCredentialProviderList providers = new AWSCredentialProviderList();
+    for (Class<?> aClass : awsClasses) {
+
+      if (forbidden.contains(aClass)) {
+        throw new IOException(E_FORBIDDEN_AWS_PROVIDER
+            + " in option " + key + ": " + aClass);
+      }
+      providers.add(createAWSCredentialProvider(conf,
+          aClass, binding));
+    }
+    return providers;
+  }
+
   /**
    * Create an AWS credential provider from its class by using reflection.  The
    * class must implement one of the following means of construction, which are
@@ -647,6 +706,8 @@ public static Class<?>[] loadAWSProviderClasses(Configuration conf,
    * <ol>
    * <li>a public constructor accepting java.net.URI and
    *     org.apache.hadoop.conf.Configuration</li>
+   * <li>a public constructor accepting
+   *    org.apache.hadoop.conf.Configuration</li>
    * <li>a public static method named getInstance that accepts no
    *    arguments and returns an instance of
    *    com.amazonaws.auth.AWSCredentialsProvider, or</li>
@@ -659,11 +720,11 @@ public static Class<?>[] loadAWSProviderClasses(Configuration conf,
    * @return the instantiated class
    * @throws IOException on any instantiation failure.
    */
-  public static AWSCredentialsProvider createAWSCredentialProvider(
+  private static AWSCredentialsProvider createAWSCredentialProvider(
       Configuration conf,
       Class<?> credClass,
-      URI uri) throws IOException {
-    AWSCredentialsProvider credentials;
+      @Nullable URI uri) throws IOException {
+    AWSCredentialsProvider credentials = null;
     String className = credClass.getName();
     if (!AWSCredentialsProvider.class.isAssignableFrom(credClass)) {
       throw new IOException("Class " + credClass + " " + NOT_AWS_PROVIDER);
@@ -706,9 +767,9 @@ public static AWSCredentialsProvider createAWSCredentialProvider(
       // no supported constructor or factory method found
       throw new IOException(String.format("%s " + CONSTRUCTOR_EXCEPTION
           + ".  A class specified in %s must provide a public constructor "
-          + "accepting Configuration, or a public factory method named "
-          + "getInstance that accepts no arguments, or a public default "
-          + "constructor.", className, AWS_CREDENTIALS_PROVIDER));
+          + "of a supported signature, or a public factory method named "
+          + "getInstance that accepts no arguments.",
+          className, AWS_CREDENTIALS_PROVIDER));
     } catch (InvocationTargetException e) {
       Throwable targetException = e.getTargetException();
       if (targetException == null) {
@@ -733,6 +794,24 @@ public static AWSCredentialsProvider createAWSCredentialProvider(
     }
   }
 
+  /**
+   * Set a key if the value is non-empty.
+   * @param config config to patch
+   * @param key key to set
+   * @param val value to probe and set
+   * @param origin origin
+   * @return true if the property was set
+   */
+  public static boolean setIfDefined(Configuration config, String key,
+      String val, String origin) {
+    if (StringUtils.isNotEmpty(val)) {
+      config.set(key, val, origin);
+      return true;
+    } else {
+      return false;
+    }
+  }
+
   /**
    * Return the access key and secret for S3 API use.
    * or indicated in the UserInfo of the name URI param.
@@ -1406,7 +1485,7 @@ static void patchSecurityCredentialProviders(Configuration conf) {
    * @return the encryption key or ""
    * @throws IllegalArgumentException bad arguments.
    */
-  static String getServerSideEncryptionKey(String bucket,
+  public static String getServerSideEncryptionKey(String bucket,
       Configuration conf) {
     try {
       return lookupPassword(bucket, conf, SERVER_SIDE_ENCRYPTION_KEY);
@@ -1427,7 +1506,7 @@ static String getServerSideEncryptionKey(String bucket,
    * one is set.
    * @throws IOException on any validation problem.
    */
-  static S3AEncryptionMethods getEncryptionAlgorithm(String bucket,
+  public static S3AEncryptionMethods getEncryptionAlgorithm(String bucket,
       Configuration conf) throws IOException {
     S3AEncryptionMethods sse = S3AEncryptionMethods.getMethod(
         lookupPassword(bucket, conf,
@@ -1437,6 +1516,7 @@ static S3AEncryptionMethods getEncryptionAlgorithm(String bucket,
     String diagnostics = passwordDiagnostics(sseKey, "key");
     switch (sse) {
     case SSE_C:
+      LOG.debug("Using SSE-C with {}", diagnostics);
       if (sseKeyLen == 0) {
         throw new IOException(SSE_C_NO_KEY_ERROR);
       }
@@ -1459,7 +1539,6 @@ static S3AEncryptionMethods getEncryptionAlgorithm(String bucket,
       LOG.debug("Data is unencrypted");
       break;
     }
-    LOG.debug("Using SSE-C with {}", diagnostics);
     return sse;
   }
 
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3ClientFactory.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3ClientFactory.java
index b237e850d2e85..e0a1d780ccf5f 100644
--- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3ClientFactory.java
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3ClientFactory.java
@@ -40,11 +40,13 @@ public interface S3ClientFactory {
    * @param name raw input S3A file system URI
    * @param bucket Optional bucket to use to look up per-bucket proxy secrets
    * @param credentialSet credentials to use
+   * @param userAgentSuffix optional suffix for the UA field.
    * @return S3 client
    * @throws IOException IO problem
    */
   AmazonS3 createS3Client(URI name,
-      final String bucket,
-      final AWSCredentialsProvider credentialSet) throws IOException;
+      String bucket,
+      AWSCredentialsProvider credentialSet,
+      String userAgentSuffix) throws IOException;
 
 }
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/SimpleAWSCredentialsProvider.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/SimpleAWSCredentialsProvider.java
index b31b72a52139a..255d0095f80c4 100644
--- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/SimpleAWSCredentialsProvider.java
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/SimpleAWSCredentialsProvider.java
@@ -21,19 +21,18 @@
 import com.amazonaws.auth.AWSCredentials;
 import com.amazonaws.auth.AWSCredentialsProvider;
 import com.amazonaws.auth.BasicAWSCredentials;
+import com.google.common.annotations.VisibleForTesting;
 
 import org.apache.commons.lang3.StringUtils;
 import org.apache.hadoop.classification.InterfaceAudience;
 import org.apache.hadoop.classification.InterfaceStability;
 import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.s3a.auth.NoAwsCredentialsException;
 import org.apache.hadoop.fs.s3native.S3xLoginHelper;
-import org.apache.hadoop.security.ProviderUtils;
 
 import java.io.IOException;
 import java.net.URI;
 
-import static org.apache.hadoop.fs.s3a.Constants.ACCESS_KEY;
-import static org.apache.hadoop.fs.s3a.Constants.SECRET_KEY;
 import static org.apache.hadoop.fs.s3a.S3AUtils.getAWSAccessKeys;
 
 /**
@@ -49,13 +48,29 @@ public class SimpleAWSCredentialsProvider implements AWSCredentialsProvider {
 
   public static final String NAME
       = "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider";
-  private String accessKey;
-  private String secretKey;
+  private final String accessKey;
+  private final String secretKey;
 
-  public SimpleAWSCredentialsProvider(URI uri, Configuration conf)
+  /**
+   * Build the credentials from a filesystem URI and configuration.
+   * @param uri FS URI
+   * @param conf configuration containing secrets/references to.
+   * @throws IOException failure
+   */
+  public SimpleAWSCredentialsProvider(final URI uri, final Configuration conf)
       throws IOException {
+      this(getAWSAccessKeys(uri, conf));
+  }
 
-      S3xLoginHelper.Login login = getAWSAccessKeys(uri, conf);
+  /**
+   * Instantiate from a login tuple.
+   * For testing, hence package-scoped.
+   * @param login login secrets
+   * @throws IOException failure
+   */
+  @VisibleForTesting
+  SimpleAWSCredentialsProvider(final S3xLoginHelper.Login login)
+      throws IOException {
       this.accessKey = login.getUser();
       this.secretKey = login.getPassword();
   }
@@ -65,8 +80,8 @@ public AWSCredentials getCredentials() {
     if (!StringUtils.isEmpty(accessKey) && !StringUtils.isEmpty(secretKey)) {
       return new BasicAWSCredentials(accessKey, secretKey);
     }
-    throw new CredentialInitializationException(
-        "Access key or secret key is unset");
+    throw new NoAwsCredentialsException("SimpleAWSCredentialsProvider",
+        "No AWS credentials in the Hadoop configuration");
   }
 
   @Override
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/Statistic.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/Statistic.java
index 919cad4f35d9e..54a2c60254167 100644
--- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/Statistic.java
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/Statistic.java
@@ -57,6 +57,8 @@ public enum Statistic {
       "Calls of delete()"),
   INVOCATION_EXISTS(CommonStatisticNames.OP_EXISTS,
       "Calls of exists()"),
+  INVOCATION_GET_DELEGATION_TOKEN(CommonStatisticNames.OP_GET_DELEGATION_TOKEN,
+      "Calls of getDelegationToken()"),
   INVOCATION_GET_FILE_CHECKSUM(CommonStatisticNames.OP_GET_FILE_CHECKSUM,
       "Calls of getFileChecksum()"),
   INVOCATION_GET_FILE_STATUS(CommonStatisticNames.OP_GET_FILE_STATUS,
@@ -213,7 +215,10 @@ public enum Statistic {
       "s3guard_metadatastore_throttle_rate",
       "S3Guard metadata store throttle rate"),
 
-  STORE_IO_THROTTLED("store_io_throttled", "Requests throttled and retried");
+  STORE_IO_THROTTLED("store_io_throttled", "Requests throttled and retried"),
+
+  DELEGATION_TOKENS_ISSUED("delegation_tokens_issued",
+      "Number of delegation tokens issued");
 
   private static final Map<String, Statistic> SYMBOL_MAP =
       new HashMap<>(Statistic.values().length);
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/TemporaryAWSCredentialsProvider.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/TemporaryAWSCredentialsProvider.java
index d42f68e905308..f124bd0337cd5 100644
--- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/TemporaryAWSCredentialsProvider.java
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/TemporaryAWSCredentialsProvider.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
@@ -18,21 +18,21 @@
 
 package org.apache.hadoop.fs.s3a;
 
-import com.amazonaws.auth.AWSCredentialsProvider;
-import com.amazonaws.auth.BasicSessionCredentials;
+import javax.annotation.Nullable;
+import java.io.IOException;
+
 import com.amazonaws.auth.AWSCredentials;
-import org.apache.commons.lang3.StringUtils;
 
-import java.io.IOException;
 import java.net.URI;
 
 import org.apache.hadoop.classification.InterfaceAudience;
 import org.apache.hadoop.classification.InterfaceStability;
 import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.security.ProviderUtils;
-
-import static org.apache.hadoop.fs.s3a.Constants.*;
-import static org.apache.hadoop.fs.s3a.S3AUtils.lookupPassword;
+import org.apache.hadoop.fs.s3a.auth.AbstractSessionCredentialsProvider;
+import org.apache.hadoop.fs.s3a.auth.MarshalledCredentialBinding;
+import org.apache.hadoop.fs.s3a.auth.MarshalledCredentials;
+import org.apache.hadoop.fs.s3a.auth.NoAuthWithAWSException;
+import org.apache.hadoop.fs.s3a.auth.NoAwsCredentialsException;
 
 /**
  * Support session credentials for authenticating with AWS.
@@ -40,50 +40,65 @@
  * Please note that users may reference this class name from configuration
  * property fs.s3a.aws.credentials.provider.  Therefore, changing the class name
  * would be a backward-incompatible change.
+ *
+ * This credential provider must not fail in creation because that will
+ * break a chain of credential providers.
  */
 @InterfaceAudience.Public
 @InterfaceStability.Stable
-public class TemporaryAWSCredentialsProvider implements AWSCredentialsProvider {
+public class TemporaryAWSCredentialsProvider extends
+    AbstractSessionCredentialsProvider {
 
   public static final String NAME
       = "org.apache.hadoop.fs.s3a.TemporaryAWSCredentialsProvider";
-  private String accessKey;
-  private String secretKey;
-  private String sessionToken;
 
-  public TemporaryAWSCredentialsProvider(Configuration conf)
+  public static final String COMPONENT
+      = "Session credentials in Hadoop configuration";
+
+  /**
+   * Construct from just a configuration.
+   * @param conf configuration.
+   */
+  public TemporaryAWSCredentialsProvider(final Configuration conf)
       throws IOException {
     this(null, conf);
   }
 
-  public TemporaryAWSCredentialsProvider(URI uri, Configuration conf)
+  /**
+   * Constructor: the URI will be null if the provider is inited unbonded
+   * to a filesystem.
+   * @param uri binding to a filesystem URI.
+   * @param conf configuration.
+   */
+  public TemporaryAWSCredentialsProvider(
+      @Nullable final URI uri,
+      final Configuration conf)
       throws IOException {
-
-      // determine the bucket
-      String bucket = uri != null ? uri.getHost():  "";
-      Configuration c = ProviderUtils.excludeIncompatibleCredentialProviders(
-          conf, S3AFileSystem.class);
-      this.accessKey = lookupPassword(bucket, c, ACCESS_KEY);
-      this.secretKey = lookupPassword(bucket, c, SECRET_KEY);
-      this.sessionToken = lookupPassword(bucket, c, SESSION_TOKEN);
+    super(uri, conf);
   }
 
+  /**
+   * The credentials here must include a session token, else this operation
+   * will raise an exception.
+   * @param config the configuration
+   * @return temporary credentials.
+   * @throws IOException on any failure to load the credentials.
+   * @throws NoAuthWithAWSException validation failure
+   * @throws NoAwsCredentialsException the credentials are actually empty.
+   */
   @Override
-  public AWSCredentials getCredentials() {
-    if (!StringUtils.isEmpty(accessKey) && !StringUtils.isEmpty(secretKey)
-        && !StringUtils.isEmpty(sessionToken)) {
-      return new BasicSessionCredentials(accessKey, secretKey, sessionToken);
+  protected AWSCredentials createCredentials(Configuration config)
+      throws IOException {
+    MarshalledCredentials creds = MarshalledCredentialBinding.fromFileSystem(
+        getUri(), config);
+    MarshalledCredentials.CredentialTypeRequired sessionOnly
+        = MarshalledCredentials.CredentialTypeRequired.SessionOnly;
+    // treat only having non-session creds as empty.
+    if (!creds.isValid(sessionOnly)) {
+      throw new NoAwsCredentialsException(COMPONENT);
     }
-    throw new CredentialInitializationException(
-        "Access key, secret key or session token is unset");
-  }
-
-  @Override
-  public void refresh() {}
-
-  @Override
-  public String toString() {
-    return getClass().getSimpleName();
+    return MarshalledCredentialBinding.toAWSCredentials(creds,
+        sessionOnly, COMPONENT);
   }
 
 }
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/AbstractAWSCredentialProvider.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/AbstractAWSCredentialProvider.java
new file mode 100644
index 0000000000000..1f714b0555285
--- /dev/null
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/AbstractAWSCredentialProvider.java
@@ -0,0 +1,70 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.s3a.auth;
+
+import javax.annotation.Nullable;
+import java.net.URI;
+
+import com.amazonaws.auth.AWSCredentialsProvider;
+
+import org.apache.hadoop.conf.Configuration;
+
+/**
+ * Base class for AWS credential providers which
+ * take a URI and config in their constructor.
+ */
+public abstract class AbstractAWSCredentialProvider
+    implements AWSCredentialsProvider {
+
+  private final URI binding;
+
+  private final Configuration conf;
+
+  /**
+   * Construct from URI + configuration.
+   * @param uri URI: may be null.
+   * @param conf configuration.
+   */
+  protected AbstractAWSCredentialProvider(
+      @Nullable final URI uri,
+      final Configuration conf) {
+    this.conf = conf;
+    this.binding = uri;
+  }
+
+  public Configuration getConf() {
+    return conf;
+  }
+
+  /**
+   * Get the binding URI: may be null.
+   * @return the URI this instance was constructed with,
+   * if any.
+   */
+  public URI getUri() {
+    return binding;
+  }
+
+  /**
+   * Refresh is a no-op by default.
+   */
+  @Override
+  public void refresh() {
+  }
+}
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/AbstractSessionCredentialsProvider.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/AbstractSessionCredentialsProvider.java
new file mode 100644
index 0000000000000..7822035ebe867
--- /dev/null
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/AbstractSessionCredentialsProvider.java
@@ -0,0 +1,170 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.s3a.auth;
+
+import javax.annotation.Nullable;
+import java.net.URI;
+import java.io.IOException;
+import java.util.concurrent.atomic.AtomicBoolean;
+
+import com.amazonaws.SdkBaseException;
+import com.amazonaws.auth.AWSCredentials;
+import com.google.common.annotations.VisibleForTesting;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.s3a.CredentialInitializationException;
+import org.apache.hadoop.fs.s3a.Invoker;
+import org.apache.hadoop.fs.s3a.Retries;
+
+/**
+ * Base class for session credential support.
+ */
+@InterfaceAudience.Private
+public abstract class AbstractSessionCredentialsProvider
+    extends AbstractAWSCredentialProvider {
+
+  /** Credentials, created in {@link #init()}. */
+  private AWSCredentials awsCredentials;
+
+  /** Atomic flag for on-demand initialization. */
+  private final AtomicBoolean initialized = new AtomicBoolean(false);
+
+  /**
+   * The (possibly translated) initialization exception.
+   * Used for testing.
+   */
+  private IOException initializationException;
+
+  /**
+   * Constructor.
+   * @param uri possibly null filesystem URI.
+   * @param conf configuration.
+   */
+  public AbstractSessionCredentialsProvider(
+      @Nullable final URI uri,
+      final Configuration conf) {
+    super(uri, conf);
+  }
+
+  /**
+   * Initialize the credentials by calling
+   * {@link #createCredentials(Configuration)} with the current config.
+   */
+  @Retries.OnceTranslated
+  protected void init() throws IOException {
+    // stop re-entrant attempts
+    if (initialized.getAndSet(true)) {
+      return;
+    }
+    try {
+      awsCredentials = Invoker.once("create credentials", "",
+          () -> createCredentials(getConf()));
+    } catch (IOException e) {
+      initializationException = e;
+      throw e;
+    }
+  }
+
+  /**
+   * Has an attempt to initialize the credentials been attempted?
+   * @return true if {@code init()} was called.
+   */
+  public boolean isInitialized() {
+    return initialized.get();
+  }
+
+  /**
+   * Implementation point: whatever the subclass must do to load credentials.
+   * This is called from {@link #init()} and then the credentials are cached,
+   * along with any exception.
+   * @param config the configuration
+   * @return the credentials
+   * @throws IOException on any failure.
+   */
+  protected abstract AWSCredentials createCredentials(Configuration config)
+      throws IOException;
+
+  /**
+   * Get the credentials.
+   * Any exception raised in
+   * {@link #createCredentials(Configuration)}
+   * is thrown here before any attempt to return the credentials
+   * is made.
+   * @return credentials, if set.
+   * @throws SdkBaseException if one was raised during init
+   * @throws CredentialInitializationException on other failures.
+   */
+  public AWSCredentials getCredentials() throws SdkBaseException {
+    // do an on-demand init then raise an AWS SDK exception if
+    // there was a failure.
+    try {
+      if (!isInitialized()) {
+        init();
+      }
+    } catch (IOException e) {
+      if (e.getCause() instanceof SdkBaseException) {
+        throw (SdkBaseException) e.getCause();
+      } else {
+        throw new CredentialInitializationException(e.getMessage(), e);
+      }
+    }
+    if (awsCredentials == null) {
+      throw new CredentialInitializationException(
+          "Provider " + this + " has no credentials");
+    }
+    return awsCredentials;
+  }
+
+  public final boolean hasCredentials() {
+    return awsCredentials == null;
+  }
+
+  @Override
+  public String toString() {
+    return getClass().getSimpleName();
+  }
+
+  /**
+   * Get any IOE raised during initialization.
+   * Null if {@link #init()} hasn't been called, or it actually worked.
+   * @return an exception or null.
+   */
+  @VisibleForTesting
+  public IOException getInitializationException() {
+    return initializationException;
+  }
+
+  /**
+   * A special set of null credentials which are not the anonymous class.
+   * This will be interpreted as "this provider has no credentials to offer",
+   * rather than an explicit error or anonymous access.
+   */
+  protected static final class NoCredentials implements AWSCredentials {
+    @Override
+    public String getAWSAccessKeyId() {
+      return null;
+    }
+
+    @Override
+    public String getAWSSecretKey() {
+      return null;
+    }
+  }
+}
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/AssumedRoleCredentialProvider.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/AssumedRoleCredentialProvider.java
index e5a363952f66a..afad1f8458994 100644
--- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/AssumedRoleCredentialProvider.java
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/AssumedRoleCredentialProvider.java
@@ -18,19 +18,22 @@
 
 package org.apache.hadoop.fs.s3a.auth;
 
+import javax.annotation.Nullable;
 import java.io.Closeable;
 import java.io.IOException;
 import java.net.URI;
+import java.util.Arrays;
 import java.util.Locale;
 import java.util.concurrent.TimeUnit;
 
-import com.amazonaws.AmazonClientException;
 import com.amazonaws.auth.AWSCredentials;
 import com.amazonaws.auth.AWSCredentialsProvider;
+import com.amazonaws.auth.EnvironmentVariableCredentialsProvider;
 import com.amazonaws.auth.STSAssumeRoleSessionCredentialsProvider;
 import com.amazonaws.services.securitytoken.AWSSecurityTokenServiceClientBuilder;
 import com.amazonaws.services.securitytoken.model.AWSSecurityTokenServiceException;
 import com.google.common.annotations.VisibleForTesting;
+import com.google.common.collect.Sets;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -39,6 +42,8 @@
 import org.apache.hadoop.classification.InterfaceStability;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.s3a.AWSCredentialProviderList;
+import org.apache.hadoop.fs.s3a.CredentialInitializationException;
+import org.apache.hadoop.fs.s3a.Retries;
 import org.apache.hadoop.fs.s3a.S3AUtils;
 import org.apache.hadoop.fs.s3a.Invoker;
 import org.apache.hadoop.fs.s3a.S3ARetryPolicy;
@@ -46,8 +51,7 @@
 import org.apache.hadoop.security.UserGroupInformation;
 
 import static org.apache.hadoop.fs.s3a.Constants.*;
-import static org.apache.hadoop.fs.s3a.S3AUtils.createAWSCredentialProvider;
-import static org.apache.hadoop.fs.s3a.S3AUtils.loadAWSProviderClasses;
+import static org.apache.hadoop.fs.s3a.S3AUtils.buildAWSProviderList;
 
 /**
  * Support IAM Assumed roles by instantiating an instance of
@@ -67,10 +71,6 @@ public class AssumedRoleCredentialProvider implements AWSCredentialsProvider,
   public static final String NAME
       = "org.apache.hadoop.fs.s3a.auth.AssumedRoleCredentialProvider";
 
-  static final String E_FORBIDDEN_PROVIDER =
-      "AssumedRoleCredentialProvider cannot be in "
-          + ASSUMED_ROLE_CREDENTIALS_PROVIDER;
-
   public static final String E_NO_ROLE = "Unset property "
       + ASSUMED_ROLE_ARN;
 
@@ -90,13 +90,13 @@ public class AssumedRoleCredentialProvider implements AWSCredentialsProvider,
    * Instantiate.
    * This calls {@link #getCredentials()} to fail fast on the inner
    * role credential retrieval.
-   * @param fsUri URI of the filesystem.
+   * @param fsUri possibly null URI of the filesystem.
    * @param conf configuration
    * @throws IOException on IO problems and some parameter checking
    * @throws IllegalArgumentException invalid parameters
    * @throws AWSSecurityTokenServiceException problems getting credentials
    */
-  public AssumedRoleCredentialProvider(URI fsUri, Configuration conf)
+  public AssumedRoleCredentialProvider(@Nullable URI fsUri, Configuration conf)
       throws IOException {
 
     arn = conf.getTrimmed(ASSUMED_ROLE_ARN, "");
@@ -105,16 +105,12 @@ public AssumedRoleCredentialProvider(URI fsUri, Configuration conf)
     }
 
     // build up the base provider
-    Class<?>[] awsClasses = loadAWSProviderClasses(conf,
+    credentialsToSTS = buildAWSProviderList(fsUri, conf,
         ASSUMED_ROLE_CREDENTIALS_PROVIDER,
-        SimpleAWSCredentialsProvider.class);
-    credentialsToSTS = new AWSCredentialProviderList();
-    for (Class<?> aClass : awsClasses) {
-      if (this.getClass().equals(aClass)) {
-        throw new IOException(E_FORBIDDEN_PROVIDER);
-      }
-      credentialsToSTS.add(createAWSCredentialProvider(conf, aClass, fsUri));
-    }
+        Arrays.asList(
+            SimpleAWSCredentialsProvider.class,
+            EnvironmentVariableCredentialsProvider.class),
+        Sets.newHashSet(this.getClass()));
     LOG.debug("Credentials to obtain role credentials: {}", credentialsToSTS);
 
     // then the STS binding
@@ -132,13 +128,13 @@ public AssumedRoleCredentialProvider(URI fsUri, Configuration conf)
       LOG.debug("Scope down policy {}", policy);
       builder.withScopeDownPolicy(policy);
     }
-    String endpoint = conf.get(ASSUMED_ROLE_STS_ENDPOINT, "");
-    String region = conf.get(ASSUMED_ROLE_STS_ENDPOINT_REGION,
+    String endpoint = conf.getTrimmed(ASSUMED_ROLE_STS_ENDPOINT, "");
+    String region = conf.getTrimmed(ASSUMED_ROLE_STS_ENDPOINT_REGION,
         ASSUMED_ROLE_STS_ENDPOINT_REGION_DEFAULT);
     AWSSecurityTokenServiceClientBuilder stsbuilder =
         STSClientFactory.builder(
           conf,
-          fsUri.getHost(),
+          fsUri != null ?  fsUri.getHost() : "",
           credentialsToSTS,
           endpoint,
           region);
@@ -164,6 +160,7 @@ public AssumedRoleCredentialProvider(URI fsUri, Configuration conf)
    * @throws AWSSecurityTokenServiceException if none could be obtained.
    */
   @Override
+  @Retries.RetryRaw
   public AWSCredentials getCredentials() {
     try {
       return invoker.retryUntranslated("getCredentials",
@@ -174,7 +171,7 @@ public AWSCredentials getCredentials() {
       // its hard to see how this could be raised, but for
       // completeness, it is wrapped as an Amazon Client Exception
       // and rethrown.
-      throw new AmazonClientException(
+      throw new CredentialInitializationException(
           "getCredentials failed: " + e,
           e);
     } catch (AWSSecurityTokenServiceException e) {
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/IAMInstanceCredentialsProvider.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/IAMInstanceCredentialsProvider.java
new file mode 100644
index 0000000000000..7ff451005e2a2
--- /dev/null
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/IAMInstanceCredentialsProvider.java
@@ -0,0 +1,75 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.s3a.auth;
+
+import java.io.Closeable;
+import java.io.IOException;
+
+import com.amazonaws.AmazonClientException;
+import com.amazonaws.auth.AWSCredentials;
+import com.amazonaws.auth.AWSCredentialsProvider;
+import com.amazonaws.auth.InstanceProfileCredentialsProvider;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.classification.InterfaceStability;
+
+/**
+ * This is going to be an IAM credential provider which performs
+ * async refresh for lower-latency on IO calls.
+ * Initially it does not do this, simply shares the single IAM instance
+ * across all instances. This makes it less expensive to declare.
+ *
+ */
+@InterfaceAudience.Private
+@InterfaceStability.Unstable
+public class IAMInstanceCredentialsProvider
+    implements AWSCredentialsProvider, Closeable {
+
+  private static final InstanceProfileCredentialsProvider INSTANCE =
+      InstanceProfileCredentialsProvider.getInstance();
+
+  public IAMInstanceCredentialsProvider() {
+  }
+
+  /**
+   * Ask for the credentials.
+   * as it invariably means "you aren't running on EC2"
+   * @return the credentials
+   */
+  @Override
+  public AWSCredentials getCredentials() {
+    try {
+      return INSTANCE.getCredentials();
+    } catch (AmazonClientException e) {
+      throw new NoAwsCredentialsException("IAMInstanceCredentialsProvider",
+          e.getMessage(),
+          e);
+    }
+  }
+
+  @Override
+  public void refresh() {
+    INSTANCE.refresh();
+  }
+
+  @Override
+  public void close() throws IOException {
+    // until async, no-op.
+  }
+}
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/MarshalledCredentialBinding.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/MarshalledCredentialBinding.java
new file mode 100644
index 0000000000000..58c9c8035f2dd
--- /dev/null
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/MarshalledCredentialBinding.java
@@ -0,0 +1,205 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.s3a.auth;
+
+import java.io.IOException;
+import java.net.URI;
+import java.util.Date;
+import java.util.Map;
+import java.util.concurrent.TimeUnit;
+
+import com.amazonaws.ClientConfiguration;
+import com.amazonaws.auth.AWSCredentials;
+import com.amazonaws.auth.AWSCredentialsProvider;
+import com.amazonaws.auth.AWSSessionCredentials;
+import com.amazonaws.auth.BasicAWSCredentials;
+import com.amazonaws.auth.BasicSessionCredentials;
+import com.amazonaws.services.securitytoken.AWSSecurityTokenService;
+import com.amazonaws.services.securitytoken.model.Credentials;
+import com.google.common.annotations.VisibleForTesting;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.s3a.Invoker;
+import org.apache.hadoop.fs.s3a.Retries;
+import org.apache.hadoop.fs.s3a.S3AFileSystem;
+import org.apache.hadoop.security.ProviderUtils;
+
+import static org.apache.hadoop.fs.s3a.Constants.ACCESS_KEY;
+import static org.apache.hadoop.fs.s3a.Constants.SECRET_KEY;
+import static org.apache.hadoop.fs.s3a.Constants.SESSION_TOKEN;
+import static org.apache.hadoop.fs.s3a.S3AUtils.lookupPassword;
+
+/**
+ * Class to bridge from the serializable/marshallabled
+ * {@link MarshalledCredentialBinding} class to/from AWS classes.
+ * This is to keep that class isolated and not dependent on aws-sdk JARs
+ * to load.
+ */
+public final class MarshalledCredentialBinding {
+
+  private MarshalledCredentialBinding() {
+  }
+
+  /**
+   * Error text on empty credentials: {@value}.
+   */
+  @VisibleForTesting
+  public static final String NO_AWS_CREDENTIALS = "No AWS credentials";
+
+  /**
+   * Create a set of marshalled credentials from a set of credentials
+   * issued by an STS call.
+   * @param credentials AWS-provided session credentials
+   */
+  public static MarshalledCredentials fromSTSCredentials(
+      final Credentials credentials) {
+    MarshalledCredentials marshalled = new MarshalledCredentials(
+        credentials.getAccessKeyId(),
+        credentials.getSecretAccessKey(),
+        credentials.getSessionToken());
+    Date date = credentials.getExpiration();
+    marshalled.setExpiration(date != null ? date.getTime() : 0);
+    return marshalled;
+  }
+
+  /**
+   * Create from a set of AWS credentials.
+   * @param credentials source credential.
+   * @return a set of marshalled credentials.
+   */
+  public static MarshalledCredentials fromAWSCredentials(
+      final AWSSessionCredentials credentials) {
+    return new MarshalledCredentials(
+        credentials.getAWSAccessKeyId(),
+        credentials.getAWSSecretKey(),
+        credentials.getSessionToken());
+  }
+
+  /**
+   * Build a set of credentials from the environment.
+   * @param env environment.
+   * @return a possibly incomplete/invalid set of credentials.
+   */
+  public static MarshalledCredentials fromEnvironment(
+      final Map<String, String> env) {
+    return new MarshalledCredentials(
+      nullToEmptyString(env.get("AWS_ACCESS_KEY")),
+      nullToEmptyString(env.get("AWS_SECRET_KEY")),
+      nullToEmptyString(env.get("AWS_SESSION_TOKEN")));
+  }
+
+  /**
+   * Take a string where a null value is remapped to an empty string.
+   * @param src source string.
+   * @return the value of the string or ""
+   */
+  private static String nullToEmptyString(final String src) {
+    return src == null ? "" : src;
+  }
+
+  /**
+   * Loads the credentials from the owning S3A FS, including
+   * from Hadoop credential providers.
+   * There is no validation.
+   * @param conf configuration to load from
+   * @return the component
+   * @throws IOException on any load failure
+   */
+  public static MarshalledCredentials fromFileSystem(
+      final URI uri,
+      final Configuration conf) throws IOException {
+    // determine the bucket
+    final String bucket = uri != null ? uri.getHost() : "";
+    final Configuration leanConf =
+        ProviderUtils.excludeIncompatibleCredentialProviders(
+            conf, S3AFileSystem.class);
+    return new MarshalledCredentials(
+        lookupPassword(bucket, leanConf, ACCESS_KEY),
+        lookupPassword(bucket, leanConf, SECRET_KEY),
+        lookupPassword(bucket, leanConf, SESSION_TOKEN));
+  }
+
+  /**
+   * Create an AWS credential set from a set of marshalled credentials.
+   *
+   * This code would seem to fit into (@link MarshalledCredentials}, and
+   * while it would from a code-hygiene perspective, to keep all AWS
+   * SDK references out of that class, the logic is implemented here instead,
+   * @param marshalled marshalled credentials
+   * @param typeRequired type of credentials required
+   * @param component component name for exception messages.
+   * @return a new set of credentials
+   * @throws NoAuthWithAWSException validation failure
+   * @throws NoAwsCredentialsException the credentials are actually empty.
+   */
+  public static AWSCredentials toAWSCredentials(
+      final MarshalledCredentials marshalled,
+      final MarshalledCredentials.CredentialTypeRequired typeRequired,
+      final String component)
+      throws NoAuthWithAWSException, NoAwsCredentialsException {
+
+    if (marshalled.isEmpty()) {
+      throw new NoAwsCredentialsException(component, NO_AWS_CREDENTIALS);
+    }
+    if (!marshalled.isValid(typeRequired)) {
+      throw new NoAuthWithAWSException(component + ":" +
+          marshalled.buildInvalidCredentialsError(typeRequired));
+    }
+    final String accessKey = marshalled.getAccessKey();
+    final String secretKey = marshalled.getSecretKey();
+    if (marshalled.hasSessionToken()) {
+      // a session token was supplied, so return session credentials
+      return new BasicSessionCredentials(accessKey, secretKey,
+          marshalled.getSessionToken());
+    } else {
+      // these are full credentials
+      return new BasicAWSCredentials(accessKey, secretKey);
+    }
+  }
+
+  /**
+   * Request a set of credentials from an STS endpoint.
+   * @param parentCredentials the parent credentials needed to talk to STS
+   * @param stsEndpoint an endpoint, use "" for none
+   * @param stsRegion region; use if the endpoint isn't the AWS default.
+   * @param duration duration of the credentials in seconds. Minimum value: 900.
+   * @param invoker invoker to use for retrying the call.
+   * @return the credentials
+   * @throws IOException on a failure of the request
+   */
+  @Retries.RetryTranslated
+  public static MarshalledCredentials requestSessionCredentials(
+      final AWSCredentialsProvider parentCredentials,
+      final ClientConfiguration awsConf,
+      final String stsEndpoint,
+      final String stsRegion,
+      final int duration,
+      final Invoker invoker) throws IOException {
+    final AWSSecurityTokenService tokenService =
+        STSClientFactory.builder(parentCredentials,
+            awsConf,
+            stsEndpoint.isEmpty() ? null : stsEndpoint,
+            stsRegion)
+            .build();
+    return fromSTSCredentials(
+        STSClientFactory.createClientConnection(tokenService, invoker)
+            .requestSessionCredentials(duration, TimeUnit.SECONDS));
+  }
+
+}
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/MarshalledCredentialProvider.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/MarshalledCredentialProvider.java
new file mode 100644
index 0000000000000..03e26e7d8c53b
--- /dev/null
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/MarshalledCredentialProvider.java
@@ -0,0 +1,92 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.s3a.auth;
+
+import java.io.IOException;
+import java.net.URI;
+
+import com.amazonaws.auth.AWSCredentials;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.classification.InterfaceStability;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.s3a.CredentialInitializationException;
+
+import static com.google.common.base.Preconditions.checkNotNull;
+import static org.apache.hadoop.fs.s3a.auth.MarshalledCredentialBinding.toAWSCredentials;
+
+/**
+ * AWS credential provider driven from marshalled session/full credentials
+ * (full, simple session or role).
+ * This is <i>not</i> intended for explicit use in job/app configurations,
+ * instead it is returned by Delegation Token Bindings, as needed.
+ * The constructor implicitly prevents explicit use.
+ */
+@InterfaceAudience.Private
+@InterfaceStability.Unstable
+public class MarshalledCredentialProvider extends
+    AbstractSessionCredentialsProvider {
+
+  /** Name: {@value}. */
+  public static final String NAME
+      = "org.apache.hadoop.fs.s3a.auth.MarshalledCredentialProvider";
+
+  private final MarshalledCredentials credentials;
+
+  private final MarshalledCredentials.CredentialTypeRequired typeRequired;
+
+  private final String component;
+
+  /**
+   * Constructor.
+   *
+   * @param component component name for exception messages.
+   * @param uri filesystem URI: must not be null.
+   * @param conf configuration.
+   * @param credentials marshalled credentials.
+   * @param typeRequired credential type required.
+   * @throws CredentialInitializationException validation failure
+   * @throws IOException failure
+   */
+  public MarshalledCredentialProvider(
+      final String component,
+      final URI uri,
+      final Configuration conf,
+      final MarshalledCredentials credentials,
+      final MarshalledCredentials.CredentialTypeRequired typeRequired)
+      throws IOException {
+    super(checkNotNull(uri, "No filesystem URI"), conf);
+    this.component = component;
+    this.typeRequired = typeRequired;
+    this.credentials = checkNotNull(credentials);
+  }
+
+  /**
+   * Perform the binding, looking up the DT and parsing it.
+   * @return true if there were some credentials
+   * @throws CredentialInitializationException validation failure
+   * @throws IOException on a failure
+   */
+  @Override
+  protected AWSCredentials createCredentials(final Configuration config)
+      throws IOException {
+    return toAWSCredentials(credentials, typeRequired, component);
+  }
+
+}
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/MarshalledCredentials.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/MarshalledCredentials.java
new file mode 100644
index 0000000000000..5737dbc4aa677
--- /dev/null
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/MarshalledCredentials.java
@@ -0,0 +1,409 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.s3a.auth;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+import java.io.Serializable;
+import java.time.OffsetDateTime;
+import java.time.ZoneOffset;
+import java.time.format.DateTimeFormatter;
+import java.util.Date;
+import java.util.Objects;
+import java.util.Optional;
+
+import com.google.common.annotations.VisibleForTesting;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.s3a.S3AUtils;
+import org.apache.hadoop.fs.s3a.auth.delegation.DelegationTokenIOException;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.Writable;
+
+import static java.util.Objects.requireNonNull;
+import static org.apache.commons.lang3.StringUtils.isNotEmpty;
+import static org.apache.hadoop.fs.s3a.Constants.ACCESS_KEY;
+import static org.apache.hadoop.fs.s3a.Constants.SECRET_KEY;
+import static org.apache.hadoop.fs.s3a.Constants.SESSION_TOKEN;
+
+/**
+ * Stores the credentials for a session or for a full login.
+ * This structure is {@link Writable}, so can be marshalled inside a
+ * delegation token.
+ *
+ * The class is designed so that keys inside are kept non-null; to be
+ * unset just set them to the empty string. This is to simplify marshalling.
+ *
+ * <i>Important: Add no references to any AWS SDK class, to
+ * ensure it can be safely deserialized whenever the relevant token
+ * identifier of a token type declared in this JAR is examined.</i>
+ */
+@InterfaceAudience.Private
+public final class MarshalledCredentials implements Writable, Serializable {
+
+  /**
+   * Error text on invalid non-empty credentials: {@value}.
+   */
+  @VisibleForTesting
+  public static final String INVALID_CREDENTIALS
+      = "Invalid AWS credentials";
+
+  /**
+   * How long can any of the secrets be: {@value}.
+   * This is much longer than the current tokens, but leaves space for
+   * future enhancements.
+   */
+  private static final int MAX_SECRET_LENGTH = 8192;
+
+  private static final long serialVersionUID = 8444610385533920692L;
+
+  /**
+   * Access key of IAM account.
+   */
+  private String accessKey = "";
+
+  /**
+   * Secret key of IAM account.
+   */
+  private String secretKey = "";
+
+  /**
+   * Optional session token.
+   * If non-empty: the credentials can be converted into
+   * session credentials.
+   */
+  private String sessionToken = "";
+
+  /**
+   * ARN of a role. Purely for diagnostics.
+   */
+  private String roleARN = "";
+
+  /**
+   * Expiry time milliseconds in UTC; the {@code Java.Util.Date} value.
+   * 0 means "does not expire/unknown".
+   */
+  private long expiration;
+
+  /**
+   * Constructor.
+   */
+  public MarshalledCredentials() {
+  }
+
+  /**
+   * Create from a set of properties.
+   * No expiry time is expected/known here.
+   * @param accessKey access key
+   * @param secretKey secret key
+   * @param sessionToken session token
+   */
+  public MarshalledCredentials(
+      final String accessKey,
+      final String secretKey,
+      final String sessionToken) {
+    this();
+    this.accessKey = requireNonNull(accessKey);
+    this.secretKey = requireNonNull(secretKey);
+    this.sessionToken = sessionToken == null ? "" : sessionToken;
+  }
+
+  public String getAccessKey() {
+    return accessKey;
+  }
+
+  public String getSecretKey() {
+    return secretKey;
+  }
+
+  public String getSessionToken() {
+    return sessionToken;
+  }
+
+  /**
+   * Expiration; will be 0 for none known.
+   * @return any expiration timestamp
+   */
+  public long getExpiration() {
+    return expiration;
+  }
+
+  public void setExpiration(final long expiration) {
+    this.expiration = expiration;
+  }
+
+  /**
+   * Get a temporal representing the time of expiration, if there
+   * is one.
+   * This is here to wrap up expectations about timestamps and zones.
+   * @return the expiration time.
+   */
+  public Optional<OffsetDateTime> getExpirationDateTime() {
+    return expiration == 0
+        ? Optional.empty()
+        : Optional.of(
+            OffsetDateTime.ofInstant(
+                new Date(expiration).toInstant(),
+                ZoneOffset.UTC));
+  }
+
+  public String getRoleARN() {
+    return roleARN;
+  }
+
+  public void setRoleARN(String roleARN) {
+    this.roleARN = requireNonNull(roleARN);
+  }
+
+  public void setAccessKey(final String accessKey) {
+    this.accessKey = requireNonNull(accessKey, "access key");
+  }
+
+  public void setSecretKey(final String secretKey) {
+    this.secretKey = requireNonNull(secretKey, "secret key");
+  }
+
+  public void setSessionToken(final String sessionToken) {
+    this.sessionToken = requireNonNull(sessionToken, "session token");
+  }
+
+  @Override
+  public boolean equals(Object o) {
+    if (this == o) {
+      return true;
+    }
+    if (o == null || getClass() != o.getClass()) {
+      return false;
+    }
+    MarshalledCredentials that = (MarshalledCredentials) o;
+    return expiration == that.expiration &&
+        Objects.equals(accessKey, that.accessKey) &&
+        Objects.equals(secretKey, that.secretKey) &&
+        Objects.equals(sessionToken, that.sessionToken) &&
+        Objects.equals(roleARN, that.roleARN);
+  }
+
+  @Override
+  public int hashCode() {
+    return Objects.hash(accessKey, secretKey, sessionToken, roleARN,
+        expiration);
+  }
+
+  /**
+   * String value MUST NOT include any secrets.
+   * @return a string value for logging.
+   */
+  @Override
+  public String toString() {
+    if (isEmpty()) {
+      return "Empty credentials";
+    }
+
+    String validity = isValid(CredentialTypeRequired.AnyNonEmpty)
+        ? "valid"
+        : "invalid";
+    if (!hasSessionToken()) {
+      // full credentials have the simplest string value.
+      return "full credentials (" + validity + ")";
+    } else {
+      // session/role credentials may have an expiry and role ARN.
+      return String.format("session credentials, expiry %s; %s(%s)",
+          getExpirationDateTime()
+              .map(x -> x.format(DateTimeFormatter.ISO_DATE))
+              .orElse("unknown"),
+          (isNotEmpty(roleARN)
+              ? ("role \"" + roleARN + "\" ")
+              : ""),
+          validity);
+    }
+  }
+
+  /**
+   * Is this empty: does it contain any credentials at all?
+   * This test returns true if either the access key or secret key is empty.
+   * @return true if there are no credentials.
+   */
+  public boolean isEmpty() {
+    return !(isNotEmpty(accessKey) && isNotEmpty(secretKey));
+  }
+
+  /**
+   * Is this a valid set of credentials tokens?
+   * @param required credential type required.
+   * @return true if the requirements are met.
+   */
+  public boolean isValid(final CredentialTypeRequired required) {
+    if (accessKey == null || secretKey == null || sessionToken == null) {
+      // null fields are not permitted, empty is OK for marshalling around.
+      return false;
+    }
+    // now look at whether values are set/unset.
+    boolean hasAccessAndSecretKeys = isNotEmpty(accessKey)
+        && isNotEmpty(secretKey);
+    boolean hasSessionToken = hasSessionToken();
+    switch (required) {
+
+    case AnyIncludingEmpty:
+      // this is simplest.
+      return true;
+
+    case Empty:
+      // empty. ignore session value if the other keys are unset.
+      return !hasAccessAndSecretKeys;
+
+    case AnyNonEmpty:
+      // just look for the access key and secret key being non-empty
+      return hasAccessAndSecretKeys;
+
+    case FullOnly:
+      return hasAccessAndSecretKeys && !hasSessionToken;
+
+    case SessionOnly:
+      return hasAccessAndSecretKeys && hasSessionToken();
+
+      // this is here to keep the IDE quiet
+    default:
+      return false;
+    }
+  }
+
+  /**
+   * Does this set of credentials have a session token.
+   * @return true if there's a session token.
+   */
+  public boolean hasSessionToken() {
+    return isNotEmpty(sessionToken);
+  }
+
+  /**
+   * Write the token.
+   * Only works if valid.
+   * @param out stream to serialize to.
+   * @throws IOException if the serialization failed.
+   */
+  @Override
+  public void write(DataOutput out) throws IOException {
+    validate("Writing " + this + ": ",
+        CredentialTypeRequired.AnyIncludingEmpty);
+    Text.writeString(out, accessKey);
+    Text.writeString(out, secretKey);
+    Text.writeString(out, sessionToken);
+    Text.writeString(out, roleARN);
+    out.writeLong(expiration);
+  }
+
+  /**
+   * Read in the fields.
+   * @throws IOException IO problem
+   */
+  @Override
+  public void readFields(DataInput in) throws IOException {
+    accessKey = Text.readString(in, MAX_SECRET_LENGTH);
+    secretKey = Text.readString(in, MAX_SECRET_LENGTH);
+    sessionToken = Text.readString(in, MAX_SECRET_LENGTH);
+    roleARN = Text.readString(in, MAX_SECRET_LENGTH);
+    expiration = in.readLong();
+  }
+
+  /**
+   * Verify that a set of credentials is valid.
+   * @throws DelegationTokenIOException if they aren't
+   * @param message message to prefix errors;
+   * @param typeRequired credential type required.
+   */
+  public void validate(final String message,
+      final CredentialTypeRequired typeRequired) throws IOException {
+    if (!isValid(typeRequired)) {
+      throw new DelegationTokenIOException(message
+          + buildInvalidCredentialsError(typeRequired));
+    }
+  }
+
+  /**
+   * Build an error string for when the credentials do not match
+   * those required.
+   * @param typeRequired credential type required.
+   * @return an error string.
+   */
+  public String buildInvalidCredentialsError(
+      final CredentialTypeRequired typeRequired) {
+    if (isEmpty()) {
+      return " " + MarshalledCredentialBinding.NO_AWS_CREDENTIALS;
+    } else {
+      return " " + INVALID_CREDENTIALS
+          + " in " + toString() + " required: " + typeRequired;
+    }
+  }
+
+  /**
+   * Patch a configuration with the secrets.
+   * This does not set any per-bucket options (it doesn't know the bucket...).
+   * <i>Warning: once done the configuration must be considered sensitive.</i>
+   * @param config configuration to patch
+   */
+  public void setSecretsInConfiguration(Configuration config) {
+    config.set(ACCESS_KEY, accessKey);
+    config.set(SECRET_KEY, secretKey);
+    S3AUtils.setIfDefined(config, SESSION_TOKEN, sessionToken,
+        "session credentials");
+  }
+
+
+  /**
+   * Return a set of empty credentials.
+   * These can be marshalled, but not used for login.
+   * @return a new set of credentials.
+   */
+  public static MarshalledCredentials empty() {
+    return new MarshalledCredentials("", "", "");
+  }
+
+  /**
+   * Enumeration of credential types for use in validation methods.
+   */
+  public enum CredentialTypeRequired {
+    /** No entry at all. */
+    Empty("None"),
+    /** Any credential type including "unset". */
+    AnyIncludingEmpty("Full, Session or None"),
+    /** Any credential type is OK. */
+    AnyNonEmpty("Full or Session"),
+    /** The credentials must be session or role credentials. */
+    SessionOnly("Session"),
+    /** Full credentials are required. */
+    FullOnly("Full");
+
+    private final String text;
+
+    CredentialTypeRequired(final String text) {
+      this.text = text;
+    }
+
+    public String getText() {
+      return text;
+    }
+
+    @Override
+    public String toString() {
+      return getText();
+    }
+  }
+}
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/NoAuthWithAWSException.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/NoAuthWithAWSException.java
index f48e17a621055..7ec13b092c9bc 100644
--- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/NoAuthWithAWSException.java
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/NoAuthWithAWSException.java
@@ -18,14 +18,14 @@
 
 package org.apache.hadoop.fs.s3a.auth;
 
-import com.amazonaws.AmazonClientException;
+import org.apache.hadoop.fs.s3a.CredentialInitializationException;
 
 /**
- * A specific subclass of {@code AmazonClientException} which can
- * be used in the retry logic to fail fast when there is any
+ * A specific subclass of {@code AmazonClientException} which is
+ * used in the S3A retry policy to fail fast when there is any
  * authentication problem.
  */
-public class NoAuthWithAWSException extends AmazonClientException {
+public class NoAuthWithAWSException extends CredentialInitializationException {
 
   public NoAuthWithAWSException(final String message, final Throwable t) {
     super(message, t);
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/NoAwsCredentialsException.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/NoAwsCredentialsException.java
new file mode 100644
index 0000000000000..bff5f27f80a54
--- /dev/null
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/NoAwsCredentialsException.java
@@ -0,0 +1,69 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.s3a.auth;
+
+import javax.annotation.Nonnull;
+
+/**
+ * A special exception which declares that no credentials were found;
+ * this can be treated specially in logging, handling, etc.
+ * As it subclasses {@link NoAuthWithAWSException}, the S3A retry handler
+ * knows not to attempt to ask for the credentials again.
+ */
+public class NoAwsCredentialsException extends
+    NoAuthWithAWSException {
+
+  /**
+   * The default error message: {@value}.
+   */
+  public static final String E_NO_AWS_CREDENTIALS = "No AWS Credentials";
+
+  /**
+   * Construct.
+   * @param credentialProvider name of the credential provider.
+   * @param message message.
+   */
+  public NoAwsCredentialsException(
+      @Nonnull final String credentialProvider,
+      @Nonnull final String message) {
+    this(credentialProvider, message, null);
+  }
+
+  /**
+   * Construct with the default message of {@link #E_NO_AWS_CREDENTIALS}.
+   * @param credentialProvider name of the credential provider.
+   */
+  public NoAwsCredentialsException(
+      @Nonnull final String credentialProvider) {
+    this(credentialProvider, E_NO_AWS_CREDENTIALS, null);
+  }
+
+  /**
+   * Construct with exception.
+   * @param credentialProvider name of the credential provider.
+   * @param message message.
+   * @param thrown inner exception
+   */
+  public NoAwsCredentialsException(
+      @Nonnull final String credentialProvider,
+      @Nonnull final String message,
+      final Throwable thrown) {
+    super(credentialProvider + ": " + message, thrown);
+  }
+}
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/RolePolicies.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/RolePolicies.java
index 34ed2958e49e6..610dbcc6765be 100644
--- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/RolePolicies.java
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/RolePolicies.java
@@ -18,12 +18,24 @@
 
 package org.apache.hadoop.fs.s3a.auth;
 
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.List;
+
+import com.google.common.collect.Lists;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.classification.InterfaceStability;
+
 import static org.apache.hadoop.fs.s3a.auth.RoleModel.*;
 
 /**
  * Operations, statements and policies covering the operations
  * needed to work with S3 and S3Guard.
  */
+@InterfaceAudience.LimitedPrivate("Tests")
+@InterfaceStability.Unstable
 public final class RolePolicies {
 
   private RolePolicies() {
@@ -88,27 +100,36 @@ private RolePolicies() {
    */
   public static final String S3_ALL_BUCKETS = "arn:aws:s3:::*";
 
+  /**
+   * All bucket list operations, including
+   * {@link #S3_BUCKET_LIST_BUCKET} and
+   * {@link #S3_BUCKET_LIST_MULTIPART_UPLOADS}.
+   */
+  public static final String S3_BUCKET_ALL_LIST = "s3:ListBucket*";
 
-  public static final String S3_ALL_LIST_OPERATIONS = "s3:List*";
-
-  public static final String S3_ALL_LIST_BUCKET = "s3:ListBucket*";
-
-  public static final String S3_LIST_BUCKET = "s3:ListBucket";
+  /**
+   * List the contents of a bucket.
+   * It applies to a bucket, not to a path in a bucket.
+   */
+  public static final String S3_BUCKET_LIST_BUCKET = "s3:ListBucket";
 
   /**
    * This is used by the abort operation in S3A commit work.
+   * It applies to a bucket, not to a path in a bucket.
    */
-  public static final String S3_LIST_BUCKET_MULTPART_UPLOADS =
+  public static final String S3_BUCKET_LIST_MULTIPART_UPLOADS =
       "s3:ListBucketMultipartUploads";
 
   /**
    * List multipart upload is needed for the S3A Commit protocols.
+   * It applies to a path in a bucket.
    */
   public static final String S3_LIST_MULTIPART_UPLOAD_PARTS
       = "s3:ListMultipartUploadParts";
 
   /**
-   * abort multipart upload is needed for the S3A Commit protocols.
+   * Abort multipart upload is needed for the S3A Commit protocols.
+   * It applies to a path in a bucket.
    */
   public static final String S3_ABORT_MULTIPART_UPLOAD
       = "s3:AbortMultipartUpload";
@@ -181,20 +202,41 @@ private RolePolicies() {
    * Actions needed to read a file in S3 through S3A, excluding
    * S3Guard and SSE-KMS.
    */
-  public static final String[] S3_PATH_READ_OPERATIONS =
+  private static final String[] S3_PATH_READ_OPERATIONS =
       new String[]{
           S3_GET_OBJECT,
       };
 
   /**
    * Base actions needed to read data from S3 through S3A,
-   * excluding SSE-KMS data and S3Guard-ed buckets.
+   * excluding:
+   * <ol>
+   *   <li>bucket-level operations</li>
+   *   <li>SSE-KMS key operations</li>
+   *   <li>DynamoDB operations for S3Guard.</li>
+   * </ol>
+   * As this excludes the bucket list operations, it is not sufficient
+   * to read from a bucket on its own.
+   */
+  private static final String[] S3_ROOT_READ_OPERATIONS =
+      new String[]{
+          S3_ALL_GET,
+      };
+
+  public static final List<String> S3_ROOT_READ_OPERATIONS_LIST =
+      Collections.unmodifiableList(Arrays.asList(S3_ALL_GET));
+
+  /**
+   * Policies which can be applied to bucket resources for read operations.
+   * <ol>
+   *   <li>SSE-KMS key operations</li>
+   *   <li>DynamoDB operations for S3Guard.</li>
+   * </ol>
    */
-  public static final String[] S3_ROOT_READ_OPERATIONS =
+  public static final String[] S3_BUCKET_READ_OPERATIONS =
       new String[]{
-          S3_LIST_BUCKET,
-          S3_LIST_BUCKET_MULTPART_UPLOADS,
           S3_ALL_GET,
+          S3_BUCKET_ALL_LIST,
       };
 
   /**
@@ -202,43 +244,39 @@ private RolePolicies() {
    * This includes the appropriate read operations, but
    * not SSE-KMS or S3Guard support.
    */
-  public static final String[] S3_PATH_RW_OPERATIONS =
-      new String[]{
+  public static final List<String> S3_PATH_RW_OPERATIONS =
+      Collections.unmodifiableList(Arrays.asList(new String[]{
           S3_ALL_GET,
           S3_PUT_OBJECT,
           S3_DELETE_OBJECT,
           S3_ABORT_MULTIPART_UPLOAD,
-          S3_LIST_MULTIPART_UPLOAD_PARTS,
-      };
+      }));
 
   /**
    * Actions needed to write data to an S3A Path.
    * This is purely the extra operations needed for writing atop
    * of the read operation set.
    * Deny these and a path is still readable, but not writeable.
-   * Excludes: SSE-KMS and S3Guard permissions.
+   * Excludes: bucket-ARN, SSE-KMS and S3Guard permissions.
    */
-  public static final String[] S3_PATH_WRITE_OPERATIONS =
-      new String[]{
+  public static final List<String> S3_PATH_WRITE_OPERATIONS =
+      Collections.unmodifiableList(Arrays.asList(new String[]{
           S3_PUT_OBJECT,
           S3_DELETE_OBJECT,
           S3_ABORT_MULTIPART_UPLOAD
-      };
+      }));
 
   /**
    * Actions needed for R/W IO from the root of a bucket.
-   * Excludes: SSE-KMS and S3Guard permissions.
+   * Excludes: bucket-ARN, SSE-KMS and S3Guard permissions.
    */
-  public static final String[] S3_ROOT_RW_OPERATIONS =
-      new String[]{
-          S3_LIST_BUCKET,
+  public static final List<String> S3_ROOT_RW_OPERATIONS =
+      Collections.unmodifiableList(Arrays.asList(new String[]{
           S3_ALL_GET,
           S3_PUT_OBJECT,
           S3_DELETE_OBJECT,
           S3_ABORT_MULTIPART_UPLOAD,
-          S3_LIST_MULTIPART_UPLOAD_PARTS,
-          S3_ALL_LIST_BUCKET,
-      };
+      }));
 
   /**
    * All DynamoDB operations: {@value}.
@@ -300,24 +338,15 @@ private RolePolicies() {
   /**
    * Statement to allow all DDB access.
    */
-  public static final Statement STATEMENT_ALL_DDB = statement(true,
-      ALL_DDB_TABLES, DDB_ALL_OPERATIONS);
+  public static final Statement STATEMENT_ALL_DDB =
+      allowAllDynamoDBOperations(ALL_DDB_TABLES);
 
   /**
    * Statement to allow all client operations needed for S3Guard,
    * but none of the admin operations.
    */
-  public static final Statement STATEMENT_S3GUARD_CLIENT = statement(true,
-      ALL_DDB_TABLES,
-      DDB_BATCH_GET_ITEM,
-      DDB_BATCH_WRITE_ITEM,
-      DDB_DELETE_ITEM,
-      DDB_DESCRIBE_TABLE,
-      DDB_GET_ITEM,
-      DDB_PUT_ITEM,
-      DDB_QUERY,
-      DDB_UPDATE_ITEM
-      );
+  public static final Statement STATEMENT_S3GUARD_CLIENT =
+      allowS3GuardClientOperations(ALL_DDB_TABLES);
 
   /**
    * Allow all S3 Operations.
@@ -327,13 +356,92 @@ private RolePolicies() {
       S3_ALL_BUCKETS,
       S3_ALL_OPERATIONS);
 
+  /**
+   * The s3:GetBucketLocation permission is for all buckets, not for
+   * any named bucket, which complicates permissions.
+   */
+  public static final Statement STATEMENT_ALL_S3_GET_BUCKET_LOCATION =
+      statement(true,
+          S3_ALL_BUCKETS,
+          S3_GET_BUCKET_LOCATION);
+
   /**
    * Policy for all S3 and S3Guard operations, and SSE-KMS.
    */
   public static final Policy ALLOW_S3_AND_SGUARD = policy(
       STATEMENT_ALL_S3,
       STATEMENT_ALL_DDB,
-      STATEMENT_ALLOW_SSE_KMS_RW
+      STATEMENT_ALLOW_SSE_KMS_RW,
+      STATEMENT_ALL_S3_GET_BUCKET_LOCATION
   );
 
+  public static Statement allowS3GuardClientOperations(String tableArn) {
+    return statement(true,
+        tableArn,
+        DDB_BATCH_GET_ITEM,
+        DDB_BATCH_WRITE_ITEM,
+        DDB_DELETE_ITEM,
+        DDB_DESCRIBE_TABLE,
+        DDB_GET_ITEM,
+        DDB_PUT_ITEM,
+        DDB_QUERY,
+        DDB_UPDATE_ITEM
+    );
+  }
+
+  public static Statement allowAllDynamoDBOperations(String tableArn) {
+    return statement(true,
+        tableArn,
+        DDB_ALL_OPERATIONS);
+  }
+
+  /**
+   * From an S3 bucket name, build an ARN to refer to it.
+   * @param bucket bucket name.
+   * @param write are write permissions required
+   * @return return statement granting access.
+   */
+  public static List<Statement> allowS3Operations(String bucket,
+      boolean write) {
+    // add the bucket operations for the specific bucket ARN
+    ArrayList<Statement> statements =
+        Lists.newArrayList(
+            statement(true,
+                bucketToArn(bucket),
+                S3_GET_BUCKET_LOCATION, S3_BUCKET_ALL_LIST));
+    // then add the statements for objects in the buckets
+    if (write) {
+      statements.add(
+          statement(true,
+              bucketObjectsToArn(bucket),
+              S3_ROOT_RW_OPERATIONS));
+    } else {
+      statements.add(
+          statement(true,
+              bucketObjectsToArn(bucket),
+              S3_ROOT_READ_OPERATIONS_LIST));
+    }
+    return statements;
+  }
+
+  /**
+   * From an S3 bucket name, build an ARN to refer to all objects in
+   * it.
+   * @param bucket bucket name.
+   * @return return the ARN to use in statements.
+   */
+  public static String bucketObjectsToArn(String bucket) {
+    return String.format("arn:aws:s3:::%s/*", bucket);
+  }
+
+
+  /**
+   * From an S3 bucket name, build an ARN to refer to it.
+   * @param bucket bucket name.
+   * @return return the ARN to use in statements.
+   */
+  public static String bucketToArn(String bucket) {
+    return String.format("arn:aws:s3:::%s", bucket);
+  }
+
 }
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/STSClientFactory.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/STSClientFactory.java
index 10bf88c61f9af..74aca50fa9c53 100644
--- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/STSClientFactory.java
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/STSClientFactory.java
@@ -18,22 +18,33 @@
 
 package org.apache.hadoop.fs.s3a.auth;
 
+import java.io.Closeable;
 import java.io.IOException;
+import java.util.concurrent.TimeUnit;
 
 import com.amazonaws.ClientConfiguration;
 import com.amazonaws.auth.AWSCredentialsProvider;
 import com.amazonaws.client.builder.AwsClientBuilder;
+import com.amazonaws.services.securitytoken.AWSSecurityTokenService;
 import com.amazonaws.services.securitytoken.AWSSecurityTokenServiceClientBuilder;
+import com.amazonaws.services.securitytoken.model.AssumeRoleRequest;
+import com.amazonaws.services.securitytoken.model.Credentials;
+import com.amazonaws.services.securitytoken.model.GetSessionTokenRequest;
 import com.google.common.base.Preconditions;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-import org.apache.commons.lang3.StringUtils;
 import org.apache.hadoop.classification.InterfaceAudience;
 import org.apache.hadoop.classification.InterfaceStability;
 import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.s3a.Invoker;
+import org.apache.hadoop.fs.s3a.Retries;
 import org.apache.hadoop.fs.s3a.S3AUtils;
 
+import static org.apache.commons.lang3.StringUtils.isEmpty;
+import static org.apache.commons.lang3.StringUtils.isNotEmpty;
+import static org.apache.hadoop.fs.s3a.auth.delegation.DelegationConstants.*;
+
 /**
  * Factory for creating STS Clients.
  */
@@ -44,6 +55,32 @@ public class STSClientFactory {
   private static final Logger LOG =
       LoggerFactory.getLogger(STSClientFactory.class);
 
+  /**
+   * Create the builder ready for any final configuration options.
+   * Picks up connection settings from the Hadoop configuration, including
+   * proxy secrets.
+   * The endpoint comes from the configuration options
+   * {@link org.apache.hadoop.fs.s3a.auth.delegation.DelegationConstants#DELEGATION_TOKEN_ENDPOINT}
+   * and
+   * {@link org.apache.hadoop.fs.s3a.auth.delegation.DelegationConstants#DELEGATION_TOKEN_REGION}
+   * @param conf Configuration to act as source of options.
+   * @param bucket Optional bucket to use to look up per-bucket proxy secrets
+   * @param credentials AWS credential chain to use
+   * @return the builder to call {@code build()}
+   * @throws IOException problem reading proxy secrets
+   */
+  public static AWSSecurityTokenServiceClientBuilder builder(
+      final Configuration conf,
+      final String bucket,
+      final AWSCredentialsProvider credentials) throws IOException {
+    final ClientConfiguration awsConf = S3AUtils.createAwsConf(conf, bucket);
+    String endpoint = conf.getTrimmed(DELEGATION_TOKEN_ENDPOINT,
+        DEFAULT_DELEGATION_TOKEN_ENDPOINT);
+    String region = conf.getTrimmed(DELEGATION_TOKEN_REGION,
+        DEFAULT_DELEGATION_TOKEN_REGION);
+    return builder(credentials, awsConf, endpoint, region);
+  }
+
   /**
    * Create the builder ready for any final configuration options.
    * Picks up connection settings from the Hadoop configuration, including
@@ -52,27 +89,149 @@ public class STSClientFactory {
    * @param bucket Optional bucket to use to look up per-bucket proxy secrets
    * @param credentials AWS credential chain to use
    * @param stsEndpoint optional endpoint "https://sns.us-west-1.amazonaws.com"
-   * @param stsRegion the region, e.g "us-west-1"
+   * @param stsRegion AWS recommend setting the endpoint instead.
    * @return the builder to call {@code build()}
    * @throws IOException problem reading proxy secrets
    */
   public static AWSSecurityTokenServiceClientBuilder builder(
       final Configuration conf,
       final String bucket,
-      final AWSCredentialsProvider credentials, final String stsEndpoint,
+      final AWSCredentialsProvider credentials,
+      final String stsEndpoint,
       final String stsRegion) throws IOException {
-    Preconditions.checkArgument(credentials != null, "No credentials");
+    final ClientConfiguration awsConf = S3AUtils.createAwsConf(conf, bucket);
+    return builder(credentials, awsConf, stsEndpoint, stsRegion);
+  }
+
+  /**
+   * Create the builder ready for any final configuration options.
+   * Picks up connection settings from the Hadoop configuration, including
+   * proxy secrets.
+   * @param awsConf AWS configuration.
+   * @param credentials AWS credential chain to use
+   * @param stsEndpoint optional endpoint "https://sns.us-west-1.amazonaws.com"
+   * @param stsRegion the region, e.g "us-west-1". Must be set if endpoint is.
+   * @return the builder to call {@code build()}
+   */
+  public static AWSSecurityTokenServiceClientBuilder builder(
+      final AWSCredentialsProvider credentials,
+      final ClientConfiguration awsConf,
+      final String stsEndpoint,
+      final String stsRegion) {
     final AWSSecurityTokenServiceClientBuilder builder
         = AWSSecurityTokenServiceClientBuilder.standard();
-    final ClientConfiguration awsConf = S3AUtils.createAwsConf(conf, bucket);
+    Preconditions.checkArgument(credentials != null, "No credentials");
     builder.withClientConfiguration(awsConf);
     builder.withCredentials(credentials);
-    if (StringUtils.isNotEmpty(stsEndpoint)) {
-      LOG.debug("STS Endpoint ={}", stsEndpoint);
+    boolean destIsStandardEndpoint = STS_STANDARD.equals(stsEndpoint);
+    if (isNotEmpty(stsEndpoint) && !destIsStandardEndpoint) {
+      Preconditions.checkArgument(
+          isNotEmpty(stsRegion),
+          "STS endpoint is set to %s but no signing region was provided",
+          stsEndpoint);
+      LOG.debug("STS Endpoint={}; region='{}'", stsEndpoint, stsRegion);
       builder.withEndpointConfiguration(
           new AwsClientBuilder.EndpointConfiguration(stsEndpoint, stsRegion));
+    } else {
+      Preconditions.checkArgument(isEmpty(stsRegion),
+          "STS signing region set set to %s but no STS endpoint specified",
+          stsRegion);
     }
     return builder;
   }
 
+  /**
+   * Create an STS Client instance.
+   * @param tokenService STS instance
+   * @param invoker invoker to use
+   * @return an STS client bonded to that interface.
+   * @throws IOException on any failure
+   */
+  public static STSClient createClientConnection(
+      final AWSSecurityTokenService tokenService,
+      final Invoker invoker)
+      throws IOException {
+    return new STSClient(tokenService, invoker);
+  }
+
+  /**
+   * STS client connection with retries.
+   */
+  public static final class STSClient implements Closeable {
+
+    private final AWSSecurityTokenService tokenService;
+
+    private final Invoker invoker;
+
+    private STSClient(final AWSSecurityTokenService tokenService,
+        final Invoker invoker) {
+      this.tokenService = tokenService;
+      this.invoker = invoker;
+    }
+
+    @Override
+    public void close() throws IOException {
+      try {
+        tokenService.shutdown();
+      } catch (UnsupportedOperationException ignored) {
+        // ignore this, as it is what the STS client currently
+        // does.
+      }
+    }
+
+    /**
+     * Request a set of session credentials.
+     *
+     * @param duration duration of the credentials
+     * @param timeUnit time unit of duration
+     * @return the role result
+     * @throws IOException on a failure of the request
+     */
+    @Retries.RetryTranslated
+    public Credentials requestSessionCredentials(
+        final long duration,
+        final TimeUnit timeUnit) throws IOException {
+      int durationSeconds = (int) timeUnit.toSeconds(duration);
+      LOG.debug("Requesting session token of duration {}", duration);
+      final GetSessionTokenRequest request = new GetSessionTokenRequest();
+      request.setDurationSeconds(durationSeconds);
+      return invoker.retry("request session credentials", "",
+          true,
+          () ->{
+            LOG.info("Requesting Amazon STS Session credentials");
+            return tokenService.getSessionToken(request).getCredentials();
+          });
+    }
+
+    /**
+     * Request a set of role credentials.
+     *
+     * @param roleARN ARN to request
+     * @param sessionName name of the session
+     * @param policy optional policy; "" is treated as "none"
+     * @param duration duration of the credentials
+     * @param timeUnit time unit of duration
+     * @return the role result
+     * @throws IOException on a failure of the request
+     */
+    @Retries.RetryTranslated
+    public Credentials requestRole(
+        final String roleARN,
+        final String sessionName,
+        final String policy,
+        final long duration,
+        final TimeUnit timeUnit) throws IOException {
+      LOG.debug("Requesting role {} with duration {}; policy = {}",
+          roleARN, duration, policy);
+      AssumeRoleRequest request = new AssumeRoleRequest();
+      request.setDurationSeconds((int) timeUnit.toSeconds(duration));
+      request.setRoleArn(roleARN);
+      request.setRoleSessionName(sessionName);
+      if (isNotEmpty(policy)) {
+        request.setPolicy(policy);
+      }
+      return invoker.retry("request role credentials", "", true,
+          () -> tokenService.assumeRole(request).getCredentials());
+    }
+  }
 }
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/delegation/AWSPolicyProvider.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/delegation/AWSPolicyProvider.java
new file mode 100644
index 0000000000000..aaca10f1aea5e
--- /dev/null
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/delegation/AWSPolicyProvider.java
@@ -0,0 +1,59 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.s3a.auth.delegation;
+
+import java.util.List;
+import java.util.Set;
+
+import org.apache.hadoop.fs.s3a.auth.RoleModel;
+
+/**
+ * Interface for providers of AWS policy for accessing data.
+ * This is used when building up the role permissions for a delegation
+ * token.
+ *
+ * The permissions requested are from the perspective of
+ * S3A filesystem operations on the data, <i>not</i> the simpler
+ * model of "permissions on the the remote service".
+ * As an example, to use S3Guard effectively, the client needs full CRUD
+ * access to the table, even for {@link AccessLevel#READ}.
+ */
+public interface AWSPolicyProvider {
+
+  /**
+   * Get the AWS policy statements required for accessing this service.
+   *
+   * @param access access level desired.
+   * @return a possibly empty list of statements to grant access at that
+   * level.
+   */
+  List<RoleModel.Statement> listAWSPolicyRules(Set<AccessLevel> access);
+
+  /**
+   * Access levels.
+   */
+  enum AccessLevel {
+    /** Filesystem data read operations. */
+    READ,
+    /** Data write, encryption, etc. */
+    WRITE,
+    /** Administration of the data, tables, etc. */
+    ADMIN,
+  }
+}
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/delegation/AbstractDTService.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/delegation/AbstractDTService.java
new file mode 100644
index 0000000000000..dcb83c2c28b6a
--- /dev/null
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/delegation/AbstractDTService.java
@@ -0,0 +1,154 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.s3a.auth.delegation;
+
+import java.io.IOException;
+import java.net.URI;
+
+import com.google.common.base.Preconditions;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.s3a.S3AFileSystem;
+import org.apache.hadoop.security.UserGroupInformation;
+import org.apache.hadoop.service.AbstractService;
+
+import static java.util.Objects.requireNonNull;
+
+/**
+ * This is the base class for both the delegation binding
+ * code and the back end service created; allows for
+ * shared methods across both.
+ *
+ * The lifecycle sequence is as follows
+ * <pre>
+ *   - create
+ *   - bindToFileSystem(uri, ownerFS)
+ *   - init
+ *   - start
+ *   ...api calls...
+ *   - stop
+ * </pre>
+ *
+ * As the S3ADelegation mechanism is all configured during the filesystem
+ * initalize() operation, it is not ready for use through all the start process.
+ */
+public abstract class AbstractDTService
+    extends AbstractService {
+
+  /**
+   * URI of the filesystem.
+   * Valid after {@link #bindToFileSystem(URI, S3AFileSystem)}.
+   */
+  private URI canonicalUri;
+
+  /**
+   * The owning filesystem.
+   * Valid after {@link #bindToFileSystem(URI, S3AFileSystem)}.
+   */
+  private S3AFileSystem fileSystem;
+
+  /**
+   * Owner of the filesystem.
+   * Valid after {@link #bindToFileSystem(URI, S3AFileSystem)}.
+   */
+  private UserGroupInformation owner;
+
+  /**
+   * Protected constructor.
+   * @param name service name.
+   */
+  protected AbstractDTService(final String name) {
+    super(name);
+  }
+
+  /**
+   * Bind to the filesystem.
+   * Subclasses can use this to perform their own binding operations -
+   * but they must always call their superclass implementation.
+   * This <i>Must</i> be called before calling {@code init()}.
+   *
+   * <b>Important:</b>
+   * This binding will happen during FileSystem.initialize(); the FS
+   * is not live for actual use and will not yet have interacted with
+   * AWS services.
+   * @param uri the canonical URI of the FS.
+   * @param fs owning FS.
+   * @throws IOException failure.
+   */
+  public void bindToFileSystem(
+      final URI uri,
+      final S3AFileSystem fs) throws IOException {
+    requireServiceState(STATE.NOTINITED);
+    Preconditions.checkState(canonicalUri == null,
+        "bindToFileSystem called twice");
+    this.canonicalUri = requireNonNull(uri);
+    this.fileSystem = requireNonNull(fs);
+    this.owner = fs.getOwner();
+  }
+
+  /**
+   * Get the canonical URI of the filesystem, which is what is
+   * used to identify the tokens.
+   * @return the URI.
+   */
+  public URI getCanonicalUri() {
+    return canonicalUri;
+  }
+
+  /**
+   * Get the owner of the FS.
+   * @return the owner fs
+   */
+  protected S3AFileSystem getFileSystem() {
+    return fileSystem;
+  }
+
+  /**
+   * Get the owner of this Service.
+   * @return owner; non-null after binding to an FS.
+   */
+  public UserGroupInformation getOwner() {
+    return owner;
+  }
+
+  /**
+   * Require that the service is in a given state.
+   * @param state desired state.
+   * @throws IllegalStateException if the condition is not met
+   */
+  protected void requireServiceState(final STATE state)
+      throws IllegalStateException {
+    Preconditions.checkState(isInState(state),
+        "Required State: %s; Actual State %s", state, getServiceState());
+  }
+
+  /**
+   * Require the service to be started.
+   * @throws IllegalStateException if it is not.
+   */
+  protected void requireServiceStarted() throws IllegalStateException {
+    requireServiceState(STATE.STARTED);
+  }
+
+  @Override
+  protected void serviceInit(final Configuration conf) throws Exception {
+    super.serviceInit(conf);
+    requireNonNull(canonicalUri, "service does not have a canonical URI");
+  }
+}
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/delegation/AbstractDelegationTokenBinding.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/delegation/AbstractDelegationTokenBinding.java
new file mode 100644
index 0000000000000..73660ea88b41d
--- /dev/null
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/delegation/AbstractDelegationTokenBinding.java
@@ -0,0 +1,305 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.s3a.auth.delegation;
+
+import java.io.IOException;
+import java.net.URI;
+import java.nio.charset.Charset;
+import java.util.Optional;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.s3a.AWSCredentialProviderList;
+import org.apache.hadoop.fs.s3a.S3AFileSystem;
+import org.apache.hadoop.fs.s3a.auth.RoleModel;
+import org.apache.hadoop.fs.s3a.commit.DurationInfo;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.security.token.SecretManager;
+import org.apache.hadoop.security.token.Token;
+
+import static java.util.Objects.requireNonNull;
+import static org.apache.hadoop.fs.s3a.auth.delegation.DelegationConstants.DURATION_LOG_AT_INFO;
+
+/**
+ *  An AbstractDelegationTokenBinding implementation is a class which
+ *  handles the binding of its underlying authentication mechanism to the
+ *  Hadoop Delegation token mechanism.
+ *
+ *  See also {@code org.apache.hadoop.fs.azure.security.WasbDelegationTokenManager}
+ *  but note that it assumes Kerberos tokens for which the renewal mechanism
+ *  is the sole plugin point.
+ *  This class is designed to be more generic.
+ *
+ *  <b>Lifecycle</b>
+ *
+ *  It is a Hadoop Service, so has a standard lifecycle: once started
+ *  its lifecycle will follow that of the {@link S3ADelegationTokens}
+ *  instance which created it --which itself follows the lifecycle of the FS.
+ *
+ *  One big difference is that
+ *  {@link #bindToFileSystem(URI, S3AFileSystem)} will be called
+ *  before the {@link #init(Configuration)} operation, this is where
+ *  the owning FS is passed in.
+ *
+ *  Implementations are free to start background operations in their
+ *  {@code serviceStart()} method, provided they are safely stopped in
+ *  {@code serviceStop()}.
+ *
+ *  <b>When to check for the ability to issue tokens</b>
+ *  Implementations MUST start up without actually holding the secrets
+ *  needed to issue tokens (config options, credentials to talk to STS etc)
+ *  as in server-side deployments they are not expected to have these.
+ *
+ *  <b>Retry Policy</b>
+ *
+ *  All methods which talk to AWS services are expected to do translation,
+ *  with retries as they see fit.
+ */
+public abstract class AbstractDelegationTokenBinding extends AbstractDTService {
+
+  /** Token kind: must match that of the token identifiers issued. */
+  private final Text kind;
+
+  private SecretManager<AbstractS3ATokenIdentifier> secretManager;
+
+  private static final Logger LOG = LoggerFactory.getLogger(
+      AbstractDelegationTokenBinding.class);
+
+  /**
+   * Constructor.
+   *
+   * @param name as passed to superclass for use in log messages.
+   * @param kind token kind.
+   */
+  protected AbstractDelegationTokenBinding(final String name,
+      final Text kind) {
+    super(name);
+    this.kind = requireNonNull(kind);
+  }
+
+  /**
+   * Get the kind of the tokens managed here.
+   * @return the token kind.
+   */
+  public Text getKind() {
+    return kind;
+  }
+
+  /**
+   * Return the name of the owner to be used in tokens.
+   * This may be that of the UGI owner, or it could be related to
+   * the AWS login.
+   * @return a text name of the owner.
+   */
+  public Text getOwnerText() {
+    return new Text(getOwner().getUserName());
+  }
+
+  /**
+   * Predicate: will this binding issue a DT?
+   * That is: should the filesystem declare that it is issuing
+   * delegation tokens? If true
+   * @return a declaration of what will happen when asked for a token.
+   */
+  public S3ADelegationTokens.TokenIssuingPolicy getTokenIssuingPolicy() {
+    return S3ADelegationTokens.TokenIssuingPolicy.RequestNewToken;
+  }
+
+  /**
+   * Create a delegation token for the user.
+   * This will only be called if a new DT is needed, that is: the
+   * filesystem has been deployed unbonded.
+   * @param policy minimum policy to use, if known.
+   * @param encryptionSecrets encryption secrets for the token.
+   * @return the token or null if the back end does not want to issue one.
+   * @throws IOException if one cannot be created
+   */
+  public Token<AbstractS3ATokenIdentifier> createDelegationToken(
+      final Optional<RoleModel.Policy> policy,
+      final EncryptionSecrets encryptionSecrets) throws IOException {
+    requireServiceStarted();
+    final AbstractS3ATokenIdentifier tokenIdentifier =
+            createTokenIdentifier(policy, encryptionSecrets);
+    if (tokenIdentifier != null) {
+      Token<AbstractS3ATokenIdentifier> token =
+          new Token<>(tokenIdentifier, secretManager);
+      token.setKind(getKind());
+      LOG.debug("Created token {} with token identifier {}",
+          token, tokenIdentifier);
+      return token;
+    } else {
+      return null;
+    }
+  }
+
+  /**
+   * Create a token identifier with all the information needed
+   * to be included in a delegation token.
+   * This is where session credentials need to be extracted, etc.
+   * This will only be called if a new DT is needed, that is: the
+   * filesystem has been deployed unbonded.
+   *
+   * If {@link #createDelegationToken(Optional, EncryptionSecrets)}
+   * is overridden, this method can be replaced with a stub.
+   *
+   * @param policy minimum policy to use, if known.
+   * @param encryptionSecrets encryption secrets for the token.
+   * @return the token data to include in the token identifier.
+   * @throws IOException failure creating the token data.
+   */
+  public abstract AbstractS3ATokenIdentifier createTokenIdentifier(
+      Optional<RoleModel.Policy> policy,
+      EncryptionSecrets encryptionSecrets) throws IOException;
+
+  /**
+   * Verify that a token identifier is of a specific class.
+   * This will reject subclasses (i.e. it is stricter than
+   * {@code instanceof}, then cast it to that type.
+   * @param identifier identifier to validate
+   * @param expectedClass class of the expected token identifier.
+   * @throws DelegationTokenIOException If the wrong class was found.
+   */
+  protected <T extends AbstractS3ATokenIdentifier> T convertTokenIdentifier(
+      final AbstractS3ATokenIdentifier identifier,
+      final Class<T> expectedClass) throws DelegationTokenIOException {
+    if (!identifier.getClass().equals(expectedClass)) {
+      throw new DelegationTokenIOException(
+          DelegationTokenIOException.TOKEN_WRONG_CLASS
+              + "; expected a token identifier of type "
+              + expectedClass
+              + " but got "
+              + identifier.getClass()
+              + " and kind " + identifier.getKind());
+    }
+    return (T) identifier;
+  }
+
+  /**
+   * Perform any actions when deploying unbonded, and return a list
+   * of credential providers.
+   * @return non-empty list of AWS credential providers to use for
+   * authenticating this client with AWS services.
+   * @throws IOException any failure.
+   */
+  public abstract AWSCredentialProviderList deployUnbonded()
+      throws IOException;
+
+  /**
+   * Bind to the token identifier, returning the credential providers to use
+   * for the owner to talk to S3, DDB and related AWS Services.
+   * @param retrievedIdentifier the unmarshalled data
+   * @return non-empty list of AWS credential providers to use for
+   * authenticating this client with AWS services.
+   * @throws IOException any failure.
+   */
+  public abstract AWSCredentialProviderList bindToTokenIdentifier(
+      AbstractS3ATokenIdentifier retrievedIdentifier)
+      throws IOException;
+
+  /**
+   * Create a new subclass of {@link AbstractS3ATokenIdentifier}.
+   * This is used in the secret manager.
+   * @return an empty identifier.
+   */
+  public abstract AbstractS3ATokenIdentifier createEmptyIdentifier();
+
+  @Override
+  public String toString() {
+    return super.toString()
+        + " token kind = " + getKind();
+  }
+
+  /**
+   * Service startup: create the secret manager.
+   * @throws Exception failure.
+   */
+  @Override
+  protected void serviceStart() throws Exception {
+    super.serviceStart();
+    secretManager = createSecretMananger();
+  }
+
+  /**
+   * Return a description.
+   * This is logged during after service start and binding:
+   * it should be as informative as possible.
+   * @return a description to log.
+   */
+  public String getDescription() {
+    return "Token binding " + getKind().toString();
+  }
+
+  /**
+   * Create a secret manager.
+   * @return a secret manager.
+   * @throws IOException on failure
+   */
+  protected SecretManager<AbstractS3ATokenIdentifier> createSecretMananger()
+      throws IOException {
+    return new TokenSecretManager();
+  }
+
+  /**
+   * Return a string for use in building up the User-Agent field, so
+   * get into the S3 access logs. Useful for diagnostics.
+   * @return a string for the S3 logs or "" for "nothing to add"
+   */
+  public String getUserAgentField() {
+    return "";
+  }
+
+  /**
+   * Get the password to use in secret managers.
+   * This is a constant; its just recalculated every time to stop findbugs
+   * highlighting security risks of shared mutable byte arrays.
+   * @return a password.
+   */
+  protected static byte[] getSecretManagerPasssword() {
+    return "non-password".getBytes(Charset.forName("UTF-8"));
+  }
+
+  /**
+   * The secret manager always uses the same secret; the
+   * factory for new identifiers is that of the token manager.
+   */
+  protected class TokenSecretManager
+      extends SecretManager<AbstractS3ATokenIdentifier> {
+
+    @Override
+    protected byte[] createPassword(AbstractS3ATokenIdentifier identifier) {
+      return getSecretManagerPasssword();
+    }
+
+    @Override
+    public byte[] retrievePassword(AbstractS3ATokenIdentifier identifier)
+        throws InvalidToken {
+      return getSecretManagerPasssword();
+    }
+
+    @Override
+    public AbstractS3ATokenIdentifier createIdentifier() {
+      try (DurationInfo ignored = new DurationInfo(LOG, DURATION_LOG_AT_INFO,
+          "Creating Delegation Token Identifier")) {
+        return AbstractDelegationTokenBinding.this.createEmptyIdentifier();
+      }
+    }
+  }
+}
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/delegation/AbstractS3ATokenIdentifier.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/delegation/AbstractS3ATokenIdentifier.java
new file mode 100644
index 0000000000000..7c1c0e3891da4
--- /dev/null
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/delegation/AbstractS3ATokenIdentifier.java
@@ -0,0 +1,305 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.s3a.auth.delegation;
+
+import java.io.ByteArrayInputStream;
+import java.io.DataInput;
+import java.io.DataInputStream;
+import java.io.DataOutput;
+import java.io.IOException;
+import java.net.URI;
+import java.util.Objects;
+import java.util.UUID;
+
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.net.NetUtils;
+import org.apache.hadoop.security.token.Token;
+import org.apache.hadoop.security.token.delegation.web.DelegationTokenIdentifier;
+
+import static java.util.Objects.requireNonNull;
+
+/**
+ * An S3A Delegation Token Identifier: contains the information needed
+ * to talk to S3A.
+ *
+ * These are loaded via the service loader API an used in a map of
+ * Kind to class, which is then looked up to deserialize token
+ * identifiers of a given class.
+ *
+ * Every non-abstract class must provide
+ * <ol>
+ *   <li>Their unique token kind.</li>
+ *   <li>An empty constructor.</li>
+ *   <li>An entry in the resource file
+ *   {@code /META-INF/services/org.apache.hadoop.security.token.TokenIdentifier}
+ *   </li>
+ * </ol>
+ *
+ * The base implementation contains
+ * <ol>
+ *   <li>The URI of the FS.</li>
+ *   <li>Encryption secrets for use in the destination FS.</li>
+ * </ol>
+ * Subclasses are required to add whatever information is needed to authenticate
+ * the user with the credential provider which their binding class will
+ * provide.
+ *
+ * <i>Important: Add no references to any AWS SDK class, to
+ * ensure it can be safely deserialized whenever the relevant token
+ * identifier of a token type declared in this JAR is examined.</i>
+ */
+public abstract class AbstractS3ATokenIdentifier
+    extends DelegationTokenIdentifier {
+
+  /**
+   * The maximum string length supported for text fields.
+   */
+  protected static final int MAX_TEXT_LENGTH = 8192;
+
+  /** Canonical URI of the bucket. */
+  private URI uri;
+
+  /**
+   * Encryption secrets to also marshall with any credentials.
+   * Set during creation to ensure it is never null.
+   */
+  private EncryptionSecrets encryptionSecrets = new EncryptionSecrets();
+
+  /**
+   * Timestamp of creation.
+   * This is set to the current time; it will be overridden when
+   * deserializing data.
+   */
+  private long created = System.currentTimeMillis();
+
+  /**
+   * An origin string for diagnostics.
+   */
+  private String origin = "";
+
+  /**
+   * This marshalled UUID can be used in testing to verify transmission,
+   * and reuse; as it is printed you can see what is happending too.
+   */
+  private String uuid = UUID.randomUUID().toString();
+
+  /**
+   * Constructor.
+   * @param kind token kind.
+   * @param uri filesystem URI.
+   * @param owner token owner
+   * @param origin origin text for diagnostics.
+   * @param encryptionSecrets encryption secrets to set.
+   */
+  protected AbstractS3ATokenIdentifier(
+      final Text kind,
+      final URI uri,
+      final Text owner,
+      final String origin,
+      final EncryptionSecrets encryptionSecrets) {
+    this(kind, owner, new Text(), new Text(), uri);
+    this.origin = requireNonNull(origin);
+    this.encryptionSecrets = requireNonNull(encryptionSecrets);
+  }
+
+  /**
+   * Constructor.
+   * @param kind token kind.
+   * @param owner token owner
+   * @param renewer token renewer
+   * @param realUser token real user
+   * @param uri filesystem URI.
+   */
+  protected AbstractS3ATokenIdentifier(
+      final Text kind,
+      final Text owner,
+      final Text renewer,
+      final Text realUser,
+      final URI uri) {
+    super(kind, owner, renewer, realUser);
+    this.uri = requireNonNull(uri);
+  }
+
+  /**
+   * Build from a token.
+   * This has been written for refresh operations;
+   * if someone implements refresh it will be relevant.
+   * @param token to to build from
+   * @throws IOException failure to build the identifier.
+   */
+  protected AbstractS3ATokenIdentifier(
+      final Text kind,
+      final Token<AbstractS3ATokenIdentifier> token) throws IOException {
+    super(kind);
+    ByteArrayInputStream bais = new ByteArrayInputStream(token.getIdentifier());
+    readFields(new DataInputStream(bais));
+  }
+
+  /**
+   * For subclasses to use in their own empty-constructors.
+   */
+  protected AbstractS3ATokenIdentifier(final Text kind) {
+    super(kind);
+  }
+
+  public String getBucket() {
+    return uri.getHost();
+  }
+
+  public URI getUri() {
+    return uri;
+  }
+
+  public String getOrigin() {
+    return origin;
+  }
+
+  public void setOrigin(final String origin) {
+    this.origin = origin;
+  }
+
+  public long getCreated() {
+    return created;
+  }
+
+  /**
+   * Write state.
+   * {@link org.apache.hadoop.io.Writable#write(DataOutput)}.
+   * @param out destination
+   * @throws IOException failure
+   */
+  @Override
+  public void write(final DataOutput out) throws IOException {
+    super.write(out);
+    Text.writeString(out, uri.toString());
+    Text.writeString(out, origin);
+    Text.writeString(out, uuid);
+    encryptionSecrets.write(out);
+    out.writeLong(created);
+  }
+
+  /**
+   * Read state.
+   * {@link org.apache.hadoop.io.Writable#readFields(DataInput)}.
+   *
+   * Note: this operation gets called in toString() operations on tokens, so
+   * must either always succeed, or throw an IOException to trigger the
+   * catch and downgrade. RuntimeExceptions (e.g. Preconditions checks) are
+   * not to be used here for this reason.)
+   *
+   * @param in input stream
+   * @throws DelegationTokenIOException if the token binding is wrong.
+   * @throws IOException IO problems.
+   */
+  @Override
+  public void readFields(final DataInput in)
+      throws DelegationTokenIOException, IOException {
+    super.readFields(in);
+    uri = URI.create(Text.readString(in, MAX_TEXT_LENGTH));
+    origin = Text.readString(in, MAX_TEXT_LENGTH);
+    uuid = Text.readString(in, MAX_TEXT_LENGTH);
+    encryptionSecrets.readFields(in);
+    created = in.readLong();
+  }
+
+  /**
+   * Validate the token by looking at its fields.
+   * @throws IOException on failure.
+   */
+  public void validate() throws IOException {
+    if (uri == null) {
+      throw new DelegationTokenIOException("No URI in " + this);
+    }
+  }
+
+  @Override
+  public String toString() {
+    final StringBuilder sb = new StringBuilder(
+        "S3ATokenIdentifier{");
+    sb.append(getKind());
+    sb.append("; uri=").append(uri);
+    sb.append("; timestamp=").append(created);
+    sb.append("; encryption=").append(encryptionSecrets.toString());
+    sb.append("; ").append(uuid);
+    sb.append("; ").append(origin);
+    sb.append('}');
+    return sb.toString();
+  }
+
+  /**
+   * Equality check is on superclass and UUID only.
+   * @param o other.
+   * @return true if the base class considers them equal and the URIs match.
+   */
+  @Override
+  public boolean equals(final Object o) {
+    if (this == o) {
+      return true;
+    }
+    if (o == null || getClass() != o.getClass()) {
+      return false;
+    }
+    if (!super.equals(o)) {
+      return false;
+    }
+    final AbstractS3ATokenIdentifier that = (AbstractS3ATokenIdentifier) o;
+    return Objects.equals(uuid, that.uuid) &&
+        Objects.equals(uri, that.uri);
+  }
+
+  @Override
+  public int hashCode() {
+    return Objects.hash(super.hashCode(), uri);
+  }
+
+  /**
+   * Return the expiry time in seconds since 1970-01-01.
+   * @return the time when the session credential expire.
+   */
+  public long getExpiryTime() {
+    return 0;
+  }
+
+  /**
+   * Get the UUID of this token identifier.
+   * @return a UUID.
+   */
+  public String getUuid() {
+    return uuid;
+  }
+
+  /**
+   * Get the encryption secrets.
+   * @return the encryption secrets within this identifier.
+   */
+  public EncryptionSecrets getEncryptionSecrets() {
+    return encryptionSecrets;
+  }
+
+  /**
+   * Create the default origin text message with local hostname and
+   * timestamp.
+   * @return a string for token diagnostics.
+   */
+  public static String createDefaultOriginMessage() {
+    return String.format("Created on %s at time %s.",
+        NetUtils.getHostname(),
+        java.time.Instant.now());
+  }
+}
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/delegation/DelegationConstants.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/delegation/DelegationConstants.java
new file mode 100644
index 0000000000000..7674c6920ddc2
--- /dev/null
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/delegation/DelegationConstants.java
@@ -0,0 +1,165 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.s3a.auth.delegation;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.classification.InterfaceStability;
+import org.apache.hadoop.fs.s3a.Constants;
+import org.apache.hadoop.io.Text;
+
+/**
+ * All the constants related to delegation tokens.
+ * Not in the normal S3 constants while unstable.
+ *
+ * Where possible, the existing assumed role properties are used to configure
+ * STS binding, default ARN, etc. This makes documenting everything that
+ * much easier and avoids trying to debug precisely which sts endpoint
+ * property should be set.
+ *
+ * Most settings here are replicated in {@code core-default.xml}; the
+ * values MUST be kept in sync.
+ */
+@InterfaceAudience.Public
+@InterfaceStability.Unstable
+public final class DelegationConstants {
+
+  /**
+   * Endpoint for session tokens, used when building delegation tokens:
+   * {@value}.
+   * @see <a href="https://docs.aws.amazon.com/general/latest/gr/rande.html#sts_region">STS regions</a>
+   */
+  public static final String DELEGATION_TOKEN_ENDPOINT =
+      Constants.ASSUMED_ROLE_STS_ENDPOINT;
+
+  /**
+   * Default endpoint for session tokens: {@value}.
+   */
+  public static final String DEFAULT_DELEGATION_TOKEN_ENDPOINT =
+      Constants.DEFAULT_ASSUMED_ROLE_STS_ENDPOINT;
+
+  /**
+   * Region for DT issuing; must be non-empty if the endpoint is set: {@value}.
+   */
+  public static final String DELEGATION_TOKEN_REGION =
+      Constants.ASSUMED_ROLE_STS_ENDPOINT_REGION;
+
+  /**
+   * Region default: {@value}.
+   */
+  public static final String DEFAULT_DELEGATION_TOKEN_REGION =
+      Constants.ASSUMED_ROLE_STS_ENDPOINT_REGION_DEFAULT;
+
+  /**
+   * Duration of tokens in time: {@value}.
+   */
+  public static final String DELEGATION_TOKEN_DURATION =
+      Constants.ASSUMED_ROLE_SESSION_DURATION;
+
+  /**
+   * Default duration of a delegation token: {@value}.
+   * Must be in the range supported by STS.
+   */
+  public static final String DEFAULT_DELEGATION_TOKEN_DURATION =
+      Constants.ASSUMED_ROLE_SESSION_DURATION_DEFAULT;
+
+  /**
+   * Key to list AWS credential providers for Session/role
+   * credentials: {@value}.
+   */
+  public static final String DELEGATION_TOKEN_CREDENTIALS_PROVIDER =
+      Constants.AWS_CREDENTIALS_PROVIDER;
+
+  /**
+   * ARN of the delegation token: {@value}.
+   * Required for the role token.
+   */
+  public static final String DELEGATION_TOKEN_ROLE_ARN =
+      Constants.ASSUMED_ROLE_ARN;
+
+  /**
+   * Property containing classname for token binding: {@value}.
+   */
+  public static final String DELEGATION_TOKEN_BINDING =
+      "fs.s3a.delegation.token.binding";
+  /**
+   * Session Token binding classname: {@value}.
+   */
+  public static final String DELEGATION_TOKEN_SESSION_BINDING =
+      "org.apache.hadoop.fs.s3a.auth.delegation.SessionTokenBinding";
+
+  /**
+   * Default token binding {@value}.
+   */
+  public static final String DEFAULT_DELEGATION_TOKEN_BINDING = "";
+
+  /**
+   * Token binding to pass full credentials: {@value}.
+   */
+  public static final String DELEGATION_TOKEN_FULL_CREDENTIALS_BINDING =
+      "org.apache.hadoop.fs.s3a.auth.delegation.FullCredentialsTokenBinding";
+
+  /**
+   * Role DTs: {@value}.
+   */
+  public static final String DELEGATION_TOKEN_ROLE_BINDING =
+      "org.apache.hadoop.fs.s3a.auth.delegation.RoleTokenBinding";
+
+  /** Prefix for token names: {@value}. */
+  public static final String TOKEN_NAME_PREFIX = "S3ADelegationToken/";
+
+  /** Name of session token: {@value}. */
+  public static final String SESSION_TOKEN_NAME = TOKEN_NAME_PREFIX + "Session";
+
+  /** Kind of the session token; value is {@link #SESSION_TOKEN_NAME}. */
+  public static final Text SESSION_TOKEN_KIND = new Text(SESSION_TOKEN_NAME);
+
+  /** Name of full token: {@value}. */
+  public static final String FULL_TOKEN_NAME = TOKEN_NAME_PREFIX + "Full";
+
+  /** Kind of the full token; value is {@link #FULL_TOKEN_NAME}. */
+  public static final Text FULL_TOKEN_KIND = new Text(FULL_TOKEN_NAME);
+
+  /** Name of role token: {@value}. */
+  public static final String ROLE_TOKEN_NAME = TOKEN_NAME_PREFIX + "Role";
+
+  /** Kind of the role token; value is {@link #ROLE_TOKEN_NAME}. */
+  public static final Text ROLE_TOKEN_KIND = new Text(ROLE_TOKEN_NAME);
+
+  /**
+   * Package-scoped option to control level that duration info on token
+   * binding operations are logged at.
+   * Value: {@value}.
+   */
+  static final boolean DURATION_LOG_AT_INFO = true;
+
+  /**
+   * If the token binding auth chain is only session-level auth, you
+   * can't use the role binding: {@value}.
+   */
+  public static final String E_NO_SESSION_TOKENS_FOR_ROLE_BINDING
+      = "Cannot issue S3A Role Delegation Tokens without full AWS credentials";
+
+  /**
+   * The standard STS server.
+   */
+  public static final String STS_STANDARD = "sts.amazonaws.com";
+
+  private DelegationConstants() {
+  }
+}
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/delegation/DelegationTokenIOException.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/delegation/DelegationTokenIOException.java
new file mode 100644
index 0000000000000..32d45bc17dd55
--- /dev/null
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/delegation/DelegationTokenIOException.java
@@ -0,0 +1,50 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.s3a.auth.delegation;
+
+import java.io.IOException;
+
+/**
+ * General IOException for Delegation Token issues.
+ * Includes recommended error strings, which can be used in tests when
+ * looking for specific errors.
+ */
+public class DelegationTokenIOException extends IOException {
+
+  private static final long serialVersionUID = 599813827985340023L;
+
+  /** Error: delegation token/token identifier class isn't the right one. */
+  public static final String TOKEN_WRONG_CLASS
+      = "Delegation token is wrong class";
+
+  /**
+   * The far end is expecting a different token kind than
+   * that which the client created.
+   */
+  protected static final String TOKEN_MISMATCH = "Token mismatch";
+
+  public DelegationTokenIOException(final String message) {
+    super(message);
+  }
+
+  public DelegationTokenIOException(final String message,
+      final Throwable cause) {
+    super(message, cause);
+  }
+}
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/delegation/EncryptionSecretOperations.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/delegation/EncryptionSecretOperations.java
new file mode 100644
index 0000000000000..6526f9a947815
--- /dev/null
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/delegation/EncryptionSecretOperations.java
@@ -0,0 +1,73 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.s3a.auth.delegation;
+
+import java.util.Optional;
+
+import com.amazonaws.services.s3.model.SSEAwsKeyManagementParams;
+import com.amazonaws.services.s3.model.SSECustomerKey;
+
+import org.apache.hadoop.fs.s3a.S3AEncryptionMethods;
+
+/**
+ * These support operations on {@link EncryptionSecrets} which use the AWS SDK
+ * operations. Isolating them here ensures that that class is not required on
+ * the classpath.
+ */
+public final class EncryptionSecretOperations {
+
+  private EncryptionSecretOperations() {
+  }
+
+  /**
+   * Create SSE-C client side key encryption options on demand.
+   * @return an optional key to attach to a request.
+   * @param secrets source of the encryption secrets.
+   */
+  public static Optional<SSECustomerKey> createSSECustomerKey(
+      final EncryptionSecrets secrets) {
+    if (secrets.hasEncryptionKey() &&
+        secrets.getEncryptionMethod() == S3AEncryptionMethods.SSE_C) {
+      return Optional.of(new SSECustomerKey(secrets.getEncryptionKey()));
+    } else {
+      return Optional.empty();
+    }
+  }
+
+  /**
+   * Create SSE-KMS options for a request, iff the encryption is SSE-KMS.
+   * @return an optional SSE-KMS param to attach to a request.
+   * @param secrets source of the encryption secrets.
+   */
+  public static Optional<SSEAwsKeyManagementParams> createSSEAwsKeyManagementParams(
+      final EncryptionSecrets secrets) {
+
+    //Use specified key, otherwise default to default master aws/s3 key by AWS
+    if (secrets.getEncryptionMethod() == S3AEncryptionMethods.SSE_KMS) {
+      if (secrets.hasEncryptionKey()) {
+        return Optional.of(new SSEAwsKeyManagementParams(
+            secrets.getEncryptionKey()));
+      } else {
+        return Optional.of(new SSEAwsKeyManagementParams());
+      }
+    } else {
+      return Optional.empty();
+    }
+  }
+}
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/delegation/EncryptionSecrets.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/delegation/EncryptionSecrets.java
new file mode 100644
index 0000000000000..092653de557f0
--- /dev/null
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/delegation/EncryptionSecrets.java
@@ -0,0 +1,221 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.s3a.auth.delegation;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+import java.io.ObjectInputStream;
+import java.io.Serializable;
+import java.util.Objects;
+
+import org.apache.commons.lang3.StringUtils;
+import org.apache.hadoop.fs.s3a.S3AEncryptionMethods;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.Writable;
+
+/**
+ * Encryption options in a form which can serialized or marshalled as a hadoop
+ * Writeable.
+ *
+ * Maintainers: For security reasons, don't print any of this.
+ *
+ * Note this design marshalls/unmarshalls its serialVersionUID
+ * in its writable, which is used to compare versions.
+ *
+ * <i>Important.</i>
+ * If the wire format is ever changed incompatibly,
+ * update the serial version UID to ensure that older clients get safely
+ * rejected.
+ *
+ * <i>Important</i>
+ * Do not import any AWS SDK classes, directly or indirectly.
+ * This is to ensure that S3A Token identifiers can be unmarshalled even
+ * without that SDK.
+ */
+public class EncryptionSecrets implements Writable, Serializable {
+
+  public static final int MAX_SECRET_LENGTH = 2048;
+
+  private static final long serialVersionUID = 1208329045511296375L;
+
+  /**
+   * Encryption algorithm to use: must match one in
+   * {@link S3AEncryptionMethods}.
+   */
+  private String encryptionAlgorithm = "";
+
+  /**
+   * Encryption key: possibly sensitive information.
+   */
+  private String encryptionKey = "";
+
+  /**
+   * This field isn't serialized/marshalled; it is rebuilt from the
+   * encryptionAlgorithm field.
+   */
+  private transient S3AEncryptionMethods encryptionMethod =
+      S3AEncryptionMethods.NONE;
+
+  /**
+   * Empty constructor, for use in marshalling.
+   */
+  public EncryptionSecrets() {
+  }
+
+  /**
+   * Create a pair of secrets.
+   * @param encryptionAlgorithm algorithm enumeration.
+   * @param encryptionKey key/key reference.
+   * @throws IOException failure to initialize.
+   */
+  public EncryptionSecrets(final S3AEncryptionMethods encryptionAlgorithm,
+      final String encryptionKey) throws IOException {
+    this(encryptionAlgorithm.getMethod(), encryptionKey);
+  }
+
+  /**
+   * Create a pair of secrets.
+   * @param encryptionAlgorithm algorithm name
+   * @param encryptionKey key/key reference.
+   * @throws IOException failure to initialize.
+   */
+  public EncryptionSecrets(final String encryptionAlgorithm,
+      final String encryptionKey) throws IOException {
+    this.encryptionAlgorithm = encryptionAlgorithm;
+    this.encryptionKey = encryptionKey;
+    init();
+  }
+
+  /**
+   * Write out the encryption secrets.
+   * @param out {@code DataOutput} to serialize this object into.
+   * @throws IOException IO failure
+   */
+  @Override
+  public void write(final DataOutput out) throws IOException {
+    new LongWritable(serialVersionUID).write(out);
+    Text.writeString(out, encryptionAlgorithm);
+    Text.writeString(out, encryptionKey);
+  }
+
+  /**
+   * Read in from the writable stream.
+   * After reading, call {@link #init()}.
+   * @param in {@code DataInput} to deserialize this object from.
+   * @throws IOException failure to read/validate data.
+   */
+  @Override
+  public void readFields(final DataInput in) throws IOException {
+    final LongWritable version = new LongWritable();
+    version.readFields(in);
+    if (version.get() != serialVersionUID) {
+      throw new DelegationTokenIOException(
+          "Incompatible EncryptionSecrets version");
+    }
+    encryptionAlgorithm = Text.readString(in, MAX_SECRET_LENGTH);
+    encryptionKey = Text.readString(in, MAX_SECRET_LENGTH);
+    init();
+  }
+
+  /**
+   * For java serialization: read and then call {@link #init()}.
+   * @param in input
+   * @throws IOException IO problem
+   * @throws ClassNotFoundException problem loading inner class.
+   */
+  private void readObject(ObjectInputStream in)
+      throws IOException, ClassNotFoundException {
+    in.defaultReadObject();
+    init();
+  }
+
+  /**
+   * Init all state, including after any read.
+   * @throws IOException error rebuilding state.
+   */
+  private void init() throws IOException {
+    encryptionMethod = S3AEncryptionMethods.getMethod(
+        encryptionAlgorithm);
+  }
+
+  public String getEncryptionAlgorithm() {
+    return encryptionAlgorithm;
+  }
+
+  public String getEncryptionKey() {
+    return encryptionKey;
+  }
+
+  /**
+   * Does this instance have encryption options?
+   * That is: is the algorithm non-null.
+   * @return true if there's an encryption algorithm.
+   */
+  public boolean hasEncryptionAlgorithm() {
+    return StringUtils.isNotEmpty(encryptionAlgorithm);
+  }
+
+  /**
+   * Does this instance have an encryption key?
+   * @return true if there's an encryption key.
+   */
+  public boolean hasEncryptionKey() {
+    return StringUtils.isNotEmpty(encryptionKey);
+  }
+
+  @Override
+  public boolean equals(final Object o) {
+    if (this == o) {
+      return true;
+    }
+    if (o == null || getClass() != o.getClass()) {
+      return false;
+    }
+    final EncryptionSecrets that = (EncryptionSecrets) o;
+    return Objects.equals(encryptionAlgorithm, that.encryptionAlgorithm)
+        && Objects.equals(encryptionKey, that.encryptionKey);
+  }
+
+  @Override
+  public int hashCode() {
+    return Objects.hash(encryptionAlgorithm, encryptionKey);
+  }
+
+  /**
+   * Get the encryption method.
+   * @return the encryption method
+   */
+  public S3AEncryptionMethods getEncryptionMethod() {
+    return encryptionMethod;
+  }
+
+  /**
+   * String function returns the encryption mode but not any other
+   * secrets.
+   * @return a string safe for logging.
+   */
+  @Override
+  public String toString() {
+    return S3AEncryptionMethods.NONE.equals(encryptionMethod)
+        ? "(no encryption)"
+        : encryptionMethod.getMethod();
+  }
+}
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/delegation/FullCredentialsTokenBinding.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/delegation/FullCredentialsTokenBinding.java
new file mode 100644
index 0000000000000..138667b07d299
--- /dev/null
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/delegation/FullCredentialsTokenBinding.java
@@ -0,0 +1,172 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.s3a.auth.delegation;
+
+import java.io.IOException;
+import java.net.URI;
+import java.util.Optional;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.s3a.AWSCredentialProviderList;
+import org.apache.hadoop.fs.s3a.S3AUtils;
+import org.apache.hadoop.fs.s3a.auth.MarshalledCredentialBinding;
+import org.apache.hadoop.fs.s3a.auth.MarshalledCredentialProvider;
+import org.apache.hadoop.fs.s3a.auth.MarshalledCredentials;
+import org.apache.hadoop.fs.s3a.auth.RoleModel;
+import org.apache.hadoop.fs.s3native.S3xLoginHelper;
+
+import static org.apache.hadoop.fs.s3a.auth.delegation.DelegationConstants.FULL_TOKEN_KIND;
+
+/**
+ * Full credentials: they are simply passed as-is, rather than
+ * converted to a session.
+ * These aren't as secure; this class exists to (a) support deployments
+ * where there is not STS service and (b) validate the design of
+ * S3A DT support to support different managers.
+ */
+public class FullCredentialsTokenBinding extends
+    AbstractDelegationTokenBinding {
+
+  /**
+   * Wire name of this binding includes a version marker: {@value}.
+   */
+  private static final String NAME = "FullCredentials/001";
+
+  public static final String FULL_TOKEN = "Full Delegation Token";
+
+  /**
+   * Long-lived AWS credentials.
+   */
+  private MarshalledCredentials awsCredentials;
+
+  /**
+   * Origin of credentials.
+   */
+  private String credentialOrigin;
+
+  /**
+   * Constructor, uses name of {@link #name} and token kind of
+   * {@link DelegationConstants#FULL_TOKEN_KIND}.
+   *
+   */
+  public FullCredentialsTokenBinding() {
+    super(NAME, FULL_TOKEN_KIND);
+  }
+
+  @Override
+  protected void serviceStart() throws Exception {
+    super.serviceStart();
+    loadAWSCredentials();
+  }
+
+  /**
+   * Load the AWS credentials.
+   * @throws IOException failure
+   */
+  private void loadAWSCredentials() throws IOException {
+    credentialOrigin = AbstractS3ATokenIdentifier.createDefaultOriginMessage();
+    Configuration conf = getConfig();
+    URI uri = getCanonicalUri();
+    // look for access keys to FS
+    S3xLoginHelper.Login secrets = S3AUtils.getAWSAccessKeys(uri, conf);
+    if (secrets.hasLogin()) {
+      awsCredentials = new MarshalledCredentials(
+          secrets.getUser(), secrets.getPassword(), "");
+      credentialOrigin += "; source = Hadoop configuration data";
+    } else {
+      // if there are none, look for the environment variables.
+      awsCredentials = MarshalledCredentialBinding.fromEnvironment(
+          System.getenv());
+      if (awsCredentials.isValid(
+          MarshalledCredentials.CredentialTypeRequired.AnyNonEmpty)) {
+        // valid tokens, so mark as origin
+        credentialOrigin += "; source = Environment variables";
+      } else {
+        credentialOrigin = "no credentials in configuration or"
+            + " environment variables";
+      }
+    }
+    awsCredentials.validate(credentialOrigin +": ",
+        MarshalledCredentials.CredentialTypeRequired.AnyNonEmpty);
+  }
+
+  /**
+   * Serve up the credentials retrieved from configuration/environment in
+   * {@link #loadAWSCredentials()}.
+   * @return a credential provider for the unbonded instance.
+   * @throws IOException failure to load
+   */
+  @Override
+  public AWSCredentialProviderList deployUnbonded() throws IOException {
+    requireServiceStarted();
+    return new AWSCredentialProviderList(
+        "Full Credentials Token Binding",
+        new MarshalledCredentialProvider(
+            FULL_TOKEN,
+            getFileSystem().getUri(),
+            getConfig(),
+            awsCredentials,
+            MarshalledCredentials.CredentialTypeRequired.AnyNonEmpty));
+  }
+
+  /**
+   * Create a new delegation token.
+   *
+   * It's slightly inefficient to create a new one every time, but
+   * it avoids concurrency problems with managing any singleton.
+   * @param policy minimum policy to use, if known.
+   * @param encryptionSecrets encryption secrets.
+   * @return a DT identifier
+   * @throws IOException failure
+   */
+  @Override
+  public AbstractS3ATokenIdentifier createTokenIdentifier(
+      final Optional<RoleModel.Policy> policy,
+      final EncryptionSecrets encryptionSecrets) throws IOException {
+    requireServiceStarted();
+
+    return new FullCredentialsTokenIdentifier(getCanonicalUri(),
+        getOwnerText(),
+        awsCredentials,
+        encryptionSecrets,
+        credentialOrigin);
+  }
+
+  @Override
+  public AWSCredentialProviderList bindToTokenIdentifier(
+      final AbstractS3ATokenIdentifier retrievedIdentifier)
+      throws IOException {
+    FullCredentialsTokenIdentifier tokenIdentifier =
+        convertTokenIdentifier(retrievedIdentifier,
+            FullCredentialsTokenIdentifier.class);
+    return new AWSCredentialProviderList(
+        "", new MarshalledCredentialProvider(
+            FULL_TOKEN,
+            getFileSystem().getUri(),
+            getConfig(),
+            tokenIdentifier.getMarshalledCredentials(),
+            MarshalledCredentials.CredentialTypeRequired.AnyNonEmpty));
+  }
+
+  @Override
+  public AbstractS3ATokenIdentifier createEmptyIdentifier() {
+    return new FullCredentialsTokenIdentifier();
+  }
+
+}
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/delegation/FullCredentialsTokenIdentifier.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/delegation/FullCredentialsTokenIdentifier.java
new file mode 100644
index 0000000000000..95e4a28970caa
--- /dev/null
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/delegation/FullCredentialsTokenIdentifier.java
@@ -0,0 +1,50 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.s3a.auth.delegation;
+
+import java.net.URI;
+
+import org.apache.hadoop.fs.s3a.auth.MarshalledCredentials;
+import org.apache.hadoop.io.Text;
+
+/**
+ * The full credentials payload is the same of that for a session token, but
+ * a different token kind is used.
+ *
+ * Token kind is {@link DelegationConstants#FULL_TOKEN_KIND}.
+ */
+public class FullCredentialsTokenIdentifier extends SessionTokenIdentifier {
+
+  public FullCredentialsTokenIdentifier() {
+    super(DelegationConstants.FULL_TOKEN_KIND);
+  }
+
+  public FullCredentialsTokenIdentifier(final URI uri,
+      final Text owner,
+      final MarshalledCredentials marshalledCredentials,
+      final EncryptionSecrets encryptionSecrets,
+      String origin) {
+    super(DelegationConstants.FULL_TOKEN_KIND,
+        owner,
+        uri,
+        marshalledCredentials,
+        encryptionSecrets,
+        origin);
+  }
+}
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/delegation/RoleTokenBinding.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/delegation/RoleTokenBinding.java
new file mode 100644
index 0000000000000..f436671a8f3f0
--- /dev/null
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/delegation/RoleTokenBinding.java
@@ -0,0 +1,176 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.s3a.auth.delegation;
+
+import java.io.IOException;
+import java.util.Optional;
+import java.util.UUID;
+import java.util.concurrent.TimeUnit;
+
+import com.amazonaws.services.securitytoken.model.Credentials;
+import com.google.common.annotations.VisibleForTesting;
+import com.google.common.base.Preconditions;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.s3a.AWSCredentialProviderList;
+import org.apache.hadoop.fs.s3a.Retries;
+import org.apache.hadoop.fs.s3a.auth.MarshalledCredentialProvider;
+import org.apache.hadoop.fs.s3a.auth.MarshalledCredentials;
+import org.apache.hadoop.fs.s3a.auth.RoleModel;
+import org.apache.hadoop.fs.s3a.auth.STSClientFactory;
+
+import static org.apache.hadoop.fs.s3a.auth.MarshalledCredentialBinding.fromSTSCredentials;
+import static org.apache.hadoop.fs.s3a.auth.delegation.DelegationConstants.DELEGATION_TOKEN_CREDENTIALS_PROVIDER;
+import static org.apache.hadoop.fs.s3a.auth.delegation.DelegationConstants.DELEGATION_TOKEN_ROLE_ARN;
+import static org.apache.hadoop.fs.s3a.auth.delegation.DelegationConstants.E_NO_SESSION_TOKENS_FOR_ROLE_BINDING;
+
+/**
+ * Role Token support requests an explicit role and automatically restricts
+ * that role to the given policy of the binding.
+ * The session is locked down as much as possible.
+ */
+public class RoleTokenBinding extends SessionTokenBinding {
+
+  private static final Logger LOG = LoggerFactory.getLogger(
+      RoleTokenBinding.class);
+
+  private static final RoleModel MODEL = new RoleModel();
+
+  /**
+   * Wire name of this binding includes a version marker: {@value}.
+   */
+  private static final String NAME = "RoleCredentials/001";
+
+  /**
+   * Error message when there is no Role ARN.
+   */
+  @VisibleForTesting
+  public static final String E_NO_ARN =
+      "No role ARN defined in " + DELEGATION_TOKEN_ROLE_ARN;
+
+  public static final String COMPONENT = "Role Delegation Token";
+
+  /**
+   * Role ARN to use when requesting new tokens.
+   */
+  private String roleArn;
+
+  /**
+   * Constructor.
+   * Name is {@link #name}; token kind is
+   * {@link DelegationConstants#ROLE_TOKEN_KIND}.
+   */
+  public RoleTokenBinding() {
+    super(NAME, DelegationConstants.ROLE_TOKEN_KIND);
+  }
+
+  @Override
+  protected void serviceInit(final Configuration conf) throws Exception {
+    super.serviceInit(conf);
+    roleArn = getConfig().getTrimmed(DELEGATION_TOKEN_ROLE_ARN, "");
+  }
+
+  /**
+   * Returns a (wrapped) {@link MarshalledCredentialProvider} which
+   * requires the marshalled credentials to contain session secrets.
+   * @param retrievedIdentifier the incoming identifier.
+   * @return the provider chain.
+   * @throws IOException on failure
+   */
+  @Override
+  public AWSCredentialProviderList bindToTokenIdentifier(
+      final AbstractS3ATokenIdentifier retrievedIdentifier)
+      throws IOException {
+    RoleTokenIdentifier tokenIdentifier =
+        convertTokenIdentifier(retrievedIdentifier,
+            RoleTokenIdentifier.class);
+    setTokenIdentifier(Optional.of(tokenIdentifier));
+    MarshalledCredentials marshalledCredentials
+        = tokenIdentifier.getMarshalledCredentials();
+    setExpirationDateTime(marshalledCredentials.getExpirationDateTime());
+    return new AWSCredentialProviderList(
+        "Role Token Binding",
+        new MarshalledCredentialProvider(
+            COMPONENT, getFileSystem().getUri(),
+            getConfig(),
+            marshalledCredentials,
+            MarshalledCredentials.CredentialTypeRequired.SessionOnly));
+  }
+
+  /**
+   * Create the Token Identifier.
+   * Looks for the option {@link DelegationConstants#DELEGATION_TOKEN_ROLE_ARN}
+   * in the config and fail if it is not set.
+   * @param policy the policy which will be used for the requested token.
+   * @param encryptionSecrets encryption secrets.
+   * @return the token.
+   * @throws IllegalArgumentException if there is no role defined.
+   * @throws IOException any problem acquiring the role.
+   */
+  @Override
+  @Retries.RetryTranslated
+  public RoleTokenIdentifier createTokenIdentifier(
+      final Optional<RoleModel.Policy> policy,
+      final EncryptionSecrets encryptionSecrets) throws IOException {
+    requireServiceStarted();
+    Preconditions.checkState(!roleArn.isEmpty(), E_NO_ARN);
+    String policyJson = policy.isPresent() ?
+        MODEL.toJson(policy.get()) : "";
+    final STSClientFactory.STSClient client = prepareSTSClient()
+        .orElseThrow(() -> {
+          // we've come in on a parent binding, so fail fast
+          LOG.error("Cannot issue delegation tokens because the credential"
+              + " providers listed in " + DELEGATION_TOKEN_CREDENTIALS_PROVIDER
+              + " are returning session tokens");
+          return new DelegationTokenIOException(
+              E_NO_SESSION_TOKENS_FOR_ROLE_BINDING);
+        });
+    Credentials credentials = client
+        .requestRole(roleArn,
+            UUID.randomUUID().toString(),
+            policyJson,
+            getDuration(),
+            TimeUnit.SECONDS);
+    return new RoleTokenIdentifier(
+        getCanonicalUri(),
+        getOwnerText(),
+        fromSTSCredentials(credentials),
+        encryptionSecrets,
+        AbstractS3ATokenIdentifier.createDefaultOriginMessage()
+            + " Role ARN=" + roleArn);
+  }
+
+  @Override
+  public RoleTokenIdentifier createEmptyIdentifier() {
+    return new RoleTokenIdentifier();
+  }
+
+  @Override
+  public String getDescription() {
+    return super.getDescription() + " Role ARN=" +
+        (roleArn.isEmpty() ? "(none)" : ('"' +  roleArn +'"'));
+  }
+
+  @Override
+  protected String bindingName() {
+    return "Role";
+  }
+}
diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AEncryptionSSECBlockOutputStream.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/delegation/RoleTokenIdentifier.java
similarity index 52%
rename from hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AEncryptionSSECBlockOutputStream.java
rename to hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/delegation/RoleTokenIdentifier.java
index 8991badd83666..342db0e9ddf68 100644
--- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AEncryptionSSECBlockOutputStream.java
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/delegation/RoleTokenIdentifier.java
@@ -16,30 +16,34 @@
  * limitations under the License.
  */
 
-package org.apache.hadoop.fs.s3a;
+package org.apache.hadoop.fs.s3a.auth.delegation;
 
-import org.apache.hadoop.conf.Configuration;
+import java.net.URI;
+
+import org.apache.hadoop.fs.s3a.auth.MarshalledCredentials;
+import org.apache.hadoop.io.Text;
 
 /**
- * Run the encryption tests against the Fast output stream.
- * This verifies that both file writing paths can encrypt their data.
+ * Role token identifier.
+ * Token kind is {@link DelegationConstants#ROLE_TOKEN_KIND}
  */
+public class RoleTokenIdentifier extends SessionTokenIdentifier {
 
-public class ITestS3AEncryptionSSECBlockOutputStream
-    extends AbstractTestS3AEncryption {
-
-  @Override
-  protected Configuration createConfiguration() {
-    Configuration conf = super.createConfiguration();
-    conf.set(Constants.FAST_UPLOAD_BUFFER,
-        Constants.FAST_UPLOAD_BYTEBUFFER);
-    conf.set(Constants.SERVER_SIDE_ENCRYPTION_KEY,
-        "4niV/jPK5VFRHY+KNb6wtqYd4xXyMgdJ9XQJpcQUVbs=");
-    return conf;
+  public RoleTokenIdentifier() {
+    super(DelegationConstants.ROLE_TOKEN_KIND);
   }
 
-  @Override
-  protected S3AEncryptionMethods getSSEAlgorithm() {
-    return S3AEncryptionMethods.SSE_C;
+  public RoleTokenIdentifier(final URI uri,
+      final Text owner,
+      final MarshalledCredentials marshalledCredentials,
+      final EncryptionSecrets encryptionSecrets,
+      final String origin) {
+    super(DelegationConstants.ROLE_TOKEN_KIND,
+        owner,
+        uri,
+        marshalledCredentials,
+        encryptionSecrets,
+        origin);
   }
+
 }
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/delegation/S3ADelegationTokens.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/delegation/S3ADelegationTokens.java
new file mode 100644
index 0000000000000..b8eeca135079f
--- /dev/null
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/delegation/S3ADelegationTokens.java
@@ -0,0 +1,685 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.s3a.auth.delegation;
+
+import java.io.IOException;
+import java.net.URI;
+import java.util.EnumSet;
+import java.util.List;
+import java.util.Objects;
+import java.util.Optional;
+import java.util.concurrent.atomic.AtomicInteger;
+
+import com.google.common.annotations.VisibleForTesting;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.commons.lang3.StringUtils;
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.s3a.AWSCredentialProviderList;
+import org.apache.hadoop.fs.s3a.S3AFileSystem;
+import org.apache.hadoop.fs.s3a.S3AInstrumentation;
+import org.apache.hadoop.fs.s3a.auth.RoleModel;
+import org.apache.hadoop.fs.s3a.commit.DurationInfo;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.security.Credentials;
+import org.apache.hadoop.security.UserGroupInformation;
+import org.apache.hadoop.security.token.Token;
+import org.apache.hadoop.service.ServiceOperations;
+
+import static com.google.common.base.Preconditions.checkArgument;
+import static com.google.common.base.Preconditions.checkState;
+import static org.apache.hadoop.fs.s3a.auth.delegation.DelegationConstants.DEFAULT_DELEGATION_TOKEN_BINDING;
+import static org.apache.hadoop.fs.s3a.auth.delegation.DelegationConstants.DELEGATION_TOKEN_BINDING;
+import static org.apache.hadoop.fs.s3a.auth.delegation.DelegationConstants.DURATION_LOG_AT_INFO;
+
+/**
+ * Support for creating a DT from a filesystem.
+ *
+ * Isolated from S3A for control and testability.
+ *
+ * The S3A Delegation Tokens are special in that the tokens are not directly
+ * used to authenticate with the AWS services.
+ * Instead they can session/role  credentials requested off AWS on demand.
+ *
+ * The design is extensible in that different back-end bindings can be used
+ * to switch to different session creation mechanisms, or indeed, to any
+ * other authentication mechanism supported by an S3 service, provided it
+ * ultimately accepts some form of AWS credentials for authentication through
+ * the AWS SDK. That is, if someone wants to wire this up to Kerberos, or
+ * OAuth2, this design should support them.
+ *
+ * URIs processed must be the canonical URIs for the service.
+ */
+@InterfaceAudience.Private
+public class S3ADelegationTokens extends AbstractDTService {
+
+  private static final Logger LOG = LoggerFactory.getLogger(
+      S3ADelegationTokens.class);
+
+  @VisibleForTesting
+  static final String E_ALREADY_DEPLOYED
+      = "S3A Delegation tokens has already been bound/deployed";
+
+  public static final String E_DELEGATION_TOKENS_DISABLED
+      = "Delegation tokens are not enabled";
+
+  /**
+   * User who owns this FS; fixed at instantiation time, so that
+   * in calls to getDelegationToken() and similar, this user is the one whose
+   * credentials are involved.
+   */
+  private final UserGroupInformation user;
+
+  /**
+   * Count of number of created tokens.
+   * For testing and diagnostics.
+   */
+  private final AtomicInteger creationCount = new AtomicInteger(0);
+
+  /**
+   * Text value of this token service.
+   */
+  private Text service;
+
+  /**
+   * Active Delegation token.
+   */
+  private Optional<Token<AbstractS3ATokenIdentifier>> boundDT
+      = Optional.empty();
+
+  /**
+   * The DT decoded when this instance is created by bonding
+   * to an existing DT.
+   */
+  private Optional<AbstractS3ATokenIdentifier> decodedIdentifier
+      = Optional.empty();
+
+  /**
+   * Dynamically loaded token binding; lifecycle matches this object.
+   */
+  private AbstractDelegationTokenBinding tokenBinding;
+
+  /**
+   * List of cred providers; unset until {@link #bindToDelegationToken(Token)}.
+   */
+  private Optional<AWSCredentialProviderList> credentialProviders
+      = Optional.empty();
+
+  /**
+   * The access policies we want for operations.
+   * There's no attempt to ask for "admin" permissions here, e.g.
+   * those to manipulate S3Guard tables.
+   */
+  protected static final EnumSet<AWSPolicyProvider.AccessLevel> ACCESS_POLICY
+      = EnumSet.of(
+          AWSPolicyProvider.AccessLevel.READ,
+          AWSPolicyProvider.AccessLevel.WRITE);
+
+  /**
+   * Statistics for the owner FS.
+   */
+  private S3AInstrumentation.DelegationTokenStatistics stats;
+
+  /**
+   * Name of the token binding as extracted from token kind; used for
+   * logging.
+   */
+  private String tokenBindingName = "";
+
+  /**
+   * Instantiate.
+   */
+  public S3ADelegationTokens() throws IOException {
+    super("S3ADelegationTokens");
+    user = UserGroupInformation.getCurrentUser();
+  }
+
+  @Override
+  public void bindToFileSystem(final URI uri, final S3AFileSystem fs)
+      throws IOException {
+    super.bindToFileSystem(uri, fs);
+    service = getTokenService(getCanonicalUri());
+    stats = fs.getInstrumentation().newDelegationTokenStatistics();
+  }
+
+  /**
+   * Init the service.
+   * This identifies the token binding class to use and creates, initializes
+   * and starts it.
+   * Will raise an exception if delegation tokens are not enabled.
+   * @param conf configuration
+   * @throws Exception any failure to start up
+   */
+  @Override
+  protected void serviceInit(final Configuration conf) throws Exception {
+    super.serviceInit(conf);
+    checkState(hasDelegationTokenBinding(conf),
+        E_DELEGATION_TOKENS_DISABLED);
+    Class<? extends AbstractDelegationTokenBinding> binding = conf.getClass(
+        DelegationConstants.DELEGATION_TOKEN_BINDING,
+        SessionTokenBinding.class,
+        AbstractDelegationTokenBinding.class);
+    tokenBinding = binding.newInstance();
+    tokenBinding.bindToFileSystem(getCanonicalUri(), getFileSystem());
+    tokenBinding.init(conf);
+    tokenBindingName = tokenBinding.getKind().toString();
+    LOG.info("Filesystem {} is using delegation tokens of kind {}",
+        getCanonicalUri(), tokenBindingName);
+  }
+
+  /**
+   * Service startup includes binding to any delegation token, and
+   * deploying unbounded if there is none.
+   * It is after this that token operations can be used.
+   * @throws Exception any failure
+   */
+  @Override
+  protected void serviceStart() throws Exception {
+    super.serviceStart();
+    tokenBinding.start();
+    bindToAnyDelegationToken();
+    LOG.info("S3A Delegation support token {} with {}",
+        identifierToString(),
+        tokenBinding.getDescription());
+  }
+
+  /**
+   * Get the identifier as a string, or "(none)".
+   * @return a string value for logs etc.
+   */
+  private String identifierToString() {
+    return decodedIdentifier.map(Objects::toString)
+        .orElse("(none)");
+  }
+
+  /**
+   * Stop the token binding.
+   * @throws Exception on any failure
+   */
+  @SuppressWarnings("ThrowableNotThrown")
+  @Override
+  protected void serviceStop() throws Exception {
+    LOG.debug("Stopping delegation tokens");
+    try {
+      super.serviceStop();
+    } finally {
+      ServiceOperations.stopQuietly(LOG, tokenBinding);
+    }
+  }
+
+
+  /**
+   * Perform the unbonded deployment operations.
+   * Create the AWS credential provider chain to use
+   * when talking to AWS when there is no delegation token to work with.
+   * authenticating this client with AWS services, and saves it
+   * to {@link #credentialProviders}
+   *
+   * @throws IOException any failure.
+   */
+  private void deployUnbonded()
+      throws IOException {
+    requireServiceStarted();
+    checkState(!isBoundToDT(),
+        "Already Bound to a delegation token");
+    LOG.info("No delegation tokens present: using direct authentication");
+    credentialProviders = Optional.of(tokenBinding.deployUnbonded());
+  }
+
+  /**
+   * Attempt to bind to any existing DT, including unmarshalling its contents
+   * and creating the AWS credential provider used to authenticate
+   * the client.
+   *
+   * If successful:
+   * <ol>
+   *   <li>{@link #boundDT} is set to the retrieved token.</li>
+   *   <li>{@link #decodedIdentifier} is set to the extracted identifier.</li>
+   *   <li>{@link #credentialProviders} is set to the credential
+   *   provider(s) returned by the token binding.</li>
+   * </ol>
+   * If unsuccessful, {@link #deployUnbonded()} is called for the
+   * unbonded codepath instead, which will set
+   * {@link #credentialProviders} to its value.
+   *
+   * This means after this call (and only after) the token operations
+   * can be invoked.
+   *
+   * This method is called from {@link #serviceStart()}, so a check on
+   * the service state can be used to check things; the state model
+   * prevents re-entrant calls.
+   * @throws IOException selection/extraction/validation failure.
+   */
+  private void bindToAnyDelegationToken() throws IOException {
+    checkState(!credentialProviders.isPresent(), E_ALREADY_DEPLOYED);
+    Token<AbstractS3ATokenIdentifier> token = selectTokenFromFSOwner();
+    if (token != null) {
+      bindToDelegationToken(token);
+    } else {
+      deployUnbonded();
+    }
+    if (credentialProviders.get().size() == 0) {
+      throw new DelegationTokenIOException("No AWS credential providers"
+          + " created by Delegation Token Binding "
+          + tokenBinding.getName());
+    }
+  }
+
+  /**
+   * This is a test-only back door which resets the state and binds to
+   * a token again.
+   * This allows an instance of this class to be bonded to a DT after being
+   * started, so avoids the need to have the token in the current user
+   * credentials. It is package scoped so as to only be usable in tests
+   * in the same package.
+   *
+   * Yes, this is ugly, but there is no obvious/easy way to test token
+   * binding without Kerberos getting involved.
+   * @param token token to decode and bind to.
+   * @throws IOException selection/extraction/validation failure.
+   */
+  @VisibleForTesting
+  void resetTokenBindingToDT(final Token<AbstractS3ATokenIdentifier> token)
+      throws IOException{
+    credentialProviders = Optional.empty();
+    bindToDelegationToken(token);
+  }
+
+  /**
+   * Bind to a delegation token retrieved for this filesystem.
+   * Extract the secrets from the token and set internal fields
+   * to the values.
+   * <ol>
+   *   <li>{@link #boundDT} is set to {@code token}.</li>
+   *   <li>{@link #decodedIdentifier} is set to the extracted identifier.</li>
+   *   <li>{@link #credentialProviders} is set to the credential
+   *   provider(s) returned by the token binding.</li>
+   * </ol>
+   * @param token token to decode and bind to.
+   * @throws IOException selection/extraction/validation failure.
+   */
+  @VisibleForTesting
+  public void bindToDelegationToken(
+      final Token<AbstractS3ATokenIdentifier> token)
+      throws IOException {
+    checkState(!credentialProviders.isPresent(), E_ALREADY_DEPLOYED);
+    boundDT = Optional.of(token);
+    AbstractS3ATokenIdentifier dti = extractIdentifier(token);
+    LOG.info("Using delegation token {}", dti);
+    decodedIdentifier = Optional.of(dti);
+    try (DurationInfo ignored = new DurationInfo(LOG, DURATION_LOG_AT_INFO,
+        "Creating Delegation Token")) {
+      // extract the credential providers.
+      credentialProviders = Optional.of(
+          tokenBinding.bindToTokenIdentifier(dti));
+    }
+  }
+
+  /**
+   * Predicate: is there a bound DT?
+   * @return true if there's a value in {@link #boundDT}.
+   */
+  public boolean isBoundToDT() {
+    return boundDT.isPresent();
+  }
+
+  /**
+   * Get any bound DT.
+   * @return a delegation token if this instance was bound to it.
+   */
+  public Optional<Token<AbstractS3ATokenIdentifier>> getBoundDT() {
+    return boundDT;
+  }
+
+  /**
+   * Predicate: will this binding issue a DT if requested
+   * in a call to {@link #getBoundOrNewDT(EncryptionSecrets)}?
+   * That is: should the filesystem declare that it is issuing
+   * delegation tokens?
+   * @return a declaration of what will happen when asked for a token.
+   */
+  public TokenIssuingPolicy getTokenIssuingPolicy() {
+    return isBoundToDT()
+        ? TokenIssuingPolicy.ReturnExistingToken
+        : tokenBinding.getTokenIssuingPolicy();
+  }
+
+  /**
+   * Get any bound DT or create a new one.
+   * @return a delegation token.
+   * @throws IOException if one cannot be created
+   * @param encryptionSecrets encryption secrets for any new token.
+   */
+  @SuppressWarnings("OptionalGetWithoutIsPresent")
+  public Token<AbstractS3ATokenIdentifier> getBoundOrNewDT(
+      final EncryptionSecrets encryptionSecrets)
+      throws IOException {
+    LOG.debug("Delegation token requested");
+    if (isBoundToDT()) {
+      // the FS was created on startup with a token, so return it.
+      LOG.debug("Returning current token");
+      return getBoundDT().get();
+    } else {
+      // not bound to a token, so create a new one.
+      // issued DTs are not cached so that long-lived filesystems can
+      // reliably issue session/role tokens.
+      return createDelegationToken(encryptionSecrets);
+    }
+  }
+
+  /**
+   * How many delegation tokens have been issued?
+   * @return the number times {@link #createDelegationToken(EncryptionSecrets)}
+   * returned a token.
+   */
+  public int getCreationCount() {
+    return creationCount.get();
+  }
+
+  /**
+   * Create a delegation token for the user.
+   * This will only be called if a new DT is needed, that is: the
+   * filesystem has been deployed unbonded.
+   * @param encryptionSecrets encryption secrets for the token.
+   * @return the token
+   * @throws IOException if one cannot be created
+   */
+  @VisibleForTesting
+  public Token<AbstractS3ATokenIdentifier> createDelegationToken(
+      final EncryptionSecrets encryptionSecrets) throws IOException {
+    requireServiceStarted();
+    checkArgument(encryptionSecrets != null,
+        "Null encryption secrets");
+    // this isn't done in in advance as it needs S3Guard initialized in the
+    // filesystem before it can generate complete policies.
+    List<RoleModel.Statement> statements = getFileSystem()
+        .listAWSPolicyRules(ACCESS_POLICY);
+    Optional<RoleModel.Policy> rolePolicy =
+        statements.isEmpty() ?
+            Optional.empty() : Optional.of(new RoleModel.Policy(statements));
+
+    try(DurationInfo ignored = new DurationInfo(LOG, DURATION_LOG_AT_INFO,
+        "Creating New Delegation Token", tokenBinding.getKind())) {
+      Token<AbstractS3ATokenIdentifier> token
+          = tokenBinding.createDelegationToken(rolePolicy, encryptionSecrets);
+      if (token != null) {
+        token.setService(service);
+        noteTokenCreated(token);
+      }
+      return token;
+    }
+  }
+
+  /**
+   * Note that a token has been created; increment counters and statistics.
+   * @param token token created
+   */
+  private void noteTokenCreated(final Token<AbstractS3ATokenIdentifier> token) {
+    LOG.info("Created S3A Delegation Token: {}", token);
+    creationCount.incrementAndGet();
+    stats.tokenIssued();
+  }
+
+  /**
+   * Get the AWS credential provider.
+   * @return the DT credential provider
+   * @throws IOException failure to parse the DT
+   * @throws IllegalStateException if this instance is not bound to a DT
+   */
+  public AWSCredentialProviderList getCredentialProviders()
+      throws IOException {
+    return credentialProviders.orElseThrow(
+        () -> new DelegationTokenIOException("Not yet bonded"));
+  }
+
+  /**
+   * Get the encryption secrets of the DT.
+   * non-empty iff service is started and was bound to a DT.
+   * @return any encryption settings propagated with the DT.
+   */
+  public Optional<EncryptionSecrets> getEncryptionSecrets() {
+    return decodedIdentifier.map(
+        AbstractS3ATokenIdentifier::getEncryptionSecrets);
+  }
+
+  /**
+   * Get any decoded identifier from the bound DT; empty if not bound.
+   * @return the decoded identifier.
+   */
+  public Optional<AbstractS3ATokenIdentifier> getDecodedIdentifier() {
+    return decodedIdentifier;
+  }
+
+  /**
+   * Get the service identifier of the owning FS.
+   * @return a service identifier to use when registering tokens
+   */
+  public Text getService() {
+    return service;
+  }
+
+  /**
+   * The canonical name of the service.
+   * This can be used as the canonical service name for the FS.
+   * @return the canonicalized FS URI.
+   */
+  public String getCanonicalServiceName() {
+    return getCanonicalUri().toString();
+  }
+
+  /**
+   * Find a token for the FS user and canonical filesystem URI.
+   * @return the token, or null if one cannot be found.
+   * @throws IOException on a failure to unmarshall the token.
+   */
+  @VisibleForTesting
+  public Token<AbstractS3ATokenIdentifier> selectTokenFromFSOwner()
+      throws IOException {
+    return lookupToken(user.getCredentials(),
+        service,
+        tokenBinding.getKind());
+  }
+
+  /**
+   * Get the service identifier of a filesystem.
+   * This must be unique for (S3A, the FS URI)
+   * @param fsURI filesystem URI
+   * @return identifier to use.
+   */
+  private static Text getTokenService(final URI fsURI) {
+    return getTokenService(fsURI.toString());
+  }
+
+  @Override
+  public String toString() {
+    final StringBuilder sb = new StringBuilder(
+        "S3ADelegationTokens{");
+    sb.append("canonicalServiceURI=").append(getCanonicalUri());
+    sb.append("; owner=").append(user.getShortUserName());
+    sb.append("; isBoundToDT=").append(isBoundToDT());
+    sb.append("; token creation count=").append(getCreationCount());
+    sb.append("; tokenManager=").append(tokenBinding);
+    sb.append("; token=").append(identifierToString());
+    sb.append('}');
+    return sb.toString();
+  }
+
+  /**
+   * Get the kind of the issued tokens.
+   * @return token kind.
+   */
+  public Text getTokenKind() {
+    return tokenBinding.getKind();
+  }
+
+  /**
+   * Get the service identifier of a filesystem URI.
+   * This must be unique for (S3a, the FS URI)
+   * @param fsURI filesystem URI as a string
+   * @return identifier to use.
+   */
+  @VisibleForTesting
+  static Text getTokenService(final String fsURI) {
+    return new Text(fsURI);
+  }
+
+  /**
+   * From a token, get the session token identifier.
+   * @param token token to process
+   * @return the session token identifier
+   * @throws IOException failure to validate/read data encoded in identifier.
+   * @throws IllegalArgumentException if the token isn't an S3A session token
+   */
+  public AbstractS3ATokenIdentifier extractIdentifier(
+      final Token<? extends AbstractS3ATokenIdentifier> token)
+      throws IOException {
+
+    checkArgument(token != null, "null token");
+    AbstractS3ATokenIdentifier identifier;
+    // harden up decode beyond that Token does itself
+    try {
+      identifier = token.decodeIdentifier();
+    } catch (RuntimeException e) {
+      Throwable cause = e.getCause();
+      if (cause != null) {
+        // its a wrapping around class instantiation.
+        throw new DelegationTokenIOException("Decoding S3A token " + cause,
+            cause);
+      } else {
+        throw e;
+      }
+    }
+    if (identifier == null) {
+      throw new DelegationTokenIOException("Failed to unmarshall token for "
+          + getCanonicalUri());
+    }
+    identifier.validate();
+    return identifier;
+  }
+
+  /**
+   * Return a string for use in building up the User-Agent field, so
+   * get into the S3 access logs. Useful for diagnostics.
+   * Delegates to {{@link AbstractDelegationTokenBinding#getUserAgentField()}}
+   * for the current binding.
+   * @return a string for the S3 logs or "" for "nothing to add"
+   */
+  public String getUserAgentField() {
+    return tokenBinding.getUserAgentField();
+  }
+
+  /**
+   * Look up a token from the credentials, verify it is of the correct
+   * kind.
+   * @param credentials credentials to look up.
+   * @param service service name
+   * @param kind token kind to look for
+   * @return the token or null if no suitable token was found
+   * @throws DelegationTokenIOException wrong token kind found
+   */
+  @VisibleForTesting
+  public static Token<AbstractS3ATokenIdentifier> lookupToken(
+      final Credentials credentials,
+      final Text service,
+      final Text kind)
+      throws DelegationTokenIOException {
+
+    LOG.debug("Looking for token for service {} in credentials", service);
+    Token<?> token = credentials.getToken(service);
+    if (token != null) {
+      Text tokenKind = token.getKind();
+      LOG.debug("Found token of kind {}", tokenKind);
+      if (kind.equals(tokenKind)) {
+        // the Oauth implementation catches and logs here; this one
+        // throws the failure up.
+        return (Token<AbstractS3ATokenIdentifier>) token;
+      } else {
+
+        // there's a token for this URI, but its not the right DT kind
+        throw new DelegationTokenIOException(
+            DelegationTokenIOException.TOKEN_MISMATCH + ": expected token"
+            + " for " + service
+            + " of type " + kind
+            + " but got a token of type " + tokenKind);
+      }
+    }
+    // A token for the service was not found
+    LOG.debug("No token for {} found", service);
+    return null;
+  }
+
+  /**
+   * Look up any token from the service; cast it to one of ours.
+   * @param credentials credentials
+   * @param service service to look up
+   * @return any token found or null if none was
+   * @throws ClassCastException if the token is of a wrong type.
+   */
+  public static Token<AbstractS3ATokenIdentifier> lookupToken(
+      final Credentials credentials,
+      final Text service) {
+    return (Token<AbstractS3ATokenIdentifier>) credentials.getToken(service);
+  }
+
+  /**
+   * Look for any S3A token for the given FS service.
+   * @param credentials credentials to scan.
+   * @param uri the URI of the FS to look for
+   * @return the token or null if none was found
+   */
+  public static Token<AbstractS3ATokenIdentifier> lookupS3ADelegationToken(
+      final Credentials credentials,
+      final URI uri) {
+    return lookupToken(credentials, getTokenService(uri.toString()));
+  }
+
+  /**
+   * Predicate: does this configuration enable delegation tokens?
+   * That is: is there any text in the option
+   * {@link DelegationConstants#DELEGATION_TOKEN_BINDING} ?
+   * @param conf configuration to examine
+   * @return true iff the trimmed configuration option is not empty.
+   */
+  public static boolean hasDelegationTokenBinding(Configuration conf) {
+    return StringUtils.isNotEmpty(
+        conf.getTrimmed(DELEGATION_TOKEN_BINDING,
+            DEFAULT_DELEGATION_TOKEN_BINDING));
+  }
+
+  /**
+   * How will tokens be issued on request?
+   *
+   * The {@link #RequestNewToken} policy does not guarantee that a tokens
+   * can be created, only that an attempt will be made to request one.
+   * It may fail (wrong credential types, wrong role, etc).
+   */
+  public enum TokenIssuingPolicy {
+
+    /** The existing token will be returned. */
+    ReturnExistingToken,
+
+    /** No tokens will be issued. */
+    NoTokensAvailable,
+
+    /** An attempt will be made to request a new DT. */
+    RequestNewToken
+  }
+}
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/delegation/S3ADtFetcher.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/delegation/S3ADtFetcher.java
new file mode 100644
index 0000000000000..8ac07a216d310
--- /dev/null
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/delegation/S3ADtFetcher.java
@@ -0,0 +1,80 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.s3a.auth.delegation;
+
+import java.net.URI;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.s3a.Constants;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.security.Credentials;
+import org.apache.hadoop.security.UserGroupInformation;
+import org.apache.hadoop.security.token.DtFetcher;
+import org.apache.hadoop.security.token.Token;
+
+/**
+ * A DT fetcher for S3A.
+ * This is a copy-and-paste of
+ * {@code org.apache.hadoop.hdfs.HdfsDtFetcher}.
+ *
+ * It is only needed for the `hadoop dtutil` command.
+ */
+public class S3ADtFetcher implements DtFetcher {
+
+  private static final String SERVICE_NAME = Constants.FS_S3A;
+
+  private static final String FETCH_FAILED =
+      "Filesystem not generating Delegation Tokens";
+
+  /**
+   * Returns the service name for HDFS, which is also a valid URL prefix.
+   */
+  public Text getServiceName() {
+    return new Text(SERVICE_NAME);
+  }
+
+  public boolean isTokenRequired() {
+    return UserGroupInformation.isSecurityEnabled();
+  }
+
+  /**
+   *  Returns Token object via FileSystem, null if bad argument.
+   *  @param conf - a Configuration object used with FileSystem.get()
+   *  @param creds - a Credentials object to which token(s) will be added
+   *  @param renewer  - the renewer to send with the token request
+   *  @param url  - the URL to which the request is sent
+   *  @return a Token, or null if fetch fails.
+   */
+  public Token<?> addDelegationTokens(Configuration conf,
+      Credentials creds,
+      String renewer,
+      String url) throws Exception {
+    if (!url.startsWith(getServiceName().toString())) {
+      url = getServiceName().toString() + "://" + url;
+    }
+    FileSystem fs = FileSystem.get(URI.create(url), conf);
+    Token<?> token = fs.getDelegationToken(renewer);
+    if (token == null) {
+      throw new DelegationTokenIOException(FETCH_FAILED + ": " + url);
+    }
+    creds.addToken(token.getService(), token);
+    return token;
+  }
+}
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/delegation/SessionTokenBinding.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/delegation/SessionTokenBinding.java
new file mode 100644
index 0000000000000..67933c7cb8628
--- /dev/null
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/delegation/SessionTokenBinding.java
@@ -0,0 +1,421 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.s3a.auth.delegation;
+
+import java.io.IOException;
+import java.net.URI;
+import java.time.OffsetDateTime;
+import java.util.HashSet;
+import java.util.Optional;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicBoolean;
+
+import com.amazonaws.ClientConfiguration;
+import com.amazonaws.auth.AWSCredentials;
+import com.amazonaws.auth.AWSSessionCredentials;
+import com.amazonaws.services.securitytoken.AWSSecurityTokenService;
+import com.google.common.annotations.VisibleForTesting;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.s3a.AWSCredentialProviderList;
+import org.apache.hadoop.fs.s3a.Invoker;
+import org.apache.hadoop.fs.s3a.Retries;
+import org.apache.hadoop.fs.s3a.S3ARetryPolicy;
+import org.apache.hadoop.fs.s3a.S3AUtils;
+import org.apache.hadoop.fs.s3a.auth.MarshalledCredentialProvider;
+import org.apache.hadoop.fs.s3a.auth.MarshalledCredentials;
+import org.apache.hadoop.fs.s3a.auth.RoleModel;
+import org.apache.hadoop.fs.s3a.auth.STSClientFactory;
+import org.apache.hadoop.io.IOUtils;
+import org.apache.hadoop.io.Text;
+
+import static org.apache.hadoop.fs.s3a.Constants.AWS_CREDENTIALS_PROVIDER;
+import static org.apache.hadoop.fs.s3a.Invoker.once;
+import static org.apache.hadoop.fs.s3a.S3AUtils.STANDARD_AWS_PROVIDERS;
+import static org.apache.hadoop.fs.s3a.S3AUtils.buildAWSProviderList;
+import static org.apache.hadoop.fs.s3a.auth.MarshalledCredentialBinding.fromAWSCredentials;
+import static org.apache.hadoop.fs.s3a.auth.MarshalledCredentialBinding.fromSTSCredentials;
+import static org.apache.hadoop.fs.s3a.auth.delegation.DelegationConstants.*;
+
+/**
+ * The session token DT binding: creates an AWS session token
+ * for the DT, extracts and serves it up afterwards.
+ */
+public class SessionTokenBinding extends AbstractDelegationTokenBinding {
+
+  private static final Logger LOG = LoggerFactory.getLogger(
+      SessionTokenBinding.class);
+
+  /**
+   * Wire name of this binding: {@value}.
+   */
+  private static final String NAME = "SessionTokens/001";
+
+  /**
+   * A message added to the standard origin string when the DT is
+   * built from session credentials passed in.
+   */
+  @VisibleForTesting
+  public static final String CREDENTIALS_CONVERTED_TO_DELEGATION_TOKEN
+      = "Existing session credentials converted to Delegation Token";
+
+  public static final String SESSION_TOKEN
+      = "Session Delegation Token";
+
+  /** Invoker for STS calls. */
+  private Invoker invoker;
+
+  /**
+   * Has an attempt to initialize STS been attempted?
+   */
+  private final AtomicBoolean stsInitAttempted = new AtomicBoolean(false);
+
+  /** The STS client; created in startup if the parental credentials permit. */
+  @SuppressWarnings("FieldAccessedSynchronizedAndUnsynchronized")
+  private Optional<STSClientFactory.STSClient> stsClient = Optional.empty();
+
+  /**
+   * Duration of session in seconds.
+   */
+  private long duration;
+
+  /**
+   * Flag to indicate that the auth chain provides session credentials.
+   * If true it means that STS cannot be used (and stsClient is null).
+   */
+  private boolean hasSessionCreds;
+
+  /**
+   * The auth chain for the parent options.
+   */
+  private AWSCredentialProviderList parentAuthChain;
+
+  /**
+   * Has a log message about forwarding credentials been printed yet?
+   */
+  private final AtomicBoolean forwardMessageLogged = new AtomicBoolean(false);
+
+  /** STS endpoint. */
+  private String endpoint;
+
+  /** STS region. */
+  private String region;
+
+  /**
+   * Expiration date time as passed in from source.
+   * If unset, either we are unbound, or the token which came in does not
+   * know its expiry.
+   */
+  private Optional<OffsetDateTime> expirationDateTime;
+
+  /**
+   * Token identifier bound to.
+   */
+  private Optional<SessionTokenIdentifier> tokenIdentifier = Optional.empty();
+
+  /** Constructor for reflection. */
+  public SessionTokenBinding() {
+    this(NAME, SESSION_TOKEN_KIND);
+  }
+
+  /**
+   * Constructor for subclasses.
+   * @param name binding name.
+   * @param kind token kind.
+   */
+  protected SessionTokenBinding(final String name,
+      final Text kind) {
+    super(name, kind);
+  }
+
+  /**
+   * Service start will read in all configuration options
+   * then build that client.
+   */
+  @Override
+  protected void serviceStart() throws Exception {
+    super.serviceStart();
+    Configuration conf = getConfig();
+    duration = conf.getTimeDuration(DELEGATION_TOKEN_DURATION,
+        DEFAULT_DELEGATION_TOKEN_DURATION,
+        TimeUnit.SECONDS);
+    endpoint = conf.getTrimmed(DELEGATION_TOKEN_ENDPOINT,
+        DEFAULT_DELEGATION_TOKEN_ENDPOINT);
+    region = conf.getTrimmed(DELEGATION_TOKEN_REGION,
+        DEFAULT_DELEGATION_TOKEN_REGION);
+
+    // create the provider set for session credentials.
+    parentAuthChain = buildAWSProviderList(
+        getCanonicalUri(),
+        conf,
+        AWS_CREDENTIALS_PROVIDER,
+        STANDARD_AWS_PROVIDERS,
+        new HashSet<>());
+  }
+
+  @Override
+  protected void serviceStop() throws Exception {
+    super.serviceStop();
+    // this is here to keep findbugs quiet, even though nothing
+    // can safely invoke stsClient as we are shut down.
+    synchronized (this) {
+      this.stsClient.ifPresent(IOUtils::closeStream);
+      this.stsClient = Optional.empty();
+    }
+  }
+
+  /**
+   * Return an unbonded provider chain.
+   * @return the auth chain built from the assumed role credentials
+   * @throws IOException any failure.
+   */
+  @Override
+  public AWSCredentialProviderList deployUnbonded()
+      throws IOException {
+    requireServiceStarted();
+    return parentAuthChain;
+  }
+
+  /**
+   * Get the invoker for STS calls.
+   * @return the invoker
+   */
+  protected Invoker getInvoker() {
+    return invoker;
+  }
+
+  /**
+   * Sets the field {@link #tokenIdentifier} to the extracted/cast
+   * session token identifier, and {@link #expirationDateTime} to
+   * any expiration passed in.
+   * @param retrievedIdentifier the unmarshalled data
+   * @return the provider list
+   * @throws IOException failure
+   */
+  @Override
+  public AWSCredentialProviderList bindToTokenIdentifier(
+      final AbstractS3ATokenIdentifier retrievedIdentifier)
+      throws IOException {
+    final SessionTokenIdentifier identifier = convertTokenIdentifier(
+        retrievedIdentifier,
+        SessionTokenIdentifier.class);
+    setTokenIdentifier(Optional.of(identifier));
+    MarshalledCredentials marshalledCredentials
+        = identifier.getMarshalledCredentials();
+    setExpirationDateTime(marshalledCredentials.getExpirationDateTime());
+    return new AWSCredentialProviderList(
+        "Session Token Binding",
+        new MarshalledCredentialProvider(
+            SESSION_TOKEN,
+            getFileSystem().getUri(),
+            getConfig(),
+            marshalledCredentials,
+            MarshalledCredentials.CredentialTypeRequired.SessionOnly));
+  }
+
+  @Override
+  public String getDescription() {
+    return String.format(
+            "%s token binding for user %s, " +
+            "with STS endpoint \"%s\", region \"%s\""
+                + " and token duration %d:%02d",
+        bindingName(), getOwner().getShortUserName(), endpoint, region,
+        TimeUnit.SECONDS.toMinutes(duration),
+        duration % 60);
+  }
+
+  /**
+   * Get the role of this token; subclasses should override this
+   * for better logging.
+   * @return the role of this token
+   */
+  protected String bindingName() {
+    return "Session";
+  }
+
+  /**
+   * UA field contains the UUID of the token if present.
+   * @return a string for the S3 logs.
+   */
+  public String getUserAgentField() {
+    if (tokenIdentifier.isPresent()) {
+      return "; session ID " + tokenIdentifier.get().getUuid();
+    } else {
+      return "";
+    }
+  }
+
+  /**
+   * Attempt to init the STS connection, only does it once.
+   * If the AWS credential list to this service return session credentials
+   * then this method will return {@code empty()}; no attempt is
+   * made to connect to STS.
+   * Otherwise, the STS binding info will be looked up and an attempt
+   * made to connect to STS.
+   * Only one attempt will be made.
+   * @return any STS client created.
+   * @throws IOException any failure to bind to STS.
+   */
+  private synchronized Optional<STSClientFactory.STSClient> maybeInitSTS()
+      throws IOException {
+    if (stsInitAttempted.getAndSet(true)) {
+      // whether or not it succeeded, the state of the STS client is what
+      // callers get after the first attempt.
+      return stsClient;
+    }
+
+    Configuration conf = getConfig();
+    URI uri = getCanonicalUri();
+
+    // Ask the owner for any session credentials which it already has
+    // so that it can just propagate them.
+    // this call may fail if there are no credentials on the auth
+    // chain.
+    // As no codepath (session propagation, STS creation) will work,
+    // throw this.
+    final AWSCredentials parentCredentials = once("get credentials",
+        "",
+        () -> parentAuthChain.getCredentials());
+    hasSessionCreds = parentCredentials instanceof AWSSessionCredentials;
+
+    if (!hasSessionCreds) {
+      LOG.info("Creating STS client for {}", getDescription());
+
+      invoker = new Invoker(new S3ARetryPolicy(conf), LOG_EVENT);
+      ClientConfiguration awsConf =
+          S3AUtils.createAwsConf(conf, uri.getHost());
+      AWSSecurityTokenService tokenService =
+          STSClientFactory.builder(parentAuthChain,
+              awsConf,
+              endpoint,
+              region)
+              .build();
+      stsClient = Optional.of(
+          STSClientFactory.createClientConnection(tokenService, invoker));
+    } else {
+      LOG.debug("Parent-provided session credentials will be propagated");
+      stsClient = Optional.empty();
+    }
+    return stsClient;
+  }
+
+  /**
+   * Log retries at debug.
+   */
+  public static final Invoker.Retried LOG_EVENT =
+      (text, exception, retries, idempotent) -> {
+        LOG.info("{}: " + exception, text);
+        if (retries == 1) {
+          // stack on first attempt, to keep noise down
+          LOG.debug("{}: " + exception, text, exception);
+        }
+      };
+
+  /**
+   * Get the client to AWS STS.
+   * @return the STS client, when successfully inited.
+   */
+  protected Optional<STSClientFactory.STSClient> prepareSTSClient()
+      throws IOException {
+    return maybeInitSTS();
+  }
+
+  /**
+   * Duration of sessions.
+   * @return duration in seconds.
+   */
+  public long getDuration() {
+    return duration;
+  }
+
+  @Override
+  @Retries.RetryTranslated
+  public SessionTokenIdentifier createTokenIdentifier(
+      final Optional<RoleModel.Policy> policy,
+      final EncryptionSecrets encryptionSecrets) throws IOException {
+    requireServiceStarted();
+
+    final MarshalledCredentials marshalledCredentials;
+    String origin = AbstractS3ATokenIdentifier.createDefaultOriginMessage();
+    final Optional<STSClientFactory.STSClient> client = prepareSTSClient();
+
+    if (client.isPresent()) {
+      // this is the normal route: ask for a new STS token
+      marshalledCredentials = fromSTSCredentials(
+          client.get()
+              .requestSessionCredentials(duration, TimeUnit.SECONDS));
+    } else {
+      // get a new set of parental session credentials (pick up IAM refresh)
+      if (!forwardMessageLogged.getAndSet(true)) {
+        // warn caller on the first -and only the first- use.
+        LOG.warn("Forwarding existing session credentials to {}"
+            + " -duration unknown", getCanonicalUri());
+      }
+      origin += " " + CREDENTIALS_CONVERTED_TO_DELEGATION_TOKEN;
+      final AWSCredentials awsCredentials
+          = parentAuthChain.getCredentials();
+      if (awsCredentials instanceof AWSSessionCredentials) {
+        marshalledCredentials = fromAWSCredentials(
+            (AWSSessionCredentials) awsCredentials);
+      } else {
+        throw new DelegationTokenIOException(
+            "AWS Authentication chain is no longer supplying session secrets");
+      }
+    }
+    return new SessionTokenIdentifier(getKind(),
+        getOwnerText(),
+        getCanonicalUri(),
+        marshalledCredentials,
+        encryptionSecrets,
+        origin);
+  }
+
+  @Override
+  public SessionTokenIdentifier createEmptyIdentifier() {
+    return new SessionTokenIdentifier();
+  }
+
+  /**
+   * Expiration date time as passed in from source.
+   * If unset, either we are unbound, or the token which came in does not
+   * know its expiry.
+   */
+  protected Optional<OffsetDateTime> getExpirationDateTime() {
+    return expirationDateTime;
+  }
+
+  protected void setExpirationDateTime(
+      Optional<OffsetDateTime> expirationDateTime) {
+    this.expirationDateTime = expirationDateTime;
+  }
+
+  /**
+   * Token identifier bound to.
+   */
+  protected Optional<SessionTokenIdentifier> getTokenIdentifier() {
+    return tokenIdentifier;
+  }
+
+  protected void setTokenIdentifier(Optional<SessionTokenIdentifier>
+      tokenIdentifier) {
+    this.tokenIdentifier = tokenIdentifier;
+  }
+}
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/delegation/SessionTokenIdentifier.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/delegation/SessionTokenIdentifier.java
new file mode 100644
index 0000000000000..3928a0d454c96
--- /dev/null
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/delegation/SessionTokenIdentifier.java
@@ -0,0 +1,146 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.s3a.auth.delegation;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+import java.net.URI;
+
+import org.apache.hadoop.fs.s3a.auth.MarshalledCredentials;
+import org.apache.hadoop.io.Text;
+
+import static org.apache.hadoop.fs.s3a.auth.delegation.DelegationConstants.SESSION_TOKEN_KIND;
+
+/**
+ * A token identifier which contains a set of AWS session credentials,
+ * credentials which will be valid until they expire.
+ *
+ * <b>Note 1:</b>
+ * There's a risk here that the reference to {@link MarshalledCredentials}
+ * may trigger a transitive load of AWS classes, a load which will
+ * fail if the aws SDK isn't on the classpath.
+ *
+ * <b>Note 2:</b>
+ * This class does support subclassing, but every subclass MUST declare itself
+ * to be of a different token kind.
+ * Otherwise the process for decoding tokens breaks.
+ */
+public class SessionTokenIdentifier extends
+    AbstractS3ATokenIdentifier {
+
+  /**
+   * Session credentials: initially empty but non-null.
+   */
+  private MarshalledCredentials marshalledCredentials
+      = new MarshalledCredentials();
+
+  /**
+   * Constructor for service loader use.
+   * Created with the kind {@link DelegationConstants#SESSION_TOKEN_KIND}.
+   * Subclasses MUST NOT subclass this; they must provide their own
+   * token kind.
+   */
+  public SessionTokenIdentifier() {
+    super(SESSION_TOKEN_KIND);
+  }
+
+  /**
+   * Constructor for subclasses.
+   * @param kind kind of token identifier, for storage in the
+   * token kind to implementation map.
+   */
+  protected SessionTokenIdentifier(final Text kind) {
+    super(kind);
+  }
+
+  /**
+   * Constructor.
+   * @param kind token kind.
+   * @param owner token owner
+   * @param uri filesystem URI.
+   * @param marshalledCredentials credentials to marshall
+   * @param encryptionSecrets encryption secrets
+   * @param origin origin text for diagnostics.
+   */
+  public SessionTokenIdentifier(
+      final Text kind,
+      final Text owner,
+      final URI uri,
+      final MarshalledCredentials marshalledCredentials,
+      final EncryptionSecrets encryptionSecrets,
+      final String origin) {
+    super(kind, uri, owner, origin, encryptionSecrets);
+    this.marshalledCredentials = marshalledCredentials;
+  }
+
+  /**
+   * Constructor.
+   * @param kind token kind.
+   * @param owner token owner
+   * @param uri filesystem URI.
+   */
+  public SessionTokenIdentifier(final Text kind,
+      final Text owner,
+      final Text renewer,
+      final Text realUser,
+      final URI uri) {
+    super(kind, owner, renewer, realUser, uri);
+  }
+
+  @Override
+  public void write(final DataOutput out) throws IOException {
+    super.write(out);
+    marshalledCredentials.write(out);
+  }
+
+  @Override
+  public void readFields(final DataInput in)
+      throws IOException {
+    super.readFields(in);
+    marshalledCredentials.readFields(in);
+  }
+
+  /**
+   * Return the expiry time in seconds since 1970-01-01.
+   * @return the time when the AWS credentials expire.
+   */
+  @Override
+  public long getExpiryTime() {
+    return marshalledCredentials.getExpiration();
+  }
+
+  /**
+   * Get the marshalled credentials.
+   * @return marshalled AWS credentials.
+   */
+  public MarshalledCredentials getMarshalledCredentials() {
+    return marshalledCredentials;
+  }
+
+  /**
+   * Add the (sanitized) marshalled credentials to the string value.
+   * @return a string value for test assertions and debugging.
+   */
+  @Override
+  public String toString() {
+    return super.toString()
+        + "; " + marshalledCredentials.toString();
+  }
+}
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/delegation/package-info.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/delegation/package-info.java
new file mode 100644
index 0000000000000..f7eb6b16a5fb1
--- /dev/null
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/delegation/package-info.java
@@ -0,0 +1,34 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Extensible delegation token support for the S3A connector.
+ *
+ * Goal: support multiple back end token issue/renewal services, from
+ * "pure client side" session tokens to full "Kerberos auth".
+ *
+ * It is intended for internal use only; any external implementation
+ * of {@link org.apache.hadoop.fs.s3a.auth.delegation.AbstractDelegationTokenBinding}
+ * must consider this API unstable and track changes as they happen.
+ */
+@InterfaceAudience.LimitedPrivate("authorization-subsystems")
+@InterfaceStability.Unstable
+package org.apache.hadoop.fs.s3a.auth.delegation;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.classification.InterfaceStability;
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/package-info.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/package-info.java
index e34d68ecadc68..c3a7ee6ee55b3 100644
--- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/package-info.java
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/package-info.java
@@ -18,8 +18,12 @@
 
 /**
  * Authentication and permissions support.
+ *
+ * Some of the classes in here are expected to be referred to in configuration
+ * files, so must not change their name. These will be explicitly identified.
  */
-@InterfaceAudience.Private
+
+@InterfaceAudience.LimitedPrivate("Authentication services")
 @InterfaceStability.Unstable
 package org.apache.hadoop.fs.s3a.auth;
 
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/commit/DurationInfo.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/commit/DurationInfo.java
index c6617f83d9bf4..69f90cb651632 100644
--- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/commit/DurationInfo.java
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/commit/DurationInfo.java
@@ -23,7 +23,8 @@
 import org.apache.hadoop.classification.InterfaceAudience;
 
 /**
- * A duration with logging of final state at info in the {@code close()} call.
+ * A duration with logging of final state at info or debug
+ * in the {@code close()} call.
  * This allows it to be used in a try-with-resources clause, and have the
  * duration automatically logged.
  */
@@ -35,15 +36,41 @@ public class DurationInfo extends Duration
   private final Logger log;
 
   /**
-   * Create the duration text from a {@code String.format()} code call.
+   * Should the log be at INFO rather than DEBUG.
+   */
+  private final boolean logAtInfo;
+
+  /**
+   * Create the duration text from a {@code String.format()} code call;
+   * log output at info level.
    * @param log log to write to
    * @param format format string
    * @param args list of arguments
    */
   public DurationInfo(Logger log, String format, Object... args) {
+    this(log, true, format, args);
+  }
+
+  /**
+   * Create the duration text from a {@code String.format()} code call
+   * and log either at info or debug.
+   * @param log log to write to
+   * @param logAtInfo should the log be at info, rather than debug
+   * @param format format string
+   * @param args list of arguments
+   */
+  public DurationInfo(Logger log,
+      boolean logAtInfo,
+      String format,
+      Object... args) {
     this.text = String.format(format, args);
     this.log = log;
-    log.info("Starting: {}", text);
+    this.logAtInfo = logAtInfo;
+    if (logAtInfo) {
+      log.info("Starting: {}", text);
+    } else {
+      log.debug("Starting: {}", text);
+    }
   }
 
   @Override
@@ -54,6 +81,10 @@ public String toString() {
   @Override
   public void close() {
     finished();
-    log.info(this.toString());
+    if (logAtInfo) {
+      log.info("{}", this);
+    } else {
+      log.debug("{}", this);
+    }
   }
 }
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/DynamoDBMetadataStore.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/DynamoDBMetadataStore.java
index 7ce9ae6ef8d3e..f0f33e9dc1e81 100644
--- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/DynamoDBMetadataStore.java
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/DynamoDBMetadataStore.java
@@ -26,6 +26,7 @@
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collection;
+import java.util.Collections;
 import java.util.Date;
 import java.util.HashMap;
 import java.util.HashSet;
@@ -70,6 +71,7 @@
 import com.amazonaws.waiters.WaiterTimedOutException;
 import com.google.common.annotations.VisibleForTesting;
 import com.google.common.base.Preconditions;
+import com.google.common.collect.Lists;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -90,7 +92,9 @@
 import org.apache.hadoop.fs.s3a.S3AInstrumentation;
 import org.apache.hadoop.fs.s3a.S3AUtils;
 import org.apache.hadoop.fs.s3a.Tristate;
+import org.apache.hadoop.fs.s3a.auth.RoleModel;
 import org.apache.hadoop.fs.s3a.auth.RolePolicies;
+import org.apache.hadoop.fs.s3a.auth.delegation.AWSPolicyProvider;
 import org.apache.hadoop.io.retry.RetryPolicies;
 import org.apache.hadoop.io.retry.RetryPolicy;
 import org.apache.hadoop.security.UserGroupInformation;
@@ -98,6 +102,8 @@
 
 import static org.apache.hadoop.fs.s3a.Constants.*;
 import static org.apache.hadoop.fs.s3a.S3AUtils.*;
+import static org.apache.hadoop.fs.s3a.auth.RolePolicies.allowAllDynamoDBOperations;
+import static org.apache.hadoop.fs.s3a.auth.RolePolicies.allowS3GuardClientOperations;
 import static org.apache.hadoop.fs.s3a.s3guard.PathMetadataDynamoDBTranslation.*;
 import static org.apache.hadoop.fs.s3a.s3guard.S3Guard.*;
 
@@ -185,7 +191,8 @@
  */
 @InterfaceAudience.Private
 @InterfaceStability.Evolving
-public class DynamoDBMetadataStore implements MetadataStore {
+public class DynamoDBMetadataStore implements MetadataStore,
+    AWSPolicyProvider {
   public static final Logger LOG = LoggerFactory.getLogger(
       DynamoDBMetadataStore.class);
 
@@ -231,6 +238,7 @@ public class DynamoDBMetadataStore implements MetadataStore {
   private String region;
   private Table table;
   private String tableName;
+  private String tableArn;
   private Configuration conf;
   private String username;
 
@@ -403,6 +411,8 @@ public void initialize(Configuration config) throws IOException {
     region = conf.getTrimmed(S3GUARD_DDB_REGION_KEY);
     Preconditions.checkArgument(!StringUtils.isEmpty(region),
         "No DynamoDB region configured");
+    // there's no URI here, which complicates life: you cannot
+    // create AWS providers here which require one.
     credentials = createAWSCredentialProviderSet(null, conf);
     dynamoDB = createDynamoDB(conf, region, null, credentials);
 
@@ -1117,9 +1127,33 @@ public String toString() {
     return getClass().getSimpleName() + '{'
         + "region=" + region
         + ", tableName=" + tableName
+        + ", tableArn=" + tableArn
         + '}';
   }
 
+  /**
+   * The administrative policy includes all DDB table operations;
+   * application access is restricted to those operations S3Guard operations
+   * require when working with data in a guarded bucket.
+   * @param access access level desired.
+   * @return a possibly empty list of statements.
+   */
+  @Override
+  public List<RoleModel.Statement> listAWSPolicyRules(
+      final Set<AccessLevel> access) {
+    Preconditions.checkState(tableArn != null, "TableARN not known");
+    if (access.isEmpty()) {
+      return Collections.emptyList();
+    }
+    RoleModel.Statement stat;
+    if (access.contains(AccessLevel.ADMIN)) {
+      stat = allowAllDynamoDBOperations(tableArn);
+    } else {
+      stat = allowS3GuardClientOperations(tableArn);
+    }
+    return Lists.newArrayList(stat);
+  }
+
   /**
    * Create a table if it does not exist and wait for it to become active.
    *
@@ -1146,6 +1180,7 @@ void initTable() throws IOException {
         LOG.debug("Binding to table {}", tableName);
         TableDescription description = table.describe();
         LOG.debug("Table state: {}", description);
+        tableArn = description.getTableArn();
         final String status = description.getTableStatus();
         switch (status) {
         case "CREATING":
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3native/S3xLoginHelper.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3native/S3xLoginHelper.java
index 60d4b76407017..84e4a6768f976 100644
--- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3native/S3xLoginHelper.java
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3native/S3xLoginHelper.java
@@ -136,8 +136,6 @@ static Login extractLoginDetails(URI name) {
   /**
    * Canonicalize the given URI.
    *
-   * This strips out login information.
-   *
    * @param uri the URI to canonicalize
    * @param defaultPort default port to use in canonicalized URI if the input
    *     URI has no port and this value is greater than 0
diff --git a/hadoop-tools/hadoop-aws/src/main/resources/META-INF/services/org.apache.hadoop.security.token.DtFetcher b/hadoop-tools/hadoop-aws/src/main/resources/META-INF/services/org.apache.hadoop.security.token.DtFetcher
new file mode 100644
index 0000000000000..c1a3bd05ff5d4
--- /dev/null
+++ b/hadoop-tools/hadoop-aws/src/main/resources/META-INF/services/org.apache.hadoop.security.token.DtFetcher
@@ -0,0 +1,18 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+org.apache.hadoop.fs.s3a.auth.delegation.S3ADtFetcher
diff --git a/hadoop-tools/hadoop-aws/src/main/resources/META-INF/services/org.apache.hadoop.security.token.TokenIdentifier b/hadoop-tools/hadoop-aws/src/main/resources/META-INF/services/org.apache.hadoop.security.token.TokenIdentifier
new file mode 100644
index 0000000000000..bfd3def37aeda
--- /dev/null
+++ b/hadoop-tools/hadoop-aws/src/main/resources/META-INF/services/org.apache.hadoop.security.token.TokenIdentifier
@@ -0,0 +1,20 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+org.apache.hadoop.fs.s3a.auth.delegation.FullCredentialsTokenIdentifier
+org.apache.hadoop.fs.s3a.auth.delegation.RoleTokenIdentifier
+org.apache.hadoop.fs.s3a.auth.delegation.SessionTokenIdentifier
diff --git a/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/assumed_roles.md b/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/assumed_roles.md
index 8af045776c37c..f08f40f27b0a7 100644
--- a/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/assumed_roles.md
+++ b/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/assumed_roles.md
@@ -178,12 +178,14 @@ Here are the full set of configuration options.
 
 <property>
   <name>fs.s3a.assumed.role.credentials.provider</name>
-  <value>org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider</value>
+  <value>org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider,
+    com.amazonaws.auth.EnvironmentVariableCredentialsProvider
+  </value>
   <description>
     List of credential providers to authenticate with the STS endpoint and
     retrieve short-lived role credentials.
-    Only used if AssumedRoleCredentialProvider is the AWS credential provider.
-    If unset, uses "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider".
+    Used by AssumedRoleCredentialProvider and the S3A Session Delegation Token
+    and S3A Role Delegation Token bindings.
   </description>
 </property>
 ```
@@ -468,17 +470,69 @@ Caused by: com.amazonaws.services.securitytoken.model.AWSSecurityTokenServiceExc
   at com.amazonaws.http.AmazonHttpClient$RequestExecutor.executeWithTimer(AmazonHttpClient.java:717)
 ```
 
-### <a name="invalid_duration"></a> "Assume Role session duration should be in the range of 15min - 1Hr"
+### <a name="invalid_duration"></a> `Member must have value greater than or equal to 900`
 
-The value of `fs.s3a.assumed.role.session.duration` is out of range.
+The value of `fs.s3a.assumed.role.session.duration` is too low.
 
 ```
-java.lang.IllegalArgumentException: Assume Role session duration should be in the range of 15min
-- 1Hr
-  at com.amazonaws.auth.STSAssumeRoleSessionCredentialsProvider$Builder.withRoleSessionDurationSeconds(STSAssumeRoleSessionCredentialsProvider.java:437)
-  at org.apache.hadoop.fs.s3a.auth.AssumedRoleCredentialProvider.<init>(AssumedRoleCredentialProvider.java:86)
+org.apache.hadoop.fs.s3a.AWSBadRequestException: request role credentials:
+com.amazonaws.services.securitytoken.model.AWSSecurityTokenServiceException:
+1 validation error detected: Value '20' at 'durationSeconds' failed to satisfy constraint:
+Member must have value greater than or equal to 900 (Service: AWSSecurityTokenService;
+Status Code: 400; Error Code: ValidationError;
+Request ID: b9a82403-d0a7-11e8-98ef-596679ee890d)
+```
+
+Fix: increase.
+
+### <a name="duration_too_high"></a> Error "The requested DurationSeconds exceeds the MaxSessionDuration set for this role"
+
+The value of `fs.s3a.assumed.role.session.duration` is too high.
+
+```
+org.apache.hadoop.fs.s3a.AWSBadRequestException: request role credentials:
+ com.amazonaws.services.securitytoken.model.AWSSecurityTokenServiceException:
+The requested DurationSeconds exceeds the MaxSessionDuration set for this role.
+(Service: AWSSecurityTokenService; Status Code: 400;
+ Error Code: ValidationError; Request ID: 17875165-d0a7-11e8-b85f-d15a599a7f6d)
+```
+
+There are two solutions to this
+
+* Decrease the duration value.
+* Increase the duration of a role in the [AWS IAM Console](https://console.aws.amazon.com/iam/home#/roles).
+
+
+### "Value '345600' at 'durationSeconds' failed to satisfy constraint: Member must have value less than or equal to 43200"
+
+Irrespective of the maximum duration of a role, the AWS role API only permits callers to request
+any role for up to  12h; attempting to use a larger number will fail.
+
+
+```
+Caused by: com.amazonaws.services.securitytoken.model.AWSSecurityTokenServiceException:
+1 validation error detected:
+Value '345600' at 'durationSeconds' failed to satisfy constraint:
+Member must have value less than or equal to 43200
+(Service: AWSSecurityTokenService;
+Status Code: 400; Error Code:
+ValidationError;
+Request ID: dec1ca6b-d0aa-11e8-ac8c-4119b3ea9f7f)
+```
+
+For full sessions, the duration limit is 129600 seconds: 36h.
+
+```
+org.apache.hadoop.fs.s3a.AWSBadRequestException: request session credentials:
+com.amazonaws.services.securitytoken.model.AWSSecurityTokenServiceException:
+1 validation error detected: Value '345600' at 'durationSeconds' failed to satisfy constraint:
+Member must have value less than or equal to 129600
+(Service: AWSSecurityTokenService; Status Code: 400; Error Code: ValidationError;
+Request ID: a6e73d44-d0aa-11e8-95ed-c5bba29f0635)
 ```
 
+For both these errors, the sole fix is to request a shorter duration
+in `fs.s3a.assumed.role.session.duration`.
 
 ### <a name="malformed_policy"></a> `MalformedPolicyDocumentException` "The policy is not in the valid JSON format"
 
@@ -487,7 +541,7 @@ The policy set in `fs.s3a.assumed.role.policy` is not valid according to the
 AWS specification of Role Policies.
 
 ```
-rg.apache.hadoop.fs.s3a.AWSBadRequestException: Instantiate org.apache.hadoop.fs.s3a.auth.AssumedRoleCredentialProvider on :
+org.apache.hadoop.fs.s3a.AWSBadRequestException: Instantiate org.apache.hadoop.fs.s3a.auth.AssumedRoleCredentialProvider on :
  com.amazonaws.services.securitytoken.model.MalformedPolicyDocumentException:
   The policy is not in the valid JSON format. (Service: AWSSecurityTokenService; Status Code: 400;
    Error Code: MalformedPolicyDocument; Request ID: baf8cb62-f552-11e7-9768-9df3b384e40c):
@@ -508,36 +562,9 @@ Caused by: com.amazonaws.services.securitytoken.model.MalformedPolicyDocumentExc
    Error Code: MalformedPolicyDocument; Request ID: baf8cb62-f552-11e7-9768-9df3b384e40c)
   at com.amazonaws.http.AmazonHttpClient$RequestExecutor.handleErrorResponse(AmazonHttpClient.java:1638)
   at com.amazonaws.http.AmazonHttpClient$RequestExecutor.executeOneRequest(AmazonHttpClient.java:1303)
-  at com.amazonaws.http.AmazonHttpClient$RequestExecutor.executeHelper(AmazonHttpClient.java:1055)
-  at com.amazonaws.http.AmazonHttpClient$RequestExecutor.doExecute(AmazonHttpClient.java:743)
-  at com.amazonaws.http.AmazonHttpClient$RequestExecutor.executeWithTimer(AmazonHttpClient.java:717)
-  at com.amazonaws.http.AmazonHttpClient$RequestExecutor.execute(AmazonHttpClient.java:699)
-  at com.amazonaws.http.AmazonHttpClient$RequestExecutor.access$500(AmazonHttpClient.java:667)
-  at com.amazonaws.http.AmazonHttpClient$RequestExecutionBuilderImpl.execute(AmazonHttpClient.java:649)
-  at com.amazonaws.http.AmazonHttpClient.execute(AmazonHttpClient.java:513)
-  at com.amazonaws.services.securitytoken.AWSSecurityTokenServiceClient.doInvoke(AWSSecurityTokenServiceClient.java:1271)
-  at com.amazonaws.services.securitytoken.AWSSecurityTokenServiceClient.invoke(AWSSecurityTokenServiceClient.java:1247)
-  at com.amazonaws.services.securitytoken.AWSSecurityTokenServiceClient.executeAssumeRole(AWSSecurityTokenServiceClient.java:454)
-  at com.amazonaws.services.securitytoken.AWSSecurityTokenServiceClient.assumeRole(AWSSecurityTokenServiceClient.java:431)
-  at com.amazonaws.auth.STSAssumeRoleSessionCredentialsProvider.newSession(STSAssumeRoleSessionCredentialsProvider.java:321)
-  at com.amazonaws.auth.STSAssumeRoleSessionCredentialsProvider.access$000(STSAssumeRoleSessionCredentialsProvider.java:37)
-  at com.amazonaws.auth.STSAssumeRoleSessionCredentialsProvider$1.call(STSAssumeRoleSessionCredentialsProvider.java:76)
-  at com.amazonaws.auth.STSAssumeRoleSessionCredentialsProvider$1.call(STSAssumeRoleSessionCredentialsProvider.java:73)
-  at com.amazonaws.auth.RefreshableTask.refreshValue(RefreshableTask.java:256)
-  at com.amazonaws.auth.RefreshableTask.blockingRefresh(RefreshableTask.java:212)
-  at com.amazonaws.auth.RefreshableTask.getValue(RefreshableTask.java:153)
-  at com.amazonaws.auth.STSAssumeRoleSessionCredentialsProvider.getCredentials(STSAssumeRoleSessionCredentialsProvider.java:299)
-  at org.apache.hadoop.fs.s3a.auth.AssumedRoleCredentialProvider.getCredentials(AssumedRoleCredentialProvider.java:127)
-  at org.apache.hadoop.fs.s3a.auth.AssumedRoleCredentialProvider.<init>(AssumedRoleCredentialProvider.java:116)
-  at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
-  at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
-  at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
-  at java.lang.reflect.Constructor.newInstance(Constructor.java:423)
-  at org.apache.hadoop.fs.s3a.S3AUtils.createAWSCredentialProvider(S3AUtils.java:583)
-  ... 19 more
 ```
 
-### <a name="malformed_policy"></a> `MalformedPolicyDocumentException` "Syntax errors in policy"
+### <a name="policy_syntax_error"></a> `MalformedPolicyDocumentException` "Syntax errors in policy"
 
 The policy set in `fs.s3a.assumed.role.policy` is not valid JSON.
 
@@ -564,31 +591,6 @@ Instantiate org.apache.hadoop.fs.s3a.auth.AssumedRoleCredentialProvider on :
   at com.amazonaws.http.AmazonHttpClient$RequestExecutor.handleErrorResponse(AmazonHttpClient.java:1638)
   at com.amazonaws.http.AmazonHttpClient$RequestExecutor.executeOneRequest(AmazonHttpClient.java:1303)
   at com.amazonaws.http.AmazonHttpClient$RequestExecutor.executeHelper(AmazonHttpClient.java:1055)
-  at com.amazonaws.http.AmazonHttpClient$RequestExecutor.doExecute(AmazonHttpClient.java:743)
-  at com.amazonaws.http.AmazonHttpClient$RequestExecutor.executeWithTimer(AmazonHttpClient.java:717)
-  at com.amazonaws.http.AmazonHttpClient$RequestExecutor.execute(AmazonHttpClient.java:699)
-  at com.amazonaws.http.AmazonHttpClient$RequestExecutor.access$500(AmazonHttpClient.java:667)
-  at com.amazonaws.http.AmazonHttpClient$RequestExecutionBuilderImpl.execute(AmazonHttpClient.java:649)
-  at com.amazonaws.http.AmazonHttpClient.execute(AmazonHttpClient.java:513)
-  at com.amazonaws.services.securitytoken.AWSSecurityTokenServiceClient.doInvoke(AWSSecurityTokenServiceClient.java:1271)
-  at com.amazonaws.services.securitytoken.AWSSecurityTokenServiceClient.invoke(AWSSecurityTokenServiceClient.java:1247)
-  at com.amazonaws.services.securitytoken.AWSSecurityTokenServiceClient.executeAssumeRole(AWSSecurityTokenServiceClient.java:454)
-  at com.amazonaws.services.securitytoken.AWSSecurityTokenServiceClient.assumeRole(AWSSecurityTokenServiceClient.java:431)
-  at com.amazonaws.auth.STSAssumeRoleSessionCredentialsProvider.newSession(STSAssumeRoleSessionCredentialsProvider.java:321)
-  at com.amazonaws.auth.STSAssumeRoleSessionCredentialsProvider.access$000(STSAssumeRoleSessionCredentialsProvider.java:37)
-  at com.amazonaws.auth.STSAssumeRoleSessionCredentialsProvider$1.call(STSAssumeRoleSessionCredentialsProvider.java:76)
-  at com.amazonaws.auth.STSAssumeRoleSessionCredentialsProvider$1.call(STSAssumeRoleSessionCredentialsProvider.java:73)
-  at com.amazonaws.auth.RefreshableTask.refreshValue(RefreshableTask.java:256)
-  at com.amazonaws.auth.RefreshableTask.blockingRefresh(RefreshableTask.java:212)
-  at com.amazonaws.auth.RefreshableTask.getValue(RefreshableTask.java:153)
-  at com.amazonaws.auth.STSAssumeRoleSessionCredentialsProvider.getCredentials(STSAssumeRoleSessionCredentialsProvider.java:299)
-  at org.apache.hadoop.fs.s3a.auth.AssumedRoleCredentialProvider.getCredentials(AssumedRoleCredentialProvider.java:127)
-  at org.apache.hadoop.fs.s3a.auth.AssumedRoleCredentialProvider.<init>(AssumedRoleCredentialProvider.java:116)
-  at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
-  at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
-  at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
-  at java.lang.reflect.Constructor.newInstance(Constructor.java:423)
-  at org.apache.hadoop.fs.s3a.S3AUtils.createAWSCredentialProvider(S3AUtils.java:583)
   ... 19 more
 ```
 
@@ -646,34 +648,6 @@ Caused by: com.amazonaws.services.securitytoken.model.AWSSecurityTokenServiceExc
     SignedHeaders=amz-sdk-invocation-id;amz-sdk-retry;host;user-agent;x-amz-date,
     (Service: AWSSecurityTokenService; Status Code: 400; Error Code: IncompleteSignature;
   at com.amazonaws.http.AmazonHttpClient$RequestExecutor.handleErrorResponse(AmazonHttpClient.java:1638)
-  at com.amazonaws.http.AmazonHttpClient$RequestExecutor.executeOneRequest(AmazonHttpClient.java:1303)
-  at com.amazonaws.http.AmazonHttpClient$RequestExecutor.executeHelper(AmazonHttpClient.java:1055)
-  at com.amazonaws.http.AmazonHttpClient$RequestExecutor.doExecute(AmazonHttpClient.java:743)
-  at com.amazonaws.http.AmazonHttpClient$RequestExecutor.executeWithTimer(AmazonHttpClient.java:717)
-  at com.amazonaws.http.AmazonHttpClient$RequestExecutor.execute(AmazonHttpClient.java:699)
-  at com.amazonaws.http.AmazonHttpClient$RequestExecutor.access$500(AmazonHttpClient.java:667)
-  at com.amazonaws.http.AmazonHttpClient$RequestExecutionBuilderImpl.execute(AmazonHttpClient.java:649)
-  at com.amazonaws.http.AmazonHttpClient.execute(AmazonHttpClient.java:513)
-  at com.amazonaws.services.securitytoken.AWSSecurityTokenServiceClient.doInvoke(AWSSecurityTokenServiceClient.java:1271)
-  at com.amazonaws.services.securitytoken.AWSSecurityTokenServiceClient.invoke(AWSSecurityTokenServiceClient.java:1247)
-  at com.amazonaws.services.securitytoken.AWSSecurityTokenServiceClient.executeAssumeRole(AWSSecurityTokenServiceClient.java:454)
-  at com.amazonaws.services.securitytoken.AWSSecurityTokenServiceClient.assumeRole(AWSSecurityTokenServiceClient.java:431)
-  at com.amazonaws.auth.STSAssumeRoleSessionCredentialsProvider.newSession(STSAssumeRoleSessionCredentialsProvider.java:321)
-  at com.amazonaws.auth.STSAssumeRoleSessionCredentialsProvider.access$000(STSAssumeRoleSessionCredentialsProvider.java:37)
-  at com.amazonaws.auth.STSAssumeRoleSessionCredentialsProvider$1.call(STSAssumeRoleSessionCredentialsProvider.java:76)
-  at com.amazonaws.auth.STSAssumeRoleSessionCredentialsProvider$1.call(STSAssumeRoleSessionCredentialsProvider.java:73)
-  at com.amazonaws.auth.RefreshableTask.refreshValue(RefreshableTask.java:256)
-  at com.amazonaws.auth.RefreshableTask.blockingRefresh(RefreshableTask.java:212)
-  at com.amazonaws.auth.RefreshableTask.getValue(RefreshableTask.java:153)
-  at com.amazonaws.auth.STSAssumeRoleSessionCredentialsProvider.getCredentials(STSAssumeRoleSessionCredentialsProvider.java:299)
-  at org.apache.hadoop.fs.s3a.auth.AssumedRoleCredentialProvider.getCredentials(AssumedRoleCredentialProvider.java:127)
-  at org.apache.hadoop.fs.s3a.auth.AssumedRoleCredentialProvider.<init>(AssumedRoleCredentialProvider.java:116)
-  at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
-  at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
-  at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
-  at java.lang.reflect.Constructor.newInstance(Constructor.java:423)
-  at org.apache.hadoop.fs.s3a.S3AUtils.createAWSCredentialProvider(S3AUtils.java:583)
-  ... 25 more
 ```
 
 ### <a name="invalid_token"></a> `AccessDeniedException/InvalidClientTokenId`: "The security token included in the request is invalid"
@@ -702,31 +676,6 @@ The security token included in the request is invalid.
   at com.amazonaws.http.AmazonHttpClient$RequestExecutor.handleErrorResponse(AmazonHttpClient.java:1638)
   at com.amazonaws.http.AmazonHttpClient$RequestExecutor.executeOneRequest(AmazonHttpClient.java:1303)
   at com.amazonaws.http.AmazonHttpClient$RequestExecutor.executeHelper(AmazonHttpClient.java:1055)
-  at com.amazonaws.http.AmazonHttpClient$RequestExecutor.doExecute(AmazonHttpClient.java:743)
-  at com.amazonaws.http.AmazonHttpClient$RequestExecutor.executeWithTimer(AmazonHttpClient.java:717)
-  at com.amazonaws.http.AmazonHttpClient$RequestExecutor.execute(AmazonHttpClient.java:699)
-  at com.amazonaws.http.AmazonHttpClient$RequestExecutor.access$500(AmazonHttpClient.java:667)
-  at com.amazonaws.http.AmazonHttpClient$RequestExecutionBuilderImpl.execute(AmazonHttpClient.java:649)
-  at com.amazonaws.http.AmazonHttpClient.execute(AmazonHttpClient.java:513)
-  at com.amazonaws.services.securitytoken.AWSSecurityTokenServiceClient.doInvoke(AWSSecurityTokenServiceClient.java:1271)
-  at com.amazonaws.services.securitytoken.AWSSecurityTokenServiceClient.invoke(AWSSecurityTokenServiceClient.java:1247)
-  at com.amazonaws.services.securitytoken.AWSSecurityTokenServiceClient.executeAssumeRole(AWSSecurityTokenServiceClient.java:454)
-  at com.amazonaws.services.securitytoken.AWSSecurityTokenServiceClient.assumeRole(AWSSecurityTokenServiceClient.java:431)
-  at com.amazonaws.auth.STSAssumeRoleSessionCredentialsProvider.newSession(STSAssumeRoleSessionCredentialsProvider.java:321)
-  at com.amazonaws.auth.STSAssumeRoleSessionCredentialsProvider.access$000(STSAssumeRoleSessionCredentialsProvider.java:37)
-  at com.amazonaws.auth.STSAssumeRoleSessionCredentialsProvider$1.call(STSAssumeRoleSessionCredentialsProvider.java:76)
-  at com.amazonaws.auth.STSAssumeRoleSessionCredentialsProvider$1.call(STSAssumeRoleSessionCredentialsProvider.java:73)
-  at com.amazonaws.auth.RefreshableTask.refreshValue(RefreshableTask.java:256)
-  at com.amazonaws.auth.RefreshableTask.blockingRefresh(RefreshableTask.java:212)
-  at com.amazonaws.auth.RefreshableTask.getValue(RefreshableTask.java:153)
-  at com.amazonaws.auth.STSAssumeRoleSessionCredentialsProvider.getCredentials(STSAssumeRoleSessionCredentialsProvider.java:299)
-  at org.apache.hadoop.fs.s3a.auth.AssumedRoleCredentialProvider.getCredentials(AssumedRoleCredentialProvider.java:127)
-  at org.apache.hadoop.fs.s3a.auth.AssumedRoleCredentialProvider.<init>(AssumedRoleCredentialProvider.java:116)
-  at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
-  at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
-  at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
-  at java.lang.reflect.Constructor.newInstance(Constructor.java:423)
-  at org.apache.hadoop.fs.s3a.S3AUtils.createAWSCredentialProvider(S3AUtils.java:583)
   ... 25 more
 ```
 
@@ -740,7 +689,8 @@ match these constraints.
 If set explicitly, it must be valid.
 
 ```
-org.apache.hadoop.fs.s3a.AWSBadRequestException: Instantiate org.apache.hadoop.fs.s3a.auth.AssumedRoleCredentialProvider on
+org.apache.hadoop.fs.s3a.AWSBadRequestException:
+ Instantiate org.apache.hadoop.fs.s3a.auth.AssumedRoleCredentialProvider on
     com.amazonaws.services.securitytoken.model.AWSSecurityTokenServiceException:
     1 validation error detected: Value 'Session Names cannot Hava Spaces!' at 'roleSessionName'
     failed to satisfy constraint: Member must satisfy regular expression pattern: [\w+=,.@-]*
@@ -765,33 +715,6 @@ Caused by: com.amazonaws.services.securitytoken.model.AWSSecurityTokenServiceExc
     (Service: AWSSecurityTokenService; Status Code: 400; Error Code: ValidationError;
   at com.amazonaws.http.AmazonHttpClient$RequestExecutor.handleErrorResponse(AmazonHttpClient.java:1638)
   at com.amazonaws.http.AmazonHttpClient$RequestExecutor.executeOneRequest(AmazonHttpClient.java:1303)
-  at com.amazonaws.http.AmazonHttpClient$RequestExecutor.executeHelper(AmazonHttpClient.java:1055)
-  at com.amazonaws.http.AmazonHttpClient$RequestExecutor.doExecute(AmazonHttpClient.java:743)
-  at com.amazonaws.http.AmazonHttpClient$RequestExecutor.executeWithTimer(AmazonHttpClient.java:717)
-  at com.amazonaws.http.AmazonHttpClient$RequestExecutor.execute(AmazonHttpClient.java:699)
-  at com.amazonaws.http.AmazonHttpClient$RequestExecutor.access$500(AmazonHttpClient.java:667)
-  at com.amazonaws.http.AmazonHttpClient$RequestExecutionBuilderImpl.execute(AmazonHttpClient.java:649)
-  at com.amazonaws.http.AmazonHttpClient.execute(AmazonHttpClient.java:513)
-  at com.amazonaws.services.securitytoken.AWSSecurityTokenServiceClient.doInvoke(AWSSecurityTokenServiceClient.java:1271)
-  at com.amazonaws.services.securitytoken.AWSSecurityTokenServiceClient.invoke(AWSSecurityTokenServiceClient.java:1247)
-  at com.amazonaws.services.securitytoken.AWSSecurityTokenServiceClient.executeAssumeRole(AWSSecurityTokenServiceClient.java:454)
-  at com.amazonaws.services.securitytoken.AWSSecurityTokenServiceClient.assumeRole(AWSSecurityTokenServiceClient.java:431)
-  at com.amazonaws.auth.STSAssumeRoleSessionCredentialsProvider.newSession(STSAssumeRoleSessionCredentialsProvider.java:321)
-  at com.amazonaws.auth.STSAssumeRoleSessionCredentialsProvider.access$000(STSAssumeRoleSessionCredentialsProvider.java:37)
-  at com.amazonaws.auth.STSAssumeRoleSessionCredentialsProvider$1.call(STSAssumeRoleSessionCredentialsProvider.java:76)
-  at com.amazonaws.auth.STSAssumeRoleSessionCredentialsProvider$1.call(STSAssumeRoleSessionCredentialsProvider.java:73)
-  at com.amazonaws.auth.RefreshableTask.refreshValue(RefreshableTask.java:256)
-  at com.amazonaws.auth.RefreshableTask.blockingRefresh(RefreshableTask.java:212)
-  at com.amazonaws.auth.RefreshableTask.getValue(RefreshableTask.java:153)
-  at com.amazonaws.auth.STSAssumeRoleSessionCredentialsProvider.getCredentials(STSAssumeRoleSessionCredentialsProvider.java:299)
-  at org.apache.hadoop.fs.s3a.auth.AssumedRoleCredentialProvider.getCredentials(AssumedRoleCredentialProvider.java:135)
-  at org.apache.hadoop.fs.s3a.auth.AssumedRoleCredentialProvider.<init>(AssumedRoleCredentialProvider.java:124)
-  at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
-  at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
-  at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
-  at java.lang.reflect.Constructor.newInstance(Constructor.java:423)
-  at org.apache.hadoop.fs.s3a.S3AUtils.createAWSCredentialProvider(S3AUtils.java:583)
-  ... 26 more
 ```
 
 
@@ -818,24 +741,6 @@ Caused by: com.amazonaws.services.s3.model.AmazonS3Exception: Access Denied
   S3 Extended Request ID: iEXDVzjIyRbnkAc40MS8Sjv+uUQNvERRcqLsJsy9B0oyrjHLdkRKwJ/phFfA17Kjn483KSlyJNw=
   at com.amazonaws.http.AmazonHttpClient$RequestExecutor.handleErrorResponse(AmazonHttpClient.java:1638)
   at com.amazonaws.http.AmazonHttpClient$RequestExecutor.executeOneRequest(AmazonHttpClient.java:1303)
-  at com.amazonaws.http.AmazonHttpClient$RequestExecutor.executeHelper(AmazonHttpClient.java:1055)
-  at com.amazonaws.http.AmazonHttpClient$RequestExecutor.doExecute(AmazonHttpClient.java:743)
-  at com.amazonaws.http.AmazonHttpClient$RequestExecutor.executeWithTimer(AmazonHttpClient.java:717)
-  at com.amazonaws.http.AmazonHttpClient$RequestExecutor.execute(AmazonHttpClient.java:699)
-  at com.amazonaws.http.AmazonHttpClient$RequestExecutor.access$500(AmazonHttpClient.java:667)
-  at com.amazonaws.http.AmazonHttpClient$RequestExecutionBuilderImpl.execute(AmazonHttpClient.java:649)
-  at com.amazonaws.http.AmazonHttpClient.execute(AmazonHttpClient.java:513)
-  at com.amazonaws.services.s3.AmazonS3Client.invoke(AmazonS3Client.java:4229)
-  at com.amazonaws.services.s3.AmazonS3Client.invoke(AmazonS3Client.java:4176)
-  at com.amazonaws.services.s3.AmazonS3Client.deleteObject(AmazonS3Client.java:2066)
-  at com.amazonaws.services.s3.AmazonS3Client.deleteObject(AmazonS3Client.java:2052)
-  at org.apache.hadoop.fs.s3a.S3AFileSystem.lambda$deleteObject$7(S3AFileSystem.java:1338)
-  at org.apache.hadoop.fs.s3a.Invoker.retryUntranslated(Invoker.java:314)
-  at org.apache.hadoop.fs.s3a.Invoker.retryUntranslated(Invoker.java:280)
-  at org.apache.hadoop.fs.s3a.S3AFileSystem.deleteObject(S3AFileSystem.java:1334)
-  at org.apache.hadoop.fs.s3a.S3AFileSystem.removeKeys(S3AFileSystem.java:1657)
-  at org.apache.hadoop.fs.s3a.S3AFileSystem.innerRename(S3AFileSystem.java:1046)
-  at org.apache.hadoop.fs.s3a.S3AFileSystem.rename(S3AFileSystem.java:851)
 ```
 
 This is the policy restriction behaving as intended: the caller is trying to
@@ -882,3 +787,63 @@ or just that this specific permission has been omitted.
 If the role policy requested for the assumed role didn't ask for any DynamoDB
 permissions, this is where all attempts to work with a S3Guarded bucket will
 fail. Check the value of `fs.s3a.assumed.role.policy`
+
+### Error `Unable to execute HTTP request`
+
+This is a low-level networking error. Possible causes include:
+
+* The endpoint set in `fs.s3a.assumed.role.sts.endpoint` is invalid.
+* There are underlying network problems.
+
+```
+org.apache.hadoop.fs.s3a.AWSClientIOException: request session credentials:
+  com.amazonaws.SdkClientException:
+
+  Unable to execute HTTP request: null: Unable to execute HTTP request: null
+at com.amazonaws.thirdparty.apache.http.impl.conn.DefaultRoutePlanner.determineRoute(DefaultRoutePlanner.java:88)
+at com.amazonaws.thirdparty.apache.http.impl.client.InternalHttpClient.determineRoute(InternalHttpClient.java:124)
+at com.amazonaws.thirdparty.apache.http.impl.client.InternalHttpClient.doExecute(InternalHttpClient.java:183)
+at com.amazonaws.thirdparty.apache.http.impl.client.CloseableHttpClient.execute(CloseableHttpClient.java:82)
+at com.amazonaws.thirdparty.apache.http.impl.client.CloseableHttpClient.execute(CloseableHttpClient.java:55)
+```
+
+###  <a name="credential_scope"></a> Error "Credential should be scoped to a valid region"
+
+This is based on conflict between the values of `fs.s3a.assumed.role.sts.endpoint`
+and `fs.s3a.assumed.role.sts.endpoint.region`
+Two variants, "not '''"
+
+Variant 1: `Credential should be scoped to a valid region, not 'us-west-1'` (or other string)
+
+
+```
+java.nio.file.AccessDeniedException: : request session credentials:
+com.amazonaws.services.securitytoken.model.AWSSecurityTokenServiceException:
+Credential should be scoped to a valid region, not 'us-west-1'.
+(Service: AWSSecurityTokenService; Status Code: 403; Error Code: SignatureDoesNotMatch; Request ID: d9065cc4-e2b9-11e8-8b7b-f35cb8d7aea4):SignatureDoesNotMatch
+```
+
+One of:
+
+
+* the value of `fs.s3a.assumed.role.sts.endpoint.region` is not a valid region
+* the value of `fs.s3a.assumed.role.sts.endpoint.region` is not the signing
+region of the endpoint set in `fs.s3a.assumed.role.sts.endpoint`
+
+
+Variant 2: `Credential should be scoped to a valid region, not ''`
+
+```
+java.nio.file.AccessDeniedException: : request session credentials:
+com.amazonaws.services.securitytoken.model.AWSSecurityTokenServiceException:
+  Credential should be scoped to a valid region, not ''. (
+  Service: AWSSecurityTokenService; Status Code: 403; Error Code: SignatureDoesNotMatch;
+  Request ID: bd3e5121-e2ac-11e8-a566-c1a4d66b6a16):SignatureDoesNotMatch
+```
+
+This should be intercepted earlier: an endpoint has been specified but
+not a region.
+
+There's special handling for the central `sts.amazonaws.com` region; when
+that is declared as the value of `fs.s3a.assumed.role.sts.endpoint.region` then
+there is no need to declare a region: whatever value it has is ignored.
diff --git a/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/delegation_token_architecture.md b/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/delegation_token_architecture.md
new file mode 100644
index 0000000000000..90e4e5587d027
--- /dev/null
+++ b/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/delegation_token_architecture.md
@@ -0,0 +1,466 @@
+<!---
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License. See accompanying LICENSE file.
+-->
+
+# S3A Delegation Token Architecture
+
+This is an architecture document to accompany
+[Working with Delegation Tokens](delegation_tokens.html)
+
+## Background: Delegation Tokens
+
+Delegation Tokens, "DTs" are a common feature of Hadoop Services.
+They are opaque byte arrays which can be issued by services like
+HDFS, HBase, YARN, and which can be used to authenticate a request with
+that service.
+
+### Tokens are Issued
+
+In a Kerberized cluster, they are issued by the service after the caller
+has authenticated, and so that principal is trusted to be who they say they are.
+The issued DT can therefore attest that whoever is including that token
+on a request is authorized to act on behalf of that principal —for the
+specific set of operations which the DT grants.
+
+As an example, an HDFS DT can be requested by a user, included in the
+launch context of a YARN application -say DistCp, and that launched application
+can then talk to HDFS as if they were that user.
+
+### Tokens are marshalled
+
+Tokens are opaque byte arrays. They are contained within a `Token<T extends TokenIdentifier>`
+ class which includes an expiry time, the service identifier, and some other details.
+
+`Token<>` instances can be serialized as a Hadoop Writable, or converted saved to/from a protobuf
+format. This is how they are included in YARN application and container requests,
+and elsewhere. They can even be saved to files through the `hadoop dt` command.
+
+### Tokens can be unmarshalled
+
+
+At the far end, tokens can be unmarshalled and converted into instances of
+the java classes. This assumes that all the dependent classes are on the
+classpath, obviously.
+
+### Tokens can be used to authenticate callers
+
+The Hadoop RPC layer and the web SPNEGO layer support tokens.
+
+### Tokens can be renewed
+
+DTs can be renewed by the specific principal declared at creation time as
+"the renewer". In the example above, the YARN Resource Manager's principal
+can be declared as the reviewer. Then, even while a token is attached
+to a queued launch request in the RM, the RM can regularly request of HDFS
+that the token is renewed.
+
+There's an ultimate limit on how long tokens can be renewed for, but its
+generally 72h or similar, so that medium-life jobs can access services
+and data on behalf of a user.
+
+### Tokens can be Revoked
+
+When tokens are no longer needed, the service can be told to revoke a token.
+Continuing the YARN example, after an application finishes the YARN RM
+can revoke every token marshalled into the application launch request.
+At which point there's no risk associated with that token being
+compromised.
+
+
+*This is all how "real" Hadoop tokens work*
+
+The S3A Delegation Tokens are subtly different.
+
+The S3A DTs actually include the AWS credentials within the token
+data marshalled and shared across the cluster. The credentials can be one
+of:
+
+* The Full AWS (`fs.s3a.access.key`, `fs.s3a.secret.key`) login.
+* A set of AWS session credentials
+  (`fs.s3a.access.key`, `fs.s3a.secret.key`, `fs.s3a.session.token`).
+
+These credentials are obtained from the AWS Secure Token Service (STS) when the the token is issued.
+* A set of AWS session credentials binding the user to a specific AWS IAM Role,
+further restricted to only access the S3 bucket and matching S3Guard DynamoDB table.
+Again, these credentials are requested when the token is issued.
+
+
+*Tokens can be issued*
+
+When an S3A Filesystem instance is asked to issue a token it can simply package
+up the login secrets (The "Full" tokens), or talk to the AWS STS service
+to get a set of session/assumed role credentials. These are marshalled within
+the overall token, and then onwards to applications.
+
+*Tokens can be marshalled*
+
+The AWS secrets are held in a subclass of `org.apache.hadoop.security.token.TokenIdentifier`.
+This class gets serialized to a byte array when the whole token is marshalled, and deserialized
+when the token is loaded.
+
+*Tokens can be used to authenticate callers*
+
+The S3A FS does not hand the token to AWS services to authenticate the caller.
+Instead it takes the AWS credentials included in the token identifier
+and uses them to sign the requests.
+
+*Tokens cannot be renewed*
+
+The tokens contain the credentials; you cant use them to ask AWS for more.
+
+For full credentials that is moot, but for the session and role credentials,
+they will expire. At which point the application will be unable to
+talk to the AWS infrastructure.
+
+*Tokens cannot be revoked*
+
+The AWS STS APIs don't let you revoke a single set of session credentials.
+
+## Background: How Tokens are collected in MapReduce jobs
+
+
+### `org.apache.hadoop.mapreduce.JobSubmitter.submitJobInternal()`
+
+1. Calls `org.apache.hadoop.mapreduce.security.TokenCache.obtainTokensForNamenodes()`
+for the job submission dir on the cluster FS (i.e. `fs.defaultFS`).
+1. Reads in the property `mapreduce.job.hdfs-servers` and extracts DTs from them,
+1. Tells the `FileInputFormat` and `FileOutputFormat` subclasses of the job
+to collect their source and dest FS tokens.
+
+All token collection is via `TokenCache.obtainTokensForNamenodes()`
+
+### `TokenCache.obtainTokensForNamenodes(Credentials, Path[], Configuration) `
+
+1. Returns immediately if security is off.
+1. Retrieves all the filesystems in the list of paths.
+1. Retrieves a token from each unless it is in the list of filesystems in `mapreduce.job.hdfs-servers.token-renewal.exclude`
+1. Merges in any DTs stored in the file referenced under: `mapreduce.job.credentials.binary`
+1. Calls `FileSystem.collectDelegationTokens()`, which, if there isn't any token already in the credential list, issues and adds a new token. *There is no check to see if that existing credential has expired*.
+
+
+### `FileInputFormat.listStatus(JobConf job): FileStatus[]`
+
+Enumerates source paths in (`mapreduce.input.fileinputformat.inputdir`) ; uses `TokenCache.obtainTokensForNamenodes()`
+to collate a token for all of these paths.
+
+This operation is called by the public interface method `FileInputFormat.getSplits()`.
+
+### `FileOutputFormat.checkOutputSpecs()`
+
+Calls `getOutputPath(job)` and asks for the DTs of that output path FS.
+
+
+## Architecture of the S3A Delegation Token Support
+
+
+
+1. The S3A FS client has the ability to be configured with a delegation
+token binding, the "DT Binding", a class declared in the option `fs.s3a.delegation.token.binding`.
+1. If set, when a filesystem is instantiated it asks the DT binding for its list of AWS credential providers.
+(the list in `fs.s3a.aws.credentials.provider` are only used if the DT binding wishes to).
+1. The DT binding scans for the current principal (`UGI.getCurrentUser()`/"the Owner") to see if they
+have any token in their credential cache whose service name matches the URI of the filesystem.
+1. If one is found, it is unmarshalled and then used to authenticate the caller via
+some AWS Credential provider returned to the S3A FileSystem instance.
+1. If none is found, the Filesystem is considered to have been deployed "Unbonded".
+The DT binding has to return a list of the AWS credential providers to use.
+
+When requests are made of AWS services, the created credential provider(s) are
+used to sign requests.
+
+When the filesystem is asked for a delegation token, the
+DT binding will generate a token identifier containing the marshalled tokens.
+
+If the Filesystem was deployed with a DT, that is, it was deployed "bonded", that existing
+DT is returned.
+
+If it was deployed unbonded, the DT Binding is asked to create a new DT.
+
+It is up to the binding what it includes in the token identifier, and how it obtains them.
+This new token identifier is included in a token which has a "canonical service name" of
+the URI of the filesystem (e.g "s3a://landsat-pds").
+
+The issued/reissued token identifier can be marshalled and reused.
+
+
+### class `org.apache.hadoop.fs.s3a.auth.delegation.S3ADelegationTokens`
+
+This joins up the S3A Filesystem with the pluggable DT binding classes.
+
+One is instantiated in the S3A Filesystem instance if a DT Binding class
+has been instantiated. If so, it is invoked for
+
+* Building up the authentication chain during filesystem initialization.
+* Determining if the FS should declare that it has a canonical name
+(in `getCanonicalServiceName()`).
+* When asked for a DT (in `getDelegationToken(String renewer)`).
+
+The `S3ADelegationTokens` has the task of instantiating the actual DT binding,
+which must be a subclass of `AbstractDelegationTokenBinding`.
+
+All the DT bindings, and `S3ADelegationTokens` itself are subclasses of
+`org.apache.hadoop.service.AbstractService`; they follow the YARN service lifecycle
+of: create -> init -> start -> stop. This means that a DT binding, may, if it chooses,
+start worker threads when the service is started (`serviceStart()`); it must
+then stop them in the `serviceStop` method. (Anyone doing this must be aware
+that the owner FS is not fully initialized in serviceStart: they must not
+call into the Filesystem).
+
+The actions of this class are
+
+* Lookup of DTs associated with this S3A FS (scanning credentials, unmarshalling).
+* initiating the DT binding in bound/unbound state.
+* issuing DTs, either serving up the existing one, or requesting the DT Binding for
+a new instance of `AbstractS3ATokenIdentifier` and then wrapping a hadoop token
+around it.
+* General logging, debugging, and metrics. Delegation token metrics are
+collected in (`S3AInstrumentation.DelegationTokenStatistics`)
+
+
+
+
+### class `org.apache.hadoop.fs.s3a.auth.delegation.AbstractS3ATokenIdentifier`
+
+All tokens returned are a subclass of `AbstractS3ATokenIdentifier`.
+
+This class contains the following fields:
+
+```java
+  /** Canonical URI of the bucket. */
+  private URI uri;
+
+  /**
+   * Encryption secrets to also marshall with any credentials.
+   * Set during creation to ensure it is never null.
+   */
+  private EncryptionSecrets encryptionSecrets = new EncryptionSecrets();
+
+  /**
+   * Timestamp of creation.
+   * This is set to the current time; it will be overridden when
+   * deserializing data.
+   */
+  private long created = System.currentTimeMillis();
+
+  /**
+   * An origin string for diagnostics.
+   */
+  private String origin = "";
+
+  /**
+   * This marshalled UUID can be used in testing to verify transmission,
+   * and reuse; as it is printed you can see what is happending too.
+   */
+  private String uuid = UUID.randomUUID().toString();
+```
+
+The `uuid` field is used for equality tests and debugging; the `origin` and
+`created` fields are also for diagnostics.
+
+The `encryptionSecrets` structure enumerates the AWS encryption mechanism
+of the filesystem instance, and any declared key. This allows
+the client-side secret for SSE-C encryption to be passed to the filesystem,
+or the key name for SSE-KMS.
+
+*The encryption settings and secrets of the S3A filesystem on the client
+are included in the DT, so can be used to encrypt/decrypt data in the cluster.*
+
+### class `SessionTokenIdentifier` extends `AbstractS3ATokenIdentifier`
+
+This holds session tokens, and it also gets used as a superclass of
+the other token identifiers.
+
+It adds a set of `MarshalledCredentials` containing the session secrets.
+
+Every token/token identifier must have a unique *Kind*; this is how token
+identifier deserializers are looked up. For Session Credentials, it is
+`S3ADelegationToken/Session`. Subclasses *must* have a different token kind,
+else the unmarshalling and binding mechanism will fail.
+
+
+### classes `RoleTokenIdentifier` and `FullCredentialsTokenIdentifier`
+
+These are subclasses of `SessionTokenIdentifier` with different token kinds,
+needed for that token unmarshalling.
+
+Their kinds are `S3ADelegationToken/Role` and `S3ADelegationToken/Full`
+respectively.
+
+Having different possible token bindings raises the risk that a job is submitted
+with one binding and yet the cluster is expecting another binding.
+Provided the configuration option `fs.s3a.delegation.token.binding` is not
+marked as final in the `core-site.xml` file, the value of that binding
+set in the job should propagate with the binding: the choice of provider
+is automatic. A cluster can even mix bindings across jobs.
+However if a core-site XML file declares a specific binding for a single bucket and
+the job only had the generic `fs.s3a.delegation.token.binding`` binding,
+then there will be a mismatch.
+Each binding must be rigorous about checking the Kind of any found delegation
+token and failing meaningfully here.
+
+
+
+### class `MarshalledCredentials`
+
+Can marshall a set of AWS credentials (access key, secret key, session token)
+as a Hadoop Writable.
+
+These can be given to an instance of class `MarshalledCredentialProvider`
+and used to sign AWS RPC/REST API calls.
+
+## DT Binding: `AbstractDelegationTokenBinding`
+
+The plugin point for this design is the DT binding, which must be a subclass
+of `org.apache.hadoop.fs.s3a.auth.delegation.AbstractDelegationTokenBinding`.
+
+
+This class
+
+* Returns the *Kind* of these tokens.
+* declares whether tokens will actually  be issued or not (the TokenIssuingPolicy).
+* can issue a DT in
+
+```java
+  public abstract AWSCredentialProviderList deployUnbonded()
+      throws IOException;
+```
+
+The S3A FS has been brought up with DTs enabled, but none have been found
+for its service name. The DT binding is tasked with coming up with the
+fallback list of AWS credential providers.
+
+```java
+public abstract AWSCredentialProviderList bindToTokenIdentifier(
+    AbstractS3ATokenIdentifier retrievedIdentifier)
+    throws IOException;
+```
+
+A DT for this FS instance been found. Bind to it and extract enough information
+to authenticate with AWS. Return the list of providers to use.
+
+```java
+public abstract AbstractS3ATokenIdentifier createEmptyIdentifier();
+```
+
+Return an empty identifier.
+
+
+```java
+public abstract AbstractS3ATokenIdentifier createTokenIdentifier(
+      Optional<RoleModel.Policy> policy,
+      EncryptionSecrets encryptionSecrets)
+```
+
+This is the big one: creatw a new Token Identifier for this filesystem, one
+which must include the encryption secrets, and which may make use of
+the role policy.
+
+## Token issuing
+
+### How Full Delegation Tokens are issued.
+
+If the client is only logged in with session credentials: fail.
+
+Else: take the AWS access/secret key, store them in the MarshalledCredentials
+in a new `FullCredentialsTokenIdentifier`, and return.
+
+
+### How Session Delegation Tokens are issued.
+
+If the client is only logged in with session credentials: return these.
+
+This is taken from the Yahoo! patch: if a user is logged
+in with a set of session credentials (including those from some 2FA login),
+they just get wrapped up and passed in.
+
+There's no clue as to how long they will last, so there's a warning printed.
+
+If there is a full set of credentials, then an `SessionTokenBinding.maybeInitSTS()`
+creates an STS client set up to communicate with the (configured) STS endpoint,
+retrying with the same retry policy as the filesystem.
+
+This client is then used to request a set of session credentials.
+
+### How Role Delegation Tokens are issued.
+
+If the client is only logged in with session credentials: fail.
+
+We don't know whether this is a full user session or some role session,
+and rather than pass in some potentially more powerful secrets with the job,
+just fail.
+
+Else: as with session delegation tokens, an STS client is created. This time
+`assumeRole()` is invoked with the ARN of the role and an extra AWS role policy
+set to restrict access to:
+
+* CRUD access the specific bucket a token is being requested for
+* CRUD access to the contents of any S3Guard DDB used (not admin rights though).
+* access to all KMS keys (assumption: AWS KMS is where restrictions are set up)
+
+*Example Generated Role Policy*
+
+
+```json
+{
+  "Version" : "2012-10-17",
+  "Statement" : [ {
+    "Sid" : "7",
+    "Effect" : "Allow",
+    "Action" : [ "s3:GetBucketLocation", "s3:ListBucket*" ],
+    "Resource" : "arn:aws:s3:::example-bucket"
+  }, {
+    "Sid" : "8",
+    "Effect" : "Allow",
+    "Action" : [ "s3:Get*", "s3:PutObject", "s3:DeleteObject", "s3:AbortMultipartUpload" ],
+    "Resource" : "arn:aws:s3:::example-bucket/*"
+  }, {
+    "Sid" : "1",
+    "Effect" : "Allow",
+    "Action" : [ "kms:Decrypt", "kms:GenerateDataKey" ],
+    "Resource" : "arn:aws:kms:*"
+  }, {
+    "Sid" : "9",
+    "Effect" : "Allow",
+    "Action" : [ "dynamodb:BatchGetItem", "dynamodb:BatchWriteItem", "dynamodb:DeleteItem", "dynamodb:DescribeTable", "dynamodb:GetItem", "dynamodb:PutItem", "dynamodb:Query", "dynamodb:UpdateItem" ],
+    "Resource" : "arn:aws:dynamodb:eu-west-1:980678866fff:table/example-bucket"
+  } ]
+}
+```
+
+These permissions are sufficient for all operations the S3A client currently
+performs on a bucket. If those requirements are expanded, these policies
+may change.
+
+
+## Testing.
+
+Look in `org.apache.hadoop.fs.s3a.auth.delegation`
+
+
+It's proven impossible to generate a full end-to-end test in an MR job.
+
+1. MapReduce only collects DTs when kerberos is enabled in the cluster.
+1. A Kerberized MiniYARN cluster refuses to start on a local file:// fs without the
+native libraries, so it can set directory permissions.
+1. A Kerberized MiniHDFS cluster and MiniYARN cluster refuse to talk to each
+other reliably, at least in the week or so of trying.
+
+The `ITestDelegatedMRJob` test works around this by using Mockito to mock
+the actual YARN job submit operation in `org.apache.hadoop.mapreduce.protocol.ClientProtocol`.
+The MR code does all the work of collecting tokens and attaching them to
+the launch context, "submits" the job, which then immediately succeeds.
+The job context is examined to verify that the source and destination filesystem
+DTs were extracted.
+
+To test beyond this requires a real Kerberized cluster, or someone able to fix
+up Mini* clusters to run kerberized.
diff --git a/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/delegation_tokens.md b/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/delegation_tokens.md
new file mode 100644
index 0000000000000..30226f85eb9b7
--- /dev/null
+++ b/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/delegation_tokens.md
@@ -0,0 +1,870 @@
+<!---
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License. See accompanying LICENSE file.
+-->
+
+# Working with Delegation Tokens
+
+<!-- MACRO{toc|fromDepth=0|toDepth=2} -->
+
+## <a name="introduction"></a> Introducing S3A Delegation Tokens.
+
+The S3A filesystem client supports `Hadoop Delegation Tokens`.
+This allows YARN application like MapReduce, Distcp, Apache Flink and Apache Spark to
+obtain credentials to access S3 buckets and pass them pass these credentials to
+jobs/queries, so granting them access to the service with the same access
+permissions as the user.
+
+Three different token types are offered.
+
+*Full Delegation Tokens:* include the full login values of `fs.s3a.access.key`
+and `fs.s3a.secret.key` in the token, so the recipient has access to
+the data as the submitting user, with unlimited duration.
+These tokens do not involve communication with the AWS STS service, so
+can be used with other S3 installations.
+
+*Session Delegation Tokens:* These contain an "STS Session Token" requested by
+the S3A client from the AWS STS service. They have a limited duration
+so restrict how long an application can access AWS on behalf of a user.
+Clients with this token have the full permissions of the user.
+
+*Role Delegation Tokens:* These contain an "STS Session Token" requested by by the
+STS "Assume Role" API, so grant the caller to interact with S3 as specific AWS
+role, *with permissions restricted to purely accessing the S3 bucket
+and associated S3Guard data*.
+
+Role Delegation Tokens are the most powerful. By restricting the access rights
+of the granted STS token, no process receiving the token may perform
+any operations in the AWS infrastructure other than those for the S3 bucket,
+and that restricted by the rights of the requested role ARN.
+
+All three tokens also marshall the encryption settings: The encryption mechanism
+to use and the KMS key ID or SSE-C client secret. This allows encryption
+policy and secrets to be uploaded from the client to the services.
+
+This document covers how to use these tokens. For details on the implementation
+see [S3A Delegation Token Architecture](delegation_token_architecture.html).
+
+## <a name="background"></a> Background: Hadoop Delegation Tokens.
+
+A Hadoop Delegation Token are is a byte array of data which is submitted to
+a Hadoop services as proof that the caller has the permissions to perform
+the operation which it is requesting —
+and which can be passed between applications to *delegate* those permission.
+
+Tokens are opaque to clients, clients who simply get a byte array
+of data which they must to provide to a service when required.
+This normally contains encrypted data for use by the service.
+
+The service, which holds the password to encrypt/decrypt this data,
+can decrypt the byte array and read the contents,
+knowing that it has not been tampered with, then
+use the presence of a valid token as evidence the caller has
+at least temporary permissions to perform the requested operation.
+
+Tokens have a limited lifespan.
+They may be renewed, with the client making an IPC/HTTP request of a renewer service.
+This renewal service can also be executed on behalf of the caller by
+some other Hadoop cluster services, such as the YARN Resource Manager.
+
+After use, tokens may be revoked: this relies on services holding tables of
+valid tokens, either in memory or, for any HA service, in Apache Zookeeper or
+similar. Revoking tokens is used to clean up after jobs complete.
+
+Delegation support is tightly integrated with YARN: requests to launch
+containers and applications can include a list of delegation tokens to
+pass along. These tokens are serialized with the request, saved to a file
+on the node launching the container, and then loaded in to the credentials
+of the active user. Normally the HDFS cluster is one of the tokens used here,
+added to the credentials through a call to `FileSystem.getDelegationToken()`
+(usually via `FileSystem.addDelegationTokens()`).
+
+Delegation Tokens are also supported with applications such as Hive: a query
+issued to a shared (long-lived) Hive cluster can include the delegation
+tokens required to access specific filesystems *with the rights of the user
+submitting the query*.
+
+All these applications normally only retrieve delegation tokens when security
+is enabled. This is why the cluster configuration needs to enable Kerberos.
+Production Hadoop clusters need Kerberos for security anyway.
+
+
+## <a name="s3a-delegation-tokens"></a> S3A Delegation Tokens.
+
+S3A now supports delegation tokens, so allowing a caller to acquire tokens
+from a local S3A Filesystem connector instance and pass them on to
+applications to grant them equivalent or restricted access.
+
+These S3A Delegation Tokens are special in that they do not contain
+password-protected data opaque to clients; they contain the secrets needed
+to access the relevant S3 buckets and associated services.
+
+They are obtained by requesting a delegation token from the S3A filesystem client.
+Issued token mey be included in job submissions, passed to running applications,
+etc. This token is specific to an individual bucket; all buckets which a client
+wishes to work with must have a separate delegation token issued.
+
+S3A implements Delegation Tokens in its `org.apache.hadoop.fs.s3a.auth.delegation.S3ADelegationTokens`
+class, which then supports multiple "bindings" behind it, so supporting
+different variants of S3A Delegation Tokens.
+
+Because applications only collect Delegation Tokens in secure clusters,
+It does mean that to be able to submit delegation tokens in transient
+cloud-hosted Hadoop clusters, _these clusters must also have Kerberos enabled_.
+
+
+### <a name="session-tokens"></a> S3A Session Delegation Tokens
+
+A Session Delegation Token is created by asking the AWS
+[Security Token Service](http://docs.aws.amazon.com/STS/latest/APIReference/Welcome.html)
+to issue an AWS session password and identifier for a limited duration.
+These AWS session credentials are valid until the end of that time period.
+They are marshalled into the S3A Delegation Token.
+
+Other S3A connectors can extract these credentials and use them to
+talk to S3 and related services.
+
+Issued tokens cannot be renewed or revoked.
+
+See [GetSessionToken](http://docs.aws.amazon.com/STS/latest/APIReference/API_GetSessionToken.html)
+for specifics details on the (current) token lifespan.
+
+### <a name="role-tokens"></a> S3A Role Delegation Tokens
+
+A Role Delegation Tokens is created by asking the AWS
+[Security Token Service](http://docs.aws.amazon.com/STS/latest/APIReference/Welcome.html)
+for set of "Assumed Role" credentials, with a AWS account specific role for a limited duration..
+This role is restricted to only grant access the S3 bucket, the S3Guard table
+and all KMS keys,
+They are marshalled into the S3A Delegation Token.
+
+Other S3A connectors can extract these credentials and use them to
+talk to S3 and related services.
+They may only work with the explicit AWS resources identified when the token was generated.
+
+Issued tokens cannot be renewed or revoked.
+
+
+### <a name="full-credentials"></a> S3A Full-Credential Delegation Tokens
+
+Full Credential Delegation Tokens tokens contain the full AWS login details
+(access key and secret key) needed to access a bucket.
+
+They never expire, so are the equivalent of storing the AWS account credentials
+in a Hadoop, Hive, Spark configuration or similar.
+
+They differences are:
+
+1. They are automatically passed from the client/user to the application.
+A remote application can use them to access data on behalf of the user.
+1. When a remote application destroys the filesystem connector instances and
+tokens of a user, the secrets are destroyed too.
+1. Secrets in the `AWS_` environment variables on the client will be picked up
+and automatically propagated.
+1. They do not use the AWS STS service, so may work against third-party implementations
+of the S3 protocol.
+
+
+## <a name="enabling "></a> Using S3A Delegation Tokens
+
+A prerequisite to using S3A filesystem delegation tokens is to run with
+Hadoop security enabled —which inevitably means with Kerberos.
+Even though S3A delegation tokens do not use Kerberos, the code in
+applications which fetch DTs is normally only executed when the cluster is
+running in secure mode; somewhere where the `core-site.xml` configuration
+sets `hadoop.security.authentication` to to `kerberos` or another valid
+authentication mechanism.
+
+* Without enabling security at this level, delegation tokens will not
+be collected.*
+
+Once Kerberos enabled, the process for acquiring tokens is as follows:
+
+1. Enable Delegation token support by setting `fs.s3a.delegation.token.binding`
+to the classname of the token binding to use.
+to use.
+1. Add any other binding-specific settings (STS endpoint, IAM role, etc.)
+1. Make sure the settings are the same in the service as well as the client.
+1. In the client, switch to using a [Hadoop Credential Provider](hadoop-project-dist/hadoop-common/CredentialProviderAPI.html)
+for storing your local credentials, *with a local filesystem store
+ (`localjceks:` or `jcecks://file`), so as to keep the full secrets out of any
+ job configurations.
+1. Execute the client from a Kerberos-authenticated account
+application configured with the login credentials for an AWS account able to issue session tokens.
+
+### Configuration Parameters
+
+
+| **Key** | **Meaning** | **Default** |
+| --- | --- | --- |
+| `fs.s3a.delegation.token.binding` | delegation token binding class |  `` |
+
+### Warnings
+
+##### Use Hadoop Credential Providers to keep secrets out of job configurations.
+
+Hadoop MapReduce jobs copy their client-side configurations with the job.
+If your AWS login secrets are set in an XML file then they are picked up
+and passed in with the job, _even if delegation tokens are used to propagate
+session or role secrets.
+
+Spark-submit will take any credentials in the `spark-defaults.conf`file
+and again, spread them across the cluster.
+It wil also pick up any `AWS_` environment variables and convert them into
+`fs.s3a.access.key`, `fs.s3a.secret.key` and `fs.s3a.session.key` configuration
+options.
+
+To guarantee that the secrets are not passed in, keep your secrets in
+a [hadoop credential provider file on the local filesystem](index.html#hadoop_credential_providers").
+Secrets stored here will not be propagated -the delegation tokens collected
+during job submission will be the sole AWS secrets passed in.
+
+
+##### Token Life
+
+* S3A Delegation tokens cannot be renewed.
+
+* S3A Delegation tokens cannot be revoked. It is possible for an administrator
+to terminate *all AWS sessions using a specific role*
+[from the AWS IAM console](https://docs.aws.amazon.com/IAM/latest/UserGuide/id_credentials_temp_control-access_disable-perms.html),
+if desired.
+
+* The lifespan of Session Delegation Tokens are limited to those of AWS sessions,
+maximum of 36 hours.
+
+* The lifespan of a Role Delegation Token is limited to 1 hour by default;
+a longer duration of up to 12 hours can be enabled in the AWS console for
+the specific role being used.
+
+* The lifespan of Full Delegation tokens is unlimited: the secret needs
+to be reset in the AWS Admin console to revoke it.
+
+##### Service Load on the AWS Secure Token Service
+
+All delegation tokens are issued on a bucket-by-bucket basis: clients
+must request a delegation token from every S3A filesystem to which it desires
+access.
+
+For Session and Role Delegation Tokens, this places load on the AWS STS service,
+which may trigger throttling amongst all users within the same AWS account using
+the same STS endpoint.
+
+* In experiments, a few hundred requests per second are needed to trigger throttling,
+so this is very unlikely to surface in production systems.
+* The S3A filesystem connector retries all throttled requests to AWS services, including STS.
+* Other S3 clients with use the AWS SDK will, if configured, also retry throttled requests.
+
+Overall, the risk of triggering STS throttling appears low, and most applications
+will recover from what is generally an intermittently used AWS service.
+
+### <a name="enabling-session-tokens"></a> Enabling Session Delegation Tokens
+
+For session tokens, set `fs.s3a.delegation.token.binding`
+to `org.apache.hadoop.fs.s3a.auth.delegation.SessionTokenBinding`
+
+
+|  **Key** | **Value** |
+| --- | --- |
+| `fs.s3a.delegation.token.binding` | `org.apache.hadoop.fs.s3a.auth.delegation.SessionTokenBinding` |
+
+There some further configuration options.
+
+| **Key** | **Meaning** | **Default** |
+| --- | --- | --- |
+| `fs.s3a.assumed.role.session.duration` | Duration of delegation tokens |  `1h` |
+| `fs.s3a.assumed.role.sts.endpoint` | URL to service issuing tokens |  (undefined) |
+| `fs.s3a.assumed.role.sts.endpoint.region` | region for issued tokens |  (undefined) |
+
+The XML settings needed to enable session tokens are:
+
+```xml
+<property>
+  <name>fs.s3a.delegation.token.binding</name>
+  <value>org.apache.hadoop.fs.s3a.auth.delegation.SessionTokenBinding</value>
+</property>
+<property>
+  <name>fs.s3a.assumed.role.session.duration</name>
+  <value>1h</value>
+</property>
+```
+
+1. If the application requesting a token has full AWS credentials for the
+relevant bucket, then a new session token will be issued.
+1. If the application requesting a token is itself authenticating with
+a session delegation token, then the existing token will be forwarded.
+The life of the token will not be extended.
+1. If the application requesting a token does not have either of these,
+the the tokens cannot be issued: the operation will fail with an error.
+
+
+The endpoint for STS requests are set by the same configuration
+property as for the `AssumedRole` credential provider and for Role Delegation
+tokens.
+
+```xml
+<!-- Optional -->
+<property>
+  <name>fs.s3a.assumed.role.sts.endpoint</name>
+  <value>sts.amazonaws.com</value>
+</property>
+<property>
+  <name>fs.s3a.assumed.role.sts.endpoint.region</name>
+  <value>us-west-1</value>
+</property>
+```
+
+If the `fs.s3a.assumed.role.sts.endpoint` option is set, or set to something
+other than the central `sts.amazonaws.com` endpoint, then the region property
+*must* be set.
+
+
+Both the Session and the Role Delegation Token bindings use the option
+`fs.s3a.aws.credentials.provider` to define the credential providers
+to authenticate to the AWS STS with.
+
+Here is the effective list of providers if none are declared:
+
+```xml
+<property>
+  <name>fs.s3a.aws.credentials.provider</name>
+  <value>
+    org.apache.hadoop.fs.s3a.TemporaryAWSCredentialsProvider,
+    org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider,
+    com.amazonaws.auth.EnvironmentVariableCredentialsProvider,
+    org.apache.hadoop.fs.s3a.auth.IAMInstanceCredentialsProvider
+  </value>
+</property>
+```
+
+Not all these authentication mechanisms provide the full set of credentials
+STS needs. The session token provider will simply forward any session credentials
+it is authenticated with; the role token binding will fail.
+
+#### Forwarding of existing AWS Session credentials.
+
+When the AWS credentials supplied to the Session Delegation Token binding
+through `fs.s3a.aws.credentials.provider` are themselves a set of
+session credentials, generated delegation tokens with simply contain these
+existing session credentials, a new set of credentials obtained from STS.
+This is because the STS service does not let
+callers authenticated with session/role credentials from requesting new sessions.
+
+This feature is useful when generating tokens from an EC2 VM instance in one IAM
+role and forwarding them over to VMs which are running in a different IAM role.
+The tokens will grant the permissions of the original VM's IAM role.
+
+The duration of the forwarded tokens will be exactly that of the current set of
+tokens, which may be very limited in lifespan. A warning will appear
+in the logs declaring this.
+
+Note: Role Delegation tokens do not support this forwarding of session credentials,
+because there's no way to explicitly change roles in the process.
+
+
+### <a name="enabling-role-tokens"></a> Enabling Role Delegation Tokens
+
+For role delegation tokens, set `fs.s3a.delegation.token.binding`
+to `org.apache.hadoop.fs.s3a.auth.delegation.RoleTokenBinding`
+
+|  **Key** | **Value** |
+| --- | --- |
+| `fs.s3a.delegation.token.binding` | `org.apache.hadoop.fs.s3a.auth.delegation.SessionToRoleTokenBinding` |
+
+
+There are some further configuration options:
+
+| **Key** | **Meaning** | **Default** |
+| --- | --- | --- |
+| `fs.s3a.assumed.role.session.duration"` | Duration of delegation tokens |  `1h` |
+| `fs.s3a.assumed.role.arn` | ARN for role to request |  (undefined) |
+| `fs.s3a.assumed.role.sts.endpoint.region` | region for issued tokens |  (undefined) |
+
+The option `fs.s3a.assumed.role.arn` must be set to a role which the
+user can assume. It must have permissions to access the bucket, any
+associated S3Guard table and any KMS encryption keys. The actual
+requested role will be this role, explicitly restricted to the specific
+bucket and S3Guard table.
+
+The XML settings needed to enable session tokens are:
+
+```xml
+<property>
+  <name>fs.s3a.delegation.token.binding</name>
+  <value>org.apache.hadoop.fs.s3a.auth.delegation.RoleTokenBinding</value>
+</property>
+<property>
+  <name>fs.s3a.assumed.role.arn</name>
+  <value>ARN of role to request</value>
+  <value>REQUIRED ARN</value>
+</property>
+<property>
+  <name>fs.s3a.assumed.role.session.duration</name>
+  <value>1h</value>
+</property>
+```
+
+A JSON role policy for the role/session will automatically be generated which will
+consist of
+1. Full access to the S3 bucket for all operations used by the S3A client
+(read, write, list, multipart operations, get bucket location, etc).
+1. Full user access to any S3Guard DynamoDB table used by the bucket.
+1. Full user access to KMS keys. This is to be able to decrypt any data
+in the bucket encrypted with SSE-KMS, as well as encrypt new data if that
+is the encryption policy.
+
+If the client doesn't have S3Guard enabled, but the remote application does,
+the issued role tokens will not have permission to access the S3Guard table.
+
+### <a name="enabling-full-tokens"></a> Enabling Full Delegation Tokens
+
+This passes the full credentials in, falling back to any session credentials
+which were used to configure the S3A FileSystem instance.
+
+For Full Credential Delegation tokens, set `fs.s3a.delegation.token.binding`
+to `org.apache.hadoop.fs.s3a.auth.delegation.FullCredentialsTokenBinding`
+
+|  **Key** | **Value** |
+| --- | --- |
+| `fs.s3a.delegation.token.binding` | `org.apache.hadoop.fs.s3a.auth.delegation.FullCredentialsTokenBinding` |
+
+There are no other configuration options.
+
+```xml
+<property>
+  <name>fs.s3a.delegation.token.binding</name>
+  <value>org.apache.hadoop.fs.s3a.auth.delegation.FullCredentialsTokenBinding</value>
+</property>
+```
+
+Key points:
+
+1. If the application requesting a token has full AWS credentials for the
+relevant bucket, then a full credential token will be issued.
+1. If the application requesting a token is itself authenticating with
+a session delegation token, then the existing token will be forwarded.
+The life of the token will not be extended.
+1. If the application requesting a token does not have either of these,
+the the tokens cannot be issued: the operation will fail with an error.
+
+## <a name="managing_token_duration"></a> Managing the Delegation Tokens Duration
+
+Full Credentials have an unlimited lifespan.
+
+Session and role credentials have a lifespan defined by the duration
+property `fs.s3a.assumed.role.session.duration`.
+
+This can have a maximum value of "36h" for session delegation tokens.
+
+For Role Delegation Tokens, the maximum duration of a token is
+that of the role itself: 1h by default, though this can be changed to
+12h [In the IAM Console](https://console.aws.amazon.com/iam/home#/roles),
+or from the AWS CLI.
+
+*Without increasing the duration of role, one hour is the maximum value;
+the error message `The requested DurationSeconds exceeds the MaxSessionDuration set for this role`
+is returned if the requested duration of a Role Delegation Token is greater
+than that available for the role.
+
+
+## <a name="testing"></a> Testing Delegation Token Support
+
+The easiest way to test that delegation support is configured is to use
+the `hdfs fetchdt` command, which can fetch tokens from S3A, Azure ABFS
+and any other filesystem which can issue tokens, as well as HDFS itself.
+
+This will fetch the token and save it to the named file (here, `tokens.bin`),
+even if Kerberos is disabled.
+
+```bash
+# Fetch a token for the AWS landsat-pds bucket and save it to tokens.bin
+$ hdfs fetchdt --webservice s3a://landsat-pds/  tokens.bin
+```
+
+If the command fails with `ERROR: Failed to fetch token` it means the
+filesystem does not have delegation tokens enabled.
+
+If it fails for other reasons, the likely causes are configuration and
+possibly connectivity to the AWS STS Server.
+
+Once collected, the token can be printed. This will show
+the type of token, details about encryption and expiry, and the
+host on which it was created.
+
+```bash
+$ bin/hdfs fetchdt --print tokens.bin
+
+Token (S3ATokenIdentifier{S3ADelegationToken/Session; uri=s3a://landsat-pds;
+timestamp=1541683947569; encryption=EncryptionSecrets{encryptionMethod=SSE_S3};
+Created on vm1.local/192.168.99.1 at time 2018-11-08T13:32:26.381Z.};
+Session credentials for user AAABWL expires Thu Nov 08 14:02:27 GMT 2018; (valid))
+for s3a://landsat-pds
+```
+The "(valid)" annotation means that the AWS credentials are considered "valid":
+there is both a username and a secret.
+
+You can use the `s3guard bucket-info` command to see what the delegation
+support for a specific bucket is.
+If delegation support is enabled, it also prints the current
+hadoop security level.
+
+```bash
+$ hadoop s3guard bucket-info s3a://landsat-pds/
+
+Filesystem s3a://landsat-pds
+Location: us-west-2
+Filesystem s3a://landsat-pds is not using S3Guard
+The "magic" committer is supported
+
+S3A Client
+  Endpoint: fs.s3a.endpoint=s3.amazonaws.com
+  Encryption: fs.s3a.server-side-encryption-algorithm=none
+  Input seek policy: fs.s3a.experimental.input.fadvise=normal
+Delegation Support enabled: token kind = S3ADelegationToken/Session
+Hadoop security mode: SIMPLE
+```
+
+Although the S3A delegation tokens do not depend upon Kerberos,
+MapReduce and other applications only request tokens from filesystems when
+security is enabled in Hadoop.
+
+
+## <a name="troubleshooting"></a> Troubleshooting S3A Delegation Tokens
+
+The `hadoop s3guard bucket-info` command will print information about
+the delegation state of a bucket.
+
+Consult [troubleshooting Assumed Roles](assumed_roles.html#troubleshooting)
+for details on AWS error messages related to AWS IAM roles.
+
+The [cloudstore](https://github.com/steveloughran/cloudstore) module's StoreDiag
+utility can also be used to explore delegation token support
+
+
+### Submitted job cannot authenticate
+
+There are many causes for this; delegation tokens add some more.
+
+### Tokens are not issued
+
+
+* This user is not `kinit`-ed in to Kerberos. Use `klist` and
+`hadoop kdiag` to see the Kerberos authentication state of the logged in user.
+* The filesystem instance on the client has not had a token binding set in
+`fs.s3a.delegation.token.binding`, so does not attempt to issue any.
+* The job submission is not aware that access to the specific S3 buckets
+are required. Review the application's submission mechanism to determine
+how to list source and destination paths. For example, for MapReduce,
+tokens for the cluster filesystem (`fs.defaultFS`) and all filesystems
+referenced as input and output paths will be queried for
+delegation tokens.
+
+For Apache Spark, the cluster filesystem and any filesystems listed in the
+property `spark.yarn.access.hadoopFileSystems` are queried for delegation
+tokens in secure clusters.
+See [Running on Yarn](https://spark.apache.org/docs/latest/running-on-yarn.html).
+
+
+### Error `No AWS login credentials`
+
+The client does not have any valid credentials to request a token
+from the Amazon STS service.
+
+### Tokens Expire before job completes
+
+The default duration of session and role tokens as set in
+`fs.s3a.assumed.role.session.duration` is one hour, "1h".
+
+For session tokens, this can be increased to any time up to 36 hours.
+
+For role tokens, it can be increased up to 12 hours, *but only if
+the role is configured in the AWS IAM Console to have a longer lifespan*.
+
+
+### Error `DelegationTokenIOException: Token mismatch`
+
+```
+org.apache.hadoop.fs.s3a.auth.delegation.DelegationTokenIOException:
+ Token mismatch: expected token for s3a://example-bucket
+ of type S3ADelegationToken/Session but got a token of type S3ADelegationToken/Full
+
+  at org.apache.hadoop.fs.s3a.auth.delegation.S3ADelegationTokens.lookupToken(S3ADelegationTokens.java:379)
+  at org.apache.hadoop.fs.s3a.auth.delegation.S3ADelegationTokens.selectTokenFromActiveUser(S3ADelegationTokens.java:300)
+  at org.apache.hadoop.fs.s3a.auth.delegation.S3ADelegationTokens.bindToExistingDT(S3ADelegationTokens.java:160)
+  at org.apache.hadoop.fs.s3a.S3AFileSystem.bindAWSClient(S3AFileSystem.java:423)
+  at org.apache.hadoop.fs.s3a.S3AFileSystem.initialize(S3AFileSystem.java:265)
+```
+
+The value of `fs.s3a.delegation.token.binding` is different in the remote
+service than in the local client. As a result, the remote service
+cannot use the token supplied by the client to authenticate.
+
+Fix: reference the same token binding class at both ends.
+
+
+### Warning `Forwarding existing session credentials`
+
+This message is printed when an S3A filesystem instance has been asked
+for a Session Delegation Token, and it is itself only authenticated with
+a set of AWS session credentials (such as those issued by the IAM metadata
+service).
+
+The created token will contain these existing credentials, credentials which
+can be used until the existing session expires.
+
+The duration of this existing session is unknown: the message is warning
+you that it may expire without warning.
+
+### Error `Cannot issue S3A Role Delegation Tokens without full AWS credentials`
+
+An S3A filesystem instance has been asked for a Role Delegation Token,
+but the instance is only authenticated with session tokens.
+This means that a set of role tokens cannot be requested.
+
+Note: no attempt is made to convert the existing set of session tokens into
+a delegation token, unlike the Session Delegation Tokens. This is because
+the role of the current session (if any) is unknown.
+
+
+## <a name="implementation"></a> Implementation Details
+
+### <a name="architecture"></a> Architecture
+
+Concepts:
+
+1. The S3A FileSystem can create delegation tokens when requested.
+1. These can be marshalled as per other Hadoop Delegation Tokens.
+1. At the far end, they can be retrieved, unmarshalled and used to authenticate callers.
+1. DT binding plugins can then use these directly, or, somehow,
+manage authentication and token issue through other services
+(for example: Kerberos)
+1. Token Renewal and Revocation are not supported.
+
+
+There's support for different back-end token bindings through the
+`org.apache.hadoop.fs.s3a.auth.delegation.S3ADelegationTokenManager`
+
+Every implementation of this must return a subclass of
+`org.apache.hadoop.fs.s3a.auth.delegation.AbstractS3ATokenIdentifier`
+when asked to create a delegation token; this subclass must be registered
+in `META-INF/services/org.apache.hadoop.security.token.TokenIdentifier`
+for unmarshalling.
+
+This identifier must contain all information needed at the far end to
+authenticate the caller with AWS services used by the S3A client: AWS S3 and
+potentially AWS KMS (for SSE-KMS) and AWS DynamoDB (for S3Guard).
+
+It must have its own unique *Token Kind*, to ensure that it can be distinguished
+from the other token identifiers when tokens are being unmarshalled.
+
+| Kind |  Token class |
+|------|--------------|
+| `S3ADelegationToken/Full` | `org.apache.hadoop.fs.s3a.auth.delegation.FullCredentialsTokenIdentifier` |
+| `S3ADelegationToken/Session` | `org.apache.hadoop.fs.s3a.auth.delegation.RoleTokenIdentifier`|
+| `S3ADelegationToken/Role` | `org.apache.hadoop.fs.s3a.auth.delegation.SessionTokenIdentifier` |
+
+If implementing an external binding:
+
+1. Follow the security requirements below.
+1. Define a new token identifier; there is no requirement for the `S3ADelegationToken/`
+prefix —but it is useful for debugging.
+1. Token Renewal and Revocation is not integrated with the binding mechanism;
+if the operations are supported, implementation is left as an exercise.
+1. Be aware of the stability guarantees of the module "LimitedPrivate/Unstable".
+
+### <a name="security"></a> Security
+
+S3A DTs contain secrets valuable for a limited period (session secrets) or
+long-lived secrets with no explicit time limit.
+
+* The `toString()` operations on token identifiers MUST NOT print secrets; this
+is needed to keep them out of logs.
+* Secrets MUST NOT be logged, even at debug level.
+* Prefer short-lived session secrets over long-term secrets.
+* Try to restrict the permissions to what a client with the delegated token
+  may perform to those needed to access data in the S3 bucket. This potentially
+  includes a DynamoDB table, KMS access, etc.
+* Implementations need to be resistant to attacks which pass in invalid data as
+their token identifier: validate the types of the unmarshalled data; set limits
+on the size of all strings and other arrays to read in, etc.
+
+### <a name="resilience"></a> Resilience
+
+Implementations need to handle transient failures of any remote authentication
+service, and the risk of a large-cluster startup overloading it.
+
+* All get/renew/cancel operations should be considered idempotent.
+* And clients to repeat with backoff & jitter on recoverable connectivity failures.
+* While failing fast on the unrecoverable failures (DNS, authentication).
+
+### <a name="scalability"></a> Scalability limits of AWS STS service
+
+There is currently no documented rate limit for token requests against the AWS
+STS service.
+
+We have two tests which attempt to generate enough requests for
+delegation tokens that the AWS STS service will throttle requests for
+tokens by that AWS account for that specific STS endpoint
+(`ILoadTestRoleCredentials` and `ILoadTestSessionCredentials`).
+
+In the initial results of these tests:
+
+* A few hundred requests a second can be made before STS block the caller.
+* The throttling does not last very long (seconds)
+* Tt does not appear to affect any other STS endpoints.
+
+If developers wish to experiment with these tests and provide more detailed
+analysis, we would welcome this. Do bear in mind that all users of the
+same AWS account in that region will be throttled. Your colleagues may
+notice, especially if the applications they are running do not retry on
+throttle responses from STS (it's not a common occurrence after all...).
+
+## Implementing your own Delegation Token Binding
+
+The DT binding mechanism is designed to be extensible: if you have an alternate
+authentication mechanism, such as an S3-compatible object store with
+Kerberos support —S3A Delegation tokens should support it.
+
+*if it can't: that's a bug in the implementation which needs to be corrected*.
+
+### Steps
+
+1. Come up with a token "Kind"; a unique name for the delegation token identifier.
+1. Implement a subclass of `AbstractS3ATokenIdentifier` which adds all information which
+is marshalled from client to remote services. This must subclass the `Writable` methods to read
+and write the data to a data stream: these subclasses must call the superclass methods first.
+1. Add a resource `META-INF/services/org.apache.hadoop.security.token.TokenIdentifier`
+1. And list in it, the classname of your new identifier.
+1. Implement a subclass of `AbstractDelegationTokenBinding`
+
+### Implementing `AbstractS3ATokenIdentifier`
+
+Look at the other examples to see what to do; `SessionTokenIdentifier` does
+most of the work.
+
+Having a `toString()` method which is informative is ideal for the `hdfs creds`
+command as well as debugging: *but do not print secrets*
+
+*Important*: Add no references to any AWS SDK class, to
+ensure it can be safely deserialized whenever the relevant token
+identifier is examined. Best practise is: avoid any references to
+classes which may not be on the classpath of core Hadoop services,
+especially the YARN Resource Manager and Node Managers.
+
+### `AWSCredentialProviderList deployUnbonded()`
+
+1. Perform all initialization needed on an "unbonded" deployment to authenticate with the store.
+1. Return a list of AWS Credential providers which can be used to authenticate the caller.
+
+**Tip**: consider *not* doing all the checks to verify that DTs can be issued.
+That can be postponed until a DT is issued -as in any deployments where a DT is not actually
+needed, failing at this point is overkill. As an example, `RoleTokenBinding` cannot issue
+DTs if it only has a set of session credentials, but it will deploy without them, so allowing
+`hadoop fs` commands to work on an EC2 VM with IAM role credentials.
+
+**Tip**: The class `org.apache.hadoop.fs.s3a.auth.MarshalledCredentials` holds a set of
+marshalled credentials and so can be used within your own Token Identifier if you want
+to include a set of full/session AWS credentials in your token identifier.
+
+### `AWSCredentialProviderList bindToTokenIdentifier(AbstractS3ATokenIdentifier id)`
+
+The identifier passed in will be the one for the current filesystem URI and of your token kind.
+
+1. Use `convertTokenIdentifier` to cast it to your DT type, or fail with a meaningful `IOException`.
+1. Extract the secrets needed to authenticate with the object store (or whatever service issues
+object store credentials).
+1. Return a list of AWS Credential providers which can be used to authenticate the caller with
+the extracted secrets.
+
+### `AbstractS3ATokenIdentifier createEmptyIdentifier()`
+
+Return an empty instance of your token identifier.
+
+### `AbstractS3ATokenIdentifier createTokenIdentifier(Optional<RoleModel.Policy> policy,  EncryptionSecrets secrets)`
+
+Create the delegation token.
+
+If non-empty, the `policy` argument contains an AWS policy model to grant access to:
+
+* The target S3 bucket.
+* Any S3Guard DDB table it is bonded to.
+* KMS key `"kms:GenerateDataKey` and `kms:Decrypt`permissions for all KMS keys.
+
+This can be converted to a string and passed to the AWS `assumeRole` operation.
+
+The `secrets` argument contains encryption policy and secrets:
+this should be passed to the superclass constructor as is; it is retrieved and used
+to set the encryption policy on the newly created filesystem.
+
+
+*Tip*: Use `AbstractS3ATokenIdentifier.createDefaultOriginMessage()` to create an initial
+message for the origin of the token —this is useful for diagnostics.
+
+
+#### Token Renewal
+
+There's no support in the design for token renewal; it would be very complex
+to make it pluggable, and as all the bundled mechanisms don't support renewal,
+untestable and unjustifiable.
+
+Any token binding which wants to add renewal support will have to implement
+it directly.
+
+### Testing
+
+Use the tests `org.apache.hadoop.fs.s3a.auth.delegation` as examples. You'll have to
+copy and paste some of the test base classes over; `hadoop-common`'s test JAR is published
+to Maven Central, but not the S3A one (a fear of leaking AWS credentials).
+
+
+#### Unit Test `TestS3ADelegationTokenSupport`
+
+This tests marshalling and unmarshalling of tokens identifiers.
+*Test that every field is preserved.*
+
+
+#### Integration Test `ITestSessionDelegationTokens`
+
+Tests the lifecycle of session tokens.
+
+#### Integration Test `ITestSessionDelegationInFileystem`.
+
+This collects DTs from one filesystem, and uses that to create a new FS instance and
+then perform filesystem operations. A miniKDC is instantiated
+
+* Take care to remove all login secrets from the environment, so as to make sure that
+the second instance is picking up the DT information.
+* `UserGroupInformation.reset()` can be used to reset user secrets after every test
+case (e.g. teardown), so that issued DTs from one test case do not contaminate the next.
+* its subclass, `ITestRoleDelegationInFileystem` adds a check that the current credentials
+in the DT cannot be used to access data on other buckets —that is, the active
+session really is restricted to the target bucket.
+
+
+#### Integration Test `ITestDelegatedMRJob`
+
+It's not easy to bring up a YARN cluster with a secure HDFS and miniKDC controller in
+test cases —this test, the closest there is to an end-to-end test,
+uses mocking to mock the RPC calls to the YARN AM, and then verifies that the tokens
+have been collected in the job context,
+
+#### Load Test `ILoadTestSessionCredentials`
+
+This attempts to collect many, many delegation tokens simultaneously and sees
+what happens.
+
+Worth doing if you have a new authentication service provider, or
+implementing custom DT support.
+Consider also something for going from DT to
+AWS credentials if this is also implemented by your own service.
+This is left as an exercise for the developer.
+
+**Tip**: don't go overboard here, especially against AWS itself.
diff --git a/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/index.md b/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/index.md
index 5c409e4d9a499..aec6500fe9860 100644
--- a/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/index.md
+++ b/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/index.md
@@ -50,6 +50,8 @@ Please use `s3a:` as the connector to data hosted in S3 with Apache Hadoop.**
 * [Committing work to S3 with the "S3A Committers"](./committers.html)
 * [S3A Committers Architecture](./committer_architecture.html)
 * [Working with IAM Assumed Roles](./assumed_roles.html)
+* [S3A Delegation Token Support](./delegation_tokens.html)
+* [S3A Delegation Token Architecture](delegation_token_architecture.html).
 * [Testing](./testing.html)
 
 ## <a name="overview"></a> Overview
@@ -357,15 +359,20 @@ on the hosts/processes where the work is executed.
 
 ### <a name="auth_providers"></a> Changing Authentication Providers
 
-The standard way to authenticate is with an access key and secret key using the
-properties in the configuration file.
+The standard way to authenticate is with an access key and secret key set in
+the Hadoop configuration files.
 
-The S3A client follows the following authentication chain:
+By default, the S3A client follows the following authentication chain:
 
+1. The options `fs.s3a.access.key`, `fs.s3a.secret.key` and `fs.s3a.sesson.key
+are looked for in the Hadoop XML configuration/Hadoop credential providers,
+returning a set of session credentials if all three are defined.
 1. The `fs.s3a.access.key` and `fs.s3a.secret.key` are looked for in the Hadoop
-XML configuration.
+XML configuration//Hadoop credential providers, returning a set of long-lived
+credentials if they are defined.
 1. The [AWS environment variables](http://docs.aws.amazon.com/cli/latest/userguide/cli-chap-getting-started.html#cli-environment),
-are then looked for.
+are then looked for: these will return session or full credentials depending
+on which values are set.
 1. An attempt is made to query the Amazon EC2 Instance Metadata Service to
  retrieve credentials published to EC2 VMs.
 
@@ -381,13 +388,19 @@ AWS Credential Providers are classes which can be used by the Amazon AWS SDK to
 obtain an AWS login from a different source in the system, including environment
 variables, JVM properties and configuration files.
 
-There are three AWS Credential Providers inside the `hadoop-aws` JAR:
+All Hadoop `fs.s3a.` options used to store login details can all be secured
+in [Hadoop credential providers](../../../hadoop-project-dist/hadoop-common/CredentialProviderAPI.html);
+this is advised as a more secure way to store valuable secrets.
+
+There are a number of AWS Credential Providers inside the `hadoop-aws` JAR:
 
 | classname | description |
 |-----------|-------------|
 | `org.apache.hadoop.fs.s3a.TemporaryAWSCredentialsProvider`| Session Credentials |
 | `org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider`| Simple name/secret credentials |
 | `org.apache.hadoop.fs.s3a.AnonymousAWSCredentialsProvider`| Anonymous Login |
+| `org.apache.hadoop.fs.s3a.auth.AssumedRoleCredentialProvider<`| [Assumed Role credentials](assumed_roles.html) |
+
 
 There are also many in the Amazon SDKs, in particular two which are automatically
 set up in the authentication chain:
@@ -502,10 +515,52 @@ This means that the default S3A authentication chain can be defined as
 <property>
   <name>fs.s3a.aws.credentials.provider</name>
   <value>
-  org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider,
-  com.amazonaws.auth.EnvironmentVariableCredentialsProvider,
-  com.amazonaws.auth.InstanceProfileCredentialsProvider
+    org.apache.hadoop.fs.s3a.TemporaryAWSCredentialsProvider,
+    org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider,
+    com.amazonaws.auth.EnvironmentVariableCredentialsProvider,
+    org.apache.hadoop.fs.s3a.auth.IAMInstanceCredentialsProvider
   </value>
+  <description>
+    Comma-separated class names of credential provider classes which implement
+    com.amazonaws.auth.AWSCredentialsProvider.
+
+    When S3A delegation tokens are not enabled, this list will be used
+    to directly authenticate with S3 and DynamoDB services.
+    When S3A Delegation tokens are enabled, depending upon the delegation
+    token binding it may be used
+    to communicate wih the STS endpoint to request session/role
+    credentials.
+
+    These are loaded and queried in sequence for a valid set of credentials.
+    Each listed class must implement one of the following means of
+    construction, which are attempted in order:
+    * a public constructor accepting java.net.URI and
+        org.apache.hadoop.conf.Configuration,
+    * a public constructor accepting org.apache.hadoop.conf.Configuration,
+    * a public static method named getInstance that accepts no
+       arguments and returns an instance of
+       com.amazonaws.auth.AWSCredentialsProvider, or
+    * a public default constructor.
+
+    Specifying org.apache.hadoop.fs.s3a.AnonymousAWSCredentialsProvider allows
+    anonymous access to a publicly accessible S3 bucket without any credentials.
+    Please note that allowing anonymous access to an S3 bucket compromises
+    security and therefore is unsuitable for most use cases. It can be useful
+    for accessing public data sets without requiring AWS credentials.
+
+    If unspecified, then the default list of credential provider classes,
+    queried in sequence, is:
+    * org.apache.hadoop.fs.s3a.TemporaryAWSCredentialsProvider: looks
+       for session login secrets in the Hadoop configuration.
+    * org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider:
+       Uses the values of fs.s3a.access.key and fs.s3a.secret.key.
+    * com.amazonaws.auth.EnvironmentVariableCredentialsProvider: supports
+        configuration of AWS access key ID and secret access key in
+        environment variables named AWS_ACCESS_KEY_ID and
+        AWS_SECRET_ACCESS_KEY, as documented in the AWS SDK.
+    * com.amazonaws.auth.InstanceProfileCredentialsProvider: supports use
+        of instance profile credentials if running in an EC2 VM.
+  </description>
 </property>
 ```
 
@@ -520,9 +575,6 @@ and significantly damage your organisation.
 1. Never commit your secrets into an SCM repository.
 The [git secrets](https://github.com/awslabs/git-secrets) can help here.
 
-1. Avoid using s3a URLs which have key and secret in the URL. This
-is dangerous as the secrets leak into the logs.
-
 1. Never include AWS credentials in bug reports, files attached to them,
 or similar.
 
@@ -543,20 +595,23 @@ The command line of any launched program is visible to all users on a Unix syste
 management: a specific S3A connection can be made with a different assumed role
 and permissions from the primary user account.
 
-1. Consider a workflow in which usera and applications are issued with short-lived
+1. Consider a workflow in which users and applications are issued with short-lived
 session credentials, configuring S3A to use these through
 the `TemporaryAWSCredentialsProvider`.
 
 1. Have a secure process in place for cancelling and re-issuing credentials for
 users and applications. Test it regularly by using it to refresh credentials.
 
+1. In installations where Kerberos is enabled, [S3A Delegation Tokens](delegation_tokens.html)
+can be used to acquire short-lived session/role credentials and then pass them
+into the shared application. This can ensure that the long-lived secrets stay
+on the local system.
+
 When running in EC2, the IAM EC2 instance credential provider will automatically
 obtain the credentials needed to access AWS services in the role the EC2 VM
 was deployed as.
-This credential provider is enabled in S3A by default.
+This AWS credential provider is enabled in S3A by default.
 
-The safest way to keep the AWS login keys a secret within Hadoop is to use
-Hadoop Credentials.
 
 ## <a name="hadoop_credential_providers"></a>Storing secrets with Hadoop Credential Providers
 
diff --git a/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/testing.md b/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/testing.md
index 058fb35e259c5..e3f227de220dc 100644
--- a/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/testing.md
+++ b/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/testing.md
@@ -508,10 +508,11 @@ exception.
 
 ### Testing Session Credentials
 
-The test `TestS3ATemporaryCredentials` requests a set of temporary
-credentials from the STS service, then uses them to authenticate with S3.
+Some tests requests a session credentials and assumed role credentials from the
+AWS Secure Token Service, then use them to authenticate with S3 either directly
+or via delegation tokens.
 
-If an S3 implementation does not support STS, then the functional test
+If an S3 implementation does not support STS, then these functional test
 cases must be disabled:
 
 ```xml
@@ -519,18 +520,30 @@ cases must be disabled:
   <name>test.fs.s3a.sts.enabled</name>
   <value>false</value>
 </property>
+
 ```
 These tests request a temporary set of credentials from the STS service endpoint.
-An alternate endpoint may be defined in `test.fs.s3a.sts.endpoint`.
+An alternate endpoint may be defined in `fs.s3a.assumed.role.sts.endpoint`.
+If this is set, a delegation token region must also be defined:
+in `fs.s3a.assumed.role.sts.endpoint.region`.
+This is useful not just for testing alternative infrastructures,
+but to reduce latency on tests executed away from the central
+service.
 
 ```xml
 <property>
-  <name>test.fs.s3a.sts.endpoint</name>
-  <value>https://sts.example.org/</value>
+  <name>fs.s3a.delegation.token.endpoint</name>
+  <value>fs.s3a.assumed.role.sts.endpoint</value>
+</property>
+<property>
+  <name>fs.s3a.assumed.role.sts.endpoint.region</name>
+  <value>eu-west-2</value>
 </property>
 ```
-The default is ""; meaning "use the amazon default value".
+The default is ""; meaning "use the amazon default endpoint" (`sts.amazonaws.com`).
 
+Consult the [AWS documentation](https://docs.aws.amazon.com/general/latest/gr/rande.html#sts_region)
+for the full list of locations.
 
 ## <a name="debugging"></a> Debugging Test failures
 
@@ -1148,16 +1161,25 @@ This is not for use in production.
 Tests for the AWS Assumed Role credential provider require an assumed
 role to request.
 
-If this role is not set, the tests which require it will be skipped.
+If this role is not declared in `fs.s3a.assumed.role.arn`,
+the tests which require it will be skipped.
 
-To run the tests in `ITestAssumeRole`, you need:
+The specific tests an Assumed Role ARN is required for are
+
+- `ITestAssumeRole`.
+- `ITestRoleDelegationTokens`.
+- One of the parameterized test cases in `ITestDelegatedMRJob`.
+
+To run these tests you need:
 
 1. A role in your AWS account will full read and write access rights to
-the S3 bucket used in the tests, and ideally DynamoDB, for S3Guard.
+the S3 bucket used in the tests, DynamoDB, for S3Guard, and KMS for any
+SSE-KMS tests.
+
 If your bucket is set up by default to use S3Guard, the role must have access
 to that service.
 
-1.  Your IAM User to have the permissions to adopt that role.
+1. Your IAM User to have the permissions to "assume" that role.
 
 1. The role ARN must be set in `fs.s3a.assumed.role.arn`.
 
diff --git a/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/troubleshooting_s3a.md b/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/troubleshooting_s3a.md
index 805c6f723e6ad..3123221bd8293 100644
--- a/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/troubleshooting_s3a.md
+++ b/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/troubleshooting_s3a.md
@@ -235,7 +235,23 @@ As an example, the endpoint for S3 Frankfurt is `s3.eu-central-1.amazonaws.com`:
 </property>
 ```
 
-## <a name="access_denied"></a> `AccessDeniedException` "Access Denied"
+## <a name="access_denied"></a> "The security token included in the request is invalid"
+
+You are trying to use session/temporary credentials and the session token
+supplied is considered invalid.
+
+```
+org.apache.hadoop.fs.s3a.AWSBadRequestException: initTable on bucket:
+  com.amazonaws.services.dynamodbv2.model.AmazonDynamoDBException:
+  The security token included in the request is invalid
+  (Service: AmazonDynamoDBv2; Status Code: 400; Error Code: UnrecognizedClientException)
+```
+
+This can surface if your configuration is setting the `fs.s3a.secret.key`,
+`fs.s3a.access.key` and `fs.s3a.session.key` correctly, but the
+AWS credential provider list set in `AWS_CREDENTIALS_PROVIDER` does not include
+`org.apache.hadoop.fs.s3a.TemporaryAWSCredentialsProvider`.
+
 
 ### <a name="access_denied_unknown-ID"></a> AccessDeniedException "The AWS Access Key Id you provided does not exist in our records."
 
diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/AbstractS3ATestBase.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/AbstractS3ATestBase.java
index f22af4963596b..484d2dcfb3768 100644
--- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/AbstractS3ATestBase.java
+++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/AbstractS3ATestBase.java
@@ -133,10 +133,10 @@ protected void writeThenReadFile(Path path, int len) throws IOException {
    * Assert that an exception failed with a specific status code.
    * @param e exception
    * @param code expected status code
-   * @throws AWSS3IOException rethrown if the status code does not match.
+   * @throws AWSServiceIOException rethrown if the status code does not match.
    */
-  protected void assertStatusCode(AWSS3IOException e, int code)
-      throws AWSS3IOException {
+  protected void assertStatusCode(AWSServiceIOException e, int code)
+      throws AWSServiceIOException {
     if (e.getStatusCode() != code) {
       throw e;
     }
diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AEncryptionSSEKMSUserDefinedKeyBlockOutputStream.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AEncryptionSSEKMSUserDefinedKeyBlockOutputStream.java
deleted file mode 100644
index c1708305ece5a..0000000000000
--- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AEncryptionSSEKMSUserDefinedKeyBlockOutputStream.java
+++ /dev/null
@@ -1,50 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.hadoop.fs.s3a;
-
-import static org.apache.hadoop.fs.contract.ContractTestUtils.skip;
-
-import org.apache.commons.lang3.StringUtils;
-import org.apache.hadoop.conf.Configuration;
-
-/**
- * Run the encryption tests against the Fast output stream.
- * This verifies that both file writing paths can encrypt their data. This
- * requires the SERVER_SIDE_ENCRYPTION_KEY to be set in auth-keys.xml for it
- * to run.
- */
-public class ITestS3AEncryptionSSEKMSUserDefinedKeyBlockOutputStream
-    extends AbstractTestS3AEncryption {
-
-  @Override
-  protected Configuration createConfiguration() {
-    Configuration conf = super.createConfiguration();
-    if(StringUtils.isBlank(conf.get(Constants.SERVER_SIDE_ENCRYPTION_KEY))){
-      skip(Constants.SERVER_SIDE_ENCRYPTION_KEY+ " is not set for " +
-          S3AEncryptionMethods.SSE_KMS.getMethod());
-    }
-    conf.set(Constants.FAST_UPLOAD_BUFFER, Constants.FAST_UPLOAD_BYTEBUFFER);
-    return conf;
-  }
-
-  @Override
-  protected S3AEncryptionMethods getSSEAlgorithm() {
-    return S3AEncryptionMethods.SSE_KMS;
-  }
-}
diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3ATemporaryCredentials.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3ATemporaryCredentials.java
index afc4086344f3a..a0573c001ea45 100644
--- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3ATemporaryCredentials.java
+++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3ATemporaryCredentials.java
@@ -19,49 +19,89 @@
 package org.apache.hadoop.fs.s3a;
 
 import java.io.IOException;
+import java.net.URISyntaxException;
+import java.nio.file.AccessDeniedException;
+import java.time.Duration;
+import java.time.OffsetDateTime;
+import java.util.concurrent.TimeUnit;
 
+import com.amazonaws.ClientConfiguration;
 import com.amazonaws.services.securitytoken.AWSSecurityTokenService;
 import com.amazonaws.services.securitytoken.AWSSecurityTokenServiceClientBuilder;
-import com.amazonaws.services.securitytoken.model.GetSessionTokenRequest;
-import com.amazonaws.services.securitytoken.model.GetSessionTokenResult;
 import com.amazonaws.services.securitytoken.model.Credentials;
+import org.hamcrest.Matchers;
+import org.junit.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.s3a.auth.MarshalledCredentialBinding;
+import org.apache.hadoop.fs.s3a.auth.MarshalledCredentials;
 import org.apache.hadoop.fs.s3a.auth.STSClientFactory;
 import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.s3a.auth.delegation.SessionTokenIdentifier;
+import org.apache.hadoop.fs.s3a.commit.DurationInfo;
+import org.apache.hadoop.io.IOUtils;
 import org.apache.hadoop.test.LambdaTestUtils;
 
-import org.junit.Test;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
 import static org.apache.hadoop.fs.contract.ContractTestUtils.*;
 import static org.apache.hadoop.fs.s3a.Constants.*;
+import static org.apache.hadoop.fs.s3a.S3ATestUtils.assumeSessionTestsEnabled;
+import static org.apache.hadoop.fs.s3a.S3ATestUtils.requestSessionCredentials;
+import static org.apache.hadoop.fs.s3a.S3ATestUtils.unsetHadoopCredentialProviders;
+import static org.apache.hadoop.fs.s3a.auth.MarshalledCredentialBinding.fromSTSCredentials;
+import static org.apache.hadoop.fs.s3a.auth.MarshalledCredentialBinding.toAWSCredentials;
+import static org.apache.hadoop.fs.s3a.auth.RoleTestUtils.assertCredentialsEqual;
+import static org.apache.hadoop.fs.s3a.auth.delegation.DelegationConstants.*;
+import static org.apache.hadoop.fs.s3a.auth.delegation.SessionTokenBinding.CREDENTIALS_CONVERTED_TO_DELEGATION_TOKEN;
+import static org.apache.hadoop.test.LambdaTestUtils.intercept;
+import static org.hamcrest.Matchers.containsString;
 
 /**
  * Tests use of temporary credentials (for example, AWS STS & S3).
- * This test extends a class that "does things to the root directory", and
- * should only be used against transient filesystems where you don't care about
- * the data.
+ *
+ * The property {@link Constants#ASSUMED_ROLE_STS_ENDPOINT} can be set to
+ * point this at different STS endpoints.
+ * This test will use the AWS credentials (if provided) for
+ * S3A tests to request temporary credentials, then attempt to use those
+ * credentials instead.
  */
 public class ITestS3ATemporaryCredentials extends AbstractS3ATestBase {
 
   private static final Logger LOG =
       LoggerFactory.getLogger(ITestS3ATemporaryCredentials.class);
 
-  private static final String PROVIDER_CLASS
+  private static final String TEMPORARY_AWS_CREDENTIALS
       = TemporaryAWSCredentialsProvider.NAME;
 
   private static final long TEST_FILE_SIZE = 1024;
 
+  public static final String STS_LONDON = "sts.eu-west-2.amazonaws.com";
+
+  public static final String EU_IRELAND = "eu-west-1";
+
   private AWSCredentialProviderList credentials;
 
+  @Override
+  public void setup() throws Exception {
+    super.setup();
+    assumeSessionTestsEnabled(getConfiguration());
+  }
+
   @Override
   public void teardown() throws Exception {
     S3AUtils.closeAutocloseables(LOG, credentials);
     super.teardown();
   }
 
+  @Override
+  protected Configuration createConfiguration() {
+    Configuration conf = super.createConfiguration();
+    conf.set(DELEGATION_TOKEN_BINDING,
+        DELEGATION_TOKEN_SESSION_BINDING);
+    return conf;
+  }
+
   /**
    * Test use of STS for requesting temporary credentials.
    *
@@ -75,9 +115,6 @@ public void teardown() throws Exception {
   @Test
   public void testSTS() throws IOException {
     Configuration conf = getContract().getConf();
-    if (!conf.getBoolean(TEST_STS_ENABLED, true)) {
-      skip("STS functional tests disabled");
-    }
     S3AFileSystem testFS = getFileSystem();
     credentials = testFS.shareCredentials("testSTS");
 
@@ -86,18 +123,15 @@ public void testSTS() throws IOException {
         conf,
         bucket,
         credentials,
-        conf.getTrimmed(TEST_STS_ENDPOINT, ""), "");
-    AWSSecurityTokenService stsClient = builder.build();
-
-    if (!conf.getTrimmed(TEST_STS_ENDPOINT, "").isEmpty()) {
-      LOG.debug("STS Endpoint ={}", conf.getTrimmed(TEST_STS_ENDPOINT, ""));
-      stsClient.setEndpoint(conf.getTrimmed(TEST_STS_ENDPOINT, ""));
-    }
-    GetSessionTokenRequest sessionTokenRequest = new GetSessionTokenRequest();
-    sessionTokenRequest.setDurationSeconds(900);
-    GetSessionTokenResult sessionTokenResult;
-    sessionTokenResult = stsClient.getSessionToken(sessionTokenRequest);
-    Credentials sessionCreds = sessionTokenResult.getCredentials();
+        getStsEndpoint(conf),
+        getStsRegion(conf));
+    STSClientFactory.STSClient clientConnection =
+        STSClientFactory.createClientConnection(
+            builder.build(),
+            new Invoker(new S3ARetryPolicy(conf), Invoker.LOG_EVENT));
+    Credentials sessionCreds = clientConnection
+        .requestSessionCredentials(TEST_SESSION_TOKEN_DURATION_SECONDS,
+            TimeUnit.SECONDS);
 
     // clone configuration so changes here do not affect the base FS.
     Configuration conf2 = new Configuration(conf);
@@ -106,11 +140,10 @@ public void testSTS() throws IOException {
     S3AUtils.clearBucketOption(conf2, bucket, SECRET_KEY);
     S3AUtils.clearBucketOption(conf2, bucket, SESSION_TOKEN);
 
-    conf2.set(ACCESS_KEY, sessionCreds.getAccessKeyId());
-    conf2.set(SECRET_KEY, sessionCreds.getSecretAccessKey());
-    conf2.set(SESSION_TOKEN, sessionCreds.getSessionToken());
+    MarshalledCredentials mc = fromSTSCredentials(sessionCreds);
+    updateConfigWithSessionCreds(conf2, mc);
 
-    conf2.set(AWS_CREDENTIALS_PROVIDER, PROVIDER_CLASS);
+    conf2.set(AWS_CREDENTIALS_PROVIDER, TEMPORARY_AWS_CREDENTIALS);
 
     // with valid credentials, we can set properties.
     try(S3AFileSystem fs = S3ATestUtils.createTestFileSystem(conf2)) {
@@ -130,6 +163,16 @@ public void testSTS() throws IOException {
     }
   }
 
+  protected String getStsEndpoint(final Configuration conf) {
+    return conf.getTrimmed(ASSUMED_ROLE_STS_ENDPOINT,
+            DEFAULT_ASSUMED_ROLE_STS_ENDPOINT);
+  }
+
+  protected String getStsRegion(final Configuration conf) {
+    return conf.getTrimmed(ASSUMED_ROLE_STS_ENDPOINT_REGION,
+        ASSUMED_ROLE_STS_ENDPOINT_REGION_DEFAULT);
+  }
+
   @Test
   public void testTemporaryCredentialValidation() throws Throwable {
     Configuration conf = new Configuration();
@@ -139,4 +182,265 @@ public void testTemporaryCredentialValidation() throws Throwable {
     LambdaTestUtils.intercept(CredentialInitializationException.class,
         () -> new TemporaryAWSCredentialsProvider(conf).getCredentials());
   }
+
+  /**
+   * Test that session tokens are propagated, with the origin string
+   * declaring this.
+   */
+  @Test
+  public void testSessionTokenPropagation() throws Exception {
+    Configuration conf = new Configuration(getContract().getConf());
+    MarshalledCredentials sc = requestSessionCredentials(conf,
+        getFileSystem().getBucket());
+    updateConfigWithSessionCreds(conf, sc);
+    conf.set(AWS_CREDENTIALS_PROVIDER, TEMPORARY_AWS_CREDENTIALS);
+
+    try (S3AFileSystem fs = S3ATestUtils.createTestFileSystem(conf)) {
+      createAndVerifyFile(fs, path("testSTS"), TEST_FILE_SIZE);
+      SessionTokenIdentifier identifier
+          = (SessionTokenIdentifier) fs.getDelegationToken("")
+          .decodeIdentifier();
+      String ids = identifier.toString();
+      assertThat("origin in " + ids,
+          identifier.getOrigin(),
+          containsString(CREDENTIALS_CONVERTED_TO_DELEGATION_TOKEN));
+
+      // and validate the AWS bits to make sure everything has come across.
+      assertCredentialsEqual("Reissued credentials in " + ids,
+          sc,
+          identifier.getMarshalledCredentials());
+    }
+  }
+
+  /**
+   * Examine the returned expiry time and validate it against expectations.
+   * Allows for some flexibility in local clock, but not much.
+   */
+  @Test
+  public void testSessionTokenExpiry() throws Exception {
+    Configuration conf = new Configuration(getContract().getConf());
+    MarshalledCredentials sc = requestSessionCredentials(conf,
+        getFileSystem().getBucket());
+    long permittedExpiryOffset = 60;
+    OffsetDateTime expirationTimestamp = sc.getExpirationDateTime().get();
+    OffsetDateTime localTimestamp = OffsetDateTime.now();
+    assertTrue("local time of " + localTimestamp
+            + " is after expiry time of " + expirationTimestamp,
+        localTimestamp.isBefore(expirationTimestamp));
+
+    // what is the interval
+    Duration actualDuration = Duration.between(localTimestamp,
+        expirationTimestamp);
+    Duration offset = actualDuration.minus(TEST_SESSION_TOKEN_DURATION);
+
+    assertThat(
+        "Duration of session " + actualDuration
+            + " out of expected range of with " + offset
+            + " this host's clock may be wrong.",
+        offset.getSeconds(),
+        Matchers.lessThanOrEqualTo(permittedExpiryOffset));
+  }
+
+  protected void updateConfigWithSessionCreds(final Configuration conf,
+      final MarshalledCredentials sc) {
+    unsetHadoopCredentialProviders(conf);
+    sc.setSecretsInConfiguration(conf);
+  }
+
+  /**
+   * Create an invalid session token and verify that it is rejected.
+   */
+  @Test
+  public void testInvalidSTSBinding() throws Exception {
+    Configuration conf = new Configuration(getContract().getConf());
+
+    MarshalledCredentials sc = requestSessionCredentials(conf,
+        getFileSystem().getBucket());
+    toAWSCredentials(sc,
+        MarshalledCredentials.CredentialTypeRequired.AnyNonEmpty, "");
+    updateConfigWithSessionCreds(conf, sc);
+
+    conf.set(AWS_CREDENTIALS_PROVIDER, TEMPORARY_AWS_CREDENTIALS);
+    conf.set(SESSION_TOKEN, "invalid-" + sc.getSessionToken());
+    S3AFileSystem fs = null;
+
+    try {
+      // this may throw an exception, which is an acceptable outcome.
+      // it must be in the try/catch clause.
+      fs = S3ATestUtils.createTestFileSystem(conf);
+      Path path = path("testSTSInvalidToken");
+      createAndVerifyFile(fs,
+          path,
+            TEST_FILE_SIZE);
+      // this is a failure path, so fail with a meaningful error
+      fail("request to create a file should have failed");
+    } catch (AWSBadRequestException expected){
+      // likely at two points in the operation, depending on
+      // S3Guard state
+    } finally {
+      IOUtils.closeStream(fs);
+    }
+  }
+
+
+  @Test
+  public void testSessionCredentialsBadRegion() throws Throwable {
+    describe("Create a session with a bad region and expect failure");
+    expectedSessionRequestFailure(
+        IllegalArgumentException.class,
+        DEFAULT_DELEGATION_TOKEN_ENDPOINT,
+        "us-west-12",
+        "");
+  }
+
+  @Test
+  public void testSessionCredentialsWrongRegion() throws Throwable {
+    describe("Create a session with the wrong region and expect failure");
+    expectedSessionRequestFailure(
+        AccessDeniedException.class,
+        STS_LONDON,
+        EU_IRELAND,
+        "");
+  }
+
+  @Test
+  public void testSessionCredentialsWrongCentralRegion() throws Throwable {
+    describe("Create a session sts.amazonaws.com; region='us-west-1'");
+    expectedSessionRequestFailure(
+        IllegalArgumentException.class,
+        "sts.amazonaws.com",
+        "us-west-1",
+        "");
+  }
+
+  @Test
+  public void testSessionCredentialsRegionNoEndpoint() throws Throwable {
+    describe("Create a session with a bad region and expect fast failure");
+    expectedSessionRequestFailure(
+        IllegalArgumentException.class,
+        "",
+        EU_IRELAND,
+        EU_IRELAND);
+  }
+
+  @Test
+  public void testSessionCredentialsRegionBadEndpoint() throws Throwable {
+    describe("Create a session with a bad region and expect fast failure");
+    IllegalArgumentException ex
+        = expectedSessionRequestFailure(
+        IllegalArgumentException.class,
+        " ",
+        EU_IRELAND,
+        "");
+    LOG.info("Outcome: ", ex);
+    if (!(ex.getCause() instanceof URISyntaxException)) {
+      throw ex;
+    }
+  }
+
+  @Test
+  public void testSessionCredentialsEndpointNoRegion() throws Throwable {
+    expectedSessionRequestFailure(
+        IllegalArgumentException.class,
+        STS_LONDON,
+        "",
+        STS_LONDON);
+  }
+
+  /**
+   * Expect an attempt to create a session or request credentials to fail
+   * with a specific exception class, optionally text.
+   * @param clazz exact class of exception.
+   * @param endpoint value for the sts endpoint option.
+   * @param region signing region.
+   * @param exceptionText text or "" in the exception.
+   * @param <E> type of exception.
+   * @return the caught exception.
+   * @throws Exception any unexpected exception.
+   */
+  public <E extends Exception> E expectedSessionRequestFailure(
+      final Class<E> clazz,
+      final String endpoint,
+      final String region,
+      final String exceptionText) throws Exception {
+    try(AWSCredentialProviderList parentCreds =
+            getFileSystem().shareCredentials("test");
+        DurationInfo ignored = new DurationInfo(LOG, "requesting credentials")) {
+      Configuration conf = new Configuration(getContract().getConf());
+      ClientConfiguration awsConf =
+          S3AUtils.createAwsConf(conf, null);
+      return intercept(clazz, exceptionText,
+          () -> {
+            AWSSecurityTokenService tokenService =
+                STSClientFactory.builder(parentCreds,
+                    awsConf,
+                    endpoint,
+                    region)
+                    .build();
+            Invoker invoker = new Invoker(new S3ARetryPolicy(conf),
+                LOG_AT_ERROR);
+
+            STSClientFactory.STSClient stsClient
+                = STSClientFactory.createClientConnection(tokenService,
+                invoker);
+
+            return stsClient.requestSessionCredentials(30, TimeUnit.MINUTES);
+          });
+    }
+  }
+
+  /**
+   * Log retries at debug.
+   */
+  public static final Invoker.Retried LOG_AT_ERROR =
+      (text, exception, retries, idempotent) -> {
+        LOG.error("{}", text, exception);
+      };
+
+  @Test
+  public void testTemporaryCredentialValidationOnLoad() throws Throwable {
+    Configuration conf = new Configuration();
+    unsetHadoopCredentialProviders(conf);
+    conf.set(ACCESS_KEY, "aaa");
+    conf.set(SECRET_KEY, "bbb");
+    conf.set(SESSION_TOKEN, "");
+    final MarshalledCredentials sc = MarshalledCredentialBinding.fromFileSystem(
+        null, conf);
+    intercept(IOException.class,
+        MarshalledCredentials.INVALID_CREDENTIALS,
+        () -> {
+          sc.validate("",
+              MarshalledCredentials.CredentialTypeRequired.SessionOnly);
+          return sc.toString();
+        });
+  }
+  @Test
+  public void testEmptyTemporaryCredentialValidation() throws Throwable {
+    Configuration conf = new Configuration();
+    unsetHadoopCredentialProviders(conf);
+    conf.set(ACCESS_KEY, "");
+    conf.set(SECRET_KEY, "");
+    conf.set(SESSION_TOKEN, "");
+    final MarshalledCredentials sc = MarshalledCredentialBinding.fromFileSystem(
+        null, conf);
+    intercept(IOException.class,
+        MarshalledCredentialBinding.NO_AWS_CREDENTIALS,
+        () -> {
+          sc.validate("",
+              MarshalledCredentials.CredentialTypeRequired.SessionOnly);
+          return sc.toString();
+        });
+  }
+
+  /**
+   * Verify that the request mechanism is translating exceptions.
+   * @throws Exception on a failure
+   */
+  @Test
+  public void testSessionRequestExceptionTranslation() throws Exception {
+    intercept(IOException.class,
+        () -> requestSessionCredentials(getConfiguration(),
+            getFileSystem().getBucket(), 10));
+  }
+
 }
diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/MockS3AFileSystem.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/MockS3AFileSystem.java
index 1062a12077954..51ff299e7be08 100644
--- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/MockS3AFileSystem.java
+++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/MockS3AFileSystem.java
@@ -37,6 +37,7 @@
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.fs.RemoteIterator;
 import org.apache.hadoop.fs.permission.FsPermission;
+import org.apache.hadoop.fs.s3a.auth.delegation.EncryptionSecrets;
 import org.apache.hadoop.fs.s3a.commit.staging.StagingTestBase;
 import org.apache.hadoop.util.Progressable;
 
@@ -71,6 +72,8 @@ public class MockS3AFileSystem extends S3AFileSystem {
   /** Log the entire stack of where operations are called: {@value}.  */
   public static final int LOG_STACK = 2;
 
+  private final Path root;
+
   /**
    * This can be edited to set the log level of events through the
    * mock FS.
@@ -85,8 +88,10 @@ public MockS3AFileSystem(S3AFileSystem mock,
       Pair<StagingTestBase.ClientResults, StagingTestBase.ClientErrors> outcome) {
     this.mock = mock;
     this.outcome = outcome;
-    setUri(FS_URI);
+    setUri(FS_URI, false);
     setBucket(BUCKET);
+    setEncryptionSecrets(new EncryptionSecrets());
+    root = new Path(FS_URI.toString());
   }
 
   public Pair<StagingTestBase.ClientResults, StagingTestBase.ClientErrors>
@@ -118,9 +123,19 @@ private void event(String format, Object... args) {
     }
   }
 
+  @Override
+  public URI getUri() {
+    return FS_URI;
+  }
+
   @Override
   public Path getWorkingDirectory() {
-    return new Path("s3a://" + BUCKET + "/work");
+    return new Path(root, "work");
+  }
+
+  @Override
+  public Path qualify(final Path path) {
+    return path.makeQualified(FS_URI, getWorkingDirectory());
   }
 
   @Override
diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/MockS3ClientFactory.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/MockS3ClientFactory.java
index dbf228d4c7f33..0403d36c69022 100644
--- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/MockS3ClientFactory.java
+++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/MockS3ClientFactory.java
@@ -37,7 +37,8 @@ public class MockS3ClientFactory implements S3ClientFactory {
   @Override
   public AmazonS3 createS3Client(URI name,
       final String bucket,
-      final AWSCredentialsProvider credentialSet) {
+      final AWSCredentialsProvider credentialSet,
+      final String userAgentSuffix) {
     AmazonS3 s3 = mock(AmazonS3.class);
     when(s3.doesBucketExist(bucket)).thenReturn(true);
     // this listing is used in startup if purging is enabled, so
diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/S3ATestConstants.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/S3ATestConstants.java
index ce2a98ecb232f..5f28c3012e788 100644
--- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/S3ATestConstants.java
+++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/S3ATestConstants.java
@@ -18,6 +18,8 @@
 
 package org.apache.hadoop.fs.s3a;
 
+import java.time.Duration;
+
 /**
  * Constants for S3A Testing.
  */
@@ -137,6 +139,12 @@ public interface S3ATestConstants {
    */
   String TEST_UNIQUE_FORK_ID = "test.unique.fork.id";
   String TEST_STS_ENABLED = "test.fs.s3a.sts.enabled";
+
+  /**
+   * Endpoint for STS testing.
+   * @deprecated : Use {@link Constants#ASSUMED_ROLE_STS_ENDPOIN}
+   */
+  @Deprecated
   String TEST_STS_ENDPOINT = "test.fs.s3a.sts.endpoint";
 
   /**
@@ -173,4 +181,16 @@ public interface S3ATestConstants {
    */
   String FS_S3A_IMPL_DISABLE_CACHE
       = "fs.s3a.impl.disable.cache";
+
+  /**
+   * Duration in seconds for role/session token requests: {@value}.
+   */
+  int TEST_SESSION_TOKEN_DURATION_SECONDS = 900;
+
+  /**
+   * Test session duration as a java 8 Duration.
+   */
+  Duration TEST_SESSION_TOKEN_DURATION = Duration.ofSeconds(
+      TEST_SESSION_TOKEN_DURATION_SECONDS);
+
 }
diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/S3ATestUtils.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/S3ATestUtils.java
index 097b482d334a0..484f079e3e6d6 100644
--- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/S3ATestUtils.java
+++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/S3ATestUtils.java
@@ -47,6 +47,7 @@
 import org.hamcrest.core.Is;
 import org.junit.Assert;
 import org.junit.Assume;
+import org.junit.internal.AssumptionViolatedException;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -62,7 +63,6 @@
 import java.util.concurrent.Callable;
 
 import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.HADOOP_SECURITY_CREDENTIAL_PROVIDER_PATH;
-import static org.apache.commons.lang3.StringUtils.isNotEmpty;
 import static org.apache.hadoop.fs.contract.ContractTestUtils.skip;
 import static org.apache.hadoop.fs.s3a.FailureInjectionPolicy.*;
 import static org.apache.hadoop.fs.s3a.S3ATestConstants.*;
@@ -144,6 +144,7 @@ public static S3AFileSystem createTestFileSystem(Configuration conf)
    * @param purge flag to enable Multipart purging
    * @return the FS
    * @throws IOException IO Problems
+   * @throws AssumptionViolatedException if the FS is not named
    */
   public static S3AFileSystem createTestFileSystem(Configuration conf,
       boolean purge)
@@ -157,10 +158,12 @@ public static S3AFileSystem createTestFileSystem(Configuration conf,
       testURI = URI.create(fsname);
       liveTest = testURI.getScheme().equals(Constants.FS_S3A);
     }
-    // This doesn't work with our JUnit 3 style test cases, so instead we'll
-    // make this whole class not run by default
-    Assume.assumeTrue("No test filesystem in " + TEST_FS_S3A_NAME,
-        liveTest);
+    if (!liveTest) {
+      // This doesn't work with our JUnit 3 style test cases, so instead we'll
+      // make this whole class not run by default
+      throw new AssumptionViolatedException(
+          "No test filesystem in " + TEST_FS_S3A_NAME);
+    }
     // patch in S3Guard options
     maybeEnableS3Guard(conf);
     S3AFileSystem fs1 = new S3AFileSystem();
@@ -189,6 +192,7 @@ public static void enableMultipartPurge(Configuration conf, int seconds) {
    * @param conf configuration
    * @return the FS
    * @throws IOException IO Problems
+   * @throws AssumptionViolatedException if the FS is not named
    */
   public static FileContext createTestFileContext(Configuration conf)
       throws IOException {
@@ -200,10 +204,12 @@ public static FileContext createTestFileContext(Configuration conf)
       testURI = URI.create(fsname);
       liveTest = testURI.getScheme().equals(Constants.FS_S3A);
     }
-    // This doesn't work with our JUnit 3 style test cases, so instead we'll
-    // make this whole class not run by default
-    Assume.assumeTrue("No test filesystem in " + TEST_FS_S3A_NAME,
-        liveTest);
+    if (!liveTest) {
+      // This doesn't work with our JUnit 3 style test cases, so instead we'll
+      // make this whole class not run by default
+      throw new AssumptionViolatedException("No test filesystem in "
+          + TEST_FS_S3A_NAME);
+    }
     // patch in S3Guard options
     maybeEnableS3Guard(conf);
     FileContext fc = FileContext.getFileContext(testURI, conf);
@@ -321,56 +327,10 @@ public static String getTestProperty(Configuration conf,
       String defVal) {
     String confVal = conf != null ? conf.getTrimmed(key, defVal) : defVal;
     String propval = System.getProperty(key);
-    return isNotEmpty(propval) && !UNSET_PROPERTY.equals(propval)
+    return StringUtils.isNotEmpty(propval) && !UNSET_PROPERTY.equals(propval)
         ? propval : confVal;
   }
 
-  /**
-   * Get the test CSV file; assume() that it is not empty.
-   * @param conf test configuration
-   * @return test file.
-   */
-  public static String getCSVTestFile(Configuration conf) {
-    String csvFile = conf
-        .getTrimmed(KEY_CSVTEST_FILE, DEFAULT_CSVTEST_FILE);
-    Assume.assumeTrue("CSV test file is not the default",
-        isNotEmpty(csvFile));
-    return csvFile;
-  }
-
-  /**
-   * Get the test CSV path; assume() that it is not empty.
-   * @param conf test configuration
-   * @return test file as a path.
-   */
-  public static Path getCSVTestPath(Configuration conf) {
-    return new Path(getCSVTestFile(conf));
-  }
-
-  /**
-   * Get the test CSV file; assume() that it is not modified (i.e. we haven't
-   * switched to a new storage infrastructure where the bucket is no longer
-   * read only).
-   * @return test file.
-   * @param conf test configuration
-   */
-  public static String getLandsatCSVFile(Configuration conf) {
-    String csvFile = getCSVTestFile(conf);
-    Assume.assumeTrue("CSV test file is not the default",
-        DEFAULT_CSVTEST_FILE.equals(csvFile));
-    return csvFile;
-  }
-  /**
-   * Get the test CSV file; assume() that it is not modified (i.e. we haven't
-   * switched to a new storage infrastructure where the bucket is no longer
-   * read only).
-   * @param conf test configuration
-   * @return test file as a path.
-   */
-  public static Path getLandsatCSVPath(Configuration conf) {
-    return new Path(getLandsatCSVFile(conf));
-  }
-
   /**
    * Verify the class of an exception. If it is not as expected, rethrow it.
    * Comparison is on the exact class, not subclass-of inference as
@@ -1168,12 +1128,9 @@ public static void skipDuringFaultInjection(S3AFileSystem fs) {
    * Skip a test if the FS isn't marked as supporting magic commits.
    * @param fs filesystem
    */
-  public static void assumeMagicCommitEnabled(S3AFileSystem fs)
-      throws IOException {
+  public static void assumeMagicCommitEnabled(S3AFileSystem fs) {
     assume("Magic commit option disabled on " + fs,
-        fs.hasPathCapability(
-            fs.getWorkingDirectory(),
-            CommitConstants.STORE_CAPABILITY_MAGIC_COMMITTER));
+        fs.hasCapability(CommitConstants.STORE_CAPABILITY_MAGIC_COMMITTER));
   }
 
   /**
diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/TestS3AAWSCredentialsProvider.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/TestS3AAWSCredentialsProvider.java
index e7f836be728a6..3822ee781dcc8 100644
--- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/TestS3AAWSCredentialsProvider.java
+++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/TestS3AAWSCredentialsProvider.java
@@ -23,12 +23,14 @@
 import java.net.URI;
 import java.nio.file.AccessDeniedException;
 import java.util.Arrays;
+import java.util.Collections;
 import java.util.List;
 
 import com.amazonaws.auth.AWSCredentials;
 import com.amazonaws.auth.AWSCredentialsProvider;
 import com.amazonaws.auth.EnvironmentVariableCredentialsProvider;
 import com.amazonaws.auth.InstanceProfileCredentialsProvider;
+import com.google.common.collect.Sets;
 import org.junit.Rule;
 import org.junit.Test;
 import org.junit.rules.ExpectedException;
@@ -51,18 +53,24 @@
  */
 public class TestS3AAWSCredentialsProvider {
 
+  /**
+   * URI of the landsat images.
+   */
+  private static final URI TESTFILE_URI = new Path(
+      DEFAULT_CSVTEST_FILE).toUri();
+
   @Rule
   public ExpectedException exception = ExpectedException.none();
 
   @Test
   public void testProviderWrongClass() throws Exception {
-    expectProviderInstantiationFailure(this.getClass().getName(),
+    expectProviderInstantiationFailure(this.getClass(),
         NOT_AWS_PROVIDER);
   }
 
   @Test
   public void testProviderAbstractClass() throws Exception {
-    expectProviderInstantiationFailure(AbstractProvider.class.getName(),
+    expectProviderInstantiationFailure(AbstractProvider.class,
         ABSTRACT_PROVIDER);
   }
 
@@ -75,30 +83,29 @@ public void testProviderNotAClass() throws Exception {
   @Test
   public void testProviderConstructorError() throws Exception {
     expectProviderInstantiationFailure(
-        ConstructorSignatureErrorProvider.class.getName(),
+        ConstructorSignatureErrorProvider.class,
         CONSTRUCTOR_EXCEPTION);
   }
 
   @Test
   public void testProviderFailureError() throws Exception {
     expectProviderInstantiationFailure(
-        ConstructorFailureProvider.class.getName(),
+        ConstructorFailureProvider.class,
         INSTANTIATION_EXCEPTION);
   }
 
   @Test
   public void testInstantiationChain() throws Throwable {
-    Configuration conf = new Configuration();
+    Configuration conf = new Configuration(false);
     conf.set(AWS_CREDENTIALS_PROVIDER,
         TemporaryAWSCredentialsProvider.NAME
             + ", \t" + SimpleAWSCredentialsProvider.NAME
             + " ,\n " + AnonymousAWSCredentialsProvider.NAME);
     Path testFile = getCSVTestPath(conf);
 
-    URI uri = testFile.toUri();
-    AWSCredentialProviderList list = S3AUtils.createAWSCredentialProviderSet(
-        uri, conf);
-    List<Class<? extends AWSCredentialsProvider>> expectedClasses =
+    AWSCredentialProviderList list = createAWSCredentialProviderSet(
+        testFile.toUri(), conf);
+    List<Class<?>> expectedClasses =
         Arrays.asList(
             TemporaryAWSCredentialsProvider.class,
             SimpleAWSCredentialsProvider.class,
@@ -109,60 +116,73 @@ public void testInstantiationChain() throws Throwable {
   @Test
   public void testDefaultChain() throws Exception {
     URI uri1 = new URI("s3a://bucket1"), uri2 = new URI("s3a://bucket2");
-    Configuration conf = new Configuration();
+    Configuration conf = new Configuration(false);
     // use the default credential provider chain
     conf.unset(AWS_CREDENTIALS_PROVIDER);
-    AWSCredentialProviderList list1 = S3AUtils.createAWSCredentialProviderSet(
+    AWSCredentialProviderList list1 = createAWSCredentialProviderSet(
         uri1, conf);
-    AWSCredentialProviderList list2 = S3AUtils.createAWSCredentialProviderSet(
+    AWSCredentialProviderList list2 = createAWSCredentialProviderSet(
         uri2, conf);
-    List<Class<? extends AWSCredentialsProvider>> expectedClasses =
-        Arrays.asList(
-            SimpleAWSCredentialsProvider.class,
-            EnvironmentVariableCredentialsProvider.class,
-            InstanceProfileCredentialsProvider.class);
+    List<Class<?>> expectedClasses = STANDARD_AWS_PROVIDERS;
     assertCredentialProviders(expectedClasses, list1);
     assertCredentialProviders(expectedClasses, list2);
-    assertSameInstanceProfileCredentialsProvider(list1.getProviders().get(2),
-        list2.getProviders().get(2));
+  }
+
+  @Test
+  public void testDefaultChainNoURI() throws Exception {
+    Configuration conf = new Configuration(false);
+    // use the default credential provider chain
+    conf.unset(AWS_CREDENTIALS_PROVIDER);
+    assertCredentialProviders(STANDARD_AWS_PROVIDERS,
+        createAWSCredentialProviderSet(null, conf));
   }
 
   @Test
   public void testConfiguredChain() throws Exception {
     URI uri1 = new URI("s3a://bucket1"), uri2 = new URI("s3a://bucket2");
-    Configuration conf = new Configuration();
-    List<Class<? extends AWSCredentialsProvider>> expectedClasses =
+    List<Class<?>> expectedClasses =
         Arrays.asList(
             EnvironmentVariableCredentialsProvider.class,
             InstanceProfileCredentialsProvider.class,
             AnonymousAWSCredentialsProvider.class);
-    conf.set(AWS_CREDENTIALS_PROVIDER, buildClassListString(expectedClasses));
-    AWSCredentialProviderList list1 = S3AUtils.createAWSCredentialProviderSet(
+    Configuration conf =
+        createProviderConfiguration(buildClassListString(expectedClasses));
+    AWSCredentialProviderList list1 = createAWSCredentialProviderSet(
         uri1, conf);
-    AWSCredentialProviderList list2 = S3AUtils.createAWSCredentialProviderSet(
+    AWSCredentialProviderList list2 = createAWSCredentialProviderSet(
         uri2, conf);
     assertCredentialProviders(expectedClasses, list1);
     assertCredentialProviders(expectedClasses, list2);
-    assertSameInstanceProfileCredentialsProvider(list1.getProviders().get(1),
-        list2.getProviders().get(1));
   }
 
   @Test
   public void testConfiguredChainUsesSharedInstanceProfile() throws Exception {
     URI uri1 = new URI("s3a://bucket1"), uri2 = new URI("s3a://bucket2");
-    Configuration conf = new Configuration();
-    List<Class<? extends AWSCredentialsProvider>> expectedClasses =
-        Arrays.<Class<? extends AWSCredentialsProvider>>asList(
+    Configuration conf = new Configuration(false);
+    List<Class<?>> expectedClasses =
+        Arrays.asList(
             InstanceProfileCredentialsProvider.class);
     conf.set(AWS_CREDENTIALS_PROVIDER, buildClassListString(expectedClasses));
-    AWSCredentialProviderList list1 = S3AUtils.createAWSCredentialProviderSet(
+    AWSCredentialProviderList list1 = createAWSCredentialProviderSet(
         uri1, conf);
-    AWSCredentialProviderList list2 = S3AUtils.createAWSCredentialProviderSet(
+    AWSCredentialProviderList list2 = createAWSCredentialProviderSet(
         uri2, conf);
     assertCredentialProviders(expectedClasses, list1);
     assertCredentialProviders(expectedClasses, list2);
-    assertSameInstanceProfileCredentialsProvider(list1.getProviders().get(0),
-        list2.getProviders().get(0));
+  }
+
+  @Test
+  public void testFallbackToDefaults() throws Throwable {
+    // build up the base provider
+    final AWSCredentialProviderList credentials = buildAWSProviderList(
+        new URI("s3a://bucket1"),
+        createProviderConfiguration("  "),
+        ASSUMED_ROLE_CREDENTIALS_PROVIDER,
+        Arrays.asList(
+            EnvironmentVariableCredentialsProvider.class),
+        Sets.newHashSet());
+    assertTrue("empty credentials", credentials.size() > 0);
+
   }
 
   /**
@@ -212,14 +232,110 @@ public void refresh() {
     }
   }
 
+  @Test
+  public void testAWSExceptionTranslation() throws Throwable {
+    IOException ex = expectProviderInstantiationFailure(
+        AWSExceptionRaisingFactory.class,
+        AWSExceptionRaisingFactory.NO_AUTH);
+    if (!(ex instanceof AccessDeniedException)) {
+      throw ex;
+    }
+  }
+
+  static class AWSExceptionRaisingFactory implements AWSCredentialsProvider {
+
+    public static final String NO_AUTH = "No auth";
+
+    public static AWSCredentialsProvider getInstance() {
+      throw new NoAuthWithAWSException(NO_AUTH);
+    }
+
+    @Override
+    public AWSCredentials getCredentials() {
+      return null;
+    }
+
+    @Override
+    public void refresh() {
+
+    }
+  }
+
+  @Test
+  public void testFactoryWrongType() throws Throwable {
+    expectProviderInstantiationFailure(
+        FactoryOfWrongType.class,
+        CONSTRUCTOR_EXCEPTION);
+  }
+
+  static class FactoryOfWrongType implements AWSCredentialsProvider {
+
+    public static final String NO_AUTH = "No auth";
+
+    public static String getInstance() {
+      return "oops";
+    }
+
+    @Override
+    public AWSCredentials getCredentials() {
+      return null;
+    }
+
+    @Override
+    public void refresh() {
+
+    }
+  }
+
+  /**
+   * Expect a provider to raise an exception on failure.
+   * @param option aws provider option string.
+   * @param expectedErrorText error text to expect
+   * @return the exception raised
+   * @throws Exception any unexpected exception thrown.
+   */
   private IOException expectProviderInstantiationFailure(String option,
       String expectedErrorText) throws Exception {
-    Configuration conf = new Configuration();
-    conf.set(AWS_CREDENTIALS_PROVIDER, option);
-    Path testFile = new Path(
-        conf.getTrimmed(KEY_CSVTEST_FILE, DEFAULT_CSVTEST_FILE));
     return intercept(IOException.class, expectedErrorText,
-        () -> S3AUtils.createAWSCredentialProviderSet(testFile.toUri(), conf));
+        () -> createAWSCredentialProviderSet(
+            TESTFILE_URI,
+            createProviderConfiguration(option)));
+  }
+
+  /**
+   * Expect a provider to raise an exception on failure.
+   * @param aClass class to use
+   * @param expectedErrorText error text to expect
+   * @return the exception raised
+   * @throws Exception any unexpected exception thrown.
+   */
+  private IOException expectProviderInstantiationFailure(Class aClass,
+      String expectedErrorText) throws Exception {
+    return expectProviderInstantiationFailure(
+        buildClassListString(Collections.singletonList(aClass)),
+        expectedErrorText);
+  }
+
+  /**
+   * Create a configuration with a specific provider.
+   * @param providerOption option for the aws credential provider option.
+   * @return a configuration to use in test cases
+   */
+  private Configuration createProviderConfiguration(
+      final String providerOption) {
+    Configuration conf = new Configuration(false);
+    conf.set(AWS_CREDENTIALS_PROVIDER, providerOption);
+    return conf;
+  }
+
+  /**
+   * Create a configuration with a specific class.
+   * @param aClass class to use
+   * @return a configuration to use in test cases
+   */
+  public Configuration createProviderConfiguration(final Class<?> aClass) {
+    return createProviderConfiguration(buildClassListString(
+        Collections.singletonList(aClass)));
   }
 
   /**
@@ -228,13 +344,13 @@ private IOException expectProviderInstantiationFailure(String option,
    * @param list providers to check
    */
   private static void assertCredentialProviders(
-      List<Class<? extends AWSCredentialsProvider>> expectedClasses,
+      List<Class<?>> expectedClasses,
       AWSCredentialProviderList list) {
     assertNotNull(list);
     List<AWSCredentialsProvider> providers = list.getProviders();
     assertEquals(expectedClasses.size(), providers.size());
     for (int i = 0; i < expectedClasses.size(); ++i) {
-      Class<? extends AWSCredentialsProvider> expectedClass =
+      Class<?> expectedClass =
           expectedClasses.get(i);
       AWSCredentialsProvider provider = providers.get(i);
       assertNotNull(
@@ -247,23 +363,6 @@ private static void assertCredentialProviders(
     }
   }
 
-  /**
-   * Asserts that two different references point to the same shared instance of
-   * InstanceProfileCredentialsProvider using a descriptive assertion message.
-   * @param provider1 provider to check
-   * @param provider2 provider to check
-   */
-  private static void assertSameInstanceProfileCredentialsProvider(
-      AWSCredentialsProvider provider1, AWSCredentialsProvider provider2) {
-    assertNotNull(provider1);
-    assertInstanceOf(InstanceProfileCredentialsProvider.class, provider1);
-    assertNotNull(provider2);
-    assertInstanceOf(InstanceProfileCredentialsProvider.class, provider2);
-    assertSame("Expected all usage of InstanceProfileCredentialsProvider to "
-            + "share a singleton instance, but found unique instances.",
-        provider1, provider2);
-  }
-
   /**
    * This is here to check up on the S3ATestUtils probes themselves.
    * @see S3ATestUtils#authenticationContains(Configuration, String).
@@ -290,7 +389,7 @@ public void testExceptionLogic() throws Throwable {
     // but that it closes safely
     providers.close();
 
-    S3ARetryPolicy retryPolicy = new S3ARetryPolicy(new Configuration());
+    S3ARetryPolicy retryPolicy = new S3ARetryPolicy(new Configuration(false));
     assertEquals("Expected no retry on auth failure",
         RetryPolicy.RetryAction.FAIL.action,
         retryPolicy.shouldRetry(noAuth, 0, 0, true).action);
@@ -355,6 +454,9 @@ public void testIOEInConstructorPropagation() throws Throwable {
     }
   }
 
+  /**
+   * Credential provider which raises an IOE when constructed.
+   */
   private static class IOERaisingProvider implements AWSCredentialsProvider {
 
     public IOERaisingProvider(URI uri, Configuration conf)
diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/TestSSEConfiguration.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/TestSSEConfiguration.java
index 050f0a7197c58..a664a8bd3f204 100644
--- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/TestSSEConfiguration.java
+++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/TestSSEConfiguration.java
@@ -280,4 +280,29 @@ public void testGetBucketPasswordFromProviderShort() throws Throwable {
     assertSecretKeyEquals(conf, bucketURI.getHost(), "overidden", "overidden");
   }
 
+  @Test
+  public void testUnknownEncryptionMethod() throws Throwable {
+    intercept(IOException.class, UNKNOWN_ALGORITHM,
+        () -> S3AEncryptionMethods.getMethod("SSE-ROT13"));
+  }
+
+  @Test
+  public void testClientEncryptionMethod() throws Throwable {
+    S3AEncryptionMethods method = getMethod("CSE-KMS");
+    assertEquals(CSE_KMS, method);
+    assertFalse("shouldn't be server side " + method, method.isServerSide());
+  }
+
+  @Test
+  public void testCSEKMSEncryptionMethod() throws Throwable {
+    S3AEncryptionMethods method = getMethod("CSE-CUSTOM");
+    assertEquals(CSE_CUSTOM, method);
+    assertFalse("shouldn't be server side " + method, method.isServerSide());
+  }
+
+  @Test
+  public void testNoEncryptionMethod() throws Throwable {
+    assertEquals(NONE, getMethod(" "));
+  }
+
 }
diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/ITestAssumeRole.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/ITestAssumeRole.java
index 9981c9a6766a1..1ac52c4e3a239 100644
--- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/ITestAssumeRole.java
+++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/ITestAssumeRole.java
@@ -30,6 +30,7 @@
 
 import com.amazonaws.auth.AWSCredentials;
 import com.amazonaws.services.securitytoken.model.AWSSecurityTokenServiceException;
+import com.fasterxml.jackson.core.JsonProcessingException;
 import org.junit.Test;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -61,6 +62,7 @@
 import static org.apache.hadoop.fs.s3a.auth.RoleModel.*;
 import static org.apache.hadoop.fs.s3a.auth.RolePolicies.*;
 import static org.apache.hadoop.fs.s3a.auth.RoleTestUtils.forbidden;
+import static org.apache.hadoop.fs.s3a.auth.RoleTestUtils.newAssumedRoleConfig;
 import static org.apache.hadoop.test.GenericTestUtils.assertExceptionContains;
 import static org.apache.hadoop.test.LambdaTestUtils.*;
 
@@ -76,6 +78,9 @@ public class ITestAssumeRole extends AbstractS3ATestBase {
 
   private static final Path ROOT = new Path("/");
 
+  private static final Statement STATEMENT_ALL_BUCKET_READ_ACCESS
+      = statement(true, S3_ALL_BUCKETS, S3_BUCKET_READ_OPERATIONS);
+
   /**
    * test URI, built in setup.
    */
@@ -135,6 +140,34 @@ private <E extends Throwable> E expectFileSystemCreateFailure(
   public void testCreateCredentialProvider() throws IOException {
     describe("Create the credential provider");
 
+    Configuration conf = createValidRoleConf();
+    try (AssumedRoleCredentialProvider provider
+             = new AssumedRoleCredentialProvider(uri, conf)) {
+      LOG.info("Provider is {}", provider);
+      AWSCredentials credentials = provider.getCredentials();
+      assertNotNull("Null credentials from " + provider, credentials);
+    }
+  }
+
+  @Test
+  public void testCreateCredentialProviderNoURI() throws IOException {
+    describe("Create the credential provider");
+
+    Configuration conf = createValidRoleConf();
+    try (AssumedRoleCredentialProvider provider
+             = new AssumedRoleCredentialProvider(null, conf)) {
+      LOG.info("Provider is {}", provider);
+      AWSCredentials credentials = provider.getCredentials();
+      assertNotNull("Null credentials from " + provider, credentials);
+    }
+  }
+
+  /**
+   * Create a valid role configuration.
+   * @return a configuration set to use to the role ARN.
+   * @throws JsonProcessingException problems working with JSON policies.
+   */
+  protected Configuration createValidRoleConf() throws JsonProcessingException {
     String roleARN = getAssumedRoleARN();
 
     Configuration conf = new Configuration(getContract().getConf());
@@ -143,12 +176,7 @@ public void testCreateCredentialProvider() throws IOException {
     conf.set(ASSUMED_ROLE_SESSION_NAME, "valid");
     conf.set(ASSUMED_ROLE_SESSION_DURATION, "45m");
     bindRolePolicy(conf, RESTRICTED_POLICY);
-    try (AssumedRoleCredentialProvider provider
-             = new AssumedRoleCredentialProvider(uri, conf)) {
-      LOG.info("Provider is {}", provider);
-      AWSCredentials credentials = provider.getCredentials();
-      assertNotNull("Null credentials from " + provider, credentials);
-    }
+    return conf;
   }
 
   @Test
@@ -205,11 +233,12 @@ public void testAssumeRoleCannotAuthAssumedRole() throws Exception {
     describe("Assert that you can't use assumed roles to auth assumed roles");
 
     Configuration conf = createAssumedRoleConfig();
+    unsetHadoopCredentialProviders(conf);
     conf.set(ASSUMED_ROLE_CREDENTIALS_PROVIDER,
         AssumedRoleCredentialProvider.NAME);
     expectFileSystemCreateFailure(conf,
         IOException.class,
-        AssumedRoleCredentialProvider.E_FORBIDDEN_PROVIDER);
+        E_FORBIDDEN_AWS_PROVIDER);
   }
 
   @Test
@@ -217,6 +246,7 @@ public void testAssumeRoleBadInnerAuth() throws Exception {
     describe("Try to authenticate with a keypair with spaces");
 
     Configuration conf = createAssumedRoleConfig();
+    unsetHadoopCredentialProviders(conf);
     conf.set(ASSUMED_ROLE_CREDENTIALS_PROVIDER,
         SimpleAWSCredentialsProvider.NAME);
     conf.set(ACCESS_KEY, "not valid");
@@ -232,6 +262,7 @@ public void testAssumeRoleBadInnerAuth2() throws Exception {
     describe("Try to authenticate with an invalid keypair");
 
     Configuration conf = createAssumedRoleConfig();
+    unsetHadoopCredentialProviders(conf);
     conf.set(ASSUMED_ROLE_CREDENTIALS_PROVIDER,
         SimpleAWSCredentialsProvider.NAME);
     conf.set(ACCESS_KEY, "notvalid");
@@ -461,7 +492,7 @@ public void testRestrictedWriteSubdir() throws Throwable {
 
     bindRolePolicyStatements(conf,
         STATEMENT_S3GUARD_CLIENT,
-        statement(true, S3_ALL_BUCKETS, S3_ROOT_READ_OPERATIONS),
+        STATEMENT_ALL_BUCKET_READ_ACCESS,
         STATEMENT_ALLOW_SSE_KMS_RW,
         new Statement(Effects.Allow)
           .addActions(S3_ALL_OPERATIONS)
@@ -525,7 +556,7 @@ public void executeRestrictedRename(final Configuration conf)
     bindRolePolicyStatements(conf,
         STATEMENT_S3GUARD_CLIENT,
         STATEMENT_ALLOW_SSE_KMS_RW,
-        statement(true, S3_ALL_BUCKETS, S3_ROOT_READ_OPERATIONS),
+        STATEMENT_ALL_BUCKET_READ_ACCESS,
         new Statement(Effects.Allow)
           .addActions(S3_PATH_RW_OPERATIONS)
           .addResources(directory(restrictedDir))
@@ -617,8 +648,8 @@ public void executeRenameReadOnlyData(final Configuration conf)
 
     bindRolePolicyStatements(conf,
         STATEMENT_S3GUARD_CLIENT,
-        statement(true, S3_ALL_BUCKETS, S3_ROOT_READ_OPERATIONS),
-          new Statement(Effects.Allow)
+        STATEMENT_ALL_BUCKET_READ_ACCESS,
+        new Statement(Effects.Allow)
             .addActions(S3_PATH_RW_OPERATIONS)
             .addResources(directory(destDir))
     );
@@ -698,7 +729,7 @@ public void testRestrictedCommitActions() throws Throwable {
     bindRolePolicyStatements(conf,
         STATEMENT_S3GUARD_CLIENT,
         STATEMENT_ALLOW_SSE_KMS_RW,
-        statement(true, S3_ALL_BUCKETS, S3_ROOT_READ_OPERATIONS),
+        STATEMENT_ALL_BUCKET_READ_ACCESS,
         new Statement(Effects.Allow)
             .addActions(S3_PATH_RW_OPERATIONS)
             .addResources(directory(writeableDir))
diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/ITestAssumedRoleCommitOperations.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/ITestAssumedRoleCommitOperations.java
index 834826e447c49..6b55b1b4c327d 100644
--- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/ITestAssumedRoleCommitOperations.java
+++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/ITestAssumedRoleCommitOperations.java
@@ -74,7 +74,7 @@ public void setup() throws Exception {
     bindRolePolicyStatements(conf,
         STATEMENT_S3GUARD_CLIENT,
         STATEMENT_ALLOW_SSE_KMS_RW,
-        statement(true, S3_ALL_BUCKETS, S3_ROOT_READ_OPERATIONS),
+        statement(true, S3_ALL_BUCKETS, S3_BUCKET_READ_OPERATIONS),
         new RoleModel.Statement(RoleModel.Effects.Allow)
             .addActions(S3_PATH_RW_OPERATIONS)
             .addResources(directory(restrictedDir))
diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/RoleTestUtils.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/RoleTestUtils.java
index 6e70fc6934857..dbbaee5f8a9d1 100644
--- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/RoleTestUtils.java
+++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/RoleTestUtils.java
@@ -22,6 +22,7 @@
 import java.util.concurrent.Callable;
 
 import com.fasterxml.jackson.core.JsonProcessingException;
+import org.junit.Assume;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -30,7 +31,7 @@
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.test.GenericTestUtils;
+import org.apache.hadoop.fs.s3a.auth.delegation.DelegationConstants;
 
 import static org.apache.hadoop.fs.contract.ContractTestUtils.touch;
 import static org.apache.hadoop.fs.s3a.Constants.*;
@@ -38,6 +39,8 @@
 import static org.apache.hadoop.fs.s3a.auth.RoleModel.*;
 import static org.apache.hadoop.fs.s3a.auth.RolePolicies.*;
 import static org.apache.hadoop.test.LambdaTestUtils.intercept;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
 
 /**
  * Helper class for testing roles.
@@ -150,6 +153,7 @@ public static Configuration newAssumedRoleConfig(
     conf.set(ASSUMED_ROLE_ARN, roleARN);
     conf.set(ASSUMED_ROLE_SESSION_NAME, "test");
     conf.set(ASSUMED_ROLE_SESSION_DURATION, "15m");
+    conf.unset(DelegationConstants.DELEGATION_TOKEN_BINDING);
     disableFilesystemCaching(conf);
     return conf;
   }
@@ -170,4 +174,39 @@ public static <T> AccessDeniedException forbidden(
         contained, eval);
   }
 
+  /**
+   * Get the Assumed role referenced by ASSUMED_ROLE_ARN;
+   * skip the test if it is unset.
+   * @param conf config
+   * @return the string
+   */
+  public static String probeForAssumedRoleARN(Configuration conf) {
+    String arn = conf.getTrimmed(ASSUMED_ROLE_ARN, "");
+    Assume.assumeTrue("No ARN defined in " + ASSUMED_ROLE_ARN,
+        !arn.isEmpty());
+    return arn;
+  }
+
+  /**
+   * Assert that credentials are equal without printing secrets.
+   * Different assertions will have different message details.
+   * @param message message to use as base of error.
+   * @param expected expected credentials
+   * @param actual actual credentials.
+   */
+  public static void assertCredentialsEqual(final String message,
+      final MarshalledCredentials expected,
+      final MarshalledCredentials actual) {
+    // DO NOT use assertEquals() here, as that could print a secret to
+    // the test report.
+    assertEquals(message + ": access key",
+        expected.getAccessKey(),
+        actual.getAccessKey());
+    assertTrue(message + ": secret key",
+        expected.getSecretKey().equals(actual.getSecretKey()));
+    assertEquals(message + ": session token",
+        expected.getSessionToken(),
+        actual.getSessionToken());
+
+  }
 }
diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/TestMarshalledCredentials.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/TestMarshalledCredentials.java
new file mode 100644
index 0000000000000..c5ed9dbaac429
--- /dev/null
+++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/TestMarshalledCredentials.java
@@ -0,0 +1,138 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.s3a.auth;
+
+import java.net.URI;
+import java.net.URISyntaxException;
+
+import com.amazonaws.auth.AWSCredentials;
+import org.junit.Before;
+import org.junit.Test;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.s3a.S3AEncryptionMethods;
+import org.apache.hadoop.fs.s3a.S3ATestUtils;
+import org.apache.hadoop.fs.s3a.auth.delegation.EncryptionSecrets;
+import org.apache.hadoop.test.HadoopTestBase;
+
+import static org.apache.hadoop.test.LambdaTestUtils.intercept;
+
+/**
+ * Unit test of marshalled credential support.
+ */
+public class TestMarshalledCredentials extends HadoopTestBase {
+
+  private MarshalledCredentials credentials;
+
+  private int expiration;
+
+  private URI bucketURI;
+
+  @Before
+  public void createSessionToken() throws URISyntaxException {
+    bucketURI = new URI("s3a://bucket1");
+    credentials = new MarshalledCredentials("accessKey",
+        "secretKey", "sessionToken");
+    credentials.setRoleARN("roleARN");
+    expiration = 1970;
+    credentials.setExpiration(expiration);
+  }
+
+  @Test
+  public void testRoundTrip() throws Throwable {
+    MarshalledCredentials c2 = S3ATestUtils.roundTrip(this.credentials,
+        new Configuration());
+    assertEquals(credentials, c2);
+    assertEquals("accessKey", c2.getAccessKey());
+    assertEquals("secretKey", c2.getSecretKey());
+    assertEquals("sessionToken", c2.getSessionToken());
+    assertEquals(expiration, c2.getExpiration());
+    assertEquals(credentials, c2);
+  }
+
+  @Test
+  public void testRoundTripNoSessionData() throws Throwable {
+    MarshalledCredentials c = new MarshalledCredentials();
+    c.setAccessKey("A");
+    c.setSecretKey("K");
+    MarshalledCredentials c2 = S3ATestUtils.roundTrip(c,
+        new Configuration());
+    assertEquals(c, c2);
+  }
+
+  @Test
+  public void testRoundTripEncryptionData() throws Throwable {
+    EncryptionSecrets secrets = new EncryptionSecrets(
+        S3AEncryptionMethods.SSE_KMS,
+        "key");
+    EncryptionSecrets result = S3ATestUtils.roundTrip(secrets,
+        new Configuration());
+    assertEquals("round trip", secrets, result);
+  }
+
+  @Test
+  public void testMarshalledCredentialProviderSession() throws Throwable {
+    MarshalledCredentialProvider provider
+        = new MarshalledCredentialProvider("test",
+        bucketURI,
+        new Configuration(false),
+        credentials,
+        MarshalledCredentials.CredentialTypeRequired.SessionOnly);
+    AWSCredentials aws = provider.getCredentials();
+    assertEquals(credentials.toString(),
+        credentials.getAccessKey(),
+        aws.getAWSAccessKeyId());
+    assertEquals(credentials.toString(),
+        credentials.getSecretKey(),
+        aws.getAWSSecretKey());
+    // because the credentials are set to full only, creation will fail
+  }
+
+  /**
+   * Create with a mismatch of type and supplied credentials.
+   * Verify that the operation fails, but only when credentials
+   * are actually requested.
+   */
+  @Test
+  public void testCredentialTypeMismatch() throws Throwable {
+    MarshalledCredentialProvider provider
+        = new MarshalledCredentialProvider("test",
+        bucketURI,
+        new Configuration(false),
+        credentials,
+        MarshalledCredentials.CredentialTypeRequired.FullOnly);
+    // because the credentials are set to full only, creation will fail
+    intercept(NoAuthWithAWSException.class, "test",
+        () ->  provider.getCredentials());
+  }
+
+  /**
+   * This provider fails fast if there's no URL.
+   */
+  @Test
+  public void testCredentialProviderNullURI() throws Throwable {
+    intercept(NullPointerException.class, "",
+        () ->
+            new MarshalledCredentialProvider("test",
+            null,
+            new Configuration(false),
+            credentials,
+            MarshalledCredentials.CredentialTypeRequired.FullOnly));
+  }
+}
diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/delegation/AbstractDelegationIT.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/delegation/AbstractDelegationIT.java
new file mode 100644
index 0000000000000..7651e24a69219
--- /dev/null
+++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/delegation/AbstractDelegationIT.java
@@ -0,0 +1,207 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.s3a.auth.delegation;
+
+import java.io.DataOutputStream;
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.net.URI;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.s3a.AbstractS3ATestBase;
+import org.apache.hadoop.fs.s3a.S3AFileSystem;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.security.Credentials;
+import org.apache.hadoop.security.UserGroupInformation;
+import org.apache.hadoop.security.token.Token;
+
+import static java.util.Objects.requireNonNull;
+import static org.apache.hadoop.fs.s3a.Constants.AWS_CREDENTIALS_PROVIDER;
+import static org.apache.hadoop.fs.s3a.S3ATestUtils.removeBaseAndBucketOverrides;
+import static org.apache.hadoop.fs.s3a.auth.delegation.DelegationConstants.DELEGATION_TOKEN_BINDING;
+import static org.apache.hadoop.fs.s3a.auth.delegation.S3ADelegationTokens.lookupS3ADelegationToken;
+
+/**
+ * superclass class for DT tests.
+ */
+public abstract class AbstractDelegationIT extends AbstractS3ATestBase {
+
+  protected static final String YARN_RM = "yarn-rm@EXAMPLE";
+
+  private static final Logger LOG =
+      LoggerFactory.getLogger(AbstractDelegationIT.class);
+
+  /**
+   * Look up a token from the submitted credentials.
+   * @param submittedCredentials credentials
+   * @param uri URI of the FS
+   * @param kind required kind of the token (which is asserted on)
+   * @return the token
+   * @throws IOException IO failure
+   */
+  public static AbstractS3ATokenIdentifier lookupToken(
+      Credentials submittedCredentials,
+      URI uri,
+      Text kind) throws IOException {
+    final Token<AbstractS3ATokenIdentifier> token =
+        requireNonNull(
+            lookupS3ADelegationToken(submittedCredentials, uri),
+            "No Token for " + uri);
+    assertEquals("Kind of token " + token,
+        kind,
+        token.getKind());
+    return token.decodeIdentifier();
+  }
+
+  /**
+   * Create credentials with the DTs of the given FS.
+   * @param fs filesystem
+   * @return a non-empty set of credentials.
+   * @throws IOException failure to create.
+   */
+  protected static Credentials mkTokens(final S3AFileSystem fs)
+      throws IOException {
+    Credentials cred = new Credentials();
+    fs.addDelegationTokens(AbstractDelegationIT.YARN_RM, cred);
+    return cred;
+  }
+
+  /**
+   * Create and Init an FS instance.
+   * @param uri URI
+   * @param conf config to use
+   * @return the instance
+   * @throws IOException failure to create/init
+   */
+  protected static S3AFileSystem newS3AInstance(final URI uri,
+      final Configuration conf)
+      throws IOException {
+    S3AFileSystem fs = new S3AFileSystem();
+    fs.initialize(uri, conf);
+    return fs;
+  }
+
+  /**
+   * Assert that a filesystem is bound to a DT; that is: it is a delegate FS.
+   * @param fs filesystem
+   * @param tokenKind the kind of the token to require
+   */
+  protected static void assertBoundToDT(final S3AFileSystem fs,
+      final Text tokenKind) {
+    final S3ADelegationTokens dtSupport = fs.getDelegationTokens().get();
+    assertTrue("Expected bound to a delegation token: " + dtSupport,
+        dtSupport.isBoundToDT());
+    assertEquals("Wrong token kind",
+        tokenKind, dtSupport.getBoundDT().get().getKind());
+  }
+
+  /**
+   * Assert that the number of tokens created by an FS matches the
+   * expected value.
+   * @param fs filesystem
+   * @param expected expected creation count.
+   */
+  protected static void assertTokenCreationCount(final S3AFileSystem fs,
+      final int expected) {
+    assertEquals("DT creation count from " + fs.getDelegationTokens().get(),
+        expected,
+        getTokenCreationCount(fs));
+  }
+
+  /**
+   * Get the token creation count of a filesystem.
+   * @param fs FS
+   * @return creation count
+   */
+  private static int getTokenCreationCount(final S3AFileSystem fs) {
+    return fs.getDelegationTokens()
+        .map(S3ADelegationTokens::getCreationCount)
+        .get();
+  }
+
+  /**
+   * Patch the current config with the DT binding.
+   * @param conf configuration to patch
+   * @param binding binding to use
+   */
+  protected void enableDelegationTokens(Configuration conf, String binding) {
+    LOG.info("Enabling delegation token support for {}", binding);
+    conf.set(DELEGATION_TOKEN_BINDING, binding);
+  }
+
+  /**
+   * Reset UGI info.
+   */
+  protected void resetUGI() {
+    UserGroupInformation.reset();
+  }
+
+  /**
+   * Bind the provider list to the args supplied.
+   * At least one must be provided, to stop the default list being
+   * picked up.
+   * @param config configuration to patch.
+   * @param bucket bucket to clear.
+   * @param providerClassnames providers
+   */
+  protected void bindProviderList(String bucket,
+      Configuration config,
+      String... providerClassnames) {
+    removeBaseAndBucketOverrides(bucket, config, AWS_CREDENTIALS_PROVIDER);
+    assertTrue("No providers to bind to", providerClassnames.length > 0);
+    config.setStrings(AWS_CREDENTIALS_PROVIDER, providerClassnames);
+  }
+
+  /**
+   * Save a DT to a file.
+   * @param tokenFile destination file
+   * @param token token to save
+   * @throws IOException failure
+   */
+  protected void saveDT(final File tokenFile, final Token<?> token)
+      throws IOException {
+    requireNonNull(token, "Null token");
+    Credentials cred = new Credentials();
+    cred.addToken(token.getService(), token);
+
+    try(DataOutputStream out = new DataOutputStream(
+        new FileOutputStream(tokenFile))) {
+      cred.writeTokenStorageToStream(out);
+    }
+  }
+
+  /**
+   * Create and init an S3a DT instance, but don't start it.
+   * @param conf conf to use
+   * @return a new instance
+   * @throws IOException IOE
+   */
+  public S3ADelegationTokens instantiateDTSupport(Configuration conf)
+      throws IOException {
+    S3AFileSystem fs = getFileSystem();
+    S3ADelegationTokens tokens = new S3ADelegationTokens();
+    tokens.bindToFileSystem(fs.getCanonicalUri(), fs);
+    tokens.init(conf);
+    return tokens;
+  }
+}
diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/delegation/CountInvocationsProvider.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/delegation/CountInvocationsProvider.java
new file mode 100644
index 0000000000000..3a7d78d68f7d5
--- /dev/null
+++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/delegation/CountInvocationsProvider.java
@@ -0,0 +1,52 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.s3a.auth.delegation;
+
+import java.util.concurrent.atomic.AtomicLong;
+
+import com.amazonaws.auth.AWSCredentials;
+import com.amazonaws.auth.AWSCredentialsProvider;
+
+import org.apache.hadoop.fs.s3a.CredentialInitializationException;
+
+/**
+ * Simple AWS credential provider which counts how often it is invoked.
+ */
+public class CountInvocationsProvider
+    implements AWSCredentialsProvider {
+
+  public static final String NAME = CountInvocationsProvider.class.getName();
+
+  public static final AtomicLong COUNTER = new AtomicLong(0);
+
+  @Override
+  public AWSCredentials getCredentials() {
+    COUNTER.incrementAndGet();
+    throw new CredentialInitializationException("no credentials");
+  }
+
+  @Override
+  public void refresh() {
+
+  }
+
+  public static long getInvocationCount() {
+    return COUNTER.get();
+  }
+}
diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/delegation/Csvout.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/delegation/Csvout.java
new file mode 100644
index 0000000000000..95a6a6936fc16
--- /dev/null
+++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/delegation/Csvout.java
@@ -0,0 +1,103 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.s3a.auth.delegation;
+
+import java.io.Closeable;
+import java.io.IOException;
+import java.io.Writer;
+
+/**
+ * This is a small utility class to write out rows to a CSV/TSV file.
+ * It does not do any escaping of written text, so don't write entries
+ * containing separators.
+ * Quoting must be done external to this class.
+ */
+public final class Csvout implements Closeable {
+
+  private final Writer out;
+
+  private final String separator;
+
+  private final String eol;
+
+  private boolean isStartOfLine = true;
+
+  /**
+   * Instantiate.
+   * @param out output stream.
+   * @param separator field separator.
+   * @param eol end of line sequence
+   */
+  public Csvout(final Writer out,
+      final String separator,
+      final String eol) {
+    this.out = out;
+    this.separator = separator;
+    this.eol = eol;
+  }
+
+  /**
+   * Close the output stream.
+   * @throws IOException IO failure.
+   */
+  @Override
+  public void close() throws IOException {
+    out.close();
+  }
+
+  /**
+   * Write a single object's string value.
+   * @param o object to write.
+   * @return this instance
+   * @throws IOException IO failure.
+   */
+  public Csvout write(Object o) throws IOException {
+    if (isStartOfLine) {
+      isStartOfLine = false;
+    } else {
+      out.write(separator);
+    }
+    out.write(o.toString());
+    return this;
+  }
+
+  /**
+   * Write a newline.
+   * @return this instance
+   * @throws IOException IO failure.
+   */
+  public Csvout newline() throws IOException {
+    out.write(eol);
+    isStartOfLine = true;
+    return this;
+  }
+
+  /**
+   * Write a collection of objects.
+   * @param objects varags list of objects to write
+   * @return this instance.
+   * @throws IOException IO failure.
+   */
+  public Csvout write(Object... objects) throws IOException {
+    for (Object object : objects) {
+      write(object);
+    }
+    return this;
+  }
+}
diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AEncryptionSSES3BlockOutputStream.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/delegation/ILoadTestRoleCredentials.java
similarity index 55%
rename from hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AEncryptionSSES3BlockOutputStream.java
rename to hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/delegation/ILoadTestRoleCredentials.java
index ff9c07a7d5a25..ffcb2fb902b7e 100644
--- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AEncryptionSSES3BlockOutputStream.java
+++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/delegation/ILoadTestRoleCredentials.java
@@ -16,29 +16,23 @@
  * limitations under the License.
  */
 
-package org.apache.hadoop.fs.s3a;
+package org.apache.hadoop.fs.s3a.auth.delegation;
 
-import org.apache.hadoop.conf.Configuration;
+import static org.apache.hadoop.fs.s3a.auth.delegation.DelegationConstants.DELEGATION_TOKEN_ROLE_BINDING;
 
 /**
- * Run the encryption tests against the block output stream.
+ * This looks at the cost of assume role, to see if it is more expensive
+ * than creating simple session credentials.
  */
-public class ITestS3AEncryptionSSES3BlockOutputStream
-    extends AbstractTestS3AEncryption {
+public class ILoadTestRoleCredentials extends ILoadTestSessionCredentials {
 
   @Override
-  protected Configuration createConfiguration() {
-    Configuration conf = super.createConfiguration();
-    conf.set(Constants.FAST_UPLOAD_BUFFER,
-        Constants.FAST_UPLOAD_BYTEBUFFER);
-    //must specify encryption key as empty because SSE-S3 does not allow it,
-    //nor can it be null.
-    conf.set(Constants.SERVER_SIDE_ENCRYPTION_KEY, "");
-    return conf;
+  protected String getDelegationBinding() {
+    return DELEGATION_TOKEN_ROLE_BINDING;
   }
 
   @Override
-  protected S3AEncryptionMethods getSSEAlgorithm() {
-    return S3AEncryptionMethods.SSE_S3;
+  protected String getFilePrefix() {
+    return "role";
   }
 }
diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/delegation/ILoadTestSessionCredentials.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/delegation/ILoadTestSessionCredentials.java
new file mode 100644
index 0000000000000..7b3912bf61cd5
--- /dev/null
+++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/delegation/ILoadTestSessionCredentials.java
@@ -0,0 +1,295 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.s3a.auth.delegation;
+
+import java.io.File;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.concurrent.CompletionService;
+import java.util.concurrent.ExecutorCompletionService;
+import java.util.concurrent.ExecutorService;
+
+import com.google.common.util.concurrent.ThreadFactoryBuilder;
+import org.junit.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.commons.io.FileUtils;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.contract.ContractTestUtils;
+import org.apache.hadoop.fs.s3a.Constants;
+import org.apache.hadoop.fs.s3a.S3AFileSystem;
+import org.apache.hadoop.fs.s3a.scale.NanoTimerStats;
+import org.apache.hadoop.fs.s3a.scale.S3AScaleTestBase;
+import org.apache.hadoop.test.GenericTestUtils;
+import org.apache.hadoop.util.concurrent.HadoopExecutors;
+
+import static org.apache.hadoop.fs.s3a.S3ATestUtils.assumeSessionTestsEnabled;
+import static org.apache.hadoop.fs.s3a.auth.delegation.DelegationConstants.DELEGATION_TOKEN_BINDING;
+import static org.apache.hadoop.fs.s3a.auth.delegation.DelegationConstants.DELEGATION_TOKEN_SESSION_BINDING;
+
+/**
+ * This test has a unique name as it is designed to do something special:
+ * generate enough load on the AWS STS service to get some
+ * statistics on its throttling.
+ * This isn't documented anywhere, and for DT support it's
+ * important to know how much effort it takes to overload the service.
+ *
+ * <b>Important</b>
+ *
+ * If this test does trigger STS throttling, then all users in the same
+ * AWS account will experience throttling. This may be observable,
+ * in delays and, if the applications in use are not resilient to
+ * throttling events in STS, from application failures.
+ *
+ * Use with caution.
+ * <ol>
+ *   <li>Don't run it on an AWS endpoint which other users in a
+ *   shared AWS account are actively using. </li>
+ *   <li>Don't run it on the same AWS account which is being used for
+ *   any production service.</li>
+ *   <li>And choose a time (weekend, etc) where the account is under-used.</li>
+ *   <li>Warn your fellow users.</li>
+ * </ol>
+ *
+ * In experiments, the throttling recovers fast and appears restricted
+ * to the single STS service which the test overloads.
+ *
+ * @see <a href="https://github.com/steveloughran/datasets/releases/tag/tag_2018-09-17-aws">
+ *   AWS STS login throttling statistics</a>
+ */
+public class ILoadTestSessionCredentials extends S3AScaleTestBase {
+
+  private static final Logger LOG =
+      LoggerFactory.getLogger(ILoadTestSessionCredentials.class);
+
+  protected static final int THREADS = 100;
+
+  private final ExecutorService executor =
+      HadoopExecutors.newFixedThreadPool(
+          THREADS,
+          new ThreadFactoryBuilder()
+              .setNameFormat("DelegationTokenFetcher #%d")
+              .build());
+
+  private final CompletionService<Outcome>
+      completionService =
+      new ExecutorCompletionService<>(executor);
+
+  private File dataDir;
+
+  @Override
+  protected Configuration createScaleConfiguration() {
+    Configuration conf = super.createScaleConfiguration();
+    conf.set(DELEGATION_TOKEN_BINDING,
+        getDelegationBinding());
+    conf.setInt(Constants.MAXIMUM_CONNECTIONS,
+        Math.max(THREADS, Constants.DEFAULT_MAXIMUM_CONNECTIONS));
+    conf.setInt(Constants.MAX_ERROR_RETRIES, 0);
+    return conf;
+  }
+
+  /**
+   * Which DT binding class to use.
+   * @return the binding config option.
+   */
+  protected String getDelegationBinding() {
+    return DELEGATION_TOKEN_SESSION_BINDING;
+  }
+
+  @Override
+  public void setup() throws Exception {
+    super.setup();
+    assumeSessionTestsEnabled(getConfiguration());
+    S3AFileSystem fileSystem = getFileSystem();
+    assertNotNull(
+        "No delegation tokens in FS",
+        fileSystem.getCanonicalServiceName());
+    dataDir = GenericTestUtils.getTestDir("kerberos");
+    dataDir.mkdirs();
+  }
+
+  protected String getFilePrefix() {
+    return "session";
+  }
+
+  @Test
+  public void testCreate10Tokens() throws Throwable {
+    File file = fetchTokens(10);
+    String csv = FileUtils.readFileToString(file, "UTF-8");
+    LOG.info("CSV data\n{}", csv);
+  }
+
+  @Test
+  public void testCreateManyTokens() throws Throwable {
+    fetchTokens(50000);
+  }
+
+  /**
+   * Fetch tokens.
+   * @param tokens number of tokens.
+   * @return file the timings were
+   * @throws Exception failure
+   */
+  private File fetchTokens(final int tokens)
+      throws Exception {
+
+    File filename = new File(dataDir, getFilePrefix() + "-" + tokens + ".csv");
+    fetchTokens(tokens, filename);
+    return filename;
+  }
+
+  /**
+   * Fetch tokens.
+   * @param tokens number of tokens.
+   * @param csvFile file to save this to.
+   * @throws Exception failure
+   */
+  private void fetchTokens(final int tokens, final File csvFile)
+      throws Exception {
+    describe("Fetching %d tokens, saving log to %s", tokens, csvFile);
+
+    final FileWriter out = new FileWriter(csvFile);
+    Csvout csvout = new Csvout(out, "\t", "\n");
+    Outcome.writeSchema(csvout);
+
+
+    final S3AFileSystem fileSystem = getFileSystem();
+    final ContractTestUtils.NanoTimer jobTimer =
+        new ContractTestUtils.NanoTimer();
+
+
+    for (int i = 0; i < tokens; i++) {
+      final int id = i;
+      completionService.submit(() -> {
+        final long startTime = System.currentTimeMillis();
+        final ContractTestUtils.NanoTimer timer =
+            new ContractTestUtils.NanoTimer();
+        Exception ex = null;
+        try {
+          fileSystem.getDelegationToken("Count ");
+        } catch (IOException e) {
+          ex = e;
+        }
+        timer.end("Request");
+        return new Outcome(id, startTime, timer, ex);
+      });
+    }
+
+    NanoTimerStats stats = new NanoTimerStats("Overall");
+    NanoTimerStats success = new NanoTimerStats("Successful");
+    NanoTimerStats throttled = new NanoTimerStats("Throttled");
+    List<Outcome> throttledEvents = new ArrayList<>();
+    for (int i = 0; i < tokens; i++) {
+      Outcome outcome = completionService.take().get();
+      ContractTestUtils.NanoTimer timer = outcome.timer;
+      Exception ex = outcome.exception;
+      outcome.writeln(csvout);
+      stats.add(timer);
+      if (ex != null) {
+        // throttling event occurred.
+        LOG.info("Throttled at event {}", i, ex);
+        throttled.add(timer);
+        throttledEvents.add(outcome);
+      } else {
+        success.add(timer);
+      }
+    }
+
+    csvout.close();
+
+    jobTimer.end("Execution of fetch calls");
+    // now print the stats
+    LOG.info("Summary file is " + csvFile);
+    LOG.info("Fetched {} tokens with {} throttle events\n: {}\n{}\n{}",
+        tokens,
+        throttled.getCount(),
+        stats,
+        throttled,
+        success);
+
+    double duration = jobTimer.duration();
+    double iops = tokens * 1.0e9 / duration;
+    LOG.info(
+        String.format("Effective IO rate is %3f operations/second", iops));
+    // log at debug
+    if (LOG.isDebugEnabled()) {
+      throttledEvents.stream().forEach((outcome -> {
+        LOG.debug("{}: duration: {}",
+            outcome.id, outcome.timer.elapsedTimeMs());
+      }));
+    }
+  }
+
+  /**
+   * Outcome of one of the load operations.
+   */
+  private static class Outcome {
+
+    private final int id;
+
+    private final long startTime;
+
+    private final ContractTestUtils.NanoTimer timer;
+
+    private final Exception exception;
+
+    Outcome(final int id,
+        final long startTime,
+        final ContractTestUtils.NanoTimer timer,
+        final Exception exception) {
+      this.id = id;
+      this.startTime = startTime;
+      this.timer = timer;
+      this.exception = exception;
+    }
+
+
+    /**
+     * Write this record.
+     * @param out the csvout to write through.
+     * @return the csvout instance
+     * @throws IOException IO failure.
+     */
+    public Csvout writeln(Csvout out) throws IOException {
+      return out.write(
+          id,
+          startTime,
+          exception == null ? 1: 0,
+          timer.getStartTime(),
+          timer.getEndTime(),
+          timer.duration(),
+          '"' + (exception == null ? "" : exception.getMessage()) + '"')
+          .newline();
+    }
+
+    /**
+     * Write the schema of the outcome records.
+     * @param out CSV destinatin
+     * @throws IOException IO failure.
+     */
+    public static void writeSchema(Csvout out) throws IOException {
+      out.write("id", "starttime", "success", "started", "ended",
+          "duration", "error");
+    }
+  }
+
+}
diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/delegation/ITestDelegatedMRJob.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/delegation/ITestDelegatedMRJob.java
new file mode 100644
index 0000000000000..2170e53103c63
--- /dev/null
+++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/delegation/ITestDelegatedMRJob.java
@@ -0,0 +1,272 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with this
+ * work for additional information regarding copyright ownership. The ASF
+ * licenses this file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+package org.apache.hadoop.fs.s3a.auth.delegation;
+
+import java.util.Arrays;
+import java.util.Collection;
+
+import org.junit.AfterClass;
+import org.junit.BeforeClass;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.examples.WordCount;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.s3a.S3AFileSystem;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapreduce.JobStatus;
+import org.apache.hadoop.mapreduce.MockJob;
+import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
+import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
+import org.apache.hadoop.mapreduce.v2.MiniMRYarnCluster;
+import org.apache.hadoop.security.Credentials;
+import org.apache.hadoop.security.UserGroupInformation;
+import org.apache.hadoop.security.token.Token;
+import org.apache.hadoop.security.token.TokenIdentifier;
+import org.apache.hadoop.yarn.conf.YarnConfiguration;
+
+import static java.util.Objects.requireNonNull;
+import static org.apache.hadoop.fs.s3a.S3ATestUtils.assumeSessionTestsEnabled;
+import static org.apache.hadoop.fs.s3a.S3ATestUtils.deployService;
+import static org.apache.hadoop.fs.s3a.S3ATestUtils.disableFilesystemCaching;
+import static org.apache.hadoop.fs.s3a.S3ATestUtils.getTestPropertyInt;
+import static org.apache.hadoop.fs.s3a.S3ATestUtils.terminateService;
+import static org.apache.hadoop.fs.s3a.auth.RoleTestUtils.probeForAssumedRoleARN;
+import static org.apache.hadoop.fs.s3a.auth.delegation.DelegationConstants.*;
+import static org.apache.hadoop.fs.s3a.auth.delegation.MiniKerberizedHadoopCluster.assertSecurityEnabled;
+import static org.apache.hadoop.fs.s3a.auth.delegation.MiniKerberizedHadoopCluster.closeUserFileSystems;
+
+/**
+ * Submit a job with S3 delegation tokens.
+ *
+ * YARN will not collect DTs unless it is running secure, and turning
+ * security on complicates test setup "significantly".
+ * Specifically: buts of MR refuse to work on a local FS unless the
+ * native libraries are loaded and it can use lower level POSIX APIs
+ * for creating files and directories with specific permissions.
+ * In production, this is a good thing. In tests, this is not.
+ *
+ * To address this, Job to YARN communications are mocked.
+ * The client-side job submission is as normal, but the implementation
+ * of org.apache.hadoop.mapreduce.protocol.ClientProtocol is mock.
+ *
+ * It's still an ITest though, as it does use S3A as the source and
+ * dest so as to collect URLs.
+ */
+@RunWith(Parameterized.class)
+public class ITestDelegatedMRJob extends AbstractDelegationIT {
+
+  private static final Logger LOG =
+      LoggerFactory.getLogger(ITestDelegatedMRJob.class);
+
+  /**
+   * Created in static {@link #setupCluster()} call.
+   */
+  @SuppressWarnings("StaticNonFinalField")
+  private static MiniKerberizedHadoopCluster cluster;
+
+  private final String name;
+
+  private final String tokenBinding;
+
+  private final Text tokenKind;
+
+  /**
+   * Created in test setup.
+   */
+  private MiniMRYarnCluster yarn;
+
+  private Path destPath;
+
+  /**
+   * Test array for parameterized test runs.
+   * @return a list of parameter tuples.
+   */
+  @Parameterized.Parameters
+  public static Collection<Object[]> params() {
+    return Arrays.asList(new Object[][]{
+        {"session", DELEGATION_TOKEN_SESSION_BINDING, SESSION_TOKEN_KIND},
+        {"full", DELEGATION_TOKEN_FULL_CREDENTIALS_BINDING, FULL_TOKEN_KIND},
+        {"role", DELEGATION_TOKEN_ROLE_BINDING, ROLE_TOKEN_KIND},
+    });
+  }
+
+  public ITestDelegatedMRJob(String name, String tokenBinding, Text tokenKind) {
+    this.name = name;
+    this.tokenBinding = tokenBinding;
+    this.tokenKind = tokenKind;
+  }
+
+  /***
+   * Set up the clusters.
+   */
+  @BeforeClass
+  public static void setupCluster() throws Exception {
+    JobConf conf = new JobConf();
+    assumeSessionTestsEnabled(conf);
+    disableFilesystemCaching(conf);
+    cluster = deployService(conf, new MiniKerberizedHadoopCluster());
+  }
+
+  /**
+   * Tear down the cluster.
+   */
+  @AfterClass
+  public static void teardownCluster() throws Exception {
+    cluster = terminateService(cluster);
+  }
+
+  @Override
+  protected YarnConfiguration createConfiguration() {
+    Configuration parent = super.createConfiguration();
+    YarnConfiguration conf = new YarnConfiguration(parent);
+    cluster.patchConfigWithYARNBindings(conf);
+
+    // fail fairly fast
+    conf.setInt(YarnConfiguration.RESOURCEMANAGER_CONNECT_MAX_WAIT_MS,
+        100);
+    conf.setInt(YarnConfiguration.RESOURCEMANAGER_CONNECT_RETRY_INTERVAL_MS,
+        10_000);
+
+    // set up DTs
+    enableDelegationTokens(conf, tokenBinding);
+    return conf;
+  }
+
+  @Override
+  protected YarnConfiguration getConfiguration() {
+    return (YarnConfiguration) super.getConfiguration();
+  }
+
+  @Override
+  public void setup() throws Exception {
+    cluster.loginPrincipal();
+    super.setup();
+    Configuration conf = getConfiguration();
+
+    if (DELEGATION_TOKEN_ROLE_BINDING.equals(tokenBinding)) {
+      // get the ARN or skip the test
+      probeForAssumedRoleARN(getConfiguration());
+    }
+
+    // filesystems are cached across the test so that
+    // instrumentation fields can be asserted on
+
+    UserGroupInformation.setConfiguration(conf);
+    assertSecurityEnabled();
+
+    LOG.info("Starting MiniMRCluster");
+    yarn = deployService(conf,
+        new MiniMRYarnCluster("ITestDelegatedMRJob", 1));
+
+  }
+
+  @Override
+  public void teardown() throws Exception {
+    describe("Teardown operations");
+    S3AFileSystem fs = getFileSystem();
+    if (fs != null && destPath != null) {
+      fs.delete(destPath, true);
+    }
+    yarn = terminateService(yarn);
+    super.teardown();
+    closeUserFileSystems(UserGroupInformation.getCurrentUser());
+  }
+
+
+  /**
+   * Get the test timeout in seconds.
+   * @return the test timeout as set in system properties or the default.
+   */
+  protected int getTestTimeoutSeconds() {
+    return getTestPropertyInt(new Configuration(),
+        KEY_TEST_TIMEOUT,
+        SCALE_TEST_TIMEOUT_SECONDS);
+  }
+
+  @Override
+  protected int getTestTimeoutMillis() {
+    return getTestTimeoutSeconds() * 1000;
+  }
+
+  @Test
+  public void testJobSubmissionCollectsTokens() throws Exception {
+    describe("Mock Job test");
+    JobConf conf = new JobConf(getConfiguration());
+
+    // the input here is the landsat file; which lets
+    // us differentiate source URI from dest URI
+    Path input = new Path(DEFAULT_CSVTEST_FILE);
+    final FileSystem sourceFS = input.getFileSystem(conf);
+
+
+    // output is in the writable test FS.
+    final S3AFileSystem fs = getFileSystem();
+
+    destPath = path(getMethodName());
+    fs.delete(destPath, true);
+    fs.mkdirs(destPath);
+    Path output = new Path(destPath, "output/");
+    output = output.makeQualified(fs.getUri(), fs.getWorkingDirectory());
+
+    MockJob job = new MockJob(conf, "word count");
+    job.setJarByClass(WordCount.class);
+    job.setMapperClass(WordCount.TokenizerMapper.class);
+    job.setCombinerClass(WordCount.IntSumReducer.class);
+    job.setReducerClass(WordCount.IntSumReducer.class);
+    job.setOutputKeyClass(Text.class);
+    job.setOutputValueClass(IntWritable.class);
+    FileInputFormat.addInputPath(job, input);
+    FileOutputFormat.setOutputPath(job, output);
+    job.setMaxMapAttempts(1);
+    job.setMaxReduceAttempts(1);
+
+    describe("Executing Mock Job Submission to %s", output);
+
+    job.submit();
+    final JobStatus status = job.getStatus();
+    assertEquals("not a mock job",
+        MockJob.NAME, status.getSchedulingInfo());
+    assertEquals("Job State",
+        JobStatus.State.RUNNING, status.getState());
+
+    final Credentials submittedCredentials =
+        requireNonNull(job.getSubmittedCredentials(),
+            "job submitted credentials");
+    final Collection<Token<? extends TokenIdentifier>> tokens
+        = submittedCredentials.getAllTokens();
+
+    // log all the tokens for debugging failed test runs
+    LOG.info("Token Count = {}", tokens.size());
+    for (Token<? extends TokenIdentifier> token : tokens) {
+      LOG.info("{}", token);
+    }
+
+    // verify the source token exists
+    lookupToken(submittedCredentials, sourceFS.getUri(), tokenKind);
+    // look up the destination token
+    lookupToken(submittedCredentials, fs.getUri(), tokenKind);
+  }
+
+}
diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/delegation/ITestRoleDelegationInFileystem.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/delegation/ITestRoleDelegationInFileystem.java
new file mode 100644
index 0000000000000..daf037f3bb682
--- /dev/null
+++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/delegation/ITestRoleDelegationInFileystem.java
@@ -0,0 +1,68 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.s3a.auth.delegation;
+
+import java.nio.file.AccessDeniedException;
+
+import org.apache.hadoop.fs.s3a.S3AFileSystem;
+import org.apache.hadoop.io.Text;
+
+import static org.apache.hadoop.fs.s3a.auth.RoleTestUtils.probeForAssumedRoleARN;
+import static org.apache.hadoop.fs.s3a.auth.delegation.DelegationConstants.DELEGATION_TOKEN_ROLE_BINDING;
+import static org.apache.hadoop.fs.s3a.auth.delegation.DelegationConstants.ROLE_TOKEN_KIND;
+import static org.apache.hadoop.test.LambdaTestUtils.intercept;
+
+/**
+ * Subclass of the session test which checks roles; only works if
+ * a role ARN has been declared.
+ */
+public class ITestRoleDelegationInFileystem extends
+    ITestSessionDelegationInFileystem {
+
+  @Override
+  public void setup() throws Exception {
+    super.setup();
+    probeForAssumedRoleARN(getConfiguration());
+  }
+
+  @Override
+  protected String getDelegationBinding() {
+    return DELEGATION_TOKEN_ROLE_BINDING;
+  }
+
+  @Override
+  public Text getTokenKind() {
+    return ROLE_TOKEN_KIND;
+  }
+
+  /**
+   * This verifies that the granted credentials only access the target bucket
+   * by using the credentials in a new S3 client to query the AWS-owned landsat
+   * bucket.
+   * @param delegatedFS delegated FS with role-restricted access.
+   * @throws Exception failure
+   */
+  @Override
+  protected void verifyRestrictedPermissions(final S3AFileSystem delegatedFS)
+      throws Exception {
+    intercept(AccessDeniedException.class,
+        () -> readLandsatMetadata(delegatedFS));
+  }
+
+}
diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/delegation/ITestRoleDelegationTokens.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/delegation/ITestRoleDelegationTokens.java
new file mode 100644
index 0000000000000..991e09fc64921
--- /dev/null
+++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/delegation/ITestRoleDelegationTokens.java
@@ -0,0 +1,122 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.s3a.auth.delegation;
+
+import java.util.EnumSet;
+import java.util.List;
+
+import org.junit.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.s3a.S3AFileSystem;
+import org.apache.hadoop.fs.s3a.auth.MarshalledCredentials;
+import org.apache.hadoop.fs.s3a.auth.RoleModel;
+import org.apache.hadoop.io.Text;
+
+import static org.apache.hadoop.fs.s3a.auth.RoleTestUtils.probeForAssumedRoleARN;
+import static org.apache.hadoop.fs.s3a.auth.delegation.DelegationConstants.DELEGATION_TOKEN_ROLE_BINDING;
+import static org.apache.hadoop.fs.s3a.auth.delegation.DelegationConstants.E_NO_SESSION_TOKENS_FOR_ROLE_BINDING;
+import static org.apache.hadoop.fs.s3a.auth.delegation.DelegationConstants.ROLE_TOKEN_KIND;
+import static org.apache.hadoop.fs.s3a.auth.delegation.RoleTokenBinding.E_NO_ARN;
+import static org.apache.hadoop.test.LambdaTestUtils.intercept;
+
+/**
+ * Rerun the session token tests with a role binding.
+ * Some tests will fail as role bindings prevent certain operations.
+ */
+public class ITestRoleDelegationTokens extends ITestSessionDelegationTokens {
+
+  private static final Logger LOG =
+      LoggerFactory.getLogger(ITestRoleDelegationTokens.class);
+  @Override
+  protected String getDelegationBinding() {
+    return DELEGATION_TOKEN_ROLE_BINDING;
+  }
+
+  @Override
+  public Text getTokenKind() {
+    return ROLE_TOKEN_KIND;
+  }
+
+  @Override
+  public void setup() throws Exception {
+    super.setup();
+    probeForAssumedRoleARN(getConfiguration());
+  }
+
+  /**
+   * Session credentials will not propagate with role tokens,
+   * so the superclass's method will fail.
+   * This subclass intercepts the exception which is expected.
+   * @param fs base FS to bond to.
+   * @param marshalledCredentials session credentials from first DT.
+   * @param conf config to use
+   * @return null
+   * @throws Exception failure
+   */
+  @Override
+  protected AbstractS3ATokenIdentifier verifyCredentialPropagation(
+      final S3AFileSystem fs,
+      final MarshalledCredentials marshalledCredentials,
+      final Configuration conf) throws Exception {
+    intercept(DelegationTokenIOException.class,
+        E_NO_SESSION_TOKENS_FOR_ROLE_BINDING,
+        () -> super.verifyCredentialPropagation(fs,
+            marshalledCredentials, conf));
+    return null;
+  }
+
+  @Test
+  public void testBindingWithoutARN() throws Throwable {
+    describe("verify that a role binding only needs a role ARN when creating"
+        + " a new token");
+
+    Configuration conf = new Configuration(getConfiguration());
+    conf.unset(DelegationConstants.DELEGATION_TOKEN_ROLE_ARN);
+    try (S3ADelegationTokens delegationTokens2 = new S3ADelegationTokens()) {
+      final S3AFileSystem fs = getFileSystem();
+      delegationTokens2.bindToFileSystem(fs.getUri(), fs);
+      delegationTokens2.init(conf);
+      delegationTokens2.start();
+
+      // cannot create a DT at this point
+      intercept(IllegalStateException.class,
+          E_NO_ARN,
+          () -> delegationTokens2.createDelegationToken(
+              new EncryptionSecrets()));
+    }
+  }
+
+  @Test
+  public void testCreateRoleModel() throws Throwable {
+    describe("self contained role model retrieval");
+    EnumSet<AWSPolicyProvider.AccessLevel> access
+        = EnumSet.of(
+        AWSPolicyProvider.AccessLevel.READ,
+        AWSPolicyProvider.AccessLevel.WRITE);
+    S3AFileSystem fs = getFileSystem();
+    List<RoleModel.Statement> rules = fs.listAWSPolicyRules(
+        access);
+    assertTrue("No AWS policy rules from FS", !rules.isEmpty());
+    String ruleset = new RoleModel().toJson(new RoleModel.Policy(rules));
+    LOG.info("Access policy for {}\n{}", fs.getUri(), ruleset);
+  }
+}
diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/delegation/ITestSessionDelegationInFileystem.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/delegation/ITestSessionDelegationInFileystem.java
new file mode 100644
index 0000000000000..66a1fc4ea8c40
--- /dev/null
+++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/delegation/ITestSessionDelegationInFileystem.java
@@ -0,0 +1,727 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.s3a.auth.delegation;
+
+import java.io.ByteArrayOutputStream;
+import java.io.File;
+import java.io.IOException;
+import java.io.PrintStream;
+import java.net.URI;
+import java.nio.file.AccessDeniedException;
+
+import com.amazonaws.services.s3.AmazonS3;
+import com.amazonaws.services.s3.model.ObjectMetadata;
+import org.junit.AfterClass;
+import org.junit.BeforeClass;
+import org.junit.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.contract.ContractTestUtils;
+import org.apache.hadoop.fs.s3a.AWSCredentialProviderList;
+import org.apache.hadoop.fs.s3a.DefaultS3ClientFactory;
+import org.apache.hadoop.fs.s3a.Invoker;
+import org.apache.hadoop.fs.s3a.S3AEncryptionMethods;
+import org.apache.hadoop.fs.s3a.S3AFileSystem;
+import org.apache.hadoop.fs.s3a.S3ATestUtils;
+import org.apache.hadoop.fs.s3a.Statistic;
+import org.apache.hadoop.hdfs.tools.DelegationTokenFetcher;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.security.TokenCache;
+import org.apache.hadoop.security.Credentials;
+import org.apache.hadoop.security.UserGroupInformation;
+import org.apache.hadoop.security.token.DtUtilShell;
+import org.apache.hadoop.security.token.Token;
+import org.apache.hadoop.security.token.TokenIdentifier;
+import org.apache.hadoop.service.ServiceOperations;
+import org.apache.hadoop.service.ServiceStateException;
+import org.apache.hadoop.util.ExitUtil;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.hadoop.yarn.conf.YarnConfiguration;
+
+import static java.util.Objects.requireNonNull;
+import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.HADOOP_SECURITY_AUTHENTICATION;
+import static org.apache.hadoop.fs.s3a.Constants.*;
+import static org.apache.hadoop.fs.s3a.S3ATestUtils.assumeSessionTestsEnabled;
+import static org.apache.hadoop.fs.s3a.S3ATestUtils.disableFilesystemCaching;
+import static org.apache.hadoop.fs.s3a.S3ATestUtils.removeBaseAndBucketOverrides;
+import static org.apache.hadoop.fs.s3a.S3ATestUtils.unsetHadoopCredentialProviders;
+import static org.apache.hadoop.fs.s3a.auth.delegation.DelegationConstants.*;
+import static org.apache.hadoop.fs.s3a.auth.delegation.DelegationTokenIOException.TOKEN_MISMATCH;
+import static org.apache.hadoop.fs.s3a.auth.delegation.MiniKerberizedHadoopCluster.ALICE;
+import static org.apache.hadoop.fs.s3a.auth.delegation.MiniKerberizedHadoopCluster.assertSecurityEnabled;
+import static org.apache.hadoop.fs.s3a.auth.delegation.S3ADelegationTokens.lookupS3ADelegationToken;
+import static org.apache.hadoop.test.LambdaTestUtils.doAs;
+import static org.apache.hadoop.test.LambdaTestUtils.intercept;
+import static org.hamcrest.Matchers.containsString;
+import static org.hamcrest.collection.IsCollectionWithSize.hasSize;
+
+/**
+ * Tests use of Hadoop delegation tokens within the FS itself.
+ * This instantiates a MiniKDC as some of the operations tested require
+ * UGI to be initialized with security enabled.
+ */
+@SuppressWarnings("StaticNonFinalField")
+public class ITestSessionDelegationInFileystem extends AbstractDelegationIT {
+
+  private static final Logger LOG =
+      LoggerFactory.getLogger(ITestSessionDelegationInFileystem.class);
+
+  private static MiniKerberizedHadoopCluster cluster;
+
+  private UserGroupInformation bobUser;
+
+  private UserGroupInformation aliceUser;
+
+  private S3ADelegationTokens delegationTokens;
+
+  /***
+   * Set up a mini Cluster with two users in the keytab.
+   */
+  @BeforeClass
+  public static void setupCluster() throws Exception {
+    cluster = new MiniKerberizedHadoopCluster();
+    cluster.init(new Configuration());
+    cluster.start();
+  }
+
+  /**
+   * Tear down the Cluster.
+   */
+  @SuppressWarnings("ThrowableNotThrown")
+  @AfterClass
+  public static void teardownCluster() throws Exception {
+    ServiceOperations.stopQuietly(LOG, cluster);
+  }
+
+  protected static MiniKerberizedHadoopCluster getCluster() {
+    return cluster;
+  }
+
+  /**
+   * Get the delegation token binding for this test suite.
+   * @return which DT binding to use.
+   */
+  protected String getDelegationBinding() {
+    return DELEGATION_TOKEN_SESSION_BINDING;
+  }
+
+  /**
+   * Get the kind of the tokens which are generated.
+   * @return the kind of DT
+   */
+  public Text getTokenKind() {
+    return SESSION_TOKEN_KIND;
+  }
+
+  @Override
+  protected Configuration createConfiguration() {
+    Configuration conf = super.createConfiguration();
+    // disable if assume role opts are off
+    assumeSessionTestsEnabled(conf);
+    disableFilesystemCaching(conf);
+    conf.set(HADOOP_SECURITY_AUTHENTICATION,
+        UserGroupInformation.AuthenticationMethod.KERBEROS.name());
+    enableDelegationTokens(conf, getDelegationBinding());
+    conf.set(AWS_CREDENTIALS_PROVIDER, " ");
+    // switch to SSE_S3.
+    if (conf.getBoolean(KEY_ENCRYPTION_TESTS, true)) {
+      conf.set(SERVER_SIDE_ENCRYPTION_ALGORITHM,
+          S3AEncryptionMethods.SSE_S3.getMethod());
+    }
+    // set the YARN RM up for YARN tests.
+    conf.set(YarnConfiguration.RM_PRINCIPAL, YARN_RM);
+    return conf;
+  }
+
+
+  @Override
+  public void setup() throws Exception {
+    // clear any existing tokens from the FS
+    resetUGI();
+    UserGroupInformation.setConfiguration(createConfiguration());
+
+    aliceUser = cluster.createAliceUser();
+    bobUser = cluster.createBobUser();
+
+    UserGroupInformation.setLoginUser(aliceUser);
+    assertSecurityEnabled();
+    // only now do the setup, so that any FS created is secure
+    super.setup();
+    S3AFileSystem fs = getFileSystem();
+    // make sure there aren't any tokens
+    assertNull("Unexpectedly found an S3A token",
+        lookupS3ADelegationToken(
+        UserGroupInformation.getCurrentUser().getCredentials(),
+        fs.getUri()));
+
+    // DTs are inited but not started.
+    delegationTokens = instantiateDTSupport(getConfiguration());
+  }
+
+  @SuppressWarnings("ThrowableNotThrown")
+  @Override
+  public void teardown() throws Exception {
+    super.teardown();
+    ServiceOperations.stopQuietly(LOG, delegationTokens);
+    FileSystem.closeAllForUGI(UserGroupInformation.getCurrentUser());
+    MiniKerberizedHadoopCluster.closeUserFileSystems(aliceUser);
+    MiniKerberizedHadoopCluster.closeUserFileSystems(bobUser);
+    cluster.resetUGI();
+  }
+
+  /**
+   * Are encryption tests enabled?
+   * @return true if encryption is turned on.
+   */
+  protected boolean encryptionTestEnabled() {
+    return getConfiguration().getBoolean(KEY_ENCRYPTION_TESTS, true);
+  }
+
+  @Test
+  public void testGetDTfromFileSystem() throws Throwable {
+    describe("Enable delegation tokens and request one");
+    delegationTokens.start();
+    S3AFileSystem fs = getFileSystem();
+    assertNotNull("No tokens from " + fs,
+        fs.getCanonicalServiceName());
+    S3ATestUtils.MetricDiff invocationDiff = new S3ATestUtils.MetricDiff(fs,
+        Statistic.INVOCATION_GET_DELEGATION_TOKEN);
+    S3ATestUtils.MetricDiff issueDiff = new S3ATestUtils.MetricDiff(fs,
+        Statistic.DELEGATION_TOKENS_ISSUED);
+    Token<AbstractS3ATokenIdentifier> token =
+        requireNonNull(fs.getDelegationToken(""),
+            "no token from filesystem " + fs);
+    assertEquals("token kind", getTokenKind(), token.getKind());
+    assertTokenCreationCount(fs, 1);
+    final String fsInfo = fs.toString();
+    invocationDiff.assertDiffEquals("getDelegationToken() in " + fsInfo,
+        1);
+    issueDiff.assertDiffEquals("DTs issued in " + delegationTokens,
+        1);
+
+    Text service = delegationTokens.getService();
+    assertEquals("service name", service, token.getService());
+    Credentials creds = new Credentials();
+    creds.addToken(service, token);
+    assertEquals("retrieve token from " + creds,
+        token, creds.getToken(service));
+  }
+
+  @Test
+  public void testAddTokensFromFileSystem() throws Throwable {
+    describe("verify FileSystem.addDelegationTokens() collects tokens");
+    S3AFileSystem fs = getFileSystem();
+    Credentials cred = new Credentials();
+    Token<?>[] tokens = fs.addDelegationTokens(YARN_RM, cred);
+    assertEquals("Number of tokens", 1, tokens.length);
+    Token<?> token = requireNonNull(tokens[0], "token");
+    LOG.info("FS token is {}", token);
+    Text service = delegationTokens.getService();
+    Token<? extends TokenIdentifier> retrieved = requireNonNull(
+        cred.getToken(service),
+        "retrieved token with key " + service + "; expected " + token);
+    delegationTokens.start();
+    // this only sneaks in because there isn't a state check here
+    delegationTokens.resetTokenBindingToDT(
+        (Token<AbstractS3ATokenIdentifier>) retrieved);
+    assertTrue("bind to existing DT failed",
+        delegationTokens.isBoundToDT());
+    AWSCredentialProviderList providerList = requireNonNull(
+        delegationTokens.getCredentialProviders(), "providers");
+
+    providerList.getCredentials();
+  }
+
+  @Test
+  public void testCanRetrieveTokenFromCurrentUserCreds() throws Throwable {
+    describe("Create a DT, add it to the current UGI credentials,"
+        + " then retrieve");
+    delegationTokens.start();
+    Credentials cred = createDelegationTokens();
+    UserGroupInformation ugi = UserGroupInformation.getCurrentUser();
+    ugi.addCredentials(cred);
+    Token<?>[] tokens = cred.getAllTokens().toArray(new Token<?>[0]);
+    Token<?> token0 = tokens[0];
+    Text service = token0.getService();
+    LOG.info("Token = " + token0);
+    Token<?> token1 = requireNonNull(
+        ugi.getCredentials().getToken(service), "Token from " + service);
+    assertEquals("retrieved token", token0, token1);
+    assertNotNull("token identifier of "  + token1,
+        token1.getIdentifier());
+  }
+
+  @Test
+  public void testDTCredentialProviderFromCurrentUserCreds() throws Throwable {
+    describe("Add credentials to the current user, "
+        + "then verify that they can be found when S3ADelegationTokens binds");
+    Credentials cred = createDelegationTokens();
+    assertThat("Token size", cred.getAllTokens(), hasSize(1));
+    UserGroupInformation.getCurrentUser().addCredentials(cred);
+    delegationTokens.start();
+    assertTrue("bind to existing DT failed",
+        delegationTokens.isBoundToDT());
+  }
+
+  /**
+   * Create credentials with the DTs of the current FS.
+   * @return a non-empty set of credentials.
+   * @throws IOException failure to create.
+   */
+  protected Credentials createDelegationTokens() throws IOException {
+    return mkTokens(getFileSystem());
+  }
+
+  /**
+   * Create a FS with a delegated token, verify it works as a filesystem,
+   * and that you can pick up the same DT from that FS too.
+   */
+  @Test
+  public void testDelegatedFileSystem() throws Throwable {
+    describe("Delegation tokens can be passed to a new filesystem;"
+        + " if role restricted, permissions are tightened.");
+    S3AFileSystem fs = getFileSystem();
+    readLandsatMetadata(fs);
+
+    URI uri = fs.getUri();
+    // create delegation tokens from the test suites FS.
+    Credentials creds = createDelegationTokens();
+    final Text tokenKind = getTokenKind();
+    AbstractS3ATokenIdentifier origTokenId = requireNonNull(
+        lookupToken(
+            creds,
+            uri,
+            tokenKind), "original");
+    // attach to the user, so that when tokens are looked for, they get picked
+    // up
+    final UserGroupInformation currentUser
+        = UserGroupInformation.getCurrentUser();
+    currentUser.addCredentials(creds);
+    // verify that the tokens went over
+    requireNonNull(lookupToken(
+            currentUser.getCredentials(),
+            uri,
+            tokenKind), "user credentials");
+    Configuration conf = new Configuration(getConfiguration());
+    String bucket = fs.getBucket();
+    disableFilesystemCaching(conf);
+    unsetHadoopCredentialProviders(conf);
+    // remove any secrets we don't want the delegated FS to accidentally
+    // pick up.
+    // this is to simulate better a remote deployment.
+    removeBaseAndBucketOverrides(bucket, conf,
+        ACCESS_KEY, SECRET_KEY, SESSION_TOKEN,
+        SERVER_SIDE_ENCRYPTION_ALGORITHM,
+        DELEGATION_TOKEN_ROLE_ARN,
+        DELEGATION_TOKEN_ENDPOINT);
+    // this is done to make sure you cannot create an STS session no
+    // matter how you pick up credentials.
+    conf.set(DELEGATION_TOKEN_ENDPOINT, "http://localhost:8080/");
+    bindProviderList(bucket, conf, CountInvocationsProvider.NAME);
+    long originalCount = CountInvocationsProvider.getInvocationCount();
+
+    // create a new FS instance, which is expected to pick up the
+    // existing token
+    Path testPath = path("testDTFileSystemClient");
+    try (S3AFileSystem delegatedFS = newS3AInstance(uri, conf)) {
+      LOG.info("Delegated filesystem is: {}", delegatedFS);
+      assertBoundToDT(delegatedFS, tokenKind);
+      if (encryptionTestEnabled()) {
+        assertEquals("Encryption propagation failed",
+            S3AEncryptionMethods.SSE_S3,
+            delegatedFS.getServerSideEncryptionAlgorithm());
+      }
+      verifyRestrictedPermissions(delegatedFS);
+
+      executeDelegatedFSOperations(delegatedFS, testPath);
+      delegatedFS.mkdirs(testPath);
+
+      S3ATestUtils.MetricDiff issueDiff = new S3ATestUtils.MetricDiff(
+          delegatedFS,
+          Statistic.DELEGATION_TOKENS_ISSUED);
+
+      // verify that the FS returns the existing token when asked
+      // so that chained deployments will work
+      AbstractS3ATokenIdentifier tokenFromDelegatedFS
+          = requireNonNull(delegatedFS.getDelegationToken(""),
+          "New token").decodeIdentifier();
+      assertEquals("Newly issued token != old one",
+          origTokenId,
+          tokenFromDelegatedFS);
+      issueDiff.assertDiffEquals("DTs issued in " + delegatedFS,
+          0);
+    }
+    // the DT auth chain should override the original one.
+    assertEquals("invocation count",
+        originalCount,
+        CountInvocationsProvider.getInvocationCount());
+
+    // create a second instance, which will pick up the same value
+    try (S3AFileSystem secondDelegate = newS3AInstance(uri, conf)) {
+      assertBoundToDT(secondDelegate, tokenKind);
+      if (encryptionTestEnabled()) {
+        assertEquals("Encryption propagation failed",
+            S3AEncryptionMethods.SSE_S3,
+            secondDelegate.getServerSideEncryptionAlgorithm());
+      }
+      ContractTestUtils.assertDeleted(secondDelegate, testPath, true);
+      assertNotNull("unbounded DT",
+          secondDelegate.getDelegationToken(""));
+    }
+  }
+
+  /**
+   * Override/extension point: run operations within a delegated FS.
+   * @param delegatedFS filesystem.
+   * @param testPath path to work on.
+   * @throws IOException failures
+   */
+  protected void executeDelegatedFSOperations(final S3AFileSystem delegatedFS,
+      final Path testPath) throws Exception {
+    ContractTestUtils.assertIsDirectory(delegatedFS, new Path("/"));
+    ContractTestUtils.touch(delegatedFS, testPath);
+    ContractTestUtils.assertDeleted(delegatedFS, testPath, false);
+    delegatedFS.mkdirs(testPath);
+    ContractTestUtils.assertIsDirectory(delegatedFS, testPath);
+    Path srcFile = new Path(testPath, "src.txt");
+    Path destFile = new Path(testPath, "dest.txt");
+    ContractTestUtils.touch(delegatedFS, srcFile);
+    ContractTestUtils.rename(delegatedFS, srcFile, destFile);
+    // this file is deleted afterwards, so leave alone
+    ContractTestUtils.assertIsFile(delegatedFS, destFile);
+    ContractTestUtils.assertDeleted(delegatedFS, testPath, true);
+  }
+
+  /**
+   * Session tokens can read the landsat bucket without problems.
+   * @param delegatedFS delegated FS
+   * @throws Exception failure
+   */
+  protected void verifyRestrictedPermissions(final S3AFileSystem delegatedFS)
+      throws Exception {
+    readLandsatMetadata(delegatedFS);
+  }
+
+  @Test
+  public void testDelegationBindingMismatch1() throws Throwable {
+    describe("Verify that when the DT client and remote bindings are different,"
+        + " the failure is meaningful");
+    S3AFileSystem fs = getFileSystem();
+    URI uri = fs.getUri();
+    UserGroupInformation.getCurrentUser().addCredentials(
+        createDelegationTokens());
+
+    // create the remote FS with a full credential binding
+    Configuration conf = new Configuration(getConfiguration());
+    String bucket = fs.getBucket();
+    removeBaseAndBucketOverrides(bucket, conf,
+        ACCESS_KEY, SECRET_KEY, SESSION_TOKEN);
+    conf.set(ACCESS_KEY, "aaaaa");
+    conf.set(SECRET_KEY, "bbbb");
+    bindProviderList(bucket, conf, CountInvocationsProvider.NAME);
+    conf.set(DELEGATION_TOKEN_BINDING,
+        DELEGATION_TOKEN_FULL_CREDENTIALS_BINDING);
+    ServiceStateException e = intercept(
+        ServiceStateException.class,
+        TOKEN_MISMATCH,
+        () -> {
+          S3AFileSystem remote = newS3AInstance(uri, conf);
+          // if we get this far, provide info for the exception which will
+          // be raised.
+          String s = remote.toString();
+          remote.close();
+          return s;
+        });
+    if (!(e.getCause() instanceof DelegationTokenIOException)) {
+      throw e;
+    }
+  }
+
+  @Test
+  public void testDelegationBindingMismatch2() throws Throwable {
+    describe("assert mismatch reported when client DT is a "
+        + "subclass of the remote one");
+    S3AFileSystem fs = getFileSystem();
+    URI uri = fs.getUri();
+
+    // create the remote FS with a full credential binding
+    Configuration conf = new Configuration(getConfiguration());
+    String bucket = fs.getBucket();
+    enableDelegationTokens(conf, DELEGATION_TOKEN_FULL_CREDENTIALS_BINDING);
+
+    // create a new FS with Full tokens
+    Credentials fullTokens;
+    Token<AbstractS3ATokenIdentifier> firstDT;
+    try (S3AFileSystem fullFS = newS3AInstance(uri, conf)) {
+      // add the tokens to the user group
+      fullTokens = mkTokens(fullFS);
+      assertTokenCreationCount(fullFS, 1);
+      firstDT = fullFS.getDelegationToken(
+          "first");
+      assertTokenCreationCount(fullFS, 2);
+      Token<AbstractS3ATokenIdentifier> secondDT = fullFS.getDelegationToken(
+          "second");
+      assertTokenCreationCount(fullFS, 3);
+      assertNotEquals("DT identifiers",
+          firstDT.getIdentifier(), secondDT.getIdentifier());
+    }
+
+    // expect a token
+    AbstractS3ATokenIdentifier origTokenId = requireNonNull(
+        lookupToken(
+            fullTokens,
+            uri,
+            FULL_TOKEN_KIND), "token from credentials");
+    UserGroupInformation.getCurrentUser().addCredentials(
+        fullTokens);
+
+    // a remote FS with those tokens
+    try (S3AFileSystem delegatedFS = newS3AInstance(uri, conf)) {
+      assertBoundToDT(delegatedFS, FULL_TOKEN_KIND);
+      delegatedFS.getFileStatus(new Path("/"));
+      SessionTokenIdentifier tokenFromDelegatedFS
+          = (SessionTokenIdentifier) requireNonNull(
+              delegatedFS.getDelegationToken(""), "New token")
+          .decodeIdentifier();
+      assertTokenCreationCount(delegatedFS, 0);
+      assertEquals("Newly issued token != old one",
+          origTokenId,
+          tokenFromDelegatedFS);
+    }
+
+    // now create a configuration which expects a session token.
+    Configuration conf2 = new Configuration(getConfiguration());
+    removeBaseAndBucketOverrides(bucket, conf2,
+        ACCESS_KEY, SECRET_KEY, SESSION_TOKEN);
+    conf.set(DELEGATION_TOKEN_BINDING,
+        getDelegationBinding());
+    ServiceStateException e = intercept(ServiceStateException.class,
+        TOKEN_MISMATCH,
+        () -> {
+          S3AFileSystem remote = newS3AInstance(uri, conf);
+          // if we get this far, provide info for the exception which will
+          // be raised.
+          String s = remote.toString();
+          remote.close();
+          return s;
+        });
+    if (!(e.getCause() instanceof DelegationTokenIOException)) {
+      throw e;
+    }
+  }
+
+  /**
+   * This verifies that the granted credentials only access the target bucket
+   * by using the credentials in a new S3 client to query the AWS-owned landsat
+   * bucket.
+   * @param delegatedFS delegated FS with role-restricted access.
+   * @throws AccessDeniedException if the delegated FS's credentials can't
+   * access the bucket.
+   * @return result of the HEAD
+   * @throws Exception failure
+   */
+  protected ObjectMetadata readLandsatMetadata(final S3AFileSystem delegatedFS)
+      throws Exception {
+    AWSCredentialProviderList testing
+        = delegatedFS.shareCredentials("testing");
+
+    URI landsat = new URI(DEFAULT_CSVTEST_FILE);
+    DefaultS3ClientFactory factory
+        = new DefaultS3ClientFactory();
+    Configuration conf = new Configuration(delegatedFS.getConf());
+    conf.set(ENDPOINT, "");
+    factory.setConf(conf);
+    String host = landsat.getHost();
+    AmazonS3 s3 = factory.createS3Client(landsat, host, testing,
+        "ITestSessionDelegationInFileystem");
+
+    return Invoker.once("HEAD", host,
+        () -> s3.getObjectMetadata(host, landsat.getPath().substring(1)));
+  }
+
+  /**
+   * YARN job submission uses
+   * {@link TokenCache#obtainTokensForNamenodes(Credentials, Path[], Configuration)}
+   * for token retrieval: call it here to verify it works.
+   */
+  @Test
+  public void testYarnCredentialPickup() throws Throwable {
+    describe("Verify tokens are picked up by the YARN"
+        + " TokenCache.obtainTokensForNamenodes() API Call");
+    Credentials cred = new Credentials();
+    Path yarnPath = path("testYarnCredentialPickup");
+    Path[] paths = new Path[] {yarnPath};
+    Configuration conf = getConfiguration();
+    S3AFileSystem fs = getFileSystem();
+    TokenCache.obtainTokensForNamenodes(cred, paths, conf);
+    assertNotNull("No Token in credentials file",
+        lookupToken(
+            cred,
+            fs.getUri(),
+            getTokenKind()));
+  }
+
+  /**
+   * Test the {@code hdfs fetchdt} command works with S3A tokens.
+   */
+  @Test
+  public void testHDFSFetchDTCommand() throws Throwable {
+    describe("Use the HDFS fetchdt CLI to fetch a token");
+
+    ExitUtil.disableSystemExit();
+    S3AFileSystem fs = getFileSystem();
+    Configuration conf = fs.getConf();
+
+    URI fsUri = fs.getUri();
+    String fsurl = fsUri.toString();
+    File tokenfile = createTempTokenFile();
+
+    // this will create (& leak) a new FS instance as caching is disabled.
+    // but as teardown destroys all filesystems for this user, it
+    // gets cleaned up at the end of the test
+    String tokenFilePath = tokenfile.getAbsolutePath();
+
+
+    // create the tokens as Bob.
+    doAs(bobUser,
+        () -> DelegationTokenFetcher.main(conf,
+            args("--webservice", fsurl, tokenFilePath)));
+    assertTrue("token file was not created: " + tokenfile,
+        tokenfile.exists());
+
+    // print to stdout
+    String s = DelegationTokenFetcher.printTokensToString(conf,
+        new Path(tokenfile.toURI()),
+        false);
+    LOG.info("Tokens: {}", s);
+    DelegationTokenFetcher.main(conf,
+        args("--print", tokenFilePath));
+    DelegationTokenFetcher.main(conf,
+        args("--print", "--verbose", tokenFilePath));
+
+    // read in and retrieve token
+    Credentials creds = Credentials.readTokenStorageFile(tokenfile, conf);
+    AbstractS3ATokenIdentifier identifier = requireNonNull(
+        lookupToken(
+            creds,
+            fsUri,
+            getTokenKind()), "Token lookup");
+    assertEquals("encryption secrets",
+        fs.getEncryptionSecrets(),
+        identifier.getEncryptionSecrets());
+    assertEquals("Username of decoded token",
+        bobUser.getUserName(), identifier.getUser().getUserName());
+
+    // renew
+    DelegationTokenFetcher.main(conf, args("--renew", tokenFilePath));
+
+    // cancel
+    DelegationTokenFetcher.main(conf, args("--cancel", tokenFilePath));
+  }
+
+  protected File createTempTokenFile() throws IOException {
+    File tokenfile = File.createTempFile("tokens", ".bin",
+        cluster.getWorkDir());
+    tokenfile.delete();
+    return tokenfile;
+  }
+
+  /**
+   * Convert a vargs list to an array.
+   * @param args vararg list of arguments
+   * @return the generated array.
+   */
+  private String[] args(String...args) {
+    return args;
+  }
+
+  /**
+   * This test looks at the identity which goes with a DT.
+   * It assumes that the username of a token == the user who created it.
+   * Some tokens may change that in future (maybe use Role ARN?).
+   */
+  @Test
+  public void testFileSystemBoundToCreator() throws Throwable {
+    describe("Run tests to verify the DT Setup is bound to the creator");
+
+    // quick sanity check to make sure alice and bob are different
+    assertNotEquals("Alice and Bob logins",
+        aliceUser.getUserName(), bobUser.getUserName());
+
+    final S3AFileSystem fs = getFileSystem();
+    assertEquals("FS username in doAs()",
+        ALICE,
+        doAs(bobUser, () -> fs.getUsername()));
+
+    UserGroupInformation fsOwner = doAs(bobUser,
+        () -> fs.getDelegationTokens().get().getOwner());
+    assertEquals("username mismatch",
+        aliceUser.getUserName(), fsOwner.getUserName());
+
+    Token<AbstractS3ATokenIdentifier> dt = fs.getDelegationToken(ALICE);
+    AbstractS3ATokenIdentifier identifier
+        = dt.decodeIdentifier();
+    UserGroupInformation user = identifier.getUser();
+    assertEquals("User in DT",
+        aliceUser.getUserName(), user.getUserName());
+  }
+
+
+  protected String dtutil(int expected, String...args) throws Exception {
+    final ByteArrayOutputStream dtUtilContent = new ByteArrayOutputStream();
+    DtUtilShell dt = new DtUtilShell();
+    dt.setOut(new PrintStream(dtUtilContent));
+    dtUtilContent.reset();
+    int r =  doAs(aliceUser,
+        () ->ToolRunner.run(getConfiguration(), dt, args));
+    String s = dtUtilContent.toString();
+    LOG.info("\n{}", s);
+    assertEquals(expected, r);
+    return s;
+  }
+
+  @Test
+  public void testDTUtilShell() throws Throwable {
+    describe("Verify the dtutil shell command can fetch tokens");
+    File tokenfile = createTempTokenFile();
+
+    String tfs = tokenfile.toString();
+    String fsURI = getFileSystem().getCanonicalUri().toString();
+    dtutil(0,
+        "get", fsURI,
+        "-format", "protobuf",
+        tfs);
+    assertTrue("not created: " + tokenfile,
+        tokenfile.exists());
+    assertTrue("File is empty" + tokenfile,
+        tokenfile.length() > 0);
+    assertTrue("File only contains header" + tokenfile,
+        tokenfile.length() > 6);
+
+    String printed = dtutil(0, "print", tfs);
+    assertThat(printed, containsString(fsURI));
+    assertThat(printed, containsString(getTokenKind().toString()));
+
+  }
+
+}
diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/delegation/ITestSessionDelegationTokens.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/delegation/ITestSessionDelegationTokens.java
new file mode 100644
index 0000000000000..d24373acf0561
--- /dev/null
+++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/delegation/ITestSessionDelegationTokens.java
@@ -0,0 +1,282 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.s3a.auth.delegation;
+
+import java.io.File;
+import java.io.IOException;
+import java.net.URI;
+
+import com.amazonaws.auth.AWSCredentials;
+import com.amazonaws.auth.AWSSessionCredentials;
+import org.hamcrest.Matchers;
+import org.junit.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.s3a.S3AEncryptionMethods;
+import org.apache.hadoop.fs.s3a.S3AFileSystem;
+import org.apache.hadoop.fs.s3a.TemporaryAWSCredentialsProvider;
+import org.apache.hadoop.fs.s3a.auth.MarshalledCredentials;
+import org.apache.hadoop.io.IOUtils;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.security.Credentials;
+import org.apache.hadoop.security.token.Token;
+import org.apache.hadoop.security.token.TokenIdentifier;
+
+import static java.util.Objects.requireNonNull;
+import static org.apache.hadoop.fs.s3a.S3ATestUtils.assumeSessionTestsEnabled;
+import static org.apache.hadoop.fs.s3a.S3ATestUtils.roundTrip;
+import static org.apache.hadoop.fs.s3a.S3ATestUtils.unsetHadoopCredentialProviders;
+import static org.apache.hadoop.fs.s3a.auth.MarshalledCredentialBinding.fromAWSCredentials;
+import static org.apache.hadoop.fs.s3a.auth.delegation.DelegationConstants.DELEGATION_TOKEN_CREDENTIALS_PROVIDER;
+import static org.apache.hadoop.fs.s3a.auth.delegation.DelegationConstants.DELEGATION_TOKEN_SESSION_BINDING;
+import static org.apache.hadoop.fs.s3a.auth.delegation.DelegationConstants.SESSION_TOKEN_KIND;
+import static org.apache.hadoop.fs.s3a.auth.delegation.SessionTokenBinding.CREDENTIALS_CONVERTED_TO_DELEGATION_TOKEN;
+
+/**
+ * Tests use of Hadoop delegation tokens to marshall S3 credentials.
+ */
+public class ITestSessionDelegationTokens extends AbstractDelegationIT {
+
+  private static final Logger LOG =
+      LoggerFactory.getLogger(ITestSessionDelegationTokens.class);
+
+  public static final String KMS_KEY = "arn:kms:key";
+
+  private S3ADelegationTokens delegationTokens;
+
+  /**
+   * Get the delegation token binding for this test suite.
+   * @return which DT binding to use.
+   */
+  protected String getDelegationBinding() {
+    return DELEGATION_TOKEN_SESSION_BINDING;
+  }
+
+  public Text getTokenKind() {
+    return SESSION_TOKEN_KIND;
+  }
+
+  @Override
+  protected Configuration createConfiguration() {
+    Configuration conf = super.createConfiguration();
+    enableDelegationTokens(conf, getDelegationBinding());
+    return conf;
+  }
+
+  @Override
+  public void setup() throws Exception {
+    super.setup();
+    assumeSessionTestsEnabled(getConfiguration());
+    resetUGI();
+    delegationTokens = instantiateDTSupport(getConfiguration());
+    delegationTokens.start();
+  }
+
+  @Override
+  public void teardown() throws Exception {
+    IOUtils.cleanupWithLogger(LOG, delegationTokens);
+    resetUGI();
+    super.teardown();
+  }
+
+  /**
+   * Checks here to catch any regressions in canonicalization
+   * logic.
+   */
+  @Test
+  public void testCanonicalization() throws Throwable {
+    S3AFileSystem fs = getFileSystem();
+    assertEquals("Default port has changed",
+        0, fs.getDefaultPort());
+    URI uri = fs.getCanonicalUri();
+    String service = fs.getCanonicalServiceName();
+    assertEquals("canonical URI and service name mismatch",
+        uri, new URI(service));
+  }
+
+  @Test
+  public void testSaveLoadTokens() throws Throwable {
+    File tokenFile = File.createTempFile("token", "bin");
+    EncryptionSecrets encryptionSecrets = new EncryptionSecrets(
+        S3AEncryptionMethods.SSE_KMS, KMS_KEY);
+    Token<AbstractS3ATokenIdentifier> dt
+        = delegationTokens.createDelegationToken(encryptionSecrets);
+    final SessionTokenIdentifier origIdentifier
+        = (SessionTokenIdentifier) dt.decodeIdentifier();
+    assertEquals("kind in " + dt, getTokenKind(), dt.getKind());
+    Configuration conf = getConfiguration();
+    saveDT(tokenFile, dt);
+    assertTrue("Empty token file", tokenFile.length() > 0);
+    Credentials creds = Credentials.readTokenStorageFile(tokenFile, conf);
+    Text serviceId = delegationTokens.getService();
+    Token<? extends TokenIdentifier> token = requireNonNull(
+        creds.getToken(serviceId),
+        () -> "No token for \"" + serviceId + "\" in: " + creds.getAllTokens());
+    SessionTokenIdentifier decoded =
+        (SessionTokenIdentifier) token.decodeIdentifier();
+    decoded.validate();
+    assertEquals("token identifier ", origIdentifier, decoded);
+    assertEquals("Origin in " + decoded,
+        origIdentifier.getOrigin(), decoded.getOrigin());
+    assertEquals("Expiry time",
+        origIdentifier.getExpiryTime(), decoded.getExpiryTime());
+    assertEquals("Encryption Secrets",
+        encryptionSecrets, decoded.getEncryptionSecrets());
+  }
+
+  /**
+   * This creates a DT from a set of credentials, then verifies
+   * that you can use the round-tripped credentials as a source of
+   * authentication for another DT binding, and when
+   * that is asked for a DT token, the secrets it returns are
+   * the same as the original.
+   *
+   * That is different from DT propagation, as here the propagation
+   * is by setting the fs.s3a session/secret/id keys from the marshalled
+   * values, and using session token auth.
+   * This verifies that session token authentication can be used
+   * for DT credential auth, and that new tokens aren't created.
+   *
+   * From a testing perspective, this is not as "good" as having
+   * separate tests, but given the effort to create session tokens
+   * is all hidden in the first FS, it is actually easier to write
+   * and now forms an extra test on those generated tokens as well
+   * as the marshalling.
+   */
+  @Test
+  public void testCreateAndUseDT() throws Throwable {
+    describe("Create a Delegation Token, round trip then reuse");
+
+    final S3AFileSystem fs = getFileSystem();
+    final Configuration conf = fs.getConf();
+
+    assertNull("Current User has delegation token",
+        delegationTokens.selectTokenFromFSOwner());
+    EncryptionSecrets secrets = new EncryptionSecrets(
+        S3AEncryptionMethods.SSE_KMS, KMS_KEY);
+    Token<AbstractS3ATokenIdentifier> originalDT
+        = delegationTokens.createDelegationToken(secrets);
+    assertEquals("Token kind mismatch", getTokenKind(), originalDT.getKind());
+
+    // decode to get the binding info
+    SessionTokenIdentifier issued =
+        requireNonNull(
+            (SessionTokenIdentifier) originalDT.decodeIdentifier(),
+            () -> "no identifier in " + originalDT);
+    issued.validate();
+
+    final MarshalledCredentials creds;
+    try(S3ADelegationTokens dt2 = instantiateDTSupport(getConfiguration())) {
+      dt2.start();
+
+      dt2.resetTokenBindingToDT(originalDT);
+      final AWSSessionCredentials awsSessionCreds
+          = verifySessionCredentials(
+          dt2.getCredentialProviders().getCredentials());
+      final MarshalledCredentials origCreds = fromAWSCredentials(
+          awsSessionCreds);
+
+      Token<AbstractS3ATokenIdentifier> boundDT =
+          dt2.getBoundOrNewDT(secrets);
+      assertEquals("Delegation Tokens", originalDT, boundDT);
+      // simulate marshall and transmission
+      creds = roundTrip(origCreds, conf);
+      SessionTokenIdentifier reissued
+          = (SessionTokenIdentifier) dt2.createDelegationToken(secrets)
+          .decodeIdentifier();
+      reissued.validate();
+      String userAgentField = dt2.getUserAgentField();
+      assertThat("UA field does not contain UUID",
+          userAgentField,
+          Matchers.containsString(issued.getUuid()));
+    }
+
+    // now use those chained credentials to create a new FS instance
+    // and then get a session DT from it and expect equality
+    verifyCredentialPropagation(fs, creds, new Configuration(conf));
+  }
+
+  /**
+   * This verifies that AWS Session credentials can be picked up and
+   * returned in a DT.
+   * With a session binding, this holds; for role binding it will fail.
+   * @param fs base FS to bond to.
+   * @param session session credentials from first DT.
+   * @param conf config to use
+   * @return the retrieved DT. This is only for error reporting.
+   * @throws IOException failure.
+   */
+  @SuppressWarnings("OptionalGetWithoutIsPresent")
+  protected AbstractS3ATokenIdentifier verifyCredentialPropagation(
+      final S3AFileSystem fs,
+      final MarshalledCredentials session,
+      final Configuration conf)
+      throws Exception {
+    describe("Verify Token Propagation");
+    // clear any credential paths to ensure they don't get picked up and used
+    // for authentication.
+    unsetHadoopCredentialProviders(conf);
+    conf.set(DELEGATION_TOKEN_CREDENTIALS_PROVIDER,
+        TemporaryAWSCredentialsProvider.NAME);
+    session.setSecretsInConfiguration(conf);
+    try(S3ADelegationTokens delegationTokens2 = new S3ADelegationTokens()) {
+      delegationTokens2.bindToFileSystem(fs.getCanonicalUri(), fs);
+      delegationTokens2.init(conf);
+      delegationTokens2.start();
+
+      final Token<AbstractS3ATokenIdentifier> newDT
+          = delegationTokens2.getBoundOrNewDT(new EncryptionSecrets());
+      delegationTokens2.resetTokenBindingToDT(newDT);
+      final AbstractS3ATokenIdentifier boundId
+          = delegationTokens2.getDecodedIdentifier().get();
+
+      LOG.info("Regenerated DT is {}", newDT);
+      final MarshalledCredentials creds2 = fromAWSCredentials(
+          verifySessionCredentials(
+              delegationTokens2.getCredentialProviders().getCredentials()));
+      assertEquals("Credentials", session, creds2);
+      assertTrue("Origin in " + boundId,
+          boundId.getOrigin()
+              .contains(CREDENTIALS_CONVERTED_TO_DELEGATION_TOKEN));
+      return boundId;
+    }
+  }
+
+  private AWSSessionCredentials verifySessionCredentials(
+      final AWSCredentials creds) {
+    AWSSessionCredentials session = (AWSSessionCredentials) creds;
+    assertNotNull("access key", session.getAWSAccessKeyId());
+    assertNotNull("secret key", session.getAWSSecretKey());
+    assertNotNull("session token", session.getSessionToken());
+    return session;
+  }
+
+  @Test
+  public void testDBindingReentrancyLock() throws Throwable {
+    describe("Verify that S3ADelegationTokens cannot be bound twice when there"
+        + " is no token");
+    S3ADelegationTokens delegation = instantiateDTSupport(getConfiguration());
+    delegation.start();
+    assertFalse("Delegation is bound to a DT: " + delegation,
+        delegation.isBoundToDT());
+  }
+
+}
diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/delegation/MiniKerberizedHadoopCluster.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/delegation/MiniKerberizedHadoopCluster.java
new file mode 100644
index 0000000000000..c42372a4b3087
--- /dev/null
+++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/delegation/MiniKerberizedHadoopCluster.java
@@ -0,0 +1,378 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.s3a.auth.delegation;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.Properties;
+
+import com.google.common.base.Preconditions;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.CommonConfigurationKeys;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hdfs.HdfsConfiguration;
+import org.apache.hadoop.http.HttpConfig;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapreduce.MRJobConfig;
+import org.apache.hadoop.mapreduce.v2.jobhistory.JHAdminConfig;
+import org.apache.hadoop.minikdc.MiniKdc;
+import org.apache.hadoop.security.KDiag;
+import org.apache.hadoop.security.UserGroupInformation;
+import org.apache.hadoop.security.ssl.KeyStoreTestUtil;
+import org.apache.hadoop.service.CompositeService;
+import org.apache.hadoop.test.GenericTestUtils;
+import org.apache.hadoop.yarn.conf.YarnConfiguration;
+
+import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.HADOOP_SECURITY_AUTHENTICATION;
+import static org.apache.hadoop.hdfs.DFSConfigKeys.*;
+import static org.apache.hadoop.hdfs.client.HdfsClientConfigKeys.DFS_DATA_TRANSFER_PROTECTION_KEY;
+import static org.apache.hadoop.mapreduce.v2.jobhistory.JHAdminConfig.DEFAULT_MR_HISTORY_PORT;
+import static org.apache.hadoop.security.UserGroupInformation.loginUserFromKeytabAndReturnUGI;
+import static org.apache.hadoop.yarn.conf.YarnConfiguration.*;
+import static org.junit.Assert.assertTrue;
+
+/**
+ * This is intended to support setting up an mini-secure Hadoop + YARN + MR
+ * cluster.
+ * It does not do this, yet, for the following reason: things don't work.
+ * It is designed to be started/stopped at the class level.
+ * however, users should be logged in in test cases, so that their local state
+ * (credentials etc) are reset in every test.
+ */
+public class MiniKerberizedHadoopCluster extends CompositeService {
+
+  private static final Logger LOG =
+      LoggerFactory.getLogger(MiniKerberizedHadoopCluster.class);
+
+  public static final String ALICE = "alice";
+
+  public static final String BOB = "bob";
+
+  public static final String HTTP_LOCALHOST = "HTTP/localhost@$LOCALHOST";
+
+  /**
+   * The hostname is dynamically determined based on OS, either
+   * "localhost" (non-windows) or 127.0.0.1 (windows).
+   */
+  public static final String LOCALHOST_NAME = Path.WINDOWS
+      ? "127.0.0.1"
+      : "localhost";
+
+  private MiniKdc kdc;
+
+  private File keytab;
+
+  private File workDir;
+
+  private String krbInstance;
+
+  private String loginUsername;
+
+  private String loginPrincipal;
+
+  private String sslConfDir;
+
+  private String clientSSLConfigFileName;
+
+  private String serverSSLConfigFileName;
+
+  private String alicePrincipal;
+
+  private String bobPrincipal;
+
+  /**
+   * Create the cluster.
+   * If this class's log is at DEBUG level, this also turns
+   * Kerberos diagnostics on in the JVM.
+   */
+  public MiniKerberizedHadoopCluster() {
+    super("MiniKerberizedHadoopCluster");
+    // load all the configs to force in the -default.xml files
+    new HdfsConfiguration();
+    new YarnConfiguration();
+    new JobConf();
+    if (LOG.isDebugEnabled()) {
+      // turn on kerberos logging @ debug.
+      System.setProperty(KDiag.SUN_SECURITY_KRB5_DEBUG, "true");
+      System.setProperty(KDiag.SUN_SECURITY_SPNEGO_DEBUG, "true");
+    }
+
+  }
+
+  public MiniKdc getKdc() {
+    return kdc;
+  }
+
+  public File getKeytab() {
+    return keytab;
+  }
+
+  public String getKeytabPath() {
+    return keytab.getAbsolutePath();
+  }
+
+  public UserGroupInformation createBobUser() throws IOException {
+    return loginUserFromKeytabAndReturnUGI(bobPrincipal,
+        keytab.getAbsolutePath());
+  }
+
+  public UserGroupInformation createAliceUser() throws IOException {
+    return loginUserFromKeytabAndReturnUGI(alicePrincipal,
+        keytab.getAbsolutePath());
+  }
+
+  public File getWorkDir() {
+    return workDir;
+  }
+
+  public String getKrbInstance() {
+    return krbInstance;
+  }
+
+  public String getLoginUsername() {
+    return loginUsername;
+  }
+
+  public String getLoginPrincipal() {
+    return loginPrincipal;
+  }
+
+  public String withRealm(String user) {
+    return user + "@EXAMPLE.COM";
+  }
+
+  /**
+   * Service init creates the KDC.
+   * @param conf configuration
+   */
+  @Override
+  protected void serviceInit(final Configuration conf) throws Exception {
+    patchConfigAtInit(conf);
+    super.serviceInit(conf);
+    Properties kdcConf = MiniKdc.createConf();
+    workDir = GenericTestUtils.getTestDir("kerberos");
+    workDir.mkdirs();
+    kdc = new MiniKdc(kdcConf, workDir);
+
+    krbInstance = LOCALHOST_NAME;
+  }
+
+  /**
+   * Start the KDC, create the keytab and the alice and bob users,
+   * and UGI instances of them logged in from the keytab.
+   */
+  @Override
+  protected void serviceStart() throws Exception {
+    super.serviceStart();
+    kdc.start();
+    keytab = new File(workDir, "keytab.bin");
+    loginUsername = UserGroupInformation.getLoginUser().getShortUserName();
+    loginPrincipal = loginUsername + "/" + krbInstance;
+
+    alicePrincipal = ALICE + "/" + krbInstance;
+    bobPrincipal = BOB + "/" + krbInstance;
+    kdc.createPrincipal(keytab,
+        alicePrincipal,
+        bobPrincipal,
+        "HTTP/" + krbInstance,
+        HTTP_LOCALHOST,
+        loginPrincipal);
+    final File keystoresDir = new File(workDir, "ssl");
+    keystoresDir.mkdirs();
+    sslConfDir = KeyStoreTestUtil.getClasspathDir(
+        this.getClass());
+    KeyStoreTestUtil.setupSSLConfig(keystoresDir.getAbsolutePath(),
+        sslConfDir, getConfig(), false);
+    clientSSLConfigFileName = KeyStoreTestUtil.getClientSSLConfigFileName();
+    serverSSLConfigFileName = KeyStoreTestUtil.getServerSSLConfigFileName();
+  }
+
+
+  @Override
+  protected void serviceStop() throws Exception {
+    super.serviceStop();
+    // this can throw an exception, but it will get caught by the superclass.
+    kdc.stop();
+  }
+
+
+  protected void patchConfigAtInit(final Configuration conf) {
+
+    // turn off some noise during debugging
+    int timeout = 60 * 60_1000;
+    conf.setInt("jvm.pause.info-threshold.ms", timeout);
+    conf.setInt("jvm.pause.warn-threshold.ms", timeout);
+  }
+
+  /**
+   * Set up HDFS to run securely.
+   * In secure mode, HDFS goes out of its way to refuse to start if it
+   * doesn't consider the configuration safe.
+   * This is good in production, and it stops an HDFS cluster coming
+   * up where things can't reliably talk to each other.
+   * But it does complicate test setup.
+   * Look at {@code org.apache.hadoop.hdfs.TestDFSInotifyEventInputStreamKerberized}
+   * and {@code org.apache.hadoop.hdfs.qjournal.TestSecureNNWithQJM}
+   * for the details on what options to set here.
+   * @param conf configuration to patch.
+   */
+  protected void patchConfigWithHDFSBindings(final Configuration conf) {
+    Preconditions.checkState(isInState(STATE.STARTED));
+    enableKerberos(conf);
+
+    String path = getKeytabPath();
+    String spnegoPrincipal = "*";
+    String localhost = LOCALHOST_NAME;
+    String instance = getKrbInstance();
+    String hdfsPrincipal = getLoginPrincipal();
+    patchConfigAtInit(conf);
+
+    conf.setLong(CommonConfigurationKeys.FS_DU_INTERVAL_KEY, Long.MAX_VALUE);
+
+    conf.set(DFS_NAMENODE_KERBEROS_PRINCIPAL_KEY, hdfsPrincipal);
+    conf.set(DFS_NAMENODE_KEYTAB_FILE_KEY, path);
+    conf.set(DFS_DATANODE_KERBEROS_PRINCIPAL_KEY, hdfsPrincipal);
+    conf.set(DFS_DATANODE_KEYTAB_FILE_KEY, path);
+    conf.set(DFS_WEB_AUTHENTICATION_KERBEROS_PRINCIPAL_KEY, spnegoPrincipal);
+    conf.set(DFS_JOURNALNODE_KEYTAB_FILE_KEY, path);
+    conf.set(DFS_JOURNALNODE_KERBEROS_PRINCIPAL_KEY, hdfsPrincipal);
+    conf.set(DFS_JOURNALNODE_KERBEROS_INTERNAL_SPNEGO_PRINCIPAL_KEY,
+        spnegoPrincipal);
+    conf.setBoolean(DFS_BLOCK_ACCESS_TOKEN_ENABLE_KEY, true);
+    conf.set(DFS_DATA_TRANSFER_PROTECTION_KEY, "authentication");
+
+    conf.set(DFS_HTTP_POLICY_KEY, HttpConfig.Policy.HTTPS_ONLY.name());
+    conf.set(DFS_NAMENODE_HTTPS_ADDRESS_KEY, "localhost:0");
+    conf.set(DFS_DATANODE_HTTPS_ADDRESS_KEY, "localhost:0");
+    conf.set(DFS_HTTP_POLICY_KEY, HttpConfig.Policy.HTTPS_ONLY.name());
+    conf.set(DFS_CLIENT_HTTPS_KEYSTORE_RESOURCE_KEY,
+        KeyStoreTestUtil.getClientSSLConfigFileName());
+    conf.set(DFS_SERVER_HTTPS_KEYSTORE_RESOURCE_KEY,
+        KeyStoreTestUtil.getServerSSLConfigFileName());
+  }
+
+  /**
+   * Patch the YARN settings.
+   * Note how the yarn principal has to include the realm.
+   * @param conf configuration to patch.
+   */
+  protected void patchConfigWithYARNBindings(final Configuration conf) {
+    Preconditions.checkState(isInState(STATE.STARTED));
+    enableKerberos(conf);
+    patchConfigAtInit(conf);
+    String path = getKeytabPath();
+    String localhost = LOCALHOST_NAME;
+    String yarnPrincipal = withRealm(getLoginPrincipal());
+    conf.set(RM_PRINCIPAL, yarnPrincipal);
+
+    conf.set(RM_KEYTAB, path);
+    conf.set(RM_HOSTNAME, localhost);
+    conf.set(RM_BIND_HOST, localhost);
+    conf.set(RM_ADDRESS,
+        localhost + ":" + DEFAULT_RM_PORT);
+
+    conf.set(NM_PRINCIPAL, yarnPrincipal);
+    conf.set(NM_KEYTAB, path);
+    conf.set(NM_ADDRESS,
+        localhost + ":" + DEFAULT_NM_PORT);
+    conf.setBoolean(TIMELINE_SERVICE_ENABLED, false);
+    conf.setBoolean(MRJobConfig.MAPREDUCE_JOB_EMIT_TIMELINE_DATA, false);
+
+    conf.set(JHAdminConfig.MR_HISTORY_KEYTAB, path);
+    conf.set(JHAdminConfig.MR_HISTORY_PRINCIPAL, yarnPrincipal);
+    conf.set(JHAdminConfig.MR_HISTORY_ADDRESS,
+        localhost + ":" + DEFAULT_MR_HISTORY_PORT);
+    conf.setBoolean(JHAdminConfig.MR_HISTORY_CLEANER_ENABLE, false);
+
+    conf.setInt(RM_AM_MAX_ATTEMPTS, 1);
+    conf.setInt(YarnConfiguration.RESOURCEMANAGER_CONNECT_MAX_WAIT_MS,
+        100);
+    conf.setInt(YarnConfiguration.RESOURCEMANAGER_CONNECT_RETRY_INTERVAL_MS,
+        10_000);
+  }
+
+
+  public void resetUGI() {
+    UserGroupInformation.reset();
+  }
+
+  /**
+   * Given a shortname, built a long name with the krb instance and realm info.
+   * @param shortname short name of the user
+   * @return a long name
+   */
+  private String userOnHost(final String shortname) {
+    return shortname + "/" + krbInstance + "@" + getRealm();
+  }
+
+  public String getRealm() {
+    return kdc.getRealm();
+  }
+
+  /**
+   * Log in a user to UGI.currentUser.
+   * @param user user to log in from
+   * @throws IOException failure
+   */
+  public void loginUser(final String user) throws IOException {
+    UserGroupInformation.loginUserFromKeytab(user, getKeytabPath());
+  }
+
+  /**
+   * Log in the login principal as the current user.
+   * @throws IOException failure
+   */
+  public void loginPrincipal() throws IOException {
+    loginUser(getLoginPrincipal());
+  }
+
+  /**
+   * General assertion that security is turred on for a cluster.
+   */
+  public static void assertSecurityEnabled() {
+    assertTrue("Security is needed for this test",
+        UserGroupInformation.isSecurityEnabled());
+  }
+
+
+  /**
+   * Close filesystems for a user, downgrading a null user to a no-op.
+   * @param ugi user
+   * @throws IOException if a close operation raised one.
+   */
+  public static void closeUserFileSystems(UserGroupInformation ugi)
+      throws IOException {
+    if (ugi != null) {
+      FileSystem.closeAllForUGI(ugi);
+    }
+  }
+
+  /**
+   * Modify a configuration to use Kerberos as the auth method.
+   * @param conf configuration to patch.
+   */
+  public static void enableKerberos(Configuration conf) {
+    conf.set(HADOOP_SECURITY_AUTHENTICATION,
+        UserGroupInformation.AuthenticationMethod.KERBEROS.name());
+  }
+}
diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/delegation/TestS3ADelegationTokenSupport.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/delegation/TestS3ADelegationTokenSupport.java
new file mode 100644
index 0000000000000..0c9dd842bf40d
--- /dev/null
+++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/delegation/TestS3ADelegationTokenSupport.java
@@ -0,0 +1,171 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.s3a.auth.delegation;
+
+import java.net.URI;
+
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import org.apache.hadoop.fs.s3a.S3AEncryptionMethods;
+import org.apache.hadoop.fs.s3a.S3ATestConstants;
+import org.apache.hadoop.fs.s3a.S3ATestUtils;
+import org.apache.hadoop.fs.s3a.auth.MarshalledCredentialBinding;
+import org.apache.hadoop.fs.s3a.auth.MarshalledCredentials;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.security.UserGroupInformation;
+import org.apache.hadoop.security.token.SecretManager;
+import org.apache.hadoop.security.token.Token;
+
+import static org.apache.hadoop.fs.s3a.auth.delegation.DelegationConstants.FULL_TOKEN_KIND;
+import static org.apache.hadoop.fs.s3a.auth.delegation.DelegationConstants.SESSION_TOKEN_KIND;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+
+/**
+ * Unit tests related to S3A DT support.
+ */
+public class TestS3ADelegationTokenSupport {
+
+  private static URI landsatUri;
+
+  @BeforeClass
+  public static void classSetup() throws Exception {
+    landsatUri = new URI(S3ATestConstants.DEFAULT_CSVTEST_FILE);
+  }
+
+  @Test
+  public void testSessionTokenKind() throws Throwable {
+    AbstractS3ATokenIdentifier identifier
+        = new SessionTokenIdentifier();
+    assertEquals(SESSION_TOKEN_KIND, identifier.getKind());
+  }
+
+  @Test
+  public void testSessionTokenDecode() throws Throwable {
+    Text alice = new Text("alice");
+    AbstractS3ATokenIdentifier identifier
+        = new SessionTokenIdentifier(SESSION_TOKEN_KIND,
+        alice,
+        new URI("s3a://landsat-pds/"),
+        new MarshalledCredentials("a", "b", ""),
+        new EncryptionSecrets(S3AEncryptionMethods.SSE_S3, ""),
+        "origin");
+    Token<AbstractS3ATokenIdentifier> t1 =
+        new Token<>(identifier,
+            new SessionSecretManager());
+    AbstractS3ATokenIdentifier decoded = t1.decodeIdentifier();
+    decoded.validate();
+    MarshalledCredentials creds
+        = ((SessionTokenIdentifier) decoded).getMarshalledCredentials();
+    assertNotNull("credentials",
+        MarshalledCredentialBinding.toAWSCredentials(creds,
+        MarshalledCredentials.CredentialTypeRequired.AnyNonEmpty, ""));
+    assertEquals(alice, decoded.getOwner());
+    UserGroupInformation decodedUser = decoded.getUser();
+    assertEquals("name of " + decodedUser,
+        "alice",
+        decodedUser.getUserName());
+    assertEquals("Authentication method of " + decodedUser,
+        UserGroupInformation.AuthenticationMethod.TOKEN,
+        decodedUser.getAuthenticationMethod());
+    assertEquals("origin", decoded.getOrigin());
+  }
+
+  @Test
+  public void testFullTokenKind() throws Throwable {
+    AbstractS3ATokenIdentifier identifier
+        = new FullCredentialsTokenIdentifier();
+    assertEquals(FULL_TOKEN_KIND, identifier.getKind());
+  }
+
+  @Test
+  public void testSessionTokenIdentifierRoundTrip() throws Throwable {
+    SessionTokenIdentifier id = new SessionTokenIdentifier(
+        SESSION_TOKEN_KIND,
+        new Text(),
+        landsatUri,
+        new MarshalledCredentials("a", "b", "c"),
+        new EncryptionSecrets(), "");
+
+    SessionTokenIdentifier result = S3ATestUtils.roundTrip(id, null);
+    String ids = id.toString();
+    assertEquals("URI in " + ids, id.getUri(), result.getUri());
+    assertEquals("credentials in " + ids,
+        id.getMarshalledCredentials(),
+        result.getMarshalledCredentials());
+  }
+
+  @Test
+  public void testRoleTokenIdentifierRoundTrip() throws Throwable {
+    RoleTokenIdentifier id = new RoleTokenIdentifier(
+        landsatUri,
+        new Text(),
+        new MarshalledCredentials("a", "b", "c"),
+        new EncryptionSecrets(), "");
+
+    RoleTokenIdentifier result = S3ATestUtils.roundTrip(id, null);
+    String ids = id.toString();
+    assertEquals("URI in " + ids, id.getUri(), result.getUri());
+    assertEquals("credentials in " + ids,
+        id.getMarshalledCredentials(),
+        result.getMarshalledCredentials());
+  }
+
+  @Test
+  public void testFullTokenIdentifierRoundTrip() throws Throwable {
+    FullCredentialsTokenIdentifier id = new FullCredentialsTokenIdentifier(
+        landsatUri,
+        new Text(),
+        new MarshalledCredentials("a", "b", ""),
+        new EncryptionSecrets(), "");
+
+    FullCredentialsTokenIdentifier result = S3ATestUtils.roundTrip(id, null);
+    String ids = id.toString();
+    assertEquals("URI in " + ids, id.getUri(), result.getUri());
+    assertEquals("credentials in " + ids,
+        id.getMarshalledCredentials(),
+        result.getMarshalledCredentials());
+  }
+
+  /**
+   * The secret manager always uses the same secret; the
+   * factory for new identifiers is that of the token manager.
+   */
+  private  class SessionSecretManager
+      extends SecretManager<AbstractS3ATokenIdentifier> {
+
+    @Override
+    protected byte[] createPassword(AbstractS3ATokenIdentifier identifier) {
+      return "PASSWORD".getBytes();
+    }
+
+    @Override
+    public byte[] retrievePassword(AbstractS3ATokenIdentifier identifier)
+        throws InvalidToken {
+      return "PASSWORD".getBytes();
+    }
+
+    @Override
+    public AbstractS3ATokenIdentifier createIdentifier() {
+      return new SessionTokenIdentifier();
+    }
+  }
+
+}
diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/commit/staging/StagingTestBase.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/commit/staging/StagingTestBase.java
index d81c747fcea00..2bee08c5f6ed6 100644
--- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/commit/staging/StagingTestBase.java
+++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/commit/staging/StagingTestBase.java
@@ -97,9 +97,16 @@ public class StagingTestBase {
 
   public static final String BUCKET = MockS3AFileSystem.BUCKET;
   public static final String OUTPUT_PREFIX = "output/path";
-  public static final Path OUTPUT_PATH =
+  /** The raw bucket URI Path before any canonicalization. */
+  public static final Path RAW_BUCKET_PATH =
+      new Path("s3a://" + BUCKET + "/");
+  /** The raw bucket URI Path before any canonicalization. */
+  public static final URI RAW_BUCKET_URI =
+      RAW_BUCKET_PATH.toUri();
+  public static Path outputPath =
       new Path("s3a://" + BUCKET + "/" + OUTPUT_PREFIX);
-  public static final URI OUTPUT_PATH_URI = OUTPUT_PATH.toUri();
+  public static URI outputPathUri = outputPath.toUri();
+  public static Path root;
 
   protected StagingTestBase() {
   }
@@ -119,8 +126,11 @@ protected static S3AFileSystem createAndBindMockFSInstance(Configuration conf,
       throws IOException {
     S3AFileSystem mockFs = mockS3AFileSystemRobustly();
     MockS3AFileSystem wrapperFS = new MockS3AFileSystem(mockFs, outcome);
-    URI uri = OUTPUT_PATH_URI;
+    URI uri = RAW_BUCKET_URI;
     wrapperFS.initialize(uri, conf);
+    root = wrapperFS.makeQualified(new Path("/"));
+    outputPath = new Path(root, OUTPUT_PREFIX);
+    outputPathUri = outputPath.toUri();
     FileSystemTestHelper.addFileSystemForTesting(uri, conf, wrapperFS);
     return mockFs;
   }
@@ -142,7 +152,7 @@ private static S3AFileSystem mockS3AFileSystemRobustly() {
    */
   public static MockS3AFileSystem lookupWrapperFS(Configuration conf)
       throws IOException {
-    return (MockS3AFileSystem) FileSystem.get(OUTPUT_PATH_URI, conf);
+    return (MockS3AFileSystem) FileSystem.get(outputPathUri, conf);
   }
 
   public static void verifyCompletion(FileSystem mockS3) throws IOException {
@@ -157,13 +167,13 @@ public static void verifyDeleted(FileSystem mockS3, Path path)
 
   public static void verifyDeleted(FileSystem mockS3, String child)
       throws IOException {
-    verifyDeleted(mockS3, new Path(OUTPUT_PATH, child));
+    verifyDeleted(mockS3, new Path(outputPath, child));
   }
 
   public static void verifyCleanupTempFiles(FileSystem mockS3)
       throws IOException {
     verifyDeleted(mockS3,
-        new Path(OUTPUT_PATH, CommitConstants.TEMPORARY));
+        new Path(outputPath, CommitConstants.TEMPORARY));
   }
 
   protected static void assertConflictResolution(
@@ -177,7 +187,7 @@ protected static void assertConflictResolution(
   public static void pathsExist(FileSystem mockS3, String... children)
       throws IOException {
     for (String child : children) {
-      pathExists(mockS3, new Path(OUTPUT_PATH, child));
+      pathExists(mockS3, new Path(outputPath, child));
     }
   }
 
@@ -194,7 +204,7 @@ public static void pathDoesNotExist(FileSystem mockS3, Path path)
   public static void canDelete(FileSystem mockS3, String... children)
       throws IOException {
     for (String child : children) {
-      canDelete(mockS3, new Path(OUTPUT_PATH, child));
+      canDelete(mockS3, new Path(outputPath, child));
     }
   }
 
@@ -206,7 +216,7 @@ public static void canDelete(FileSystem mockS3, Path f) throws IOException {
 
   public static void verifyExistenceChecked(FileSystem mockS3, String child)
       throws IOException {
-    verifyExistenceChecked(mockS3, new Path(OUTPUT_PATH, child));
+    verifyExistenceChecked(mockS3, new Path(outputPath, child));
   }
 
   public static void verifyExistenceChecked(FileSystem mockS3, Path path)
diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/commit/staging/TestStagingCommitter.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/commit/staging/TestStagingCommitter.java
index 2c348f5245dc2..2f7e8e0520b49 100644
--- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/commit/staging/TestStagingCommitter.java
+++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/commit/staging/TestStagingCommitter.java
@@ -157,7 +157,7 @@ public void setupCommitter() throws Exception {
     this.tac = new TaskAttemptContextImpl(
         new Configuration(job.getConfiguration()), AID);
 
-    this.jobCommitter = new MockedStagingCommitter(OUTPUT_PATH, tac);
+    this.jobCommitter = new MockedStagingCommitter(outputPath, tac);
     jobCommitter.setupJob(job);
 
     // get the task's configuration copy so modifications take effect
@@ -172,7 +172,7 @@ public void setupCommitter() throws Exception {
     this.conf.set(BUFFER_DIR,
         String.format("%s/local-0/, %s/local-1 ", tmp, tmp));
 
-    this.committer = new MockedStagingCommitter(OUTPUT_PATH, tac);
+    this.committer = new MockedStagingCommitter(outputPath, tac);
     Paths.resetTempFolderCache();
   }
 
@@ -608,7 +608,7 @@ private Set<String> runTasks(JobContext jobContext,
       TaskAttemptContext attempt = new TaskAttemptContextImpl(
           new Configuration(jobContext.getConfiguration()), attemptID);
       MockedStagingCommitter taskCommitter = new MockedStagingCommitter(
-          OUTPUT_PATH, attempt);
+          outputPath, attempt);
       commitTask(taskCommitter, attempt, numFiles);
     }
 
diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/commit/staging/TestStagingDirectoryOutputCommitter.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/commit/staging/TestStagingDirectoryOutputCommitter.java
index f5b8d1f872a83..b511293e55d4d 100644
--- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/commit/staging/TestStagingDirectoryOutputCommitter.java
+++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/commit/staging/TestStagingDirectoryOutputCommitter.java
@@ -36,7 +36,7 @@ public class TestStagingDirectoryOutputCommitter
 
   @Override
   DirectoryStagingCommitter newJobCommitter() throws Exception {
-    return new DirectoryStagingCommitter(OUTPUT_PATH,
+    return new DirectoryStagingCommitter(outputPath,
         createTaskAttemptForJob());
   }
 
@@ -64,7 +64,7 @@ public void testFailConflictResolution() throws Exception {
 
   protected void verifyFailureConflictOutcome() throws Exception {
     FileSystem mockS3 = getMockS3A();
-    pathExists(mockS3, OUTPUT_PATH);
+    pathExists(mockS3, outputPath);
     final DirectoryStagingCommitter committer = newJobCommitter();
 
     // this should fail
@@ -77,14 +77,14 @@ protected void verifyFailureConflictOutcome() throws Exception {
     committer.commitJob(getJob());
 
     reset(mockS3);
-    pathDoesNotExist(mockS3, OUTPUT_PATH);
+    pathDoesNotExist(mockS3, outputPath);
 
     committer.setupJob(getJob());
-    verifyExistenceChecked(mockS3, OUTPUT_PATH);
+    verifyExistenceChecked(mockS3, outputPath);
     verifyNoMoreInteractions(mockS3);
 
     reset(mockS3);
-    pathDoesNotExist(mockS3, OUTPUT_PATH);
+    pathDoesNotExist(mockS3, outputPath);
     committer.commitJob(getJob());
     verifyCompletion(mockS3);
   }
@@ -93,7 +93,7 @@ protected void verifyFailureConflictOutcome() throws Exception {
   public void testAppendConflictResolution() throws Exception {
     FileSystem mockS3 = getMockS3A();
 
-    pathExists(mockS3, OUTPUT_PATH);
+    pathExists(mockS3, outputPath);
 
     getJob().getConfiguration().set(
         FS_S3A_COMMITTER_STAGING_CONFLICT_MODE, CONFLICT_MODE_APPEND);
@@ -104,7 +104,7 @@ public void testAppendConflictResolution() throws Exception {
     verifyNoMoreInteractions(mockS3);
 
     Mockito.reset(mockS3);
-    pathExists(mockS3, OUTPUT_PATH);
+    pathExists(mockS3, outputPath);
 
     committer.commitJob(getJob());
     verifyCompletion(mockS3);
@@ -114,7 +114,7 @@ public void testAppendConflictResolution() throws Exception {
   public void testReplaceConflictResolution() throws Exception {
     FileSystem mockS3 = getMockS3A();
 
-    pathExists(mockS3, OUTPUT_PATH);
+    pathExists(mockS3, outputPath);
 
     getJob().getConfiguration().set(
         FS_S3A_COMMITTER_STAGING_CONFLICT_MODE, CONFLICT_MODE_REPLACE);
@@ -125,11 +125,11 @@ public void testReplaceConflictResolution() throws Exception {
     verifyNoMoreInteractions(mockS3);
 
     Mockito.reset(mockS3);
-    pathExists(mockS3, OUTPUT_PATH);
-    canDelete(mockS3, OUTPUT_PATH);
+    pathExists(mockS3, outputPath);
+    canDelete(mockS3, outputPath);
 
     committer.commitJob(getJob());
-    verifyDeleted(mockS3, OUTPUT_PATH);
+    verifyDeleted(mockS3, outputPath);
     verifyCompletion(mockS3);
   }
 
diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/commit/staging/TestStagingPartitionedFileListing.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/commit/staging/TestStagingPartitionedFileListing.java
index 139b4e36c59e7..cb332b89489c0 100644
--- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/commit/staging/TestStagingPartitionedFileListing.java
+++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/commit/staging/TestStagingPartitionedFileListing.java
@@ -51,13 +51,13 @@ public class TestStagingPartitionedFileListing
 
   @Override
   PartitionedStagingCommitter newJobCommitter() throws IOException {
-    return new PartitionedStagingCommitter(OUTPUT_PATH,
+    return new PartitionedStagingCommitter(outputPath,
         createTaskAttemptForJob());
   }
 
   @Override
   PartitionedStagingCommitter newTaskCommitter() throws IOException {
-    return new PartitionedStagingCommitter(OUTPUT_PATH, getTAC());
+    return new PartitionedStagingCommitter(outputPath, getTAC());
   }
 
   private FileSystem attemptFS;
diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/commit/staging/TestStagingPartitionedJobCommit.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/commit/staging/TestStagingPartitionedJobCommit.java
index 55e4dc717a4c4..e7410e33fba94 100644
--- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/commit/staging/TestStagingPartitionedJobCommit.java
+++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/commit/staging/TestStagingPartitionedJobCommit.java
@@ -66,7 +66,7 @@ private static final class PartitionedStagingCommitterForTesting
 
     private PartitionedStagingCommitterForTesting(TaskAttemptContext context)
         throws IOException {
-      super(OUTPUT_PATH, context);
+      super(StagingTestBase.outputPath, context);
     }
 
     @Override
@@ -219,7 +219,7 @@ public void testReplaceWithDeleteFailure() throws Exception {
     pathsExist(mockS3, "dateint=20161116/hour=14");
     when(mockS3
         .delete(
-            new Path(OUTPUT_PATH, "dateint=20161116/hour=14"),
+            new Path(outputPath, "dateint=20161116/hour=14"),
             true))
         .thenThrow(new PathCommitException("fake",
             "Fake IOException for delete"));
diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/commit/staging/TestStagingPartitionedTaskCommit.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/commit/staging/TestStagingPartitionedTaskCommit.java
index ddcb56e81f239..2409b681e16bd 100644
--- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/commit/staging/TestStagingPartitionedTaskCommit.java
+++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/commit/staging/TestStagingPartitionedTaskCommit.java
@@ -47,13 +47,13 @@ public class TestStagingPartitionedTaskCommit
 
   @Override
   PartitionedStagingCommitter newJobCommitter() throws IOException {
-    return new PartitionedStagingCommitter(OUTPUT_PATH,
+    return new PartitionedStagingCommitter(outputPath,
         createTaskAttemptForJob());
   }
 
   @Override
   PartitionedStagingCommitter newTaskCommitter() throws Exception {
-    return new PartitionedStagingCommitter(OUTPUT_PATH, getTAC());
+    return new PartitionedStagingCommitter(outputPath, getTAC());
   }
 
   // The set of files used by this test
@@ -94,12 +94,17 @@ public void testDefault() throws Exception {
 
     // test failure when one partition already exists
     reset(mockS3);
-    pathExists(mockS3, new Path(OUTPUT_PATH, relativeFiles.get(0)).getParent());
+    Path exists = new Path(outputPath, relativeFiles.get(0)).getParent();
+    pathExists(mockS3, exists);
 
     intercept(PathExistsException.class,
         InternalCommitterConstants.E_DEST_EXISTS,
-        "Expected a PathExistsException as a partition already exists",
-        () -> committer.commitTask(getTAC()));
+        "Expected a PathExistsException as a partition"
+            + " already exists:" + exists,
+        () ->  {
+            committer.commitTask(getTAC());
+            mockS3.getFileStatus(exists);
+        });
 
     // test success
     reset(mockS3);
@@ -134,10 +139,11 @@ public void testFail() throws Exception {
 
     // test failure when one partition already exists
     reset(mockS3);
-    pathExists(mockS3, new Path(OUTPUT_PATH, relativeFiles.get(1)).getParent());
+    Path existsPath = new Path(outputPath, relativeFiles.get(1)).getParent();
+    pathExists(mockS3, existsPath);
 
     intercept(PathExistsException.class, "",
-        "Should complain because a partition already exists",
+        "Should complain because a partition already exists: " + existsPath,
         () -> committer.commitTask(getTAC()));
 
     // test success
@@ -173,7 +179,7 @@ public void testAppend() throws Exception {
 
     // test success when one partition already exists
     reset(mockS3);
-    pathExists(mockS3, new Path(OUTPUT_PATH, relativeFiles.get(2)).getParent());
+    pathExists(mockS3, new Path(outputPath, relativeFiles.get(2)).getParent());
 
     committer.commitTask(getTAC());
     Set<String> files = Sets.newHashSet();
@@ -207,7 +213,7 @@ public void testReplace() throws Exception {
 
     // test success when one partition already exists
     reset(mockS3);
-    pathExists(mockS3, new Path(OUTPUT_PATH, relativeFiles.get(3)).getParent());
+    pathExists(mockS3, new Path(outputPath, relativeFiles.get(3)).getParent());
 
     committer.commitTask(getTAC());
     Set<String> files = Sets.newHashSet();
diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/fileContext/ITestS3AFileContextStatistics.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/fileContext/ITestS3AFileContextStatistics.java
index e493818ffb84a..5dc8be06b21c4 100644
--- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/fileContext/ITestS3AFileContextStatistics.java
+++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/fileContext/ITestS3AFileContextStatistics.java
@@ -14,11 +14,18 @@
 package org.apache.hadoop.fs.s3a.fileContext;
 
 import java.net.URI;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FCStatisticsBaseTest;
 import org.apache.hadoop.fs.FileContext;
 import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.fs.s3a.S3ATestUtils;
+import org.apache.hadoop.fs.s3a.auth.STSClientFactory;
+
 import org.junit.After;
 import org.junit.Assert;
 import org.junit.Before;
@@ -28,20 +35,25 @@
  */
 public class ITestS3AFileContextStatistics extends FCStatisticsBaseTest {
 
+  private static final Logger LOG =
+      LoggerFactory.getLogger(STSClientFactory.class);
+
+  private Path testRootPath;
+
   @Before
   public void setUp() throws Exception {
     Configuration conf = new Configuration();
     fc = S3ATestUtils.createTestFileContext(conf);
-    fc.mkdir(fileContextTestHelper.getTestRootPath(fc, "test"),
+    testRootPath = fileContextTestHelper.getTestRootPath(fc, "test");
+    fc.mkdir(testRootPath,
         FileContext.DEFAULT_PERM, true);
     FileContext.clearStatistics();
   }
 
   @After
   public void tearDown() throws Exception {
-    if (fc != null) {
-      fc.delete(fileContextTestHelper.getTestRootPath(fc, "test"), true);
-    }
+    S3ATestUtils.callQuietly(LOG,
+        () -> fc != null && fc.delete(testRootPath, true));
   }
 
   @Override
diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/scale/NanoTimerStats.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/scale/NanoTimerStats.java
new file mode 100644
index 0000000000000..c5b6660fcd23e
--- /dev/null
+++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/scale/NanoTimerStats.java
@@ -0,0 +1,192 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ *  or more contributor license agreements.  See the NOTICE file
+ *  distributed with this work for additional information
+ *  regarding copyright ownership.  The ASF licenses this file
+ *  to you under the Apache License, Version 2.0 (the
+ *  "License"); you may not use this file except in compliance
+ *  with the License.  You may obtain a copy of the License at
+ *
+ *       http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+package org.apache.hadoop.fs.s3a.scale;
+
+import org.apache.hadoop.fs.contract.ContractTestUtils;
+
+/**
+ * Collect statistics from duration data from
+ * {@link ContractTestUtils.NanoTimer} values.
+ *
+ * The mean and standard deviation is built up as the stats are collected,
+ * using "Welford's Online algorithm" for the variance.
+ * Trends in statistics (e.g. slowing down) are not tracked.
+ * Not synchronized.
+ */
+public class NanoTimerStats {
+
+  private static final double ONE_NS = 1.0e9;
+
+  private final String operation;
+
+  private int count;
+
+  private double sum;
+
+  private double min;
+
+  private double max;
+
+  private double mean;
+
+  private double m2;
+
+  /**
+   * Construct statistics for a given operation.
+   * @param operation operation
+   */
+  public NanoTimerStats(String operation) {
+    this.operation = operation;
+    reset();
+  }
+
+  /**
+   * construct from another stats entry;
+   * all value are copied.
+   * @param that the source statistics
+   */
+  public NanoTimerStats(NanoTimerStats that) {
+    operation = that.operation;
+    count = that.count;
+    sum = that.sum;
+    min = that.min;
+    max = that.max;
+    mean = that.mean;
+    m2 = that.m2;
+  }
+
+  /**
+   * Add a duration.
+   * @param duration the new duration
+   */
+  public void add(ContractTestUtils.NanoTimer duration) {
+    add(duration.elapsedTime());
+  }
+
+  /**
+   * Add a number.
+   * @param x the number
+   */
+  public void add(long x) {
+    count++;
+    sum += x;
+    double delta = x - mean;
+    mean += delta / count;
+    double delta2 = x - mean;
+    m2 += delta * delta2;
+    if (min < 0 || x < min) {
+      min = x;
+    }
+    if (x > max) {
+      max = x;
+    }
+  }
+
+  /**
+   * Reset the data.
+   */
+  public void reset() {
+    count = 0;
+    sum = 0;
+    sum = 0;
+    min = -1;
+    max = 0;
+    mean = 0;
+    m2 = 0;
+  }
+
+  /**
+   * Get the number of entries sampled.
+   * @return the number of durations added
+   */
+  public int getCount() {
+    return count;
+  }
+
+  /**
+   * Get the sum of all durations.
+   * @return all the durations
+   */
+  public double getSum() {
+    return sum;
+  }
+
+  /**
+   * Get the arithmetic mean of the aggregate statistics.
+   * @return the arithmetic mean
+   */
+  public double getArithmeticMean() {
+    return mean;
+  }
+
+  /**
+   * Variance, {@code sigma^2}.
+   * @return variance, or, if no samples are there, 0.
+   */
+  public double getVariance() {
+    return count > 0 ? (m2 / (count - 1)) :
+        Double.NaN;
+  }
+
+  /**
+   * Get the std deviation, sigma.
+   * @return the stddev, 0 may mean there are no samples.
+   */
+  public double getDeviation() {
+    double variance = getVariance();
+    return (!Double.isNaN(variance) && variance > 0)  ? Math.sqrt(variance) : 0;
+  }
+
+  private double toSeconds(double nano) {
+    return nano / ONE_NS;
+  }
+
+  /**
+   * Covert to a useful string.
+   * @return a human readable summary
+   */
+  @Override
+  public String toString() {
+    return String.format(
+        "%s count=%d total=%.3fs mean=%.3fs stddev=%.3fs min=%.3fs max=%.3fs",
+        operation,
+        count,
+        toSeconds(sum),
+        toSeconds(mean),
+        getDeviation() / ONE_NS,
+        toSeconds(min),
+        toSeconds(max));
+  }
+
+  public String getOperation() {
+    return operation;
+  }
+
+  public double getMin() {
+    return min;
+  }
+
+  public double getMax() {
+    return max;
+  }
+
+  public double getMean() {
+    return mean;
+  }
+}
diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/yarn/ITestS3AMiniYarnCluster.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/yarn/ITestS3AMiniYarnCluster.java
index 6db4ebaf1d3b8..87160937a1825 100644
--- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/yarn/ITestS3AMiniYarnCluster.java
+++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/yarn/ITestS3AMiniYarnCluster.java
@@ -27,35 +27,50 @@
 import org.apache.hadoop.fs.FSDataInputStream;
 import org.apache.hadoop.fs.FSDataOutputStream;
 import org.apache.hadoop.fs.FileContext;
+import org.apache.hadoop.fs.FileStatus;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.fs.s3a.AbstractS3ATestBase;
 import org.apache.hadoop.fs.s3a.S3AFileSystem;
 import org.apache.hadoop.fs.s3a.S3ATestUtils;
+import org.apache.hadoop.fs.s3a.commit.files.SuccessData;
+import org.apache.hadoop.fs.s3a.commit.staging.StagingCommitter;
 import org.apache.hadoop.io.IOUtils;
 import org.apache.hadoop.io.IntWritable;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.mapreduce.Job;
 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
-import org.apache.hadoop.yarn.conf.YarnConfiguration;
 import org.apache.hadoop.yarn.server.MiniYARNCluster;
 
 import org.junit.Test;
 
+import static org.apache.hadoop.fs.s3a.commit.CommitConstants.FS_S3A_COMMITTER_NAME;
+import static org.apache.hadoop.fs.s3a.commit.CommitConstants.FS_S3A_COMMITTER_STAGING_UNIQUE_FILENAMES;
+import static org.apache.hadoop.fs.s3a.commit.CommitConstants._SUCCESS;
+
 /**
  * Tests that S3A is usable through a YARN application.
  */
 public class ITestS3AMiniYarnCluster extends AbstractS3ATestBase {
 
-  private final Configuration conf = new YarnConfiguration();
-  private S3AFileSystem fs;
   private MiniYARNCluster yarnCluster;
   private Path rootPath;
 
+  @Override
+  protected Configuration createConfiguration() {
+    Configuration conf = super.createConfiguration();
+    // and set up commit code
+    conf.setBoolean(FS_S3A_COMMITTER_STAGING_UNIQUE_FILENAMES,
+        false);
+    conf.set(FS_S3A_COMMITTER_NAME, StagingCommitter.NAME);
+    return conf;
+  }
+
   @Override
   public void setup() throws Exception {
     super.setup();
-    fs = S3ATestUtils.createTestFileSystem(conf);
+    S3AFileSystem fs = getFileSystem();
+    Configuration conf = getConfiguration();
     rootPath = path("MiniClusterWordCount");
     Path workingDir = path("working");
     fs.setWorkingDirectory(workingDir);
@@ -79,6 +94,9 @@ public void teardown() throws Exception {
 
   @Test
   public void testWithMiniCluster() throws Exception {
+    S3AFileSystem fs = getFileSystem();
+    Configuration conf = getConfiguration();
+
     Path input = new Path(rootPath, "input/in.txt");
     input = input.makeQualified(fs.getUri(), fs.getWorkingDirectory());
     Path output = new Path(rootPath, "output/");
@@ -99,7 +117,13 @@ public void testWithMiniCluster() throws Exception {
     int exitCode = (job.waitForCompletion(true) ? 0 : 1);
     assertEquals("Returned error code.", 0, exitCode);
 
-    assertTrue(fs.exists(new Path(output, "_SUCCESS")));
+    Path success = new Path(output, _SUCCESS);
+    FileStatus status = fs.getFileStatus(success);
+    assertTrue("0 byte success file - not a s3guard committer " + success,
+        status.getLen() > 0);
+    SuccessData successData = SuccessData.load(fs, success);
+    String commitDetails = successData.toString();
+    LOG.info("Committer details \n{}", commitDetails);
     String outputAsStr = readStringFromFile(new Path(output, "part-r-00000"));
     Map<String, Integer> resAsMap = getResultAsMap(outputAsStr);
 
@@ -130,6 +154,8 @@ private Map<String, Integer> getResultAsMap(String outputAsStr)
    * helper method.
    */
   private void writeStringToFile(Path path, String string) throws IOException {
+    Configuration conf = getConfiguration();
+
     FileContext fc = S3ATestUtils.createTestFileContext(conf);
     try (FSDataOutputStream file = fc.create(path,
             EnumSet.of(CreateFlag.CREATE))) {
@@ -141,6 +167,8 @@ private void writeStringToFile(Path path, String string) throws IOException {
    * helper method.
    */
   private String readStringFromFile(Path path) throws IOException {
+    S3AFileSystem fs = getFileSystem();
+
     try (FSDataInputStream in = fs.open(path)) {
       long bytesLen = fs.getFileStatus(path).getLen();
       byte[] buffer = new byte[(int) bytesLen];
diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/mapreduce/MockJob.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/mapreduce/MockJob.java
new file mode 100644
index 0000000000000..7886c92467159
--- /dev/null
+++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/mapreduce/MockJob.java
@@ -0,0 +1,115 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.mapreduce;
+
+import java.io.IOException;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapreduce.protocol.ClientProtocol;
+import org.apache.hadoop.security.Credentials;
+import org.apache.hadoop.security.authorize.AccessControlList;
+
+import static org.mockito.Matchers.*;
+import static org.mockito.Mockito.*;
+
+/**
+ * This is a mock job which doesn't talk to YARN.
+ * It's in this package as the JobSubmitter API is package-scoped.
+ */
+public class MockJob extends Job {
+
+  private static final Logger LOG =
+      LoggerFactory.getLogger(MockJob.class);
+
+  public static final String NAME = "mock";
+
+  private final ClientProtocol mockClient;
+  private static int jobIdCounter;
+
+  private static String trackerId = Long.toString(System.currentTimeMillis());
+
+  private Credentials submittedCredentials;
+
+  public MockJob(final JobConf conf, final String jobName)
+      throws IOException, InterruptedException {
+    super(conf);
+    setJobName(jobName);
+    mockClient = mock(ClientProtocol.class);
+    init();
+  }
+
+  public void init() throws IOException, InterruptedException {
+    when(mockClient.submitJob(any(JobID.class),
+        any(String.class),
+        any(Credentials.class)))
+        .thenAnswer(invocation -> {
+
+          final Object[] args = invocation.getArguments();
+          String name = (String) args[1];
+          LOG.info("Submitted Job {}", name);
+          submittedCredentials = (Credentials) args[2];
+          final JobStatus status = new JobStatus();
+          status.setState(JobStatus.State.RUNNING);
+          status.setSchedulingInfo(NAME);
+          status.setTrackingUrl("http://localhost:8080/");
+          return status;
+        });
+
+    when(mockClient.getNewJobID())
+        .thenReturn(
+            new JobID(trackerId, jobIdCounter++));
+
+    when(mockClient.getQueueAdmins(any(String.class)))
+        .thenReturn(
+            new AccessControlList(AccessControlList.WILDCARD_ACL_VALUE));
+  }
+
+  @Override
+  public boolean isSuccessful() throws IOException {
+    return true;
+  }
+
+  /** Only for mocking via unit tests. */
+  @InterfaceAudience.Private
+  JobSubmitter getJobSubmitter(FileSystem fs,
+      ClientProtocol submitClient) throws IOException {
+
+    return new JobSubmitter(fs, mockClient);
+  }
+
+  @Override
+  synchronized void connect()
+      throws IOException, InterruptedException, ClassNotFoundException {
+    super.connect();
+  }
+
+  public Credentials getSubmittedCredentials() {
+    return submittedCredentials;
+  }
+
+  @Override
+  synchronized void updateStatus() throws IOException {
+    // no-op
+  }
+}

From a185bf6f22da6389f89b3323e04273091cd7330f Mon Sep 17 00:00:00 2001
From: Steve Loughran <stevel@apache.org>
Date: Wed, 22 Nov 2017 20:58:12 +0530
Subject: [PATCH 33/40] HADOOP-13786 Add S3A committer for zero-rename commits
 to S3 endpoints. Contributed by Steve Loughran and Ryan Blue.

---
 .../hadoop/fs/s3a/S3ObjectAttributes.java     | 10 +--
 .../fs/s3a/ITestS3AFileOperationCost.java     |  2 +
 .../s3a/commit/magic/ITMagicCommitMRJob.java  | 70 +++++++++++++++++++
 .../integration/ITDirectoryCommitMRJob.java   | 33 +++++++++
 .../integration/ITPartitionCommitMRJob.java   | 33 +++++++++
 .../integration/ITStagingCommitMRJob.java     | 66 +++++++++++++++++
 6 files changed, 209 insertions(+), 5 deletions(-)
 create mode 100644 hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/commit/magic/ITMagicCommitMRJob.java
 create mode 100644 hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/commit/staging/integration/ITDirectoryCommitMRJob.java
 create mode 100644 hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/commit/staging/integration/ITPartitionCommitMRJob.java
 create mode 100644 hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/commit/staging/integration/ITStagingCommitMRJob.java

diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3ObjectAttributes.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3ObjectAttributes.java
index d67e3e1e8cbc6..19c810683d86d 100644
--- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3ObjectAttributes.java
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3ObjectAttributes.java
@@ -35,7 +35,7 @@ public class S3ObjectAttributes {
   private final S3AEncryptionMethods serverSideEncryptionAlgorithm;
   private final String serverSideEncryptionKey;
 
-  public S3ObjectAttributes(
+  S3ObjectAttributes(
       String bucket,
       String key,
       S3AEncryptionMethods serverSideEncryptionAlgorithm,
@@ -46,19 +46,19 @@ public S3ObjectAttributes(
     this.serverSideEncryptionKey = serverSideEncryptionKey;
   }
 
-  public String getBucket() {
+  String getBucket() {
     return bucket;
   }
 
-  public String getKey() {
+  String getKey() {
     return key;
   }
 
-  public S3AEncryptionMethods getServerSideEncryptionAlgorithm() {
+  S3AEncryptionMethods getServerSideEncryptionAlgorithm() {
     return serverSideEncryptionAlgorithm;
   }
 
-  public String getServerSideEncryptionKey() {
+  String getServerSideEncryptionKey() {
     return serverSideEncryptionKey;
   }
 }
diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AFileOperationCost.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AFileOperationCost.java
index e147c6a64806e..b3a077ed82f35 100644
--- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AFileOperationCost.java
+++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AFileOperationCost.java
@@ -200,6 +200,8 @@ public void testFakeDirectoryDeletion() throws Throwable {
     // before the internal behavior w/ or w/o metadata store.
 //    assumeFalse(fs.hasMetadataStore());
 
+    skipDuringFaultInjection(fs);
+
     Path srcBaseDir = path("src");
     mkdirs(srcBaseDir);
     MetricDiff deleteRequests =
diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/commit/magic/ITMagicCommitMRJob.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/commit/magic/ITMagicCommitMRJob.java
new file mode 100644
index 0000000000000..57eb8b226faef
--- /dev/null
+++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/commit/magic/ITMagicCommitMRJob.java
@@ -0,0 +1,70 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.s3a.commit.magic;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.s3a.commit.AbstractITCommitMRJob;
+import org.apache.hadoop.fs.s3a.commit.files.SuccessData;
+
+import static org.apache.hadoop.fs.s3a.commit.CommitConstants.*;
+
+/**
+ * Full integration test for the Magic Committer.
+ *
+ * There's no need to disable the committer setting for the filesystem here,
+ * because the committers are being instantiated in their own processes;
+ * the settings in {@link #applyCustomConfigOptions(Configuration)} are
+ * passed down to these processes.
+ */
+public class ITMagicCommitMRJob extends AbstractITCommitMRJob {
+
+  /**
+   * Need consistency here.
+   * @return false
+   */
+  @Override
+  public boolean useInconsistentClient() {
+    return false;
+  }
+
+  @Override
+  protected String committerName() {
+    return MagicS3GuardCommitter.NAME;
+  }
+
+  /**
+   * Turn on the magic commit support for the FS, else nothing will work.
+   * @param conf configuration
+   */
+  @Override
+  protected void applyCustomConfigOptions(Configuration conf) {
+    conf.setBoolean(MAGIC_COMMITTER_ENABLED, true);
+  }
+
+  /**
+   * Check that the magic dir was cleaned up.
+   * {@inheritDoc}
+   */
+  @Override
+  protected void customPostExecutionValidation(Path destPath,
+      SuccessData successData) throws Exception {
+    assertPathDoesNotExist("No cleanup", new Path(destPath, MAGIC));
+  }
+}
diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/commit/staging/integration/ITDirectoryCommitMRJob.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/commit/staging/integration/ITDirectoryCommitMRJob.java
new file mode 100644
index 0000000000000..c10ebed98d0c1
--- /dev/null
+++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/commit/staging/integration/ITDirectoryCommitMRJob.java
@@ -0,0 +1,33 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.s3a.commit.staging.integration;
+
+import org.apache.hadoop.fs.s3a.commit.AbstractITCommitMRJob;
+import org.apache.hadoop.fs.s3a.commit.staging.DirectoryStagingCommitter;
+
+/**
+ * Full integration test for the directory committer.
+ */
+public class ITDirectoryCommitMRJob extends AbstractITCommitMRJob {
+
+  @Override
+  protected String committerName() {
+    return DirectoryStagingCommitter.NAME;
+  }
+}
diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/commit/staging/integration/ITPartitionCommitMRJob.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/commit/staging/integration/ITPartitionCommitMRJob.java
new file mode 100644
index 0000000000000..1c19a952081a8
--- /dev/null
+++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/commit/staging/integration/ITPartitionCommitMRJob.java
@@ -0,0 +1,33 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.s3a.commit.staging.integration;
+
+import org.apache.hadoop.fs.s3a.commit.AbstractITCommitMRJob;
+import org.apache.hadoop.fs.s3a.commit.staging.PartitionedStagingCommitter;
+
+/**
+ * Full integration test for the partition committer.
+ */
+public class ITPartitionCommitMRJob extends AbstractITCommitMRJob {
+
+  @Override
+  protected String committerName() {
+    return PartitionedStagingCommitter.NAME;
+  }
+}
diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/commit/staging/integration/ITStagingCommitMRJob.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/commit/staging/integration/ITStagingCommitMRJob.java
new file mode 100644
index 0000000000000..76ad4645a2060
--- /dev/null
+++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/commit/staging/integration/ITStagingCommitMRJob.java
@@ -0,0 +1,66 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.s3a.commit.staging.integration;
+
+import org.junit.Test;
+
+import org.hamcrest.core.StringContains;
+import org.hamcrest.core.StringEndsWith;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.s3a.commit.AbstractITCommitMRJob;
+import org.apache.hadoop.fs.s3a.commit.CommitConstants;
+import org.apache.hadoop.fs.s3a.commit.staging.StagingCommitter;
+import org.apache.hadoop.fs.s3a.commit.staging.StagingCommitterConstants;
+import org.apache.hadoop.security.UserGroupInformation;
+
+import static org.apache.hadoop.fs.s3a.commit.staging.Paths.getMultipartUploadCommitsDirectory;
+
+/**
+ * Full integration test for the staging committer.
+ */
+public class ITStagingCommitMRJob extends AbstractITCommitMRJob {
+
+  @Override
+  protected String committerName() {
+    return StagingCommitter.NAME;
+  }
+
+  /**
+   * Verify that staging commit dirs are made absolute under the user's
+   * home directory, so, in a secure cluster, private.
+   */
+  @Test
+  public void testStagingDirectory() throws Throwable {
+    FileSystem hdfs = getDFS();
+    Configuration conf = hdfs.getConf();
+    conf.set(CommitConstants.FS_S3A_COMMITTER_STAGING_TMP_PATH,
+        "private");
+    Path dir = getMultipartUploadCommitsDirectory(conf, "UUID");
+    assertThat(dir.toString(), StringEndsWith.endsWith(
+        "UUID/"
+        + StagingCommitterConstants.STAGING_UPLOADS));
+    assertTrue("path unqualified", dir.isAbsolute());
+    String self = UserGroupInformation.getCurrentUser().getShortUserName();
+    assertThat(dir.toString(),
+        StringContains.containsString("/user/" + self + "/private"));
+  }
+
+}

From d630ea661ba42c9b5cf1c60cda7228a7e5c898bf Mon Sep 17 00:00:00 2001
From: Abhishek Modi <abmodi90@gmail.com>
Date: Tue, 26 Feb 2019 22:40:12 +0530
Subject: [PATCH 34/40] HADOOP-16093. Move DurationInfo from hadoop-aws to
 hadoop-common org.apache.hadoop.util.

Contributed by Abhishek Modi
---
 .../org/apache/hadoop/util}/DurationInfo.java |  6 ++-
 .../apache/hadoop/util/OperationDuration.java | 11 ++--
 .../apache/hadoop/util/TestDurationInfo.java  | 54 +++++++++++++++++++
 .../hadoop/fs/s3a/WriteOperationHelper.java   |  2 +-
 .../AbstractDelegationTokenBinding.java       |  2 +-
 .../auth/delegation/S3ADelegationTokens.java  |  2 +-
 .../fs/s3a/commit/AbstractS3ACommitter.java   |  1 +
 .../commit/magic/MagicS3GuardCommitter.java   |  2 +-
 .../s3a/commit/staging/StagingCommitter.java  |  2 +-
 .../hadoop/fs/s3a/select/SelectTool.java      | 10 ++--
 .../fs/s3a/ITestS3ATemporaryCredentials.java  |  2 +-
 .../s3a/commit/AbstractITCommitProtocol.java  |  2 +-
 .../fs/s3a/select/AbstractS3SelectTest.java   |  2 +-
 .../hadoop/fs/s3a/select/ITestS3Select.java   |  2 +-
 .../fs/s3a/select/ITestS3SelectCLI.java       |  4 +-
 .../fs/s3a/select/ITestS3SelectLandsat.java   |  2 +-
 .../fs/s3a/select/ITestS3SelectMRJob.java     |  2 +-
 17 files changed, 85 insertions(+), 23 deletions(-)
 rename {hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/commit => hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util}/DurationInfo.java (93%)
 rename hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/commit/Duration.java => hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/OperationDuration.java (84%)
 create mode 100644 hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/util/TestDurationInfo.java

diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/commit/DurationInfo.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/DurationInfo.java
similarity index 93%
rename from hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/commit/DurationInfo.java
rename to hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/DurationInfo.java
index 69f90cb651632..9dd75db27c733 100644
--- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/commit/DurationInfo.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/DurationInfo.java
@@ -16,11 +16,12 @@
  * limitations under the License.
  */
 
-package org.apache.hadoop.fs.s3a.commit;
+package org.apache.hadoop.util;
 
 import org.slf4j.Logger;
 
 import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.classification.InterfaceStability;
 
 /**
  * A duration with logging of final state at info or debug
@@ -29,7 +30,8 @@
  * duration automatically logged.
  */
 @InterfaceAudience.Private
-public class DurationInfo extends Duration
+@InterfaceStability.Unstable
+public class DurationInfo extends OperationDuration
     implements AutoCloseable {
   private final String text;
 
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/commit/Duration.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/OperationDuration.java
similarity index 84%
rename from hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/commit/Duration.java
rename to hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/OperationDuration.java
index c44a90b8aed25..3276d2138bbfc 100644
--- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/commit/Duration.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/OperationDuration.java
@@ -16,17 +16,22 @@
  *  limitations under the License.
  */
 
-package org.apache.hadoop.fs.s3a.commit;
+package org.apache.hadoop.util;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.classification.InterfaceStability;
 
 /**
  * Little duration counter.
  */
-public class Duration {
+@InterfaceAudience.Private
+@InterfaceStability.Unstable
+public class OperationDuration {
 
   private final long started;
   private long finished;
 
-  public Duration() {
+  public OperationDuration() {
     started = time();
     finished = started;
   }
diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/util/TestDurationInfo.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/util/TestDurationInfo.java
new file mode 100644
index 0000000000000..d1fa70319eb84
--- /dev/null
+++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/util/TestDurationInfo.java
@@ -0,0 +1,54 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.util;
+
+import org.junit.Assert;
+import org.junit.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * The class to test DurationInfo.
+ */
+public class TestDurationInfo {
+  private final Logger log = LoggerFactory.getLogger(TestDurationInfo.class);
+
+  @Test
+  public void testDurationInfoCreation() throws Exception {
+    DurationInfo info = new DurationInfo(log, "test");
+    Assert.assertTrue(info.value() >= 0);
+    Thread.sleep(1000);
+    info.finished();
+    Assert.assertTrue(info.value() > 0);
+  }
+
+  @Test
+  public void testDurationInfoWithMultipleClose() throws Exception {
+    DurationInfo info = new DurationInfo(log, "test");
+    Thread.sleep(1000);
+    info.close();
+    info.close();
+    Assert.assertTrue(info.value() > 0);
+  }
+
+  @Test(expected = NullPointerException.class)
+  public void testDurationInfoCreationWithNullMsg() {
+    DurationInfo info = new DurationInfo(log, null);
+    info.close();
+  }
+}
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/WriteOperationHelper.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/WriteOperationHelper.java
index e16f7229ac18c..8a1599ad8ecf4 100644
--- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/WriteOperationHelper.java
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/WriteOperationHelper.java
@@ -48,8 +48,8 @@
 import org.apache.hadoop.classification.InterfaceAudience;
 import org.apache.hadoop.classification.InterfaceStability;
 import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.fs.s3a.commit.DurationInfo;
 import org.apache.hadoop.fs.s3a.select.SelectBinding;
+import org.apache.hadoop.util.DurationInfo;
 
 import static com.google.common.base.Preconditions.checkArgument;
 import static com.google.common.base.Preconditions.checkNotNull;
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/delegation/AbstractDelegationTokenBinding.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/delegation/AbstractDelegationTokenBinding.java
index 73660ea88b41d..01f90ff7ce24c 100644
--- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/delegation/AbstractDelegationTokenBinding.java
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/delegation/AbstractDelegationTokenBinding.java
@@ -30,10 +30,10 @@
 import org.apache.hadoop.fs.s3a.AWSCredentialProviderList;
 import org.apache.hadoop.fs.s3a.S3AFileSystem;
 import org.apache.hadoop.fs.s3a.auth.RoleModel;
-import org.apache.hadoop.fs.s3a.commit.DurationInfo;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.security.token.SecretManager;
 import org.apache.hadoop.security.token.Token;
+import org.apache.hadoop.util.DurationInfo;
 
 import static java.util.Objects.requireNonNull;
 import static org.apache.hadoop.fs.s3a.auth.delegation.DelegationConstants.DURATION_LOG_AT_INFO;
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/delegation/S3ADelegationTokens.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/delegation/S3ADelegationTokens.java
index b8eeca135079f..50726a33d2857 100644
--- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/delegation/S3ADelegationTokens.java
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/auth/delegation/S3ADelegationTokens.java
@@ -37,12 +37,12 @@
 import org.apache.hadoop.fs.s3a.S3AFileSystem;
 import org.apache.hadoop.fs.s3a.S3AInstrumentation;
 import org.apache.hadoop.fs.s3a.auth.RoleModel;
-import org.apache.hadoop.fs.s3a.commit.DurationInfo;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.security.Credentials;
 import org.apache.hadoop.security.UserGroupInformation;
 import org.apache.hadoop.security.token.Token;
 import org.apache.hadoop.service.ServiceOperations;
+import org.apache.hadoop.util.DurationInfo;
 
 import static com.google.common.base.Preconditions.checkArgument;
 import static com.google.common.base.Preconditions.checkState;
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/commit/AbstractS3ACommitter.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/commit/AbstractS3ACommitter.java
index d2501da6aad35..ed608cb983186 100644
--- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/commit/AbstractS3ACommitter.java
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/commit/AbstractS3ACommitter.java
@@ -48,6 +48,7 @@
 import org.apache.hadoop.mapreduce.TaskAttemptContext;
 import org.apache.hadoop.mapreduce.lib.output.PathOutputCommitter;
 import org.apache.hadoop.net.NetUtils;
+import org.apache.hadoop.util.DurationInfo;
 
 import static org.apache.hadoop.fs.s3a.Invoker.ignoreIOExceptions;
 import static org.apache.hadoop.fs.s3a.S3AUtils.*;
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/commit/magic/MagicS3GuardCommitter.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/commit/magic/MagicS3GuardCommitter.java
index c956a9806417f..813b9a77460e3 100644
--- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/commit/magic/MagicS3GuardCommitter.java
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/commit/magic/MagicS3GuardCommitter.java
@@ -35,12 +35,12 @@
 import org.apache.hadoop.fs.s3a.commit.CommitOperations;
 import org.apache.hadoop.fs.s3a.commit.CommitConstants;
 import org.apache.hadoop.fs.s3a.commit.CommitUtilsWithMR;
-import org.apache.hadoop.fs.s3a.commit.DurationInfo;
 import org.apache.hadoop.fs.s3a.commit.files.PendingSet;
 import org.apache.hadoop.fs.s3a.commit.files.SinglePendingCommit;
 import org.apache.hadoop.mapreduce.JobContext;
 import org.apache.hadoop.mapreduce.TaskAttemptContext;
 import org.apache.hadoop.mapreduce.TaskAttemptID;
+import org.apache.hadoop.util.DurationInfo;
 
 import static org.apache.hadoop.fs.s3a.S3AUtils.*;
 import static org.apache.hadoop.fs.s3a.commit.CommitUtils.*;
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/commit/staging/StagingCommitter.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/commit/staging/StagingCommitter.java
index 5f5b8ec826e44..cd6d13d642c3b 100644
--- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/commit/staging/StagingCommitter.java
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/commit/staging/StagingCommitter.java
@@ -41,7 +41,6 @@
 import org.apache.hadoop.fs.s3a.S3AFileSystem;
 import org.apache.hadoop.fs.s3a.commit.AbstractS3ACommitter;
 import org.apache.hadoop.fs.s3a.commit.CommitConstants;
-import org.apache.hadoop.fs.s3a.commit.DurationInfo;
 import org.apache.hadoop.fs.s3a.commit.InternalCommitterConstants;
 import org.apache.hadoop.fs.s3a.commit.Tasks;
 import org.apache.hadoop.fs.s3a.commit.files.PendingSet;
@@ -50,6 +49,7 @@
 import org.apache.hadoop.mapreduce.JobID;
 import org.apache.hadoop.mapreduce.TaskAttemptContext;
 import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter;
+import org.apache.hadoop.util.DurationInfo;
 
 import static com.google.common.base.Preconditions.*;
 import static org.apache.hadoop.fs.s3a.Constants.*;
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/select/SelectTool.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/select/SelectTool.java
index c89cc287f2eaf..4b362c667ece6 100644
--- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/select/SelectTool.java
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/select/SelectTool.java
@@ -41,11 +41,11 @@
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.fs.impl.FutureIOSupport;
 import org.apache.hadoop.fs.s3a.S3AFileSystem;
-import org.apache.hadoop.fs.s3a.commit.Duration;
-import org.apache.hadoop.fs.s3a.commit.DurationInfo;
 import org.apache.hadoop.fs.s3a.s3guard.S3GuardTool;
 import org.apache.hadoop.fs.shell.CommandFormat;
+import org.apache.hadoop.util.DurationInfo;
 import org.apache.hadoop.util.ExitUtil;
+import org.apache.hadoop.util.OperationDuration;
 
 import static org.apache.commons.lang3.StringUtils.isNotEmpty;
 import static org.apache.hadoop.io.IOUtils.cleanupWithLogger;
@@ -102,7 +102,7 @@ public class SelectTool extends S3GuardTool {
 
   static final String SELECT_IS_DISABLED = "S3 Select is disabled";
 
-  private Duration selectDuration;
+  private OperationDuration selectDuration;
 
   private long bytesRead;
 
@@ -130,7 +130,7 @@ public String getUsage() {
     return USAGE;
   }
 
-  public Duration getSelectDuration() {
+  public OperationDuration getSelectDuration() {
     return selectDuration;
   }
 
@@ -241,7 +241,7 @@ public int run(String[] args, PrintStream out)
     }
     linesRead = 0;
 
-    selectDuration = new Duration();
+    selectDuration = new OperationDuration();
 
     // open and scan the stream.
     final FutureDataInputStreamBuilder builder = fs.openFile(path)
diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3ATemporaryCredentials.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3ATemporaryCredentials.java
index a0573c001ea45..4f2d731aecbbf 100644
--- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3ATemporaryCredentials.java
+++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3ATemporaryCredentials.java
@@ -40,9 +40,9 @@
 import org.apache.hadoop.fs.s3a.auth.STSClientFactory;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.s3a.auth.delegation.SessionTokenIdentifier;
-import org.apache.hadoop.fs.s3a.commit.DurationInfo;
 import org.apache.hadoop.io.IOUtils;
 import org.apache.hadoop.test.LambdaTestUtils;
+import org.apache.hadoop.util.DurationInfo;
 
 import static org.apache.hadoop.fs.contract.ContractTestUtils.*;
 import static org.apache.hadoop.fs.s3a.Constants.*;
diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/commit/AbstractITCommitProtocol.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/commit/AbstractITCommitProtocol.java
index 5ae8f54522724..027bcb7a93c7a 100644
--- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/commit/AbstractITCommitProtocol.java
+++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/commit/AbstractITCommitProtocol.java
@@ -33,7 +33,6 @@
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileStatus;
 import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.FileSystemTestHelper;
 import org.apache.hadoop.fs.FileUtil;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.fs.contract.ContractTestUtils;
@@ -59,6 +58,7 @@
 import org.apache.hadoop.mapreduce.task.JobContextImpl;
 import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl;
 import org.apache.hadoop.mapreduce.v2.util.MRBuilderUtils;
+import org.apache.hadoop.util.DurationInfo;
 import org.apache.hadoop.util.ReflectionUtils;
 import org.apache.hadoop.util.concurrent.HadoopExecutors;
 
diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/select/AbstractS3SelectTest.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/select/AbstractS3SelectTest.java
index 18138a616bbe4..56d99d1abe391 100644
--- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/select/AbstractS3SelectTest.java
+++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/select/AbstractS3SelectTest.java
@@ -47,7 +47,6 @@
 import org.apache.hadoop.fs.s3a.AbstractS3ATestBase;
 import org.apache.hadoop.fs.s3a.S3AFileSystem;
 import org.apache.hadoop.fs.s3a.commit.AbstractCommitITest;
-import org.apache.hadoop.fs.s3a.commit.DurationInfo;
 import org.apache.hadoop.io.LongWritable;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.io.compress.PassthroughCodec;
@@ -59,6 +58,7 @@
 import org.apache.hadoop.mapreduce.lib.input.FileSplit;
 import org.apache.hadoop.mapreduce.lib.input.LineRecordReader;
 import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl;
+import org.apache.hadoop.util.DurationInfo;
 
 import static org.apache.hadoop.fs.impl.FutureIOSupport.awaitFuture;
 import static org.apache.hadoop.fs.s3a.S3ATestUtils.getLandsatCSVPath;
diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/select/ITestS3Select.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/select/ITestS3Select.java
index 3dc2c6125762d..64974db5a466c 100644
--- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/select/ITestS3Select.java
+++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/select/ITestS3Select.java
@@ -50,7 +50,6 @@
 import org.apache.hadoop.fs.s3a.S3AInstrumentation;
 import org.apache.hadoop.fs.s3a.S3ATestUtils;
 import org.apache.hadoop.fs.s3a.Statistic;
-import org.apache.hadoop.fs.s3a.commit.DurationInfo;
 import org.apache.hadoop.io.LongWritable;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.mapred.JobConf;
@@ -62,6 +61,7 @@
 import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
 import org.apache.hadoop.mapreduce.task.JobContextImpl;
+import org.apache.hadoop.util.DurationInfo;
 
 import static org.apache.hadoop.fs.s3a.Constants.INPUT_FADVISE;
 import static org.apache.hadoop.fs.s3a.Constants.INPUT_FADV_NORMAL;
diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/select/ITestS3SelectCLI.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/select/ITestS3SelectCLI.java
index c04cf8bff76c1..fccf708fef4e8 100644
--- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/select/ITestS3SelectCLI.java
+++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/select/ITestS3SelectCLI.java
@@ -34,9 +34,9 @@
 import org.apache.hadoop.fs.s3a.S3AFileSystem;
 import org.apache.hadoop.fs.s3a.S3ATestUtils;
 import org.apache.hadoop.fs.s3a.Statistic;
-import org.apache.hadoop.fs.s3a.commit.Duration;
 import org.apache.hadoop.fs.s3a.s3guard.S3GuardTool;
 import org.apache.hadoop.util.ExitUtil;
+import org.apache.hadoop.util.OperationDuration;
 import org.apache.hadoop.util.ToolRunner;
 
 import static com.google.common.base.Preconditions.checkNotNull;
@@ -165,7 +165,7 @@ D, v(CSV_OUTPUT_QUOTE_FIELDS, CSV_OUTPUT_QUOTE_FIELDS_AS_NEEEDED),
     LOG.info("Result from select:\n{}", lines.get(0));
     assertEquals(lineCount, lines.size());
     selectCount.assertDiffEquals("select count", 1);
-    Duration duration = selectTool.getSelectDuration();
+    OperationDuration duration = selectTool.getSelectDuration();
     assertTrue("Select duration was not measured",
         duration.value() > 0);
   }
diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/select/ITestS3SelectLandsat.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/select/ITestS3SelectLandsat.java
index 780040e6a48a3..78f3a6d1fe558 100644
--- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/select/ITestS3SelectLandsat.java
+++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/select/ITestS3SelectLandsat.java
@@ -36,8 +36,8 @@
 import org.apache.hadoop.fs.s3a.S3AInstrumentation;
 import org.apache.hadoop.fs.s3a.S3ATestUtils;
 import org.apache.hadoop.fs.s3a.Statistic;
-import org.apache.hadoop.fs.s3a.commit.DurationInfo;
 import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.util.DurationInfo;
 
 import static org.apache.hadoop.fs.s3a.S3ATestUtils.assume;
 import static org.apache.hadoop.fs.s3a.S3ATestUtils.getTestPropertyBool;
diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/select/ITestS3SelectMRJob.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/select/ITestS3SelectMRJob.java
index 86d1590fce6d0..ee7de8c7ac2f2 100644
--- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/select/ITestS3SelectMRJob.java
+++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/select/ITestS3SelectMRJob.java
@@ -31,7 +31,6 @@
 import org.apache.hadoop.fs.s3a.S3AFileSystem;
 import org.apache.hadoop.fs.s3a.S3ATestUtils;
 import org.apache.hadoop.fs.s3a.S3AUtils;
-import org.apache.hadoop.fs.s3a.commit.DurationInfo;
 import org.apache.hadoop.fs.s3a.commit.files.SuccessData;
 import org.apache.hadoop.fs.s3a.commit.staging.StagingCommitter;
 import org.apache.hadoop.io.IOUtils;
@@ -41,6 +40,7 @@
 import org.apache.hadoop.mapreduce.Job;
 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
+import org.apache.hadoop.util.DurationInfo;
 import org.apache.hadoop.yarn.conf.YarnConfiguration;
 import org.apache.hadoop.yarn.server.MiniYARNCluster;
 

From b169df3d4784cd01ab895c7104110420bf709a7c Mon Sep 17 00:00:00 2001
From: Steve Loughran <stevel@cloudera.com>
Date: Tue, 12 Nov 2019 23:47:02 +0530
Subject: [PATCH 35/40] HADOOP-16665. Filesystems to be closed if they failed
 during initialize().

Contributed by Steve Loughran.

This FileSystem instantiation so if an IOException or RuntimeException is
raised in the invocation of FileSystem.initialize() then a best-effort
attempt is made to close the FS instance; exceptions raised that there
are swallowed.

The S3AFileSystem is also modified to do its own cleanup if an
IOException is raised during its initialize() process, it being the
FS we know has the "potential" to leak threads, especially in
extension points (e.g AWS Authenticators) which spawn threads.

Change-Id: Ib84073a606c9d53bf53cbfca4629876a03894f04
---
 .../java/org/apache/hadoop/fs/FileSystem.java |  20 ++-
 .../fs/TestFileSystemInitialization.java      | 125 ++++++++++++++++++
 .../hadoop/test/AbstractHadoopTestBase.java   | 110 +++++++++++++++
 .../apache/hadoop/test/HadoopTestBase.java    |   4 +-
 .../hadoop/fs/s3a/S3ABlockOutputStream.java   |   9 +-
 .../apache/hadoop/fs/s3a/S3ADataBlocks.java   |   4 +-
 .../ITestAssumedRoleCommitOperations.java     |   4 +-
 7 files changed, 264 insertions(+), 12 deletions(-)
 create mode 100644 hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/test/AbstractHadoopTestBase.java

diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FileSystem.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FileSystem.java
index 95850eb0760d5..adc5c9231d309 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FileSystem.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FileSystem.java
@@ -64,6 +64,7 @@
 import org.apache.hadoop.fs.permission.FsAction;
 import org.apache.hadoop.fs.permission.FsCreateModes;
 import org.apache.hadoop.fs.permission.FsPermission;
+import org.apache.hadoop.io.IOUtils;
 import org.apache.hadoop.io.MultipleIOException;
 import org.apache.hadoop.net.NetUtils;
 import org.apache.hadoop.security.AccessControlException;
@@ -3390,9 +3391,22 @@ private static FileSystem createFileSystem(URI uri, Configuration conf)
     Tracer tracer = FsTracer.get(conf);
     try(TraceScope scope = tracer.newScope("FileSystem#createFileSystem")) {
       scope.addKVAnnotation("scheme", uri.getScheme());
-      Class<?> clazz = getFileSystemClass(uri.getScheme(), conf);
-      FileSystem fs = (FileSystem)ReflectionUtils.newInstance(clazz, conf);
-      fs.initialize(uri, conf);
+      Class<? extends FileSystem> clazz =
+          getFileSystemClass(uri.getScheme(), conf);
+      FileSystem fs = ReflectionUtils.newInstance(clazz, conf);
+      try {
+        fs.initialize(uri, conf);
+      } catch (IOException | RuntimeException e) {
+        // exception raised during initialization.
+        // log summary at warn and full stack at debug
+        LOGGER.warn("Failed to initialize fileystem {}: {}",
+            uri, e.toString());
+        LOGGER.debug("Failed to initialize fileystem", e);
+        // then (robustly) close the FS, so as to invoke any
+        // cleanup code.
+        IOUtils.cleanupWithLogger(LOGGER, fs);
+        throw e;
+      }
       return fs;
     }
   }
diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/TestFileSystemInitialization.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/TestFileSystemInitialization.java
index 4d627a5e8e256..10ad8a14487ef 100644
--- a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/TestFileSystemInitialization.java
+++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/TestFileSystemInitialization.java
@@ -18,14 +18,24 @@
 package org.apache.hadoop.fs;
 
 import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.permission.FsPermission;
+import org.apache.hadoop.util.Progressable;
 
+import java.io.FileNotFoundException;
 import java.io.IOException;
+import java.net.URI;
 import java.net.URL;
 import java.util.ServiceConfigurationError;
 
 import org.junit.Test;
+
+import static org.apache.hadoop.test.LambdaTestUtils.intercept;
+import static org.assertj.core.api.Assertions.assertThat;
 import static org.junit.Assert.*;
 
+/**
+ * Tests related to filesystem creation and lifecycle.
+ */
 public class TestFileSystemInitialization {
 
  /**
@@ -55,4 +65,119 @@ public void testMissingLibraries() {
     } catch (Exception | ServiceConfigurationError expected) {
     }
   }
+
+  @Test
+  public void testNewInstanceFailure() throws Throwable {
+    intercept(IOException.class, FailingFileSystem.INITIALIZE, () ->
+        FileSystem.newInstance(new URI("failing://localhost"), FailingFileSystem
+            .failingConf()));
+    assertThat(FailingFileSystem.initCount).describedAs("init count")
+        .isEqualTo(1);
+    assertThat(FailingFileSystem.closeCount).describedAs("close count")
+        .isEqualTo(1);
+  }
+
+  /**
+   * An FS which will fail on both init and close, and update
+   * counters of invocations as it does so.
+   */
+  public static class FailingFileSystem extends FileSystem {
+
+    public static final String INITIALIZE = "initialize()";
+
+    public static final String CLOSE = "close()";
+
+    private static int initCount;
+
+    private static int closeCount;
+
+    private static Configuration failingConf() {
+      final Configuration conf = new Configuration(false);
+      conf.setClass("fs.failing.impl", FailingFileSystem.class,
+          FileSystem.class);
+      return conf;
+    }
+
+    @Override
+    public void initialize(final URI name, final Configuration conf)
+        throws IOException {
+      super.initialize(name, conf);
+      initCount++;
+      throw new IOException(INITIALIZE);
+    }
+
+    @Override
+    public void close() throws IOException {
+      closeCount++;
+      throw new IOException(CLOSE);
+    }
+
+    @Override
+    public URI getUri() {
+      return null;
+    }
+
+    @Override
+    public FSDataInputStream open(final Path f, final int bufferSize)
+        throws IOException {
+      return null;
+    }
+
+    @Override
+    public FSDataOutputStream create(final Path f,
+        final FsPermission permission,
+        final boolean overwrite,
+        final int bufferSize,
+        final short replication,
+        final long blockSize,
+        final Progressable progress) throws IOException {
+      return null;
+    }
+
+    @Override
+    public FSDataOutputStream append(final Path f,
+        final int bufferSize,
+        final Progressable progress) throws IOException {
+      return null;
+    }
+
+    @Override
+    public boolean rename(final Path src, final Path dst) throws IOException {
+      return false;
+    }
+
+    @Override
+    public boolean delete(final Path f, final boolean recursive)
+        throws IOException {
+      return false;
+    }
+
+    @Override
+    public FileStatus[] listStatus(final Path f)
+        throws FileNotFoundException, IOException {
+      return new FileStatus[0];
+    }
+
+    @Override
+    public void setWorkingDirectory(final Path new_dir) {
+
+    }
+
+    @Override
+    public Path getWorkingDirectory() {
+      return null;
+    }
+
+    @Override
+    public boolean mkdirs(final Path f, final FsPermission permission)
+        throws IOException {
+      return false;
+    }
+
+    @Override
+    public FileStatus getFileStatus(final Path f) throws IOException {
+      return null;
+    }
+  }
+
 }
diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/test/AbstractHadoopTestBase.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/test/AbstractHadoopTestBase.java
new file mode 100644
index 0000000000000..e18119ccafcb8
--- /dev/null
+++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/test/AbstractHadoopTestBase.java
@@ -0,0 +1,110 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.test;
+
+import java.util.concurrent.TimeUnit;
+
+import org.junit.Before;
+import org.junit.BeforeClass;
+import org.junit.Rule;
+import org.junit.rules.TestName;
+import org.junit.rules.Timeout;
+
+/**
+ * A base class for JUnit5+ tests that sets a default timeout for all tests
+ * that subclass this test.
+ *
+ * Threads are named to the method being executed, for ease of diagnostics
+ * in logs and thread dumps.
+ *
+ * Unlike {@link HadoopTestBase} this class does not extend JUnit Assert
+ * so is easier to use with AssertJ.
+ */
+public abstract class AbstractHadoopTestBase {
+
+  /**
+   * System property name to set the test timeout: {@value}.
+   */
+  public static final String PROPERTY_TEST_DEFAULT_TIMEOUT =
+      "test.default.timeout";
+
+  /**
+   * The default timeout (in milliseconds) if the system property
+   * {@link #PROPERTY_TEST_DEFAULT_TIMEOUT}
+   * is not set: {@value}.
+   */
+  public static final int TEST_DEFAULT_TIMEOUT_VALUE = 100000;
+
+  /**
+   * The JUnit rule that sets the default timeout for tests.
+   */
+  @Rule
+  public Timeout defaultTimeout = retrieveTestTimeout();
+
+  /**
+   * Retrieve the test timeout from the system property
+   * {@link #PROPERTY_TEST_DEFAULT_TIMEOUT}, falling back to
+   * the value in {@link #TEST_DEFAULT_TIMEOUT_VALUE} if the
+   * property is not defined.
+   * @return the recommended timeout for tests
+   */
+  public static Timeout retrieveTestTimeout() {
+    String propval = System.getProperty(PROPERTY_TEST_DEFAULT_TIMEOUT,
+                                         Integer.toString(
+                                           TEST_DEFAULT_TIMEOUT_VALUE));
+    int millis;
+    try {
+      millis = Integer.parseInt(propval);
+    } catch (NumberFormatException e) {
+      //fall back to the default value, as the property cannot be parsed
+      millis = TEST_DEFAULT_TIMEOUT_VALUE;
+    }
+    return new Timeout(millis, TimeUnit.MILLISECONDS);
+  }
+
+  /**
+   * The method name.
+   */
+  @Rule
+  public TestName methodName = new TestName();
+
+  /**
+   * Get the method name; defaults to the value of {@link #methodName}.
+   * Subclasses may wish to override it, which will tune the thread naming.
+   * @return the name of the method.
+   */
+  protected String getMethodName() {
+    return methodName.getMethodName();
+  }
+
+  /**
+   * Static initializer names this thread "JUnit".
+   */
+  @BeforeClass
+  public static void nameTestThread() {
+    Thread.currentThread().setName("JUnit");
+  }
+
+  /**
+   * Before each method, the thread is renamed to match the method name.
+   */
+  @Before
+  public void nameThreadToMethod() {
+    Thread.currentThread().setName("JUnit-" + getMethodName());
+  }
+}
diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/test/HadoopTestBase.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/test/HadoopTestBase.java
index cb7df4b011a2f..23f3531a41e28 100644
--- a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/test/HadoopTestBase.java
+++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/test/HadoopTestBase.java
@@ -17,6 +17,8 @@
  */
 package org.apache.hadoop.test;
 
+import java.util.concurrent.TimeUnit;
+
 import org.junit.Assert;
 import org.junit.Before;
 import org.junit.BeforeClass;
@@ -70,7 +72,7 @@ public static Timeout retrieveTestTimeout() {
       //fall back to the default value, as the property cannot be parsed
       millis = TEST_DEFAULT_TIMEOUT_VALUE;
     }
-    return new Timeout(millis);
+    return new Timeout(millis, TimeUnit.MILLISECONDS);
   }
 
   /**
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3ABlockOutputStream.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3ABlockOutputStream.java
index 662c24504c498..d20c6f1c1ea28 100644
--- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3ABlockOutputStream.java
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3ABlockOutputStream.java
@@ -53,6 +53,7 @@
 
 import static org.apache.hadoop.fs.s3a.S3AUtils.*;
 import static org.apache.hadoop.fs.s3a.Statistic.*;
+import static org.apache.hadoop.io.IOUtils.cleanupWithLogger;
 
 /**
  * Upload files/parts directly via different buffering mechanisms:
@@ -396,9 +397,9 @@ public void close() throws IOException {
       writeOperationHelper.writeFailed(ioe);
       throw ioe;
     } finally {
-      closeAll(LOG, block, blockFactory);
+      cleanupWithLogger(LOG, block, blockFactory);
       LOG.debug("Statistics: {}", statistics);
-      closeAll(LOG, statistics);
+      cleanupWithLogger(LOG, statistics);
       clearActiveBlock();
     }
     // Note end of write. This does not change the state of the remote FS.
@@ -437,7 +438,7 @@ private int putObject() throws IOException {
             // stream afterwards.
             return writeOperationHelper.putObject(putObjectRequest);
           } finally {
-            closeAll(LOG, uploadData, block);
+            cleanupWithLogger(LOG, uploadData, block);
           }
         });
     clearActiveBlock();
@@ -614,7 +615,7 @@ private void uploadBlockAsync(final S3ADataBlocks.DataBlock block)
               return partETag;
             } finally {
               // close the stream and block
-              closeAll(LOG, uploadData, block);
+              cleanupWithLogger(LOG, uploadData, block);
             }
           });
       partETagsFutures.add(partETagFuture);
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3ADataBlocks.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3ADataBlocks.java
index 0e3bca57b2596..156defb7ca031 100644
--- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3ADataBlocks.java
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3ADataBlocks.java
@@ -40,7 +40,7 @@
 import org.apache.hadoop.util.DirectBufferPool;
 
 import static org.apache.hadoop.fs.s3a.S3ADataBlocks.DataBlock.DestState.*;
-import static org.apache.hadoop.fs.s3a.S3AUtils.closeAll;
+import static org.apache.hadoop.io.IOUtils.cleanupWithLogger;
 
 /**
  * Set of classes to support output streaming into blocks which are then
@@ -155,7 +155,7 @@ InputStream getUploadStream() {
      */
     @Override
     public void close() throws IOException {
-      closeAll(LOG, uploadStream);
+      cleanupWithLogger(LOG, uploadStream);
     }
   }
 
diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/ITestAssumedRoleCommitOperations.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/ITestAssumedRoleCommitOperations.java
index 6b55b1b4c327d..853810602baec 100644
--- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/ITestAssumedRoleCommitOperations.java
+++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/auth/ITestAssumedRoleCommitOperations.java
@@ -26,7 +26,6 @@
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.fs.s3a.S3AFileSystem;
-import org.apache.hadoop.fs.s3a.S3AUtils;
 import org.apache.hadoop.fs.s3a.commit.ITestCommitOperations;
 
 import static org.apache.hadoop.fs.s3a.Constants.ASSUMED_ROLE_ARN;
@@ -34,6 +33,7 @@
 import static org.apache.hadoop.fs.s3a.auth.RoleModel.*;
 import static org.apache.hadoop.fs.s3a.auth.RolePolicies.*;
 import static org.apache.hadoop.fs.s3a.auth.RoleTestUtils.*;
+import static org.apache.hadoop.io.IOUtils.cleanupWithLogger;
 
 /**
  * Verify that the commit operations work with a restricted set of operations.
@@ -84,7 +84,7 @@ public void setup() throws Exception {
 
   @Override
   public void teardown() throws Exception {
-    S3AUtils.closeAll(LOG, roleFS);
+    cleanupWithLogger(LOG, roleFS);
     // switches getFileSystem() back to the full FS.
     roleFS = null;
     super.teardown();

From 938ef7df9fd9f589e7fba20f773382c0ee8d0884 Mon Sep 17 00:00:00 2001
From: Steve Loughran <stevel@cloudera.com>
Date: Mon, 9 Mar 2020 20:13:47 +0530
Subject: [PATCH 36/40] HADOOP-14630 Contract Tests to verify create, mkdirs
 and rename under a file is forbidden

Contributed by Steve Loughran.

Not all stores do complete validation here; in particular the S3A
Connector does not: checking up the entire directory tree to see if a path matches
is a file significantly slows things down.

This check does take place in S3A mkdirs(), which walks backwards up the list of
parent paths until it finds a directory (success) or a file (failure).
In practice production applications invariably create destination directories
before writing 1+ file into them -restricting check purely to the mkdirs()
call deliver significant speed up while implicitly including the checks.

Change-Id: I2c9df748e92b5655232e7d888d896f1868806eb0
---
 .../site/markdown/filesystem/filesystem.md    |  21 ++-
 .../contract/AbstractContractCreateTest.java  | 128 ++++++++++++++++--
 .../contract/AbstractContractRenameTest.java  |  70 ++++++++--
 .../contract/AbstractFSContractTestBase.java  |   9 ++
 .../hadoop/fs/contract/ContractOptions.java   |   9 ++
 .../org/apache/hadoop/hdfs/DFSClient.java     |   3 +-
 .../src/test/resources/contract/s3a.xml       |  10 ++
 .../adl/live/TestAdlContractRenameLive.java   |  15 ++
 ...estNativeAzureFileSystemMetricsSystem.java |  35 +++--
 .../swift/snative/SwiftNativeFileSystem.java  |  39 +++++-
 .../snative/SwiftNativeFileSystemStore.java   |   7 +-
 11 files changed, 300 insertions(+), 46 deletions(-)

diff --git a/hadoop-common-project/hadoop-common/src/site/markdown/filesystem/filesystem.md b/hadoop-common-project/hadoop-common/src/site/markdown/filesystem/filesystem.md
index eac33702f011c..0585d64de161b 100644
--- a/hadoop-common-project/hadoop-common/src/site/markdown/filesystem/filesystem.md
+++ b/hadoop-common-project/hadoop-common/src/site/markdown/filesystem/filesystem.md
@@ -507,11 +507,11 @@ running out of memory as it calculates the partitions.
 
 Any FileSystem that does not actually break files into blocks SHOULD
 return a number for this that results in efficient processing.
-A FileSystem MAY make this user-configurable (the S3 and Swift filesystem clients do this).
+A FileSystem MAY make this user-configurable (the object store connectors usually do this).
 
 ###  `long getDefaultBlockSize(Path p)`
 
-Get the "default" block size for a path —that is, the block size to be used
+Get the "default" block size for a path --that is, the block size to be used
 when writing objects to a path in the filesystem.
 
 #### Preconditions
@@ -560,14 +560,21 @@ on the filesystem.
 
 ### `boolean mkdirs(Path p, FsPermission permission)`
 
-Create a directory and all its parents
+Create a directory and all its parents.
 
 #### Preconditions
 
 
+The path must either be a directory or not exist
+ 
      if exists(FS, p) and not isDir(FS, p) :
          raise [ParentNotDirectoryException, FileAlreadyExistsException, IOException]
 
+No ancestor may be a file
+
+    forall d = ancestors(FS, p) : 
+        if exists(FS, d) and not isDir(FS, d) :
+            raise [ParentNotDirectoryException, FileAlreadyExistsException, IOException]
 
 #### Postconditions
 
@@ -607,6 +614,11 @@ Writing to or overwriting a directory must fail.
 
     if isDir(FS, p) : raise {FileAlreadyExistsException, FileNotFoundException, IOException}
 
+No ancestor may be a file
+
+    forall d = ancestors(FS, p) : 
+        if exists(FS, d) and not isDir(FS, d) :
+            raise [ParentNotDirectoryException, FileAlreadyExistsException, IOException]
 
 FileSystems may reject the request for other
 reasons, such as the FS being read-only  (HDFS),
@@ -614,7 +626,8 @@ the block size being below the minimum permitted (HDFS),
 the replication count being out of range (HDFS),
 quotas on namespace or filesystem being exceeded, reserved
 names, etc. All rejections SHOULD be `IOException` or a subclass thereof
-and MAY be a `RuntimeException` or subclass. For instance, HDFS may raise a `InvalidPathException`.
+and MAY be a `RuntimeException` or subclass.
+For instance, HDFS may raise an `InvalidPathException`.
 
 #### Postconditions
 
diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/AbstractContractCreateTest.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/AbstractContractCreateTest.java
index 07c99e0b6a528..79222ce67d6cf 100644
--- a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/AbstractContractCreateTest.java
+++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/AbstractContractCreateTest.java
@@ -22,11 +22,11 @@
 import org.apache.hadoop.fs.FileAlreadyExistsException;
 import org.apache.hadoop.fs.FileStatus;
 import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.ParentNotDirectoryException;
 import org.apache.hadoop.fs.Path;
 import org.junit.Test;
-import org.junit.internal.AssumptionViolatedException;
+import org.junit.AssumptionViolatedException;
 
-import java.io.FileNotFoundException;
 import java.io.IOException;
 
 import static org.apache.hadoop.fs.contract.ContractTestUtils.dataset;
@@ -40,7 +40,7 @@
  * Test creating files, overwrite options etc.
  */
 public abstract class AbstractContractCreateTest extends
-                                                 AbstractFSContractTestBase {
+    AbstractFSContractTestBase {
 
   /**
    * How long to wait for a path to become visible.
@@ -113,7 +113,6 @@ private void testOverwriteExistingFile(boolean useBuilder) throws Throwable {
    * This test catches some eventual consistency problems that blobstores exhibit,
    * as we are implicitly verifying that updates are consistent. This
    * is why different file lengths and datasets are used
-   * @throws Throwable
    */
   @Test
   public void testOverwriteExistingFile() throws Throwable {
@@ -137,10 +136,6 @@ private void testOverwriteEmptyDirectory(boolean useBuilder)
     } catch (FileAlreadyExistsException expected) {
       //expected
       handleExpectedException(expected);
-    } catch (FileNotFoundException e) {
-      handleRelaxedException("overwriting a dir with a file ",
-                             "FileAlreadyExistsException",
-                             e);
     } catch (IOException e) {
       handleRelaxedException("overwriting a dir with a file ",
                              "FileAlreadyExistsException",
@@ -189,10 +184,6 @@ private void testOverwriteNonEmptyDirectory(boolean useBuilder)
     } catch (FileAlreadyExistsException expected) {
       //expected
       handleExpectedException(expected);
-    } catch (FileNotFoundException e) {
-      handleRelaxedException("overwriting a dir with a file ",
-                             "FileAlreadyExistsException",
-                             e);
     } catch (IOException e) {
       handleRelaxedException("overwriting a dir with a file ",
                              "FileAlreadyExistsException",
@@ -332,4 +323,117 @@ public void testCreateMakesParentDirs() throws Throwable {
     assertTrue("Grandparent directory does not appear to be a directory",
         fs.getFileStatus(grandparent).isDirectory());
   }
+
+  @Test
+  public void testCreateFileUnderFile() throws Throwable {
+    describe("Verify that it is forbidden to create file/file");
+    if (isSupported(CREATE_FILE_UNDER_FILE_ALLOWED)) {
+      // object store or some file systems: downgrade to a skip so that the
+      // failure is visible in test results
+      skip("This filesystem supports creating files under files");
+    }
+    Path grandparent = methodPath();
+    Path parent = new Path(grandparent, "parent");
+    expectCreateUnderFileFails(
+        "creating a file under a file",
+        grandparent,
+        parent);
+  }
+
+  @Test
+  public void testCreateUnderFileSubdir() throws Throwable {
+    describe("Verify that it is forbidden to create file/dir/file");
+    if (isSupported(CREATE_FILE_UNDER_FILE_ALLOWED)) {
+      // object store or some file systems: downgrade to a skip so that the
+      // failure is visible in test results
+      skip("This filesystem supports creating files under files");
+    }
+    Path grandparent = methodPath();
+    Path parent = new Path(grandparent, "parent");
+    Path child = new Path(parent, "child");
+    expectCreateUnderFileFails(
+        "creating a file under a subdirectory of a file",
+        grandparent,
+        child);
+  }
+
+
+  @Test
+  public void testMkdirUnderFile() throws Throwable {
+    describe("Verify that it is forbidden to create file/dir");
+    Path grandparent = methodPath();
+    Path parent = new Path(grandparent, "parent");
+    expectMkdirsUnderFileFails("mkdirs() under a file",
+        grandparent, parent);
+  }
+
+  @Test
+  public void testMkdirUnderFileSubdir() throws Throwable {
+    describe("Verify that it is forbidden to create file/dir/dir");
+    Path grandparent = methodPath();
+    Path parent = new Path(grandparent, "parent");
+    Path child = new Path(parent, "child");
+    expectMkdirsUnderFileFails("mkdirs() file/dir",
+        grandparent, child);
+
+    try {
+      // create the child
+      mkdirs(child);
+    } catch (FileAlreadyExistsException | ParentNotDirectoryException ex) {
+      // either of these may be raised.
+      handleExpectedException(ex);
+    } catch (IOException e) {
+      handleRelaxedException("creating a file under a subdirectory of a file ",
+          "FileAlreadyExistsException",
+          e);
+    }
+  }
+
+  /**
+   * Expect that touch() will fail because the parent is a file.
+   * @param action action for message
+   * @param file filename to create
+   * @param descendant path under file
+   * @throws Exception failure
+   */
+  protected void expectCreateUnderFileFails(String action,
+      Path file, Path descendant)
+      throws Exception {
+    createFile(file);
+    try {
+      // create the child
+      createFile(descendant);
+    } catch (FileAlreadyExistsException | ParentNotDirectoryException ex) {
+      //expected
+      handleExpectedException(ex);
+    } catch (IOException e) {
+      handleRelaxedException(action,
+          "ParentNotDirectoryException",
+          e);
+    }
+  }
+
+  protected void expectMkdirsUnderFileFails(String action,
+      Path file, Path descendant)
+      throws Exception {
+    createFile(file);
+    try {
+      // now mkdirs
+      mkdirs(descendant);
+    } catch (FileAlreadyExistsException | ParentNotDirectoryException ex) {
+      //expected
+      handleExpectedException(ex);
+    } catch (IOException e) {
+      handleRelaxedException(action,
+          "ParentNotDirectoryException",
+          e);
+    }
+  }
+
+  private void createFile(Path path) throws IOException {
+    byte[] data = dataset(256, 'a', 'z');
+    FileSystem fs = getFileSystem();
+    writeDataset(fs, path, data, data.length, 1024 * 1024,
+        true);
+  }
 }
diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/AbstractContractRenameTest.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/AbstractContractRenameTest.java
index 5b76a753de170..fd984c0aa63d1 100644
--- a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/AbstractContractRenameTest.java
+++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/AbstractContractRenameTest.java
@@ -29,10 +29,10 @@
 import static org.apache.hadoop.fs.contract.ContractTestUtils.*;
 
 /**
- * Test creating files, overwrite options &c
+ * Test renaming files.
  */
 public abstract class AbstractContractRenameTest extends
-                                                 AbstractFSContractTestBase {
+    AbstractFSContractTestBase {
 
   @Test
   public void testRenameNewFileSameDir() throws Throwable {
@@ -83,7 +83,8 @@ public void testRenameNonexistentFile() throws Throwable {
           "FileNotFoundException",
           e);
     }
-    assertPathDoesNotExist("rename nonexistent file created a destination file", target);
+    assertPathDoesNotExist("rename nonexistent file created a destination file",
+        target);
   }
 
   /**
@@ -112,7 +113,7 @@ public void testRenameFileOverExistingFile() throws Throwable {
       // the filesystem supports rename(file, file2) by overwriting file2
 
       assertTrue("Rename returned false", renamed);
-        destUnchanged = false;
+      destUnchanged = false;
       } else {
         // rename is rejected by returning 'false' or throwing an exception
         if (renamed && !renameReturnsFalseOnRenameDestExists) {
@@ -129,12 +130,13 @@ public void testRenameFileOverExistingFile() throws Throwable {
     // verify that the destination file is as expected based on the expected
     // outcome
     verifyFileContents(getFileSystem(), destFile,
-        destUnchanged? destData: srcData);
+        destUnchanged ? destData: srcData);
   }
 
   @Test
   public void testRenameDirIntoExistingDir() throws Throwable {
-    describe("Verify renaming a dir into an existing dir puts it underneath"
+    describe("Verify renaming a dir into an existing dir puts it"
+        + " underneath"
              +" and leaves existing files alone");
     FileSystem fs = getFileSystem();
     String sourceSubdir = "source";
@@ -145,15 +147,15 @@ public void testRenameDirIntoExistingDir() throws Throwable {
     Path destDir = path("dest");
 
     Path destFilePath = new Path(destDir, "dest-512.txt");
-    byte[] destDateset = dataset(512, 'A', 'Z');
-    writeDataset(fs, destFilePath, destDateset, destDateset.length, 1024, false);
+    byte[] destData = dataset(512, 'A', 'Z');
+    writeDataset(fs, destFilePath, destData, destData.length, 1024, false);
     assertIsFile(destFilePath);
 
     boolean rename = rename(srcDir, destDir);
     Path renamedSrc = new Path(destDir, sourceSubdir);
     assertIsFile(destFilePath);
     assertIsDirectory(renamedSrc);
-    verifyFileContents(fs, destFilePath, destDateset);
+    verifyFileContents(fs, destFilePath, destData);
     assertTrue("rename returned false though the contents were copied", rename);
   }
 
@@ -285,4 +287,54 @@ private void validateAncestorsMoved(Path src, Path dst, String nestedPath)
     }
   }
 
+  @Test
+  public void testRenameFileUnderFile() throws Exception {
+    String action = "rename directly under file";
+    describe(action);
+    Path base = methodPath();
+    Path grandparent = new Path(base, "file");
+    expectRenameUnderFileFails(action,
+        grandparent,
+        new Path(base, "testRenameSrc"),
+        new Path(grandparent, "testRenameTarget"));
+  }
+
+  @Test
+  public void testRenameFileUnderFileSubdir() throws Exception {
+    String action = "rename directly under file/subdir";
+    describe(action);
+    Path base = methodPath();
+    Path grandparent = new Path(base, "file");
+    Path parent = new Path(grandparent, "parent");
+    expectRenameUnderFileFails(action,
+        grandparent,
+        new Path(base, "testRenameSrc"),
+        new Path(parent, "testRenameTarget"));
+  }
+
+  protected void expectRenameUnderFileFails(String action,
+      Path file, Path renameSrc, Path renameTarget)
+      throws Exception {
+    byte[] data = dataset(256, 'a', 'z');
+    FileSystem fs = getFileSystem();
+    writeDataset(fs, file, data, data.length, 1024 * 1024,
+        true);
+    writeDataset(fs, renameSrc, data, data.length, 1024 * 1024,
+        true);
+    String outcome;
+    boolean renamed;
+    try {
+      renamed = rename(renameSrc, renameTarget);
+      outcome = action + ": rename (" + renameSrc + ", " + renameTarget
+          + ")= " + renamed;
+    } catch (IOException e) {
+      // raw local raises an exception here
+      renamed = false;
+      outcome = "rename raised an exception: " + e;
+    }
+    assertPathDoesNotExist("after " + outcome, renameTarget);
+    assertFalse(outcome, renamed);
+    assertPathExists(action, renameSrc);
+  }
+
 }
diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/AbstractFSContractTestBase.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/AbstractFSContractTestBase.java
index 1cd2164fad300..217c3aeb7742a 100644
--- a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/AbstractFSContractTestBase.java
+++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/AbstractFSContractTestBase.java
@@ -225,6 +225,15 @@ protected Path path(String filepath) throws IOException {
       new Path(getContract().getTestPath(), filepath));
   }
 
+  /**
+   * Get a path whose name ends with the name of this method.
+   * @return a path implicitly unique amongst all methods in this class
+   * @throws IOException IO problems
+   */
+  protected Path methodPath() throws IOException {
+    return path(methodName.getMethodName());
+  }
+
   /**
    * Take a simple path like "/something" and turn it into
    * a qualified path against the test FS.
diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/ContractOptions.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/ContractOptions.java
index cca3d4ca36b7a..17cfbf4cdb9a8 100644
--- a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/ContractOptions.java
+++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/ContractOptions.java
@@ -51,6 +51,15 @@ public interface ContractOptions {
    */
   String CREATE_VISIBILITY_DELAYED = "create-visibility-delayed";
 
+  /**
+   * Flag to indicate that it is possible to create a file under a file.
+   * This is a complete violation of the filesystem rules, but it is one
+   * which object stores have been known to do for performance
+   * <i>and because nobody has ever noticed.</i>
+   * {@value}
+   */
+  String CREATE_FILE_UNDER_FILE_ALLOWED = "create-file-under-file-allowed";
+
   /**
    * Is a filesystem case sensitive.
    * Some of the filesystems that say "no" here may mean
diff --git a/hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/DFSClient.java b/hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/DFSClient.java
index ba61e4e6e249a..282e07fa97371 100755
--- a/hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/DFSClient.java
+++ b/hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/DFSClient.java
@@ -1538,7 +1538,8 @@ public boolean rename(String src, String dst) throws IOException {
           DSQuotaExceededException.class,
           QuotaByStorageTypeExceededException.class,
           UnresolvedPathException.class,
-          SnapshotAccessControlException.class);
+          SnapshotAccessControlException.class,
+          ParentNotDirectoryException.class);
     }
   }
 
diff --git a/hadoop-tools/hadoop-aws/src/test/resources/contract/s3a.xml b/hadoop-tools/hadoop-aws/src/test/resources/contract/s3a.xml
index ec4c54ae3930d..f6b0e406b3bb2 100644
--- a/hadoop-tools/hadoop-aws/src/test/resources/contract/s3a.xml
+++ b/hadoop-tools/hadoop-aws/src/test/resources/contract/s3a.xml
@@ -122,4 +122,14 @@
     <value>false</value>
   </property>
 
+  <property>
+    <name>fs.contract.supports-unbuffer</name>
+    <value>true</value>
+  </property>
+
+  <property>
+    <name>fs.contract.create-file-under-file-allowed</name>
+    <value>true</value>
+  </property>
+
 </configuration>
diff --git a/hadoop-tools/hadoop-azure-datalake/src/test/java/org/apache/hadoop/fs/adl/live/TestAdlContractRenameLive.java b/hadoop-tools/hadoop-azure-datalake/src/test/java/org/apache/hadoop/fs/adl/live/TestAdlContractRenameLive.java
index d72d35e92ee0a..3e3a010e17484 100644
--- a/hadoop-tools/hadoop-azure-datalake/src/test/java/org/apache/hadoop/fs/adl/live/TestAdlContractRenameLive.java
+++ b/hadoop-tools/hadoop-azure-datalake/src/test/java/org/apache/hadoop/fs/adl/live/TestAdlContractRenameLive.java
@@ -19,9 +19,13 @@
 
 package org.apache.hadoop.fs.adl.live;
 
+import org.junit.Test;
+
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.contract.AbstractContractRenameTest;
 import org.apache.hadoop.fs.contract.AbstractFSContract;
+import org.apache.hadoop.security.AccessControlException;
+import org.apache.hadoop.test.LambdaTestUtils;
 
 /**
  * Test rename contract test cases on Adl file system.
@@ -32,4 +36,15 @@ public class TestAdlContractRenameLive extends AbstractContractRenameTest {
   protected AbstractFSContract createContract(Configuration configuration) {
     return new AdlStorageContract(configuration);
   }
+
+  /**
+   * ADL throws an Access Control Exception rather than return false.
+   * This is caught and its error text checked, to catch regressions.
+   */
+  @Test
+  public void testRenameFileUnderFile() throws Exception {
+    LambdaTestUtils.intercept(AccessControlException.class,
+        "Parent path is not a folder.",
+        super::testRenameFileUnderFile);
+  }
 }
diff --git a/hadoop-tools/hadoop-azure/src/test/java/org/apache/hadoop/fs/azure/metrics/TestNativeAzureFileSystemMetricsSystem.java b/hadoop-tools/hadoop-azure/src/test/java/org/apache/hadoop/fs/azure/metrics/TestNativeAzureFileSystemMetricsSystem.java
index 7820b7e65d51e..aab2607b8f809 100644
--- a/hadoop-tools/hadoop-azure/src/test/java/org/apache/hadoop/fs/azure/metrics/TestNativeAzureFileSystemMetricsSystem.java
+++ b/hadoop-tools/hadoop-azure/src/test/java/org/apache/hadoop/fs/azure/metrics/TestNativeAzureFileSystemMetricsSystem.java
@@ -38,7 +38,7 @@ private static int getFilesCreated(AzureBlobStorageTestAccount testAccount) {
   /**
    * Tests that when we have multiple file systems created/destroyed 
    * metrics from each are published correctly.
-   * @throws Exception 
+   * @throws Exception on a failure
    */
   @Test
   public void testMetricsAcrossFileSystems()
@@ -46,26 +46,37 @@ public void testMetricsAcrossFileSystems()
     AzureBlobStorageTestAccount a1, a2, a3;
 
     a1 = AzureBlobStorageTestAccount.createMock();
-    assertEquals(0, getFilesCreated(a1));
+    assertFilesCreated(a1, "a1", 0);
     a2 = AzureBlobStorageTestAccount.createMock();
-    assertEquals(0, getFilesCreated(a2));
+    assertFilesCreated(a2, "a2", 0);
     a1.getFileSystem().create(new Path("/foo")).close();
     a1.getFileSystem().create(new Path("/bar")).close();
     a2.getFileSystem().create(new Path("/baz")).close();
-    assertEquals(0, getFilesCreated(a1));
-    assertEquals(0, getFilesCreated(a2));
+    assertFilesCreated(a1, "a1", 0);
+    assertFilesCreated(a2, "a2", 0);
     a1.closeFileSystem(); // Causes the file system to close, which publishes metrics
     a2.closeFileSystem();
-    
-    assertEquals(2, getFilesCreated(a1));
-    assertEquals(1, getFilesCreated(a2));
+
+    assertFilesCreated(a1, "a1", 2);
+    assertFilesCreated(a2, "a2", 1);
     a3 = AzureBlobStorageTestAccount.createMock();
-    assertEquals(0, getFilesCreated(a3));
+    assertFilesCreated(a3, "a3", 0);
     a3.closeFileSystem();
-    assertEquals(0, getFilesCreated(a3));
+    assertFilesCreated(a3, "a3", 0);
+  }
+
+  /**
+   * Assert that a specific number of files were created.
+   * @param account account to examine
+   * @param name account name (for exception text)
+   * @param expected expected value
+   */
+  private void assertFilesCreated(AzureBlobStorageTestAccount account,
+      String name, int expected) {
+    assertEquals("Files created in account " + name,
+        expected, getFilesCreated(account));
   }
 
-  
   @Test
   public void testMetricsSourceNames() {
     String name1 = NativeAzureFileSystem.newMetricsSourceName();
@@ -83,6 +94,6 @@ public void testSkipMetricsCollection() throws Exception {
       NativeAzureFileSystem.SKIP_AZURE_METRICS_PROPERTY_NAME, true);
     a.getFileSystem().create(new Path("/foo")).close();
     a.closeFileSystem(); // Causes the file system to close, which publishes metrics
-    assertEquals(0, getFilesCreated(a));
+    assertFilesCreated(a, "a", 0);
   }
 }
diff --git a/hadoop-tools/hadoop-openstack/src/main/java/org/apache/hadoop/fs/swift/snative/SwiftNativeFileSystem.java b/hadoop-tools/hadoop-openstack/src/main/java/org/apache/hadoop/fs/swift/snative/SwiftNativeFileSystem.java
index 191d56ff71808..f5d1d5b321be9 100644
--- a/hadoop-tools/hadoop-openstack/src/main/java/org/apache/hadoop/fs/swift/snative/SwiftNativeFileSystem.java
+++ b/hadoop-tools/hadoop-openstack/src/main/java/org/apache/hadoop/fs/swift/snative/SwiftNativeFileSystem.java
@@ -23,6 +23,7 @@
 import org.apache.hadoop.classification.InterfaceAudience;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.BlockLocation;
+import org.apache.hadoop.fs.CreateFlag;
 import org.apache.hadoop.fs.FSDataInputStream;
 import org.apache.hadoop.fs.FSDataOutputStream;
 import org.apache.hadoop.fs.FileAlreadyExistsException;
@@ -45,6 +46,7 @@
 import java.io.OutputStream;
 import java.net.URI;
 import java.util.ArrayList;
+import java.util.EnumSet;
 import java.util.List;
 
 /**
@@ -588,14 +590,12 @@ public boolean rename(Path src, Path dst) throws IOException {
       store.rename(makeAbsolute(src), makeAbsolute(dst));
       //success
       return true;
-    } catch (SwiftOperationFailedException e) {
-      //downgrade to a failure
-      return false;
-    } catch (FileAlreadyExistsException e) {
-      //downgrade to a failure
-      return false;
-    } catch (FileNotFoundException e) {
+    } catch (SwiftOperationFailedException
+        | FileAlreadyExistsException
+        | FileNotFoundException
+        | ParentNotDirectoryException e) {
       //downgrade to a failure
+      LOG.debug("rename({}, {}) failed",src, dst, e);
       return false;
     }
   }
@@ -725,4 +725,29 @@ public static long getBytesUploaded(FSDataOutputStream outputStream) {
     return snos.getBytesUploaded();
   }
 
+  /**
+   * {@inheritDoc}
+   * @throws FileNotFoundException if the parent directory is not present -or
+   * is not a directory.
+   */
+  @Override
+  public FSDataOutputStream createNonRecursive(Path path,
+      FsPermission permission,
+      EnumSet<CreateFlag> flags,
+      int bufferSize,
+      short replication,
+      long blockSize,
+      Progressable progress) throws IOException {
+    Path parent = path.getParent();
+    if (parent != null) {
+      // expect this to raise an exception if there is no parent
+      if (!getFileStatus(parent).isDirectory()) {
+        throw new FileAlreadyExistsException("Not a directory: " + parent);
+      }
+    }
+    return create(path, permission,
+        flags.contains(CreateFlag.OVERWRITE), bufferSize,
+        replication, blockSize, progress);
+  }
+
 }
diff --git a/hadoop-tools/hadoop-openstack/src/main/java/org/apache/hadoop/fs/swift/snative/SwiftNativeFileSystemStore.java b/hadoop-tools/hadoop-openstack/src/main/java/org/apache/hadoop/fs/swift/snative/SwiftNativeFileSystemStore.java
index ed7a782284bc3..5e4800900920a 100644
--- a/hadoop-tools/hadoop-openstack/src/main/java/org/apache/hadoop/fs/swift/snative/SwiftNativeFileSystemStore.java
+++ b/hadoop-tools/hadoop-openstack/src/main/java/org/apache/hadoop/fs/swift/snative/SwiftNativeFileSystemStore.java
@@ -27,6 +27,7 @@
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileAlreadyExistsException;
 import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.ParentNotDirectoryException;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.fs.swift.exceptions.SwiftConfigurationException;
 import org.apache.hadoop.fs.swift.exceptions.SwiftException;
@@ -562,13 +563,17 @@ public void rename(Path src, Path dst)
     //parent dir (in which case the dest dir exists), or the destination
     //directory is root, in which case it must also exist
     if (dstParent != null && !dstParent.equals(srcParent)) {
+      SwiftFileStatus fileStatus;
       try {
-        getObjectMetadata(dstParent);
+        fileStatus = getObjectMetadata(dstParent);
       } catch (FileNotFoundException e) {
         //destination parent doesn't exist; bail out
         LOG.debug("destination parent directory " + dstParent + " doesn't exist");
         throw e;
       }
+      if (!fileStatus.isDir()) {
+        throw new ParentNotDirectoryException(dstParent.toString());
+      }
     }
 
     boolean destExists = dstMetadata != null;

From c766e6d5e8cb5f79d7b8a15907ea6fa359bd0101 Mon Sep 17 00:00:00 2001
From: Konstantin V Shvachko <shv@apache.org>
Date: Tue, 13 Oct 2020 05:56:24 +0530
Subject: [PATCH 37/40] HDFS-15567. [SBN Read] HDFS should expose msync() API
 to allow downstream applications call it explicitly. Contributed by
 Konstantin V Shvachko.

(cherry picked from commit b3786d6c3cc13b0b92b9f42da1731c4ce35c9ded)
---
 .../org/apache/hadoop/fs/AbstractFileSystem.java    | 13 +++++++++++++
 .../main/java/org/apache/hadoop/fs/FileContext.java | 10 ++++++++++
 .../java/org/apache/hadoop/fs/FilterFileSystem.java |  5 +++++
 .../main/java/org/apache/hadoop/fs/FilterFs.java    |  5 +++++
 4 files changed, 33 insertions(+)

diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/AbstractFileSystem.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/AbstractFileSystem.java
index 1df68b647c99a..ccfabe52ecc06 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/AbstractFileSystem.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/AbstractFileSystem.java
@@ -865,6 +865,19 @@ public abstract FileStatus getFileStatus(final Path f)
       throws AccessControlException, FileNotFoundException,
       UnresolvedLinkException, IOException;
 
+  /**
+   * Synchronize client metadata state.
+   * <p/>In some FileSystem implementations such as HDFS metadata
+   * synchronization is essential to guarantee consistency of read requests
+   * particularly in HA setting.
+   * @throws IOException
+   * @throws UnsupportedOperationException
+   */
+  public void msync() throws IOException, UnsupportedOperationException {
+    throw new UnsupportedOperationException(getClass().getCanonicalName() +
+        " does not support method msync");
+  }
+
   /**
    * The specification of this method matches that of
    * {@link FileContext#access(Path, FsAction)}
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FileContext.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FileContext.java
index 64a347f2b8692..d204bb20b17a2 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FileContext.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FileContext.java
@@ -1254,6 +1254,16 @@ public FileStatus next(final AbstractFileSystem fs, final Path p)
     }.resolve(this, absF);
   }
 
+  /**
+   * Synchronize client metadata state.
+   *
+   * @throws IOException
+   * @throws UnsupportedOperationException
+   */
+  public void msync() throws IOException, UnsupportedOperationException {
+    defaultFS.msync();
+  }
+
   /**
    * Checks if the user can access a path.  The mode specifies which access
    * checks to perform.  If the requested permissions are granted, then the
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FilterFileSystem.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FilterFileSystem.java
index ac0ca91f8cfc6..20cbe00cc735d 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FilterFileSystem.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FilterFileSystem.java
@@ -456,6 +456,11 @@ public FileStatus getFileStatus(Path f) throws IOException {
     return fs.getFileStatus(f);
   }
 
+  @Override
+  public void msync() throws IOException, UnsupportedOperationException {
+    fs.msync();
+  }
+
   @Override
   public void access(Path path, FsAction mode) throws AccessControlException,
       FileNotFoundException, IOException {
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FilterFs.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FilterFs.java
index e197506edc88b..629d1e6164a1b 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FilterFs.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FilterFs.java
@@ -124,6 +124,11 @@ public FileStatus getFileStatus(Path f)
     return myFs.getFileStatus(f);
   }
 
+  @Override
+  public void msync() throws IOException, UnsupportedOperationException {
+    myFs.msync();
+  }
+
   @Override
   public void access(Path path, FsAction mode) throws AccessControlException,
       FileNotFoundException, UnresolvedLinkException, IOException {

From 87bb75800c0596bc4ef876b832ea1b571316ff08 Mon Sep 17 00:00:00 2001
From: Deepak Damri <deepakdamri@gmail.com>
Date: Tue, 21 Jan 2025 14:37:03 +0530
Subject: [PATCH 38/40] Add hadoop-shaded-guava jar

---
 hadoop-common-project/hadoop-common/pom.xml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/hadoop-common-project/hadoop-common/pom.xml b/hadoop-common-project/hadoop-common/pom.xml
index fec010ce66f4c..75c6e2f493e5d 100644
--- a/hadoop-common-project/hadoop-common/pom.xml
+++ b/hadoop-common-project/hadoop-common/pom.xml
@@ -44,6 +44,10 @@
       <artifactId>hadoop-annotations</artifactId>
       <scope>compile</scope>
     </dependency>
+    <dependency>
+      <groupId>org.apache.hadoop.thirdparty</groupId>
+      <artifactId>hadoop-shaded-guava</artifactId>
+    </dependency>
     <dependency>
       <groupId>com.google.guava</groupId>
       <artifactId>guava</artifactId>

From af362213100d15480fc9fd05e83d12041b8a8ff4 Mon Sep 17 00:00:00 2001
From: Gabor Bota <gabor.bota@cloudera.com>
Date: Sun, 16 Jun 2019 21:35:01 +0530
Subject: [PATCH 39/40] HADOOP-16279. S3Guard: Implement time-based (TTL)
 expiry for entries (and tombstones).

Contributed by Gabor Bota.

Change-Id: I73a2d2861901dedfe7a0e783b310fbb95e7c1af9
---
 .../src/main/resources/core-default.xml       | 385 ++++---
 .../org/apache/hadoop/fs/s3a/Constants.java   | 168 +--
 .../org/apache/hadoop/fs/s3a/Listing.java     | 100 +-
 .../apache/hadoop/fs/s3a/S3AFileSystem.java   | 722 ++++++++++---
 .../fs/s3a/s3guard/DynamoDBMetadataStore.java |  99 +-
 .../fs/s3a/s3guard/ITtlTimeProvider.java      |  34 +
 .../fs/s3a/s3guard/LocalMetadataStore.java    | 111 +-
 .../hadoop/fs/s3a/s3guard/MetadataStore.java  |  87 +-
 .../fs/s3a/s3guard/NullMetadataStore.java     |  13 +-
 .../apache/hadoop/fs/s3a/s3guard/S3Guard.java | 220 +++-
 .../hadoop/fs/s3a/s3guard/S3GuardTool.java    |   9 +-
 .../site/markdown/tools/hadoop-aws/s3guard.md | 600 +++++++++--
 .../s3a/ITestS3GuardOutOfBandOperations.java  | 975 ++++++++++++++++++
 .../apache/hadoop/fs/s3a/ITestS3GuardTtl.java | 289 ++++++
 .../s3guard/AbstractS3GuardToolTestBase.java  |   2 +-
 .../s3guard/ITestDynamoDBMetadataStore.java   |  10 +-
 .../ITestDynamoDBMetadataStoreScale.java      |   5 +-
 .../fs/s3a/s3guard/MetadataStoreTestBase.java | 230 ++++-
 .../s3a/s3guard/TestLocalMetadataStore.java   |   7 +-
 .../fs/s3a/s3guard/TestNullMetadataStore.java |   5 +
 .../hadoop/fs/s3a/s3guard/TestS3Guard.java    | 208 +++-
 .../AbstractITestS3AMetadataStoreScale.java   |  14 +-
 22 files changed, 3528 insertions(+), 765 deletions(-)
 create mode 100644 hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/ITtlTimeProvider.java
 create mode 100644 hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3GuardOutOfBandOperations.java
 create mode 100644 hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3GuardTtl.java

diff --git a/hadoop-common-project/hadoop-common/src/main/resources/core-default.xml b/hadoop-common-project/hadoop-common/src/main/resources/core-default.xml
index 6366cc483f359..7ffc2adb461a6 100644
--- a/hadoop-common-project/hadoop-common/src/main/resources/core-default.xml
+++ b/hadoop-common-project/hadoop-common/src/main/resources/core-default.xml
@@ -48,14 +48,6 @@
   ordering of the filters.</description>
 </property>
 
-  <property>
-    <name>hadoop.http.idle_timeout.ms</name>
-    <value>60000</value>
-    <description>
-      NN/JN/DN Server connection timeout in milliseconds.
-    </description>
-  </property>
-
 <!--- security properties -->
 
 <property>
@@ -369,6 +361,30 @@
   </description>
 </property>
 
+<property>
+  <name>hadoop.security.group.mapping.ldap.bind.users</name>
+  <value></value>
+  <description>
+    Aliases of users to be used to bind as when connecting to the LDAP
+    server(s). Each alias will have to have its distinguished name and
+    password specified through:
+    hadoop.security.group.mapping.ldap.bind.user
+    and a password configuration such as:
+    hadoop.security.group.mapping.ldap.bind.password.alias
+
+    For example, if:
+    hadoop.security.group.mapping.ldap.bind.users=alias1,alias2
+
+    then the following configuration is valid:
+    hadoop.security.group.mapping.ldap.bind.users.alias1.bind.user=bindUser1
+    hadoop.security.group.mapping.ldap.bind.users.alias1.bind.password.alias=
+    bindPasswordAlias1
+    hadoop.security.group.mapping.ldap.bind.users.alias2.bind.user=bindUser2
+    hadoop.security.group.mapping.ldap.bind.users.alias2.bind.password.alias=
+    bindPasswordAlias2
+  </description>
+</property>
+
 <property>
   <name>hadoop.security.group.mapping.ldap.bind.user</name>
   <value></value>
@@ -378,6 +394,16 @@
   </description>
 </property>
 
+<property>
+  <name>hadoop.security.group.mapping.ldap.bind.password.alias</name>
+  <value></value>
+  <description>
+    The alias of the bind user to be used to get the password from credential
+    providers. If the alias is empty, property
+    hadoop.security.group.mapping.ldap.bind.password is used instead.
+  </description>
+</property>
+
 <property>
   <name>hadoop.security.group.mapping.ldap.bind.password.file</name>
   <value></value>
@@ -655,27 +681,6 @@
   </description>
 </property>
 
-  <property>
-    <name>hadoop.security.token.service.use_ip</name>
-    <value>true</value>
-    <description>
-      Controls whether tokens always use IP addresses.
-      DNS changes will not be detected if this option is enabled.
-      Existing client connections that break will always reconnect
-      to the IP of the original host. New clients will connect
-      to the host's new IP but fail to locate a token.
-      Disabling this option will allow existing and new clients
-      to detect an IP change and continue to locate the new host's token.
-
-      In secure multi-homed environments, this parameter will need to
-      be set to false on both cluster servers and clients (see HADOOP-7733).
-      If it is not set correctly, the symptom will be inability to
-      submit an application to YARN from an external client
-      (with error "client host not a member of the Hadoop cluster"),
-      or even from an in-cluster client if server failover occurs.
-    </description>
-  </property>
-
 <property>
   <name>hadoop.workaround.non.threadsafe.getpwuid</name>
   <value>true</value>
@@ -709,6 +714,14 @@
     </description>
 </property>
 
+<property>
+  <name>hadoop.kerberos.keytab.login.autorenewal.enabled</name>
+  <value>false</value>
+  <description>Used to enable automatic renewal of keytab based kerberos login.
+    By default the automatic renewal is disabled for keytab based kerberos login.
+  </description>
+</property>
+
 <property>
   <name>hadoop.security.auth_to_local</name>
   <value></value>
@@ -931,124 +944,6 @@
   </description>
 </property>
 
-<property>
-  <name>fs.viewfs.overload.scheme.target.hdfs.impl</name>
-  <value>org.apache.hadoop.hdfs.DistributedFileSystem</value>
-  <description>The DistributedFileSystem for view file system overload scheme
-   when child file system and ViewFSOverloadScheme's schemes are hdfs.
-   </description>
-</property>
-
-<property>
-  <name>fs.viewfs.overload.scheme.target.s3a.impl</name>
-  <value>org.apache.hadoop.fs.s3a.S3AFileSystem</value>
-  <description>The S3AFileSystem for view file system overload scheme when
-   child file system and ViewFSOverloadScheme's schemes are s3a.</description>
-</property>
-
-<property>
-  <name>fs.viewfs.overload.scheme.target.ofs.impl</name>
-  <value>org.apache.hadoop.fs.ozone.RootedOzoneFileSystem</value>
-  <description>The RootedOzoneFileSystem for view file system overload scheme
-    when child file system and ViewFSOverloadScheme's schemes are ofs.
-  </description>
-</property>
-
-<property>
-  <name>fs.viewfs.overload.scheme.target.o3fs.impl</name>
-  <value>org.apache.hadoop.fs.ozone.OzoneFileSystem</value>
-  <description>The OzoneFileSystem for view file system overload scheme when
-   child file system and ViewFSOverloadScheme's schemes are o3fs.</description>
-</property>
-
-<property>
-  <name>fs.viewfs.overload.scheme.target.ftp.impl</name>
-  <value>org.apache.hadoop.fs.ftp.FTPFileSystem</value>
-  <description>The FTPFileSystem for view file system overload scheme when
-   child file system and ViewFSOverloadScheme's schemes are ftp.
-   </description>
-</property>
-
-<property>
-  <name>fs.viewfs.overload.scheme.target.webhdfs.impl</name>
-  <value>org.apache.hadoop.hdfs.web.WebHdfsFileSystem</value>
-  <description>The WebHdfsFileSystem for view file system overload scheme when
-   child file system and ViewFSOverloadScheme's schemes are webhdfs.
-  </description>
-</property>
-
-<property>
-  <name>fs.viewfs.overload.scheme.target.swebhdfs.impl</name>
-  <value>org.apache.hadoop.hdfs.web.SWebHdfsFileSystem</value>
-  <description>The SWebHdfsFileSystem for view file system overload scheme when
-   child file system and ViewFSOverloadScheme's schemes are swebhdfs.
-  </description>
-</property>
-
-<property>
-  <name>fs.viewfs.overload.scheme.target.file.impl</name>
-  <value>org.apache.hadoop.fs.LocalFileSystem</value>
-  <description>The LocalFileSystem for view file system overload scheme when
-   child file system and ViewFSOverloadScheme's schemes are file.
-  </description>
-</property>
-
-<property>
-  <name>fs.viewfs.overload.scheme.target.abfs.impl</name>
-  <value>org.apache.hadoop.fs.azurebfs.AzureBlobFileSystem</value>
-  <description>The AzureBlobFileSystem for view file system overload scheme
-   when child file system and ViewFSOverloadScheme's schemes are abfs.
-  </description>
-</property>
-
-<property>
-  <name>fs.viewfs.overload.scheme.target.abfss.impl</name>
-  <value>org.apache.hadoop.fs.azurebfs.SecureAzureBlobFileSystem</value>
-  <description>The SecureAzureBlobFileSystem for view file system overload
-   scheme when child file system and ViewFSOverloadScheme's schemes are abfss.
-  </description>
-</property>
-
-<property>
-  <name>fs.viewfs.overload.scheme.target.wasb.impl</name>
-  <value>org.apache.hadoop.fs.azure.NativeAzureFileSystem</value>
-  <description>The NativeAzureFileSystem for view file system overload scheme
-   when child file system and ViewFSOverloadScheme's schemes are wasb.
-  </description>
-</property>
-
-<property>
-  <name>fs.viewfs.overload.scheme.target.swift.impl</name>
-  <value>org.apache.hadoop.fs.swift.snative.SwiftNativeFileSystem</value>
-  <description>The SwiftNativeFileSystem for view file system overload scheme
-   when child file system and ViewFSOverloadScheme's schemes are swift.
-  </description>
-</property>
-
-<property>
-  <name>fs.viewfs.overload.scheme.target.oss.impl</name>
-  <value>org.apache.hadoop.fs.aliyun.oss.AliyunOSSFileSystem</value>
-  <description>The AliyunOSSFileSystem for view file system overload scheme
-   when child file system and ViewFSOverloadScheme's schemes are oss.
-  </description>
-</property>
-
-<property>
-  <name>fs.viewfs.overload.scheme.target.http.impl</name>
-  <value>org.apache.hadoop.fs.http.HttpFileSystem</value>
-  <description>The HttpFileSystem for view file system overload scheme
-   when child file system and ViewFSOverloadScheme's schemes are http.
-  </description>
-</property>
-
-<property>
-  <name>fs.viewfs.overload.scheme.target.https.impl</name>
-  <value>org.apache.hadoop.fs.http.HttpsFileSystem</value>
-  <description>The HttpsFileSystem for view file system overload scheme
-   when child file system and ViewFSOverloadScheme's schemes are https.
-  </description>
-</property>
-
 <property>
   <name>fs.AbstractFileSystem.ftp.impl</name>
   <value>org.apache.hadoop.fs.ftp.FtpFs</value>
@@ -1105,6 +1000,14 @@
   </description>
 </property>
 
+<property>
+  <name>fs.ftp.timeout</name>
+  <value>0</value>
+  <description>
+    FTP filesystem's timeout in seconds.
+  </description>
+</property>
+
 <property>
   <name>fs.df.interval</name>
   <value>60000</value>
@@ -1145,19 +1048,33 @@
 
 <property>
   <name>fs.s3a.aws.credentials.provider</name>
+  <value>
+    org.apache.hadoop.fs.s3a.TemporaryAWSCredentialsProvider,
+    org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider,
+    com.amazonaws.auth.EnvironmentVariableCredentialsProvider,
+    org.apache.hadoop.fs.s3a.auth.IAMInstanceCredentialsProvider
+  </value>
   <description>
     Comma-separated class names of credential provider classes which implement
     com.amazonaws.auth.AWSCredentialsProvider.
 
+    When S3A delegation tokens are not enabled, this list will be used
+    to directly authenticate with S3 and DynamoDB services.
+    When S3A Delegation tokens are enabled, depending upon the delegation
+    token binding it may be used
+    to communicate wih the STS endpoint to request session/role
+    credentials.
+
     These are loaded and queried in sequence for a valid set of credentials.
     Each listed class must implement one of the following means of
     construction, which are attempted in order:
-    1. a public constructor accepting java.net.URI and
+    * a public constructor accepting java.net.URI and
         org.apache.hadoop.conf.Configuration,
-    2. a public static method named getInstance that accepts no
+    * a public constructor accepting org.apache.hadoop.conf.Configuration,
+    * a public static method named getInstance that accepts no
        arguments and returns an instance of
        com.amazonaws.auth.AWSCredentialsProvider, or
-    3. a public default constructor.
+    * a public default constructor.
 
     Specifying org.apache.hadoop.fs.s3a.AnonymousAWSCredentialsProvider allows
     anonymous access to a publicly accessible S3 bucket without any credentials.
@@ -1167,13 +1084,15 @@
 
     If unspecified, then the default list of credential provider classes,
     queried in sequence, is:
-    1. org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider:
+    * org.apache.hadoop.fs.s3a.TemporaryAWSCredentialsProvider: looks
+       for session login secrets in the Hadoop configuration.
+    * org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider:
        Uses the values of fs.s3a.access.key and fs.s3a.secret.key.
-    2. com.amazonaws.auth.EnvironmentVariableCredentialsProvider: supports
+    * com.amazonaws.auth.EnvironmentVariableCredentialsProvider: supports
         configuration of AWS access key ID and secret access key in
-        environment variables named AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY,
-        and AWS_SESSION_TOKEN as documented in the AWS SDK.
-    3. com.amazonaws.auth.InstanceProfileCredentialsProvider: supports use
+        environment variables named AWS_ACCESS_KEY_ID and
+        AWS_SECRET_ACCESS_KEY, as documented in the AWS SDK.
+    * com.amazonaws.auth.InstanceProfileCredentialsProvider: supports use
         of instance profile credentials if running in an EC2 VM.
   </description>
 </property>
@@ -1229,7 +1148,7 @@
   <value>30m</value>
   <description>
     Duration of assumed roles before a refresh is attempted.
-    Only used if AssumedRoleCredentialProvider is the AWS credential provider.
+    Used when session tokens are requested.
     Range: 15m to 1h
   </description>
 </property>
@@ -1241,17 +1160,20 @@
     AWS Security Token Service Endpoint.
     If unset, uses the default endpoint.
     Only used if AssumedRoleCredentialProvider is the AWS credential provider.
+    Used by the AssumedRoleCredentialProvider and in Session and Role delegation
+    tokens.
   </description>
 </property>
 
 <property>
   <name>fs.s3a.assumed.role.sts.endpoint.region</name>
-  <value>us-west-1</value>
+  <value></value>
   <description>
     AWS Security Token Service Endpoint's region;
     Needed if fs.s3a.assumed.role.sts.endpoint points to an endpoint
     other than the default one and the v4 signature is used.
-    Only used if AssumedRoleCredentialProvider is the AWS credential provider.
+    Used by the AssumedRoleCredentialProvider and in Session and Role delegation
+    tokens.
   </description>
 </property>
 
@@ -1266,6 +1188,29 @@
   </description>
 </property>
 
+<property>
+  <name>fs.s3a.delegation.tokens.enabled</name>
+  <value>false</value>
+  <description></description>
+</property>
+
+<property>
+  <name>fs.s3a.delegation.token.binding</name>
+  <value></value>
+  <description>
+    The name of a class to provide delegation tokens support in S3A.
+    If unset: delegation token support is disabled.
+
+    Note: for job submission to actually collect these tokens,
+    Kerberos must be enabled.
+
+    Options are:
+    org.apache.hadoop.fs.s3a.auth.delegation.SessionTokenBinding
+    org.apache.hadoop.fs.s3a.auth.delegation.FullCredentialsTokenBinding
+    and org.apache.hadoop.fs.s3a.auth.delegation.RoleTokenBinding
+  </description>
+</property>
+
 <property>
   <name>fs.s3a.connection.maximum</name>
   <value>15</value>
@@ -1556,6 +1501,14 @@
     </description>
 </property>
 
+<property>
+    <name>fs.s3a.metadatastore.metadata.ttl</name>
+    <value>15m</value>
+    <description>
+        This value sets how long an entry in a MetadataStore is valid.
+    </description>
+</property>
+
 <property>
     <name>fs.s3a.metadatastore.impl</name>
     <value>org.apache.hadoop.fs.s3a.s3guard.NullMetadataStore</value>
@@ -1567,6 +1520,19 @@
     </description>
 </property>
 
+<property>
+  <name>fs.s3a.metadatastore.fail.on.write.error</name>
+  <value>true</value>
+  <description>
+    When true (default), FileSystem write operations generate
+    org.apache.hadoop.fs.s3a.MetadataPersistenceException if the metadata
+    cannot be saved to the metadata store.  When false, failures to save to
+    metadata store are logged at ERROR level, but the overall FileSystem
+    write operation succeeds.
+  </description>
+</property>
+
+
 <property>
     <name>fs.s3a.s3guard.cli.prune.age</name>
     <value>86400000</value>
@@ -1613,23 +1579,27 @@
 
 <property>
   <name>fs.s3a.s3guard.ddb.table.capacity.read</name>
-  <value>500</value>
+  <value>0</value>
   <description>
     Provisioned throughput requirements for read operations in terms of capacity
-    units for the DynamoDB table.  This config value will only be used when
-    creating a new DynamoDB table, though later you can manually provision by
-    increasing or decreasing read capacity as needed for existing tables.
-    See DynamoDB documents for more information.
+    units for the DynamoDB table. This config value will only be used when
+    creating a new DynamoDB table.
+    If set to 0 (the default), new tables are created with "per-request" capacity.
+    If a positive integer is provided for this and the write capacity, then
+    a table with "provisioned capacity" will be created.
+    You can change the capacity of an existing provisioned-capacity table
+    through the "s3guard set-capacity" command.
   </description>
 </property>
 
 <property>
   <name>fs.s3a.s3guard.ddb.table.capacity.write</name>
-  <value>100</value>
+  <value>0</value>
   <description>
     Provisioned throughput requirements for write operations in terms of
-    capacity units for the DynamoDB table.  Refer to related config
-    fs.s3a.s3guard.ddb.table.capacity.read before usage.
+    capacity units for the DynamoDB table.
+    If set to 0 (the default), new tables are created with "per-request" capacity.
+    Refer to related configuration option fs.s3a.s3guard.ddb.table.capacity.read
   </description>
 </property>
 
@@ -1936,15 +1906,15 @@
   <name>fs.s3a.change.detection.mode</name>
   <value>server</value>
   <description>
-    Determines how change detection is applied to alert to S3 objects
-    rewritten while being read. Value 'server' indicates to apply the attribute
-    constraint directly on GetObject requests to S3. Value 'client' means to do a
-    client-side comparison of the attribute value returned in the response.  Value
-    'server' would not work with third-party S3 implementations that do not
-    support these constraints on GetObject. Values 'server' and 'client' generate
-    RemoteObjectChangedException when a mismatch is detected.  Value 'warn' works
-    like 'client' but generates only a warning.  Value 'none' will ignore change
-    detection completely.
+    Determines how change detection is applied to alert to inconsistent S3
+    objects read during or after an overwrite. Value 'server' indicates to apply
+    the attribute constraint directly on GetObject requests to S3. Value 'client'
+    means to do a client-side comparison of the attribute value returned in the
+    response.  Value 'server' would not work with third-party S3 implementations
+    that do not support these constraints on GetObject. Values 'server' and
+    'client' generate RemoteObjectChangedException when a mismatch is detected.
+    Value 'warn' works like 'client' but generates only a warning.  Value 'none'
+    will ignore change detection completely.
   </description>
 </property>
 
@@ -2103,6 +2073,7 @@
   </description>
 </property>
 
+
 <!-- ipc properties -->
 
 <property>
@@ -2239,6 +2210,14 @@
   </description>
 </property>
 
+<property>
+  <name>ipc.server.reuseaddr</name>
+  <value>true</value>
+  <description>Enables the SO_REUSEADDR TCP option on the server.
+    Useful if BindException often prevents a certain service to be restarted
+    because the server side is stuck in TIME_WAIT state.
+  </description>
+</property>
 <!-- Proxy Configuration -->
 
 <property>
@@ -2452,7 +2431,7 @@
   <value>${user.home}/hadoop-http-auth-signature-secret</value>
   <description>
     The signature secret for signing the authentication tokens.
-    A different secret should be used for each service.
+    The same secret should be used for JT/NN/DN/TT configurations.
   </description>
 </property>
 
@@ -2680,11 +2659,20 @@
   </description>
 </property>
 
+<property>
+  <name>hadoop.ssl.enabled</name>
+  <value>false</value>
+  <description>
+    Deprecated. Use dfs.http.policy and yarn.http.policy instead.
+  </description>
+</property>
+
 <property>
   <name>hadoop.ssl.enabled.protocols</name>
-  <value>TLSv1,SSLv2Hello,TLSv1.1,TLSv1.2</value>
+  <value>TLSv1.1,TLSv1.2</value>
   <description>
-    The supported SSL protocols.
+    The supported SSL protocols. The parameter will only used from
+    DatanodeHttpServer.
   </description>
 </property>
 
@@ -2765,14 +2753,6 @@
   </description>
 </property>
 
-<property>
-  <name>ha.failover-controller.active-standby-elector.zk.op.retries</name>
-  <value>3</value>
-  <description>
-    The number of zookeeper operation retry times in ActiveStandbyElector
-  </description>
-</property>
-
 <property>
   <name>ha.failover-controller.cli-check.rpc-timeout.ms</name>
   <value>20000</value>
@@ -2932,7 +2912,7 @@
 
 <property>
   <name>hadoop.security.secure.random.impl</name>
-  <value></value>
+  <value>org.apache.hadoop.crypto.random.OpensslSecureRandom</value>
   <description>
     Implementation of secure random.
   </description>
@@ -3471,31 +3451,12 @@
   </property>
 
   <property>
-    <name>fs.getspaceused.classname</name>
-    <value></value>
-    <description>
-      The class that can tell estimate much space is used in a directory.
-      There are four impl classes that being supported:
-      org.apache.hadoop.fs.DU(default), org.apache.hadoop.fs.WindowsGetSpaceUsed
-      org.apache.hadoop.fs.DFCachingGetSpaceUsed and
-      org.apache.hadoop.hdfs.server.datanode.fsdataset.impl.ReplicaCachingGetSpaceUsed.
-      And the ReplicaCachingGetSpaceUsed impl class only used in HDFS module.
-    </description>
-  </property>
-
-  <property>
-    <name>fs.getspaceused.jitterMillis</name>
-    <value>60000</value>
-    <description>
-      fs space usage statistics refresh jitter in msec.
-    </description>
-  </property>
-
-  <property>
-    <name>hadoop.http.sni.host.check.enabled</name>
-    <value>false</value>
-    <description>
-      Enable Server Name Indication (SNI) host check for HTTPS enabled server.
+    <name>hadoop.domainname.resolver.impl</name>
+    <value>org.apache.hadoop.net.DNSDomainNameResolver</value>
+    <description>The implementation of DomainNameResolver used for service (NameNodes,
+      RBF Routers etc) discovery. The default implementation
+      org.apache.hadoop.net.DNSDomainNameResolver returns all IP addresses associated
+      with the input domain name of the services by querying the underlying DNS.
     </description>
   </property>
 </configuration>
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/Constants.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/Constants.java
index 1f15efb7cd942..7334506367a1e 100644
--- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/Constants.java
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/Constants.java
@@ -21,6 +21,8 @@
 import org.apache.hadoop.classification.InterfaceAudience;
 import org.apache.hadoop.classification.InterfaceStability;
 
+import java.util.concurrent.TimeUnit;
+
 /**
  * All the constants used with the {@link S3AFileSystem}.
  *
@@ -348,6 +350,18 @@ private Constants() {
       "fs.s3a.metadatastore.authoritative";
   public static final boolean DEFAULT_METADATASTORE_AUTHORITATIVE = false;
 
+  /**
+   * How long a directory listing in the MS is considered as authoritative.
+   */
+  public static final String METADATASTORE_METADATA_TTL =
+      "fs.s3a.metadatastore.metadata.ttl";
+
+  /**
+   * Default TTL in milliseconds: 15 minutes.
+   */
+  public static final long DEFAULT_METADATASTORE_METADATA_TTL =
+      TimeUnit.MINUTES.toMillis(15);
+
   /** read ahead buffer size to prevent connection re-establishments. */
   public static final String READAHEAD_RANGE = "fs.s3a.readahead.range";
   public static final long DEFAULT_READAHEAD_RANGE = 64 * 1024;
@@ -405,6 +419,17 @@ private Constants() {
   public static final String S3_METADATA_STORE_IMPL =
       "fs.s3a.metadatastore.impl";
 
+  /**
+   * Whether to fail when there is an error writing to the metadata store.
+   */
+  public static final String FAIL_ON_METADATA_WRITE_ERROR =
+      "fs.s3a.metadatastore.fail.on.write.error";
+
+  /**
+   * Default value ({@value}) for FAIL_ON_METADATA_WRITE_ERROR.
+   */
+  public static final boolean FAIL_ON_METADATA_WRITE_ERROR_DEFAULT = true;
+
   /** Minimum period of time (in milliseconds) to keep metadata (may only be
    * applied when a prune command is manually run).
    */
@@ -418,7 +443,6 @@ private Constants() {
    * This config has no default value. If the user does not set this, the
    * S3Guard will operate table in the associated S3 bucket region.
    */
-  @InterfaceStability.Unstable
   public static final String S3GUARD_DDB_REGION_KEY =
       "fs.s3a.s3guard.ddb.region";
 
@@ -428,7 +452,6 @@ private Constants() {
    * This config has no default value. If the user does not set this, the
    * S3Guard implementation will use the respective S3 bucket name.
    */
-  @InterfaceStability.Unstable
   public static final String S3GUARD_DDB_TABLE_NAME_KEY =
       "fs.s3a.s3guard.ddb.table";
 
@@ -438,36 +461,45 @@ private Constants() {
    * For example:
    * fs.s3a.s3guard.ddb.table.tag.mytag
    */
-  @InterfaceStability.Unstable
   public static final String S3GUARD_DDB_TABLE_TAG =
       "fs.s3a.s3guard.ddb.table.tag.";
 
-  /**
-   * Test table name to use during DynamoDB integration test.
-   *
-   * The table will be modified, and deleted in the end of the tests.
-   * If this value is not set, the integration tests that would be destructive
-   * won't run.
-   */
-  @InterfaceStability.Unstable
-  public static final String S3GUARD_DDB_TEST_TABLE_NAME_KEY =
-      "fs.s3a.s3guard.ddb.test.table";
-
   /**
    * Whether to create the DynamoDB table if the table does not exist.
+   * Value: {@value}.
    */
-  @InterfaceStability.Unstable
   public static final String S3GUARD_DDB_TABLE_CREATE_KEY =
       "fs.s3a.s3guard.ddb.table.create";
 
-  @InterfaceStability.Unstable
+  /**
+   * Read capacity when creating a table.
+   * When it and the write capacity are both "0", a per-request table is
+   * created.
+   * Value: {@value}.
+   */
   public static final String S3GUARD_DDB_TABLE_CAPACITY_READ_KEY =
       "fs.s3a.s3guard.ddb.table.capacity.read";
-  public static final long S3GUARD_DDB_TABLE_CAPACITY_READ_DEFAULT = 500;
-  @InterfaceStability.Unstable
+
+  /**
+   * Default read capacity when creating a table.
+   * Value: {@value}.
+   */
+  public static final long S3GUARD_DDB_TABLE_CAPACITY_READ_DEFAULT = 0;
+
+  /**
+   * Write capacity when creating a table.
+   * When it and the read capacity are both "0", a per-request table is
+   * created.
+   * Value: {@value}.
+   */
   public static final String S3GUARD_DDB_TABLE_CAPACITY_WRITE_KEY =
       "fs.s3a.s3guard.ddb.table.capacity.write";
-  public static final long S3GUARD_DDB_TABLE_CAPACITY_WRITE_DEFAULT = 100;
+
+  /**
+   * Default write capacity when creating a table.
+   * Value: {@value}.
+   */
+  public static final long S3GUARD_DDB_TABLE_CAPACITY_WRITE_DEFAULT = 0;
 
   /**
    * The maximum put or delete requests per BatchWriteItem request.
@@ -476,7 +508,6 @@ private Constants() {
    */
   public static final int S3GUARD_DDB_BATCH_WRITE_REQUEST_LIMIT = 25;
 
-  @InterfaceStability.Unstable
   public static final String S3GUARD_DDB_MAX_RETRIES =
       "fs.s3a.s3guard.ddb.max.retries";
 
@@ -488,7 +519,6 @@ private Constants() {
   public static final int S3GUARD_DDB_MAX_RETRIES_DEFAULT =
       DEFAULT_MAX_ERROR_RETRIES;
 
-  @InterfaceStability.Unstable
   public static final String S3GUARD_DDB_THROTTLE_RETRY_INTERVAL =
       "fs.s3a.s3guard.ddb.throttle.retry.interval";
   public static final String S3GUARD_DDB_THROTTLE_RETRY_INTERVAL_DEFAULT =
@@ -507,7 +537,6 @@ private Constants() {
   /**
    * The default "Null" metadata store: {@value}.
    */
-  @InterfaceStability.Unstable
   public static final String S3GUARD_METASTORE_NULL
       = "org.apache.hadoop.fs.s3a.s3guard.NullMetadataStore";
 
@@ -535,12 +564,11 @@ private Constants() {
   public static final String S3GUARD_METASTORE_LOCAL_ENTRY_TTL =
       "fs.s3a.s3guard.local.ttl";
   public static final int DEFAULT_S3GUARD_METASTORE_LOCAL_ENTRY_TTL
-      = 10 * 1000;
+      = 60 * 1000;
 
   /**
    * Use DynamoDB for the metadata: {@value}.
    */
-  @InterfaceStability.Unstable
   public static final String S3GUARD_METASTORE_DYNAMO
       = "org.apache.hadoop.fs.s3a.s3guard.DynamoDBMetadataStore";
 
@@ -711,96 +739,4 @@ private Constants() {
    * Default change detection require version: true.
    */
   public static final boolean CHANGE_DETECT_REQUIRE_VERSION_DEFAULT = true;
-
-  /**
-   * Policy for directory markers.
-   * This is a new feature of HADOOP-13230 which addresses
-   * some scale, performance and permissions issues -but
-   * at the risk of backwards compatibility.
-   * <p></p>
-   * This Hadoop release only supports the original "delete"
-   * policy.
-   */
-  public static final String DIRECTORY_MARKER_POLICY =
-      "fs.s3a.directory.marker.retention";
-
-  /**
-   * Delete directory markers. This is the backwards compatible option.
-   * Value: {@value}.
-   */
-  public static final String DIRECTORY_MARKER_POLICY_DELETE =
-      "delete";
-
-  /**
-   * Retain directory markers (unsupported in this release).
-   * Value: {@value}.
-   */
-  public static final String DIRECTORY_MARKER_POLICY_KEEP =
-      "keep";
-
-  /**
-   * Retain directory markers in authoritative directory trees only
-   *  (unsupported in this release).
-   * Value: {@value}.
-   */
-  public static final String DIRECTORY_MARKER_POLICY_AUTHORITATIVE =
-      "authoritative";
-
-  /**
-   * Default retention policy: {@value}.
-   */
-  public static final String DEFAULT_DIRECTORY_MARKER_POLICY =
-      DIRECTORY_MARKER_POLICY_DELETE;
-
-
-  /**
-   * {@code PathCapabilities} probe to verify that an S3A Filesystem
-   * has the changes needed to safely work with buckets where
-   * directoy markers have not been deleted.
-   * Value: {@value}.
-   */
-  public static final String STORE_CAPABILITY_DIRECTORY_MARKER_AWARE
-      = "fs.s3a.capability.directory.marker.aware";
-
-  /**
-   * {@code PathCapabilities} probe to indicate that the filesystem
-   * keeps directory markers.
-   * Value: {@value}.
-   */
-  public static final String STORE_CAPABILITY_DIRECTORY_MARKER_POLICY_KEEP
-      = "fs.s3a.capability.directory.marker.policy.keep";
-
-  /**
-   * {@code PathCapabilities} probe to indicate that the filesystem
-   * deletes directory markers.
-   * Value: {@value}.
-   */
-  public static final String STORE_CAPABILITY_DIRECTORY_MARKER_POLICY_DELETE
-      = "fs.s3a.capability.directory.marker.policy.delete";
-
-  /**
-   * {@code PathCapabilities} probe to indicate that the filesystem
-   * keeps directory markers in authoritative paths only.
-   * Value: {@value}.
-   */
-  public static final String
-      STORE_CAPABILITY_DIRECTORY_MARKER_POLICY_AUTHORITATIVE =
-      "fs.s3a.capability.directory.marker.policy.authoritative";
-
-  /**
-   * {@code PathCapabilities} probe to indicate that a path/S3GuardTool
-   * keeps directory markers.
-   * Value: {@value}.
-   */
-  public static final String STORE_CAPABILITY_DIRECTORY_MARKER_ACTION_KEEP
-      = "fs.s3a.capability.directory.marker.action.keep";
-
-  /**
-   * {@code PathCapabilities} probe to indicate that a path
-   * deletes directory markers.
-   * Value: {@value}.
-   */
-  public static final String STORE_CAPABILITY_DIRECTORY_MARKER_ACTION_DELETE
-      = "fs.s3a.capability.directory.marker.action.delete";
-
 }
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/Listing.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/Listing.java
index b016eadcfbb46..b62c4569b6e62 100644
--- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/Listing.java
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/Listing.java
@@ -33,10 +33,11 @@
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Collections;
-import java.util.HashSet;
+import java.util.HashMap;
 import java.util.Iterator;
 import java.util.List;
 import java.util.ListIterator;
+import java.util.Map;
 import java.util.NoSuchElementException;
 import java.util.Set;
 
@@ -68,7 +69,7 @@ public Listing(S3AFileSystem owner) {
    * @return the file status iterator
    */
   ProvidedFileStatusIterator createProvidedFileStatusIterator(
-      FileStatus[] fileStatuses,
+      S3AFileStatus[] fileStatuses,
       PathFilter filter,
       FileStatusAcceptor acceptor) {
     return new ProvidedFileStatusIterator(fileStatuses, filter, acceptor);
@@ -114,7 +115,7 @@ FileStatusListingIterator createFileStatusListingIterator(
       S3ListRequest request,
       PathFilter filter,
       Listing.FileStatusAcceptor acceptor,
-      RemoteIterator<FileStatus> providedStatus) throws IOException {
+      RemoteIterator<S3AFileStatus> providedStatus) throws IOException {
     return new FileStatusListingIterator(
         new ObjectListingIterator(listPath, request),
         filter,
@@ -129,7 +130,7 @@ FileStatusListingIterator createFileStatusListingIterator(
    */
   @VisibleForTesting
   LocatedFileStatusIterator createLocatedFileStatusIterator(
-      RemoteIterator<FileStatus> statusIterator) {
+      RemoteIterator<S3AFileStatus> statusIterator) {
     return new LocatedFileStatusIterator(statusIterator);
   }
 
@@ -143,7 +144,7 @@ LocatedFileStatusIterator createLocatedFileStatusIterator(
    */
   @VisibleForTesting
   TombstoneReconcilingIterator createTombstoneReconcilingIterator(
-      RemoteIterator<LocatedFileStatus> iterator, Set<Path> tombstones) {
+      RemoteIterator<S3ALocatedFileStatus> iterator, Set<Path> tombstones) {
     return new TombstoneReconcilingIterator(iterator, tombstones);
   }
 
@@ -189,19 +190,19 @@ interface FileStatusAcceptor {
    * iterator returned.
    */
   static final class SingleStatusRemoteIterator
-      implements RemoteIterator<LocatedFileStatus> {
+      implements RemoteIterator<S3ALocatedFileStatus> {
 
     /**
      * The status to return; set to null after the first iteration.
      */
-    private LocatedFileStatus status;
+    private S3ALocatedFileStatus status;
 
     /**
      * Constructor.
      * @param status status value: may be null, in which case
      * the iterator is empty.
      */
-    public SingleStatusRemoteIterator(LocatedFileStatus status) {
+    SingleStatusRemoteIterator(S3ALocatedFileStatus status) {
       this.status = status;
     }
 
@@ -226,9 +227,9 @@ public boolean hasNext() throws IOException {
      * to the constructor.
      */
     @Override
-    public LocatedFileStatus next() throws IOException {
+    public S3ALocatedFileStatus next() throws IOException {
       if (hasNext()) {
-        LocatedFileStatus s = this.status;
+        S3ALocatedFileStatus s = this.status;
         status = null;
         return s;
       } else {
@@ -247,16 +248,16 @@ public LocatedFileStatus next() throws IOException {
    * There is no remote data to fetch.
    */
   static class ProvidedFileStatusIterator
-      implements RemoteIterator<FileStatus> {
-    private final ArrayList<FileStatus> filteredStatusList;
+      implements RemoteIterator<S3AFileStatus> {
+    private final ArrayList<S3AFileStatus> filteredStatusList;
     private int index = 0;
 
-    ProvidedFileStatusIterator(FileStatus[] fileStatuses, PathFilter filter,
+    ProvidedFileStatusIterator(S3AFileStatus[] fileStatuses, PathFilter filter,
         FileStatusAcceptor acceptor) {
       Preconditions.checkArgument(fileStatuses != null, "Null status list!");
 
       filteredStatusList = new ArrayList<>(fileStatuses.length);
-      for (FileStatus status : fileStatuses) {
+      for (S3AFileStatus status : fileStatuses) {
         if (filter.accept(status.getPath()) && acceptor.accept(status)) {
           filteredStatusList.add(status);
         }
@@ -270,7 +271,7 @@ public boolean hasNext() throws IOException {
     }
 
     @Override
-    public FileStatus next() throws IOException {
+    public S3AFileStatus next() throws IOException {
       if (!hasNext()) {
         throw new NoSuchElementException();
       }
@@ -305,7 +306,7 @@ public FileStatus next() throws IOException {
    * Thread safety: None.
    */
   class FileStatusListingIterator
-      implements RemoteIterator<FileStatus> {
+      implements RemoteIterator<S3AFileStatus> {
 
     /** Source of objects. */
     private final ObjectListingIterator source;
@@ -316,10 +317,10 @@ class FileStatusListingIterator
     /** request batch size. */
     private int batchSize;
     /** Iterator over the current set of results. */
-    private ListIterator<FileStatus> statusBatchIterator;
+    private ListIterator<S3AFileStatus> statusBatchIterator;
 
-    private final Set<FileStatus> providedStatus;
-    private Iterator<FileStatus> providedStatusIterator;
+    private final Map<Path, S3AFileStatus> providedStatus;
+    private Iterator<S3AFileStatus> providedStatusIterator;
 
     /**
      * Create an iterator over file status entries.
@@ -335,15 +336,16 @@ class FileStatusListingIterator
     FileStatusListingIterator(ObjectListingIterator source,
         PathFilter filter,
         FileStatusAcceptor acceptor,
-        RemoteIterator<FileStatus> providedStatus) throws IOException {
+        RemoteIterator<S3AFileStatus> providedStatus) throws IOException {
       this.source = source;
       this.filter = filter;
       this.acceptor = acceptor;
-      this.providedStatus = new HashSet<>();
+      this.providedStatus = new HashMap<>();
       for (; providedStatus != null && providedStatus.hasNext();) {
-        final FileStatus status = providedStatus.next();
-        if (filter.accept(status.getPath()) && acceptor.accept(status)) {
-          this.providedStatus.add(status);
+        final S3AFileStatus status = providedStatus.next();
+        Path path = status.getPath();
+        if (filter.accept(path) && acceptor.accept(status)) {
+          this.providedStatus.put(path, status);
         }
       }
       // build the first set of results. This will not trigger any
@@ -376,7 +378,7 @@ private boolean sourceHasNext() throws IOException {
         // turn to file status that are only in provided list
         if (providedStatusIterator == null) {
           LOG.debug("Start iterating the provided status.");
-          providedStatusIterator = providedStatus.iterator();
+          providedStatusIterator = providedStatus.values().iterator();
         }
         return false;
       }
@@ -384,14 +386,21 @@ private boolean sourceHasNext() throws IOException {
 
     @Override
     @Retries.RetryTranslated
-    public FileStatus next() throws IOException {
-      final FileStatus status;
+    public S3AFileStatus next() throws IOException {
+      final S3AFileStatus status;
       if (sourceHasNext()) {
         status = statusBatchIterator.next();
-        // We remove from provided list the file status listed by S3 so that
+        // We remove from provided map the file status listed by S3 so that
         // this does not return duplicate items.
-        if (providedStatus.remove(status)) {
-          LOG.debug("Removed the status from provided file status {}", status);
+
+        // The provided status is returned as it is assumed to have the better
+        // metadata (i.e. the eTag and versionId from S3Guard)
+        S3AFileStatus provided = providedStatus.remove(status.getPath());
+        if (provided != null) {
+          LOG.debug(
+              "Removed and returned the status from provided file status {}",
+              status);
+          return provided;
         }
       } else {
         if (providedStatusIterator.hasNext()) {
@@ -441,7 +450,7 @@ private boolean buildNextStatusBatch(S3ListResult objects) {
       // counters for debug logs
       int added = 0, ignored = 0;
       // list to fill in with results. Initial size will be list maximum.
-      List<FileStatus> stats = new ArrayList<>(
+      List<S3AFileStatus> stats = new ArrayList<>(
           objects.getObjectSummaries().size() +
               objects.getCommonPrefixes().size());
       // objects
@@ -453,8 +462,9 @@ private boolean buildNextStatusBatch(S3ListResult objects) {
         }
         // Skip over keys that are ourselves and old S3N _$folder$ files
         if (acceptor.accept(keyPath, summary) && filter.accept(keyPath)) {
-          FileStatus status = createFileStatus(keyPath, summary,
-              owner.getDefaultBlockSize(keyPath), owner.getUsername());
+          S3AFileStatus status = createFileStatus(keyPath, summary,
+              owner.getDefaultBlockSize(keyPath), owner.getUsername(),
+              summary.getETag(), null);
           LOG.debug("Adding: {}", status);
           stats.add(status);
           added++;
@@ -468,7 +478,7 @@ private boolean buildNextStatusBatch(S3ListResult objects) {
       for (String prefix : objects.getCommonPrefixes()) {
         Path keyPath = owner.keyToQualifiedPath(prefix);
         if (acceptor.accept(keyPath, prefix) && filter.accept(keyPath)) {
-          FileStatus status = new S3AFileStatus(Tristate.FALSE, keyPath,
+          S3AFileStatus status = new S3AFileStatus(Tristate.FALSE, keyPath,
               owner.getUsername());
           LOG.debug("Adding directory: {}", status);
           added++;
@@ -679,14 +689,14 @@ public boolean accept(FileStatus status) {
    * return a remote iterator of {@link LocatedFileStatus} instances.
    */
   class LocatedFileStatusIterator
-      implements RemoteIterator<LocatedFileStatus> {
-    private final RemoteIterator<FileStatus> statusIterator;
+      implements RemoteIterator<S3ALocatedFileStatus> {
+    private final RemoteIterator<S3AFileStatus> statusIterator;
 
     /**
      * Constructor.
      * @param statusIterator an iterator over the remote status entries
      */
-    LocatedFileStatusIterator(RemoteIterator<FileStatus> statusIterator) {
+    LocatedFileStatusIterator(RemoteIterator<S3AFileStatus> statusIterator) {
       this.statusIterator = statusIterator;
     }
 
@@ -696,7 +706,7 @@ public boolean hasNext() throws IOException {
     }
 
     @Override
-    public LocatedFileStatus next() throws IOException {
+    public S3ALocatedFileStatus next() throws IOException {
       return owner.toLocatedFileStatus(statusIterator.next());
     }
   }
@@ -708,16 +718,16 @@ public LocatedFileStatus next() throws IOException {
    * remain in the source iterator.
    */
   static class TombstoneReconcilingIterator implements
-      RemoteIterator<LocatedFileStatus> {
-    private LocatedFileStatus next = null;
-    private final RemoteIterator<LocatedFileStatus> iterator;
+      RemoteIterator<S3ALocatedFileStatus> {
+    private S3ALocatedFileStatus next = null;
+    private final RemoteIterator<S3ALocatedFileStatus> iterator;
     private final Set<Path> tombstones;
 
     /**
      * @param iterator Source iterator to filter
      * @param tombstones set of tombstone markers to filter out of results
      */
-    TombstoneReconcilingIterator(RemoteIterator<LocatedFileStatus>
+    TombstoneReconcilingIterator(RemoteIterator<S3ALocatedFileStatus>
         iterator, Set<Path> tombstones) {
       this.iterator = iterator;
       if (tombstones != null) {
@@ -729,7 +739,7 @@ static class TombstoneReconcilingIterator implements
 
     private boolean fetch() throws IOException {
       while (next == null && iterator.hasNext()) {
-        LocatedFileStatus candidate = iterator.next();
+        S3ALocatedFileStatus candidate = iterator.next();
         if (!tombstones.contains(candidate.getPath())) {
           next = candidate;
           return true;
@@ -745,9 +755,9 @@ public boolean hasNext() throws IOException {
       return fetch();
     }
 
-    public LocatedFileStatus next() throws IOException {
+    public S3ALocatedFileStatus next() throws IOException {
       if (hasNext()) {
-        LocatedFileStatus result = next;
+        S3ALocatedFileStatus result = next;
         next = null;
         fetch();
         return result;
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AFileSystem.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AFileSystem.java
index eb055dc6bc334..4bd58d5136860 100644
--- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AFileSystem.java
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AFileSystem.java
@@ -39,6 +39,7 @@
 import java.util.Optional;
 import java.util.Set;
 import java.util.Objects;
+import java.util.concurrent.CompletableFuture;
 import java.util.concurrent.ExecutorService;
 import java.util.concurrent.LinkedBlockingQueue;
 import java.util.concurrent.ThreadPoolExecutor;
@@ -48,6 +49,7 @@
 
 import com.amazonaws.AmazonClientException;
 import com.amazonaws.AmazonServiceException;
+import com.amazonaws.SdkBaseException;
 import com.amazonaws.services.s3.AmazonS3;
 import com.amazonaws.services.s3.model.AbortMultipartUploadRequest;
 import com.amazonaws.services.s3.model.CannedAccessControlList;
@@ -73,6 +75,7 @@
 import com.amazonaws.services.s3.transfer.TransferManager;
 import com.amazonaws.services.s3.transfer.TransferManagerConfiguration;
 import com.amazonaws.services.s3.transfer.Upload;
+import com.amazonaws.services.s3.transfer.model.CopyResult;
 import com.amazonaws.services.s3.transfer.model.UploadResult;
 import com.amazonaws.event.ProgressListener;
 import com.google.common.annotations.VisibleForTesting;
@@ -87,6 +90,10 @@
 import org.apache.hadoop.fs.CreateFlag;
 import org.apache.hadoop.fs.FSDataInputStream;
 import org.apache.hadoop.fs.FSDataOutputStream;
+import org.apache.hadoop.fs.s3a.impl.ChangeDetectionPolicy;
+import org.apache.hadoop.fs.s3a.impl.CopyOutcome;
+import org.apache.hadoop.fs.s3a.select.InternalSelectConstants;
+import org.apache.hadoop.util.LambdaUtils;
 import org.apache.hadoop.fs.FileAlreadyExistsException;
 import org.apache.hadoop.fs.FileStatus;
 import org.apache.hadoop.fs.FileSystem;
@@ -111,11 +118,15 @@
 import org.apache.hadoop.fs.s3a.commit.CommitConstants;
 import org.apache.hadoop.fs.s3a.commit.PutTracker;
 import org.apache.hadoop.fs.s3a.commit.MagicCommitIntegration;
+import org.apache.hadoop.fs.s3a.impl.ChangeTracker;
+import org.apache.hadoop.fs.s3a.select.SelectBinding;
+import org.apache.hadoop.fs.s3a.select.SelectConstants;
 import org.apache.hadoop.fs.s3a.s3guard.DirListingMetadata;
 import org.apache.hadoop.fs.s3a.s3guard.MetadataStoreListFilesIterator;
 import org.apache.hadoop.fs.s3a.s3guard.MetadataStore;
 import org.apache.hadoop.fs.s3a.s3guard.PathMetadata;
 import org.apache.hadoop.fs.s3a.s3guard.S3Guard;
+import org.apache.hadoop.fs.s3a.s3guard.ITtlTimeProvider;
 import org.apache.hadoop.fs.s3native.S3xLoginHelper;
 import org.apache.hadoop.io.retry.RetryPolicies;
 import org.apache.hadoop.fs.store.EtagChecksum;
@@ -126,6 +137,7 @@
 import org.apache.hadoop.util.ReflectionUtils;
 import org.apache.hadoop.util.SemaphoredDelegatingExecutor;
 
+import static org.apache.hadoop.fs.impl.AbstractFSBuilderImpl.rejectUnknownMandatoryKeys;
 import static org.apache.hadoop.fs.s3a.Constants.*;
 import static org.apache.hadoop.fs.s3a.Invoker.*;
 import static org.apache.hadoop.fs.s3a.S3AUtils.*;
@@ -168,6 +180,7 @@ public class S3AFileSystem extends FileSystem implements StreamCapabilities,
    * retryable results in files being deleted.
   */
   public static final boolean DELETE_CONSIDERED_IDEMPOTENT = true;
+
   private URI uri;
   private Path workingDir;
   private String username;
@@ -196,6 +209,7 @@ public class S3AFileSystem extends FileSystem implements StreamCapabilities,
       LoggerFactory.getLogger("org.apache.hadoop.fs.s3a.S3AFileSystem.Progress");
   private LocalDirAllocator directoryAllocator;
   private CannedAccessControlList cannedACL;
+  private boolean failOnMetadataWriteError;
 
   /**
    * This must never be null; until initialized it just declares that there
@@ -207,6 +221,7 @@ public class S3AFileSystem extends FileSystem implements StreamCapabilities,
       createStorageStatistics();
   private long readAhead;
   private S3AInputPolicy inputPolicy;
+  private ChangeDetectionPolicy changeDetectionPolicy;
   private final AtomicBoolean closed = new AtomicBoolean(false);
   private volatile boolean isClosed = false;
   private MetadataStore metadataStore;
@@ -224,12 +239,13 @@ public class S3AFileSystem extends FileSystem implements StreamCapabilities,
   private S3ADataBlocks.BlockFactory blockFactory;
   private int blockOutputActiveBlocks;
   private WriteOperationHelper writeHelper;
+  private SelectBinding selectBinding;
   private boolean useListV1;
   private MagicCommitIntegration committerIntegration;
 
   private AWSCredentialProviderList credentials;
 
-  private S3Guard.ITtlTimeProvider ttlTimeProvider;
+  private ITtlTimeProvider ttlTimeProvider;
 
   /** Add any deprecated keys. */
   @SuppressWarnings("deprecation")
@@ -296,6 +312,9 @@ public void initialize(URI name, Configuration originalConf)
           onRetry);
       writeHelper = new WriteOperationHelper(this, getConf());
 
+      failOnMetadataWriteError = conf.getBoolean(FAIL_ON_METADATA_WRITE_ERROR,
+          FAIL_ON_METADATA_WRITE_ERROR_DEFAULT);
+
       maxKeys = intOption(conf, MAX_PAGING_KEYS, DEFAULT_MAX_PAGING_KEYS, 1);
       listing = new Listing(this);
       partSize = getMultipartSizeProperty(conf,
@@ -310,26 +329,7 @@ public void initialize(URI name, Configuration originalConf)
       readAhead = longBytesOption(conf, READAHEAD_RANGE,
           DEFAULT_READAHEAD_RANGE, 0);
 
-      int maxThreads = conf.getInt(MAX_THREADS, DEFAULT_MAX_THREADS);
-      if (maxThreads < 2) {
-        LOG.warn(MAX_THREADS + " must be at least 2: forcing to 2.");
-        maxThreads = 2;
-      }
-      int totalTasks = intOption(conf,
-          MAX_TOTAL_TASKS, DEFAULT_MAX_TOTAL_TASKS, 1);
-      long keepAliveTime = longOption(conf, KEEPALIVE_TIME,
-          DEFAULT_KEEPALIVE_TIME, 0);
-      boundedThreadPool = BlockingThreadPoolExecutorService.newInstance(
-          maxThreads,
-          maxThreads + totalTasks,
-          keepAliveTime, TimeUnit.SECONDS,
-          "s3a-transfer-shared");
-      unboundedThreadPool = new ThreadPoolExecutor(
-          maxThreads, Integer.MAX_VALUE,
-          keepAliveTime, TimeUnit.SECONDS,
-          new LinkedBlockingQueue<Runnable>(),
-          BlockingThreadPoolExecutorService.newDaemonThreadFactory(
-              "s3a-transfer-unbounded"));
+      initThreadPools(conf);
 
       int listVersion = conf.getInt(LIST_VERSION, DEFAULT_LIST_VERSION);
       if (listVersion < 1 || listVersion > 2) {
@@ -353,6 +353,8 @@ public void initialize(URI name, Configuration originalConf)
       inputPolicy = S3AInputPolicy.getPolicy(
           conf.getTrimmed(INPUT_FADVISE, INPUT_FADV_NORMAL));
       LOG.debug("Input fadvise policy = {}", inputPolicy);
+      changeDetectionPolicy = ChangeDetectionPolicy.getPolicy(conf);
+      LOG.debug("Change detection policy = {}", changeDetectionPolicy);
       boolean magicCommitterEnabled = conf.getBoolean(
           CommitConstants.MAGIC_COMMITTER_ENABLED,
           CommitConstants.DEFAULT_MAGIC_COMMITTER_ENABLED);
@@ -361,6 +363,9 @@ public void initialize(URI name, Configuration originalConf)
       committerIntegration = new MagicCommitIntegration(
           this, magicCommitterEnabled);
 
+      // instantiate S3 Select support
+      selectBinding = new SelectBinding(writeHelper);
+
       boolean blockUploadEnabled = conf.getBoolean(FAST_UPLOAD, true);
 
       if (!blockUploadEnabled) {
@@ -384,15 +389,40 @@ public void initialize(URI name, Configuration originalConf)
             getMetadataStore(), allowAuthoritative);
       }
       initMultipartUploads(conf);
-      long authDirTtl = conf.getLong(METADATASTORE_AUTHORITATIVE_DIR_TTL,
-          DEFAULT_METADATASTORE_AUTHORITATIVE_DIR_TTL);
-      ttlTimeProvider = new S3Guard.TtlTimeProvider(authDirTtl);
+      if (hasMetadataStore()) {
+        long authDirTtl = conf.getTimeDuration(METADATASTORE_METADATA_TTL,
+            DEFAULT_METADATASTORE_METADATA_TTL, TimeUnit.MILLISECONDS);
+        ttlTimeProvider = new S3Guard.TtlTimeProvider(authDirTtl);
+      }
     } catch (AmazonClientException e) {
       throw translateException("initializing ", new Path(name), e);
     }
 
   }
 
+  private void initThreadPools(Configuration conf) {
+    int maxThreads = conf.getInt(MAX_THREADS, DEFAULT_MAX_THREADS);
+    if (maxThreads < 2) {
+      LOG.warn(MAX_THREADS + " must be at least 2: forcing to 2.");
+      maxThreads = 2;
+    }
+    int totalTasks = intOption(conf,
+        MAX_TOTAL_TASKS, DEFAULT_MAX_TOTAL_TASKS, 1);
+    long keepAliveTime = longOption(conf, KEEPALIVE_TIME,
+        DEFAULT_KEEPALIVE_TIME, 0);
+    boundedThreadPool = BlockingThreadPoolExecutorService.newInstance(
+        maxThreads,
+        maxThreads + totalTasks,
+        keepAliveTime, TimeUnit.SECONDS,
+        "s3a-transfer-shared");
+    unboundedThreadPool = new ThreadPoolExecutor(
+        maxThreads, Integer.MAX_VALUE,
+        keepAliveTime, TimeUnit.SECONDS,
+        new LinkedBlockingQueue<Runnable>(),
+        BlockingThreadPoolExecutorService.newDaemonThreadFactory(
+            "s3a-transfer-unbounded"));
+  }
+
   /**
    * Create the storage statistics or bind to an existing one.
    * @return a storage statistics instance.
@@ -633,6 +663,13 @@ protected void setAmazonS3Client(AmazonS3 client) {
     Preconditions.checkNotNull(client, "client");
     LOG.debug("Setting S3 client to {}", client);
     s3 = client;
+
+    // Need to use a new TransferManager that uses the new client.
+    // Also, using a new TransferManager requires a new threadpool as the old
+    // TransferManager will shut the thread pool down when it is garbage
+    // collected.
+    initThreadPools(getConf());
+    initTransferManager();
   }
 
   /**
@@ -676,6 +713,16 @@ public S3AInputPolicy getInputPolicy() {
     return inputPolicy;
   }
 
+  /**
+   * Get the change detection policy for this FS instance.
+   * Only public to allow access in tests in other packages.
+   * @return the change detection policy
+   */
+  @VisibleForTesting
+  public ChangeDetectionPolicy getChangeDetectionPolicy() {
+    return changeDetectionPolicy;
+  }
+
   /**
    * Get the encryption algorithm of this endpoint.
    * @return the encryption algorithm.
@@ -830,31 +877,108 @@ protected URI canonicalizeUri(URI rawUri) {
    * @param f the file name to open
    * @param bufferSize the size of the buffer to be used.
    */
+  @Retries.RetryTranslated
   public FSDataInputStream open(Path f, int bufferSize)
       throws IOException {
+    return open(f, Optional.empty());
+  }
+
+  /**
+   * Opens an FSDataInputStream at the indicated Path.
+   * @param path the file to open
+   * @param options configuration options if opened with the builder API.
+   * @throws IOException IO failure.
+   */
+  @Retries.RetryTranslated
+  private FSDataInputStream open(
+      final Path path,
+      final Optional<Configuration> options)
+      throws IOException {
+
     entryPoint(INVOCATION_OPEN);
-    LOG.debug("Opening '{}' for reading; input policy = {}", f, inputPolicy);
-    final FileStatus fileStatus = getFileStatus(f);
+    final S3AFileStatus fileStatus = (S3AFileStatus) getFileStatus(path);
     if (fileStatus.isDirectory()) {
-      throw new FileNotFoundException("Can't open " + f
+      throw new FileNotFoundException("Can't open " + path
           + " because it is a directory");
     }
 
+    S3AReadOpContext readContext;
+    if (options.isPresent()) {
+      Configuration o = options.get();
+      // normal path. Open the file with the chosen seek policy, if different
+      // from the normal one.
+      // and readahead.
+      S3AInputPolicy policy = S3AInputPolicy.getPolicy(
+          o.get(INPUT_FADVISE, inputPolicy.toString()));
+      long readAheadRange2 = o.getLong(READAHEAD_RANGE, readAhead);
+      // TODO support change detection policy from options?
+      readContext = createReadContext(
+          fileStatus,
+          policy,
+          changeDetectionPolicy,
+          readAheadRange2);
+    } else {
+      readContext = createReadContext(
+          fileStatus,
+          inputPolicy,
+          changeDetectionPolicy,
+          readAhead);
+    }
+    LOG.debug("Opening '{}'", readContext);
+
     return new FSDataInputStream(
-        new S3AInputStream(new S3AReadOpContext(hasMetadataStore(),
-            invoker,
-            s3guardInvoker,
-            statistics,
-            instrumentation,
-            fileStatus),
-            new S3ObjectAttributes(bucket,
-                pathToKey(f),
-                getServerSideEncryptionAlgorithm(),
-                encryptionSecrets.getEncryptionKey()),
+        new S3AInputStream(
+            readContext,
+            createObjectAttributes(
+                path,
+                fileStatus.getETag(),
+                fileStatus.getVersionId()),
             fileStatus.getLen(),
-            s3,
-            readAhead,
-            inputPolicy));
+            s3));
+  }
+
+  /**
+   * Create the read context for reading from the referenced file,
+   * using FS state as well as the status.
+   * @param fileStatus file status.
+   * @param seekPolicy input policy for this operation
+   * @param readAheadRange readahead value.
+   * @return a context for read and select operations.
+   */
+  private S3AReadOpContext createReadContext(
+      final FileStatus fileStatus,
+      final S3AInputPolicy seekPolicy,
+      final ChangeDetectionPolicy changePolicy,
+      final long readAheadRange) {
+    return new S3AReadOpContext(fileStatus.getPath(),
+        hasMetadataStore(),
+        invoker,
+        s3guardInvoker,
+        statistics,
+        instrumentation,
+        fileStatus,
+        seekPolicy,
+        changePolicy,
+        readAheadRange);
+  }
+
+  /**
+   * Create the attributes of an object for a get/select request.
+   * @param f path path of the request.
+   * @param eTag the eTag of the S3 object
+   * @param versionId S3 object version ID
+   * @return attributes to use when building the query.
+   */
+  private S3ObjectAttributes createObjectAttributes(
+      final Path f,
+      final String eTag,
+      final String versionId) {
+    return new S3ObjectAttributes(bucket,
+        pathToKey(f),
+        getServerSideEncryptionAlgorithm(),
+        encryptionSecrets.getEncryptionKey(),
+        eTag,
+        versionId);
   }
 
   /**
@@ -1118,19 +1242,27 @@ private boolean innerRename(Path source, Path dest)
     if (srcStatus.isFile()) {
       LOG.debug("rename: renaming file {} to {}", src, dst);
       long length = srcStatus.getLen();
+      S3ObjectAttributes objectAttributes =
+          createObjectAttributes(srcStatus.getPath(),
+              srcStatus.getETag(), srcStatus.getVersionId());
+      S3AReadOpContext readContext = createReadContext(srcStatus, inputPolicy,
+          changeDetectionPolicy, readAhead);
       if (dstStatus != null && dstStatus.isDirectory()) {
         String newDstKey = maybeAddTrailingSlash(dstKey);
         String filename =
             srcKey.substring(pathToKey(src.getParent()).length()+1);
         newDstKey = newDstKey + filename;
-        copyFile(srcKey, newDstKey, length);
+        CopyResult copyResult = copyFile(srcKey, newDstKey, length,
+            objectAttributes, readContext);
         S3Guard.addMoveFile(metadataStore, srcPaths, dstMetas, src,
             keyToQualifiedPath(newDstKey), length, getDefaultBlockSize(dst),
-            username);
+            username, copyResult.getETag(), copyResult.getVersionId());
       } else {
-        copyFile(srcKey, dstKey, srcStatus.getLen());
+        CopyResult copyResult = copyFile(srcKey, dstKey, srcStatus.getLen(),
+            objectAttributes, readContext);
         S3Guard.addMoveFile(metadataStore, srcPaths, dstMetas, src, dst,
-            length, getDefaultBlockSize(dst), username);
+            length, getDefaultBlockSize(dst), username,
+            copyResult.getETag(), copyResult.getVersionId());
       }
       innerDelete(srcStatus, false);
     } else {
@@ -1153,10 +1285,10 @@ private boolean innerRename(Path source, Path dest)
       }
 
       Path parentPath = keyToQualifiedPath(srcKey);
-      RemoteIterator<LocatedFileStatus> iterator = listFilesAndEmptyDirectories(
-          parentPath, true);
+      RemoteIterator<S3ALocatedFileStatus> iterator =
+          listFilesAndEmptyDirectories(parentPath, true);
       while (iterator.hasNext()) {
-        LocatedFileStatus status = iterator.next();
+        S3ALocatedFileStatus status = iterator.next();
         long length = status.getLen();
         String key = pathToKey(status.getPath());
         if (status.isDirectory() && !key.endsWith("/")) {
@@ -1166,7 +1298,13 @@ private boolean innerRename(Path source, Path dest)
             .add(new DeleteObjectsRequest.KeyVersion(key));
         String newDstKey =
             dstKey + key.substring(srcKey.length());
-        copyFile(key, newDstKey, length);
+        S3ObjectAttributes objectAttributes =
+            createObjectAttributes(status.getPath(),
+                status.getETag(), status.getVersionId());
+        S3AReadOpContext readContext = createReadContext(status, inputPolicy,
+            changeDetectionPolicy, readAhead);
+        CopyResult copyResult = copyFile(key, newDstKey, length,
+            objectAttributes, readContext);
 
         if (hasMetadataStore()) {
           // with a metadata store, the object entries need to be updated,
@@ -1178,7 +1316,8 @@ private boolean innerRename(Path source, Path dest)
                 childDst, username);
           } else {
             S3Guard.addMoveFile(metadataStore, srcPaths, dstMetas, childSrc,
-                childDst, length, getDefaultBlockSize(childDst), username);
+                childDst, length, getDefaultBlockSize(childDst), username,
+                copyResult.getETag(), copyResult.getVersionId());
           }
           // Ancestor directories may not be listed, so we explicitly add them
           S3Guard.addMoveAncestors(metadataStore, srcPaths, dstMetas,
@@ -1205,7 +1344,7 @@ private boolean innerRename(Path source, Path dest)
       }
     }
 
-    metadataStore.move(srcPaths, dstMetas);
+    metadataStore.move(srcPaths, dstMetas, ttlTimeProvider);
 
     if (!src.getParent().equals(dst.getParent())) {
       LOG.debug("source & dest parents are different; fix up dir markers");
@@ -1224,10 +1363,28 @@ private boolean innerRename(Path source, Path dest)
   @VisibleForTesting
   @Retries.RetryTranslated
   public ObjectMetadata getObjectMetadata(Path path) throws IOException {
+    return getObjectMetadata(path, null, invoker, null);
+  }
+
+  /**
+   * Low-level call to get at the object metadata.
+   * @param path path to the object
+   * @param changeTracker the change tracker to detect version inconsistencies
+   * @param changeInvoker the invoker providing the retry policy
+   * @param operation the operation being performed (e.g. "read" or "copy")
+   * @return metadata
+   * @throws IOException IO and object access problems.
+   */
+  @VisibleForTesting
+  @Retries.RetryTranslated
+  public ObjectMetadata getObjectMetadata(Path path,
+      ChangeTracker changeTracker, Invoker changeInvoker, String operation)
+      throws IOException {
     return once("getObjectMetadata", path.toString(),
         () ->
-          // this always does a full HEAD to the object
-          getObjectMetadata(pathToKey(path)));
+            // this always does a full HEAD to the object
+            getObjectMetadata(
+                pathToKey(path), changeTracker, changeInvoker, operation));
   }
 
   /**
@@ -1252,6 +1409,16 @@ public boolean hasMetadataStore() {
     return !S3Guard.isNullMetadataStore(metadataStore);
   }
 
+  /**
+   * Does the filesystem have an authoritative metadata store?
+   * @return true if there is a metadata store and the authoritative flag
+   * is set for this filesystem.
+   */
+  @VisibleForTesting
+  boolean hasAuthoritativeMetadataStore() {
+    return hasMetadataStore() && allowAuthoritative;
+  }
+
   /**
    * Get the metadata store.
    * This will always be non-null, but may be bound to the
@@ -1388,14 +1555,41 @@ public S3AStorageStatistics getStorageStatistics() {
    */
   @Retries.RetryRaw
   protected ObjectMetadata getObjectMetadata(String key) throws IOException {
+    return getObjectMetadata(key, null, invoker,null);
+  }
+
+  /**
+   * Request object metadata; increments counters in the process.
+   * Retry policy: retry untranslated.
+   * Uses changeTracker to detect an unexpected file version (eTag or versionId)
+   * @param key key
+   * @param changeTracker the change tracker to detect unexpected object version
+   * @param changeInvoker the invoker providing the retry policy
+   * @param operation the operation (e.g. "read" or "copy") triggering this call
+   * @return the metadata
+   * @throws IOException if the retry invocation raises one (it shouldn't).
+   * @throws RemoteFileChangedException if an unexpected version is detected
+   */
+  @Retries.RetryRaw
+  protected ObjectMetadata getObjectMetadata(String key,
+      ChangeTracker changeTracker,
+      Invoker changeInvoker,
+      String operation) throws IOException {
     GetObjectMetadataRequest request =
         new GetObjectMetadataRequest(bucket, key);
     //SSE-C requires to be filled in if enabled for object metadata
     generateSSECustomerKey().ifPresent(request::setSSECustomerKey);
-    ObjectMetadata meta = invoker.retryUntranslated("GET " + key, true,
+    ObjectMetadata meta = changeInvoker.retryUntranslated("GET " + key, true,
         () -> {
           incrementStatistic(OBJECT_METADATA_REQUESTS);
-          return s3.getObjectMetadata(request);
+          if (changeTracker != null) {
+            changeTracker.maybeApplyConstraint(request);
+          }
+          ObjectMetadata objectMetadata = s3.getObjectMetadata(request);
+          if (changeTracker != null) {
+            changeTracker.processMetadata(objectMetadata, operation);
+          }
+          return objectMetadata;
         });
     incrementReadOperations();
     return meta;
@@ -1531,7 +1725,7 @@ void deleteObjectAtPath(Path f, String key, boolean isFile)
       instrumentation.directoryDeleted();
     }
     deleteObject(key);
-    metadataStore.delete(f);
+    metadataStore.delete(f, ttlTimeProvider);
   }
 
   /**
@@ -1683,10 +1877,13 @@ public UploadInfo putObject(PutObjectRequest putObjectRequest) {
    * @param putObjectRequest the request
    * @return the upload initiated
    * @throws AmazonClientException on problems
+   * @throws MetadataPersistenceException if metadata about the write could
+   * not be saved to the metadata store and
+   * fs.s3a.metadatastore.fail.on.write.error=true
    */
-  @Retries.OnceRaw("For PUT; post-PUT actions are RetriesExceptionsSwallowed")
+  @Retries.OnceRaw("For PUT; post-PUT actions are RetryTranslated")
   PutObjectResult putObjectDirect(PutObjectRequest putObjectRequest)
-      throws AmazonClientException {
+      throws AmazonClientException, MetadataPersistenceException {
     long len = getPutRequestLength(putObjectRequest);
     LOG.debug("PUT {} bytes to {}", len, putObjectRequest.getKey());
     incrementPutStartStatistics(len);
@@ -1694,7 +1891,8 @@ PutObjectResult putObjectDirect(PutObjectRequest putObjectRequest)
       PutObjectResult result = s3.putObject(putObjectRequest);
       incrementPutCompletedStatistics(true, len);
       // update metadata
-      finishedWrite(putObjectRequest.getKey(), len);
+      finishedWrite(putObjectRequest.getKey(), len,
+          result.getETag(), result.getVersionId());
       return result;
     } catch (AmazonClientException e) {
       incrementPutCompletedStatistics(false, len);
@@ -1948,7 +2146,7 @@ private boolean innerDelete(S3AFileStatus status, boolean recursive)
           }
         }
       }
-      metadataStore.deleteSubtree(f);
+      metadataStore.deleteSubtree(f, ttlTimeProvider);
     } else {
       LOG.debug("delete: Path is a file");
       deleteObjectAtPath(f, key, true);
@@ -2052,7 +2250,7 @@ public FileStatus[] innerListStatus(Path f) throws FileNotFoundException,
     LOG.debug("List status for path: {}", path);
     entryPoint(INVOCATION_LIST_STATUS);
 
-    List<FileStatus> result;
+    List<S3AFileStatus> result;
     final FileStatus fileStatus =  getFileStatus(path);
 
     if (fileStatus.isDirectory()) {
@@ -2204,10 +2402,6 @@ private boolean innerMkdirs(Path p, FsPermission permission)
     LOG.debug("Making directory: {}", f);
     entryPoint(INVOCATION_MKDIRS);
     FileStatus fileStatus;
-    List<Path> metadataStoreDirs = null;
-    if (hasMetadataStore()) {
-      metadataStoreDirs = new ArrayList<>();
-    }
 
     try {
       fileStatus = getFileStatus(f);
@@ -2220,9 +2414,6 @@ private boolean innerMkdirs(Path p, FsPermission permission)
     } catch (FileNotFoundException e) {
       // Walk path to root, ensuring closest ancestor is a directory, not file
       Path fPart = f.getParent();
-      if (metadataStoreDirs != null) {
-        metadataStoreDirs.add(f);
-      }
       while (fPart != null) {
         try {
           fileStatus = getFileStatus(fPart);
@@ -2236,11 +2427,6 @@ private boolean innerMkdirs(Path p, FsPermission permission)
           }
         } catch (FileNotFoundException fnfe) {
           instrumentation.errorIgnored();
-          // We create all missing directories in MetadataStore; it does not
-          // infer directories exist by prefix like S3.
-          if (metadataStoreDirs != null) {
-            metadataStoreDirs.add(fPart);
-          }
         }
         fPart = fPart.getParent();
       }
@@ -2283,7 +2469,10 @@ S3AFileStatus innerGetFileStatus(final Path f,
     LOG.debug("Getting path status for {}  ({})", path, key);
 
     // Check MetadataStore, if any.
-    PathMetadata pm = metadataStore.get(path, needEmptyDirectoryFlag);
+    PathMetadata pm = null;
+    if (hasMetadataStore()) {
+      pm = S3Guard.getWithTtl(metadataStore, path, ttlTimeProvider);
+    }
     Set<Path> tombstones = Collections.emptySet();
     if (pm != null) {
       if (pm.isDeleted()) {
@@ -2291,11 +2480,43 @@ S3AFileStatus innerGetFileStatus(final Path f,
             "deleted by S3Guard");
       }
 
-      FileStatus msStatus = pm.getFileStatus();
+      // if ms is not authoritative, check S3 if there's any recent
+      // modification - compare the modTime to check if metadata is up to date
+      // Skip going to s3 if the file checked is a directory. Because if the
+      // dest is also a directory, there's no difference.
+      // TODO After HADOOP-16085 the modification detection can be done with
+      //  etags or object version instead of modTime
+      if (!pm.getFileStatus().isDirectory() &&
+          !allowAuthoritative) {
+        LOG.debug("Metadata for {} found in the non-auth metastore.", path);
+        final long msModTime = pm.getFileStatus().getModificationTime();
+
+        S3AFileStatus s3AFileStatus;
+        try {
+          s3AFileStatus = s3GetFileStatus(path, key, tombstones);
+        } catch (FileNotFoundException fne) {
+          s3AFileStatus = null;
+        }
+        if (s3AFileStatus == null) {
+          LOG.warn("Failed to find file {}. Either it is not yet visible, or "
+              + "it has been deleted.", path);
+        } else {
+          final long s3ModTime = s3AFileStatus.getModificationTime();
+
+          if(s3ModTime > msModTime) {
+            LOG.debug("S3Guard metadata for {} is outdated, updating it",
+                path);
+            return S3Guard.putAndReturn(metadataStore, s3AFileStatus,
+                instrumentation, ttlTimeProvider);
+          }
+        }
+      }
+
+      S3AFileStatus msStatus = pm.getFileStatus();
       if (needEmptyDirectoryFlag && msStatus.isDirectory()) {
         if (pm.isEmptyDirectory() != Tristate.UNKNOWN) {
           // We have a definitive true / false from MetadataStore, we are done.
-          return S3AFileStatus.fromFileStatus(msStatus, pm.isEmptyDirectory());
+          return msStatus;
         } else {
           DirListingMetadata children =
               S3Guard.listChildrenWithTtl(metadataStore, path, ttlTimeProvider);
@@ -2306,7 +2527,7 @@ S3AFileStatus innerGetFileStatus(final Path f,
         }
       } else {
         // Either this is not a directory, or we don't care if it is empty
-        return S3AFileStatus.fromFileStatus(msStatus, pm.isEmptyDirectory());
+        return msStatus;
       }
 
       // If the metadata store has no children for it and it's not listed in
@@ -2315,15 +2536,18 @@ S3AFileStatus innerGetFileStatus(final Path f,
       try {
         s3FileStatus = s3GetFileStatus(path, key, tombstones);
       } catch (FileNotFoundException e) {
-        return S3AFileStatus.fromFileStatus(msStatus, Tristate.TRUE);
+        return S3AFileStatus.fromFileStatus(msStatus, Tristate.TRUE,
+            null, null);
       }
       // entry was found, save in S3Guard
-      return S3Guard.putAndReturn(metadataStore, s3FileStatus, instrumentation);
+      return S3Guard.putAndReturn(metadataStore, s3FileStatus,
+          instrumentation, ttlTimeProvider);
     } else {
       // there was no entry in S3Guard
       // retrieve the data and update the metadata store in the process.
       return S3Guard.putAndReturn(metadataStore,
-          s3GetFileStatus(path, key, tombstones), instrumentation);
+          s3GetFileStatus(path, key, tombstones), instrumentation,
+          ttlTimeProvider);
     }
   }
 
@@ -2354,7 +2578,9 @@ private S3AFileStatus s3GetFileStatus(final Path path, String key,
               dateToLong(meta.getLastModified()),
               path,
               getDefaultBlockSize(path),
-              username);
+              username,
+              meta.getETag(),
+              meta.getVersionId());
         }
       } catch (AmazonServiceException e) {
         if (e.getStatusCode() != 404) {
@@ -2381,7 +2607,9 @@ private S3AFileStatus s3GetFileStatus(final Path path, String key,
                     dateToLong(meta.getLastModified()),
                     path,
                     getDefaultBlockSize(path),
-                    username);
+                    username,
+                    meta.getETag(),
+                    meta.getVersionId());
           }
         } catch (AmazonServiceException e) {
           if (e.getStatusCode() != 404) {
@@ -2589,11 +2817,14 @@ private void innerCopyFromLocalFile(boolean delSrc, boolean overwrite,
    * @param progress optional progress callback
    * @return the upload result
    * @throws InterruptedIOException if the blocking was interrupted.
+   * @throws MetadataPersistenceException if metadata about the write could
+   * not be saved to the metadata store and
+   * fs.s3a.metadatastore.fail.on.write.error=true
    */
-  @Retries.OnceRaw("For PUT; post-PUT actions are RetriesExceptionsSwallowed")
+  @Retries.OnceRaw("For PUT; post-PUT actions are RetryTranslated")
   UploadResult executePut(PutObjectRequest putObjectRequest,
       Progressable progress)
-      throws InterruptedIOException {
+      throws InterruptedIOException, MetadataPersistenceException {
     String key = putObjectRequest.getKey();
     UploadInfo info = putObject(putObjectRequest);
     Upload upload = info.getUpload();
@@ -2603,7 +2834,8 @@ UploadResult executePut(PutObjectRequest putObjectRequest,
     UploadResult result = waitForUploadCompletion(key, info);
     listener.uploadCompleted();
     // post-write actions
-    finishedWrite(key, info.getLength());
+    finishedWrite(key, info.getLength(),
+        result.getETag(), result.getVersionId());
     return result;
   }
 
@@ -2768,12 +3000,15 @@ public List<RoleModel.Statement> listAWSPolicyRules(
    * @param srcKey source object path
    * @param dstKey destination object path
    * @param size object size
-   * @throws AmazonClientException on failures inside the AWS SDK
+   * @param srcAttributes S3 attributes of the source object
+   * @param readContext the read context
+   * @return the result of the copy
    * @throws InterruptedIOException the operation was interrupted
    * @throws IOException Other IO problems
    */
-  @Retries.RetryMixed
-  private void copyFile(String srcKey, String dstKey, long size)
+  @Retries.RetryTranslated
+  private CopyResult copyFile(String srcKey, String dstKey, long size,
+      S3ObjectAttributes srcAttributes, S3AReadOpContext readContext)
       throws IOException, InterruptedIOException  {
     LOG.debug("copyFile {} -> {} ", srcKey, dstKey);
 
@@ -2787,26 +3022,58 @@ private void copyFile(String srcKey, String dstKey, long size)
       }
     };
 
-    once("copyFile(" + srcKey + ", " + dstKey + ")", srcKey,
+    ChangeTracker changeTracker = new ChangeTracker(
+        keyToQualifiedPath(srcKey).toString(),
+        changeDetectionPolicy,
+        readContext.instrumentation.newInputStreamStatistics()
+            .getVersionMismatchCounter(),
+        srcAttributes);
+
+    String action = "copyFile(" + srcKey + ", " + dstKey + ")";
+    Invoker readInvoker = readContext.getReadInvoker();
+
+    ObjectMetadata srcom =
+        once(action, srcKey,
+            () ->
+                getObjectMetadata(srcKey, changeTracker, readInvoker, "copy"));
+    ObjectMetadata dstom = cloneObjectMetadata(srcom);
+    setOptionalObjectMetadata(dstom);
+
+    return readInvoker.retry(
+        action, srcKey,
+        true,
         () -> {
-          ObjectMetadata srcom = getObjectMetadata(srcKey);
-          ObjectMetadata dstom = cloneObjectMetadata(srcom);
-          setOptionalObjectMetadata(dstom);
           CopyObjectRequest copyObjectRequest =
               new CopyObjectRequest(bucket, srcKey, bucket, dstKey);
+          changeTracker.maybeApplyConstraint(copyObjectRequest);
+
           setOptionalCopyObjectRequestParameters(copyObjectRequest);
           copyObjectRequest.setCannedAccessControlList(cannedACL);
           copyObjectRequest.setNewObjectMetadata(dstom);
+          Optional.ofNullable(srcom.getStorageClass())
+              .ifPresent(copyObjectRequest::setStorageClass);
           Copy copy = transfers.copy(copyObjectRequest);
           copy.addProgressListener(progressListener);
-          try {
-            copy.waitForCopyResult();
-            incrementWriteOperations();
-            instrumentation.filesCopied(1, size);
-          } catch (InterruptedException e) {
-            throw new InterruptedIOException("Interrupted copying " + srcKey
-                + " to " + dstKey + ", cancelling");
+          CopyOutcome copyOutcome = CopyOutcome.waitForCopy(copy);
+          InterruptedException interruptedException =
+              copyOutcome.getInterruptedException();
+          if (interruptedException != null) {
+            // copy interrupted: convert to an IOException.
+            throw (IOException)new InterruptedIOException(
+                "Interrupted copying " + srcKey
+                    + " to " + dstKey + ", cancelling")
+                .initCause(interruptedException);
           }
+          SdkBaseException awsException = copyOutcome.getAwsException();
+          if (awsException != null) {
+            changeTracker.processException(awsException, "copy");
+            throw awsException;
+          }
+          CopyResult result = copyOutcome.getCopyResult();
+          changeTracker.processResponse(result);
+          incrementWriteOperations();
+          instrumentation.filesCopied(1, size);
+          return result;
         });
   }
 
@@ -2913,10 +3180,17 @@ private Optional<SSECustomerKey> generateSSECustomerKey() {
    * </ol>
    * @param key key written to
    * @param length  total length of file written
+   * @param eTag eTag of the written object
+   * @param versionId S3 object versionId of the written object
+   * @throws MetadataPersistenceException if metadata about the write could
+   * not be saved to the metadata store and
+   * fs.s3a.metadatastore.fail.on.write.error=true
    */
   @InterfaceAudience.Private
-  @Retries.RetryExceptionsSwallowed
-  void finishedWrite(String key, long length) {
+  @Retries.RetryTranslated("Except if failOnMetadataWriteError=false, in which"
+      + " case RetryExceptionsSwallowed")
+  void finishedWrite(String key, long length, String eTag, String versionId)
+      throws MetadataPersistenceException {
     LOG.debug("Finished write to {}, len {}", key, length);
     Path p = keyToQualifiedPath(key);
     Preconditions.checkArgument(length >= 0, "content length is negative");
@@ -2925,15 +3199,20 @@ void finishedWrite(String key, long length) {
     // See note about failure semantics in S3Guard documentation
     try {
       if (hasMetadataStore()) {
-        S3Guard.addAncestors(metadataStore, p, username);
+        S3Guard.addAncestors(metadataStore, p, username, ttlTimeProvider);
         S3AFileStatus status = createUploadFileStatus(p,
             S3AUtils.objectRepresentsDirectory(key, length), length,
-            getDefaultBlockSize(p), username);
-        S3Guard.putAndReturn(metadataStore, status, instrumentation);
+            getDefaultBlockSize(p), username, eTag, versionId);
+        S3Guard.putAndReturn(metadataStore, status, instrumentation,
+            ttlTimeProvider);
       }
     } catch (IOException e) {
-      LOG.error("S3Guard: Error updating MetadataStore for write to {}:",
-          key, e);
+      if (failOnMetadataWriteError) {
+        throw new MetadataPersistenceException(p.toString(), e);
+      } else {
+        LOG.error("S3Guard: Error updating MetadataStore for write to {}",
+            p, e);
+      }
       instrumentation.errorIgnored();
     }
   }
@@ -3296,26 +3575,41 @@ public EtagChecksum getFileChecksum(Path f, final long length)
   @Retries.OnceTranslated
   public RemoteIterator<LocatedFileStatus> listFiles(Path f,
       boolean recursive) throws FileNotFoundException, IOException {
-    return innerListFiles(f, recursive,
-        new Listing.AcceptFilesOnly(qualify(f)));
+    return toLocatedFileStatusIterator(innerListFiles(f, recursive,
+        new Listing.AcceptFilesOnly(qualify(f))));
+  }
+
+  private static RemoteIterator<LocatedFileStatus> toLocatedFileStatusIterator(
+      RemoteIterator<? extends LocatedFileStatus> iterator) {
+    return new RemoteIterator<LocatedFileStatus>() {
+      @Override
+      public boolean hasNext() throws IOException {
+        return iterator.hasNext();
+      }
+
+      @Override
+      public LocatedFileStatus next() throws IOException {
+        return iterator.next();
+      }
+    };
   }
 
   @Retries.OnceTranslated
-  public RemoteIterator<LocatedFileStatus> listFilesAndEmptyDirectories(Path f,
-      boolean recursive) throws IOException {
+  public RemoteIterator<S3ALocatedFileStatus> listFilesAndEmptyDirectories(
+      Path f, boolean recursive) throws IOException {
     return innerListFiles(f, recursive,
         new Listing.AcceptAllButS3nDirs());
   }
 
   @Retries.OnceTranslated
-  private RemoteIterator<LocatedFileStatus> innerListFiles(Path f, boolean
+  private RemoteIterator<S3ALocatedFileStatus> innerListFiles(Path f, boolean
       recursive, Listing.FileStatusAcceptor acceptor) throws IOException {
     entryPoint(INVOCATION_LIST_FILES);
     Path path = qualify(f);
     LOG.debug("listFiles({}, {})", path, recursive);
     try {
       // lookup dir triggers existence check
-      final FileStatus fileStatus = getFileStatus(path);
+      final S3AFileStatus fileStatus = (S3AFileStatus) getFileStatus(path);
       if (fileStatus.isFile()) {
         // simple case: File
         LOG.debug("Path is a file");
@@ -3327,7 +3621,7 @@ private RemoteIterator<LocatedFileStatus> innerListFiles(Path f, boolean
         String delimiter = recursive ? null : "/";
         LOG.debug("Requesting all entries under {} with delimiter '{}'",
             key, delimiter);
-        final RemoteIterator<FileStatus> cachedFilesIterator;
+        final RemoteIterator<S3AFileStatus> cachedFilesIterator;
         final Set<Path> tombstones;
         if (recursive) {
           final PathMetadata pm = metadataStore.get(path, true);
@@ -3399,52 +3693,55 @@ public RemoteIterator<LocatedFileStatus> listLocatedStatus(final Path f,
     entryPoint(INVOCATION_LIST_LOCATED_STATUS);
     Path path = qualify(f);
     LOG.debug("listLocatedStatus({}, {}", path, filter);
-    return once("listLocatedStatus", path.toString(),
-        () -> {
-          // lookup dir triggers existence check
-          final FileStatus fileStatus = getFileStatus(path);
-          if (fileStatus.isFile()) {
-            // simple case: File
-            LOG.debug("Path is a file");
-            return new Listing.SingleStatusRemoteIterator(
-                filter.accept(path) ? toLocatedFileStatus(fileStatus) : null);
-          } else {
-            // directory: trigger a lookup
-            final String key = maybeAddTrailingSlash(pathToKey(path));
-            final Listing.FileStatusAcceptor acceptor =
-                new Listing.AcceptAllButSelfAndS3nDirs(path);
-            DirListingMetadata meta =
-                S3Guard.listChildrenWithTtl(metadataStore, path,
-                    ttlTimeProvider);
-            final RemoteIterator<FileStatus> cachedFileStatusIterator =
-                listing.createProvidedFileStatusIterator(
-                    S3Guard.dirMetaToStatuses(meta), filter, acceptor);
-            return (allowAuthoritative && meta != null
-                && meta.isAuthoritative())
-                ? listing.createLocatedFileStatusIterator(
-                cachedFileStatusIterator)
-                : listing.createLocatedFileStatusIterator(
-                    listing.createFileStatusListingIterator(path,
-                        createListObjectsRequest(key, "/"),
-                        filter,
-                        acceptor,
-                        cachedFileStatusIterator));
-          }
-        });
+    RemoteIterator<? extends LocatedFileStatus> iterator =
+        once("listLocatedStatus", path.toString(),
+          () -> {
+            // lookup dir triggers existence check
+            final S3AFileStatus fileStatus =
+                (S3AFileStatus) getFileStatus(path);
+            if (fileStatus.isFile()) {
+              // simple case: File
+              LOG.debug("Path is a file");
+              return new Listing.SingleStatusRemoteIterator(
+                  filter.accept(path) ? toLocatedFileStatus(fileStatus) : null);
+            } else {
+              // directory: trigger a lookup
+              final String key = maybeAddTrailingSlash(pathToKey(path));
+              final Listing.FileStatusAcceptor acceptor =
+                  new Listing.AcceptAllButSelfAndS3nDirs(path);
+              DirListingMetadata meta =
+                  S3Guard.listChildrenWithTtl(metadataStore, path,
+                      ttlTimeProvider);
+              final RemoteIterator<S3AFileStatus> cachedFileStatusIterator =
+                  listing.createProvidedFileStatusIterator(
+                      S3Guard.dirMetaToStatuses(meta), filter, acceptor);
+              return (allowAuthoritative && meta != null
+                  && meta.isAuthoritative())
+                  ? listing.createLocatedFileStatusIterator(
+                  cachedFileStatusIterator)
+                  : listing.createLocatedFileStatusIterator(
+                      listing.createFileStatusListingIterator(path,
+                          createListObjectsRequest(key, "/"),
+                          filter,
+                          acceptor,
+                          cachedFileStatusIterator));
+            }
+          });
+    return toLocatedFileStatusIterator(iterator);
   }
 
   /**
-   * Build a {@link LocatedFileStatus} from a {@link FileStatus} instance.
+   * Build a {@link S3ALocatedFileStatus} from a {@link FileStatus} instance.
    * @param status file status
    * @return a located status with block locations set up from this FS.
    * @throws IOException IO Problems.
    */
-  LocatedFileStatus toLocatedFileStatus(FileStatus status)
+  S3ALocatedFileStatus toLocatedFileStatus(S3AFileStatus status)
       throws IOException {
-    return new LocatedFileStatus(status,
+    return new S3ALocatedFileStatus(status,
         status.isFile() ?
           getFileBlockLocations(status, 0, status.getLen())
-          : null);
+          : null, status.getETag(), status.getVersionId());
   }
 
   /**
@@ -3549,6 +3846,10 @@ public boolean hasCapability(String capability) {
       // capability depends on FS configuration
       return isMagicCommitEnabled();
 
+    case SelectConstants.S3_SELECT_CAPABILITY:
+      // select is only supported if enabled
+      return selectBinding.isEnabled();
+
     default:
       return false;
     }
@@ -3568,12 +3869,133 @@ public AWSCredentialProviderList shareCredentials(final String purpose) {
   }
 
   @VisibleForTesting
-  protected S3Guard.ITtlTimeProvider getTtlTimeProvider() {
+  public ITtlTimeProvider getTtlTimeProvider() {
     return ttlTimeProvider;
   }
 
   @VisibleForTesting
-  protected void setTtlTimeProvider(S3Guard.ITtlTimeProvider ttlTimeProvider) {
+  protected void setTtlTimeProvider(ITtlTimeProvider ttlTimeProvider) {
     this.ttlTimeProvider = ttlTimeProvider;
   }
+
+  /**
+   * This is a proof of concept of a select API.
+   * Once a proper factory mechanism for opening files is added to the
+   * FileSystem APIs, this will be deleted <i>without any warning</i>.
+   * @param source path to source data
+   * @param expression select expression
+   * @param options request configuration from the builder.
+   * @return the stream of the results
+   * @throws IOException IO failure
+   */
+  @Retries.RetryTranslated
+  private FSDataInputStream select(final Path source,
+      final String expression,
+      final Configuration options)
+      throws IOException {
+    entryPoint(OBJECT_SELECT_REQUESTS);
+    requireSelectSupport(source);
+    final Path path = makeQualified(source);
+    // call getFileStatus(), which will look at S3Guard first,
+    // so the operation will fail if it is not there or S3Guard believes it has
+    // been deleted.
+    // validation of the file status are delegated to the binding.
+    final S3AFileStatus fileStatus = (S3AFileStatus) getFileStatus(path);
+
+    // readahead range can be dynamically set
+    long ra = options.getLong(READAHEAD_RANGE, readAhead);
+    S3ObjectAttributes objectAttributes = createObjectAttributes(
+        path, fileStatus.getETag(), fileStatus.getVersionId());
+    S3AReadOpContext readContext = createReadContext(fileStatus, inputPolicy,
+        changeDetectionPolicy, ra);
+
+    if (!fileStatus.isDirectory()) {
+      // check that the object metadata lines up with what is expected
+      // based on the object attributes (which may contain an eTag or
+      // versionId) from S3Guard
+      ChangeTracker changeTracker =
+          new ChangeTracker(uri.toString(),
+              changeDetectionPolicy,
+              readContext.instrumentation.newInputStreamStatistics()
+                  .getVersionMismatchCounter(),
+              objectAttributes);
+
+      // will retry internally if wrong version detected
+      Invoker readInvoker = readContext.getReadInvoker();
+      getObjectMetadata(path, changeTracker, readInvoker, "select");
+    }
+
+    // build and execute the request
+    return selectBinding.select(
+        readContext,
+        expression,
+        options,
+        generateSSECustomerKey(),
+        objectAttributes);
+  }
+
+  /**
+   * Verify the FS supports S3 Select.
+   * @param source source file.
+   * @throws UnsupportedOperationException if not.
+   */
+  private void requireSelectSupport(final Path source) throws
+      UnsupportedOperationException {
+    if (!selectBinding.isEnabled()) {
+      throw new UnsupportedOperationException(
+          SelectConstants.SELECT_UNSUPPORTED);
+    }
+  }
+
+  /**
+   * Initiate the open or select operation.
+   * This is invoked from both the FileSystem and FileContext APIs
+   * @param path path to the file
+   * @param mandatoryKeys set of options declared as mandatory.
+   * @param options options set during the build sequence.
+   * @return a future which will evaluate to the opened/selected file.
+   * @throws IOException failure to resolve the link.
+   * @throws PathIOException operation is a select request but S3 select is
+   * disabled
+   * @throws IllegalArgumentException unknown mandatory key
+   */
+  @Override
+  @Retries.RetryTranslated
+  public CompletableFuture<FSDataInputStream> openFileWithOptions(
+      final Path path,
+      final Set<String> mandatoryKeys,
+      final Configuration options,
+      final int bufferSize) throws IOException {
+    String sql = options.get(SelectConstants.SELECT_SQL, null);
+    boolean isSelect = sql != null;
+    // choice of keys depends on open type
+    if (isSelect) {
+      rejectUnknownMandatoryKeys(
+          mandatoryKeys,
+          InternalSelectConstants.SELECT_OPTIONS,
+          "for " + path + " in S3 Select operation");
+    } else {
+      rejectUnknownMandatoryKeys(
+          mandatoryKeys,
+          InternalConstants.STANDARD_OPENFILE_KEYS,
+          "for " + path + " in non-select file I/O");
+    }
+    CompletableFuture<FSDataInputStream> result = new CompletableFuture<>();
+    if (!isSelect) {
+      // normal path.
+      unboundedThreadPool.submit(() ->
+          LambdaUtils.eval(result,
+              () -> open(path, Optional.of(options))));
+    } else {
+      // it is a select statement.
+      // fail fast if the method is not present
+      requireSelectSupport(path);
+      // submit the query
+      unboundedThreadPool.submit(() ->
+          LambdaUtils.eval(result,
+              () -> select(path, sql, options)));
+    }
+    return result;
+  }
+
 }
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/DynamoDBMetadataStore.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/DynamoDBMetadataStore.java
index f0f33e9dc1e81..0a298b4edac4d 100644
--- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/DynamoDBMetadataStore.java
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/DynamoDBMetadataStore.java
@@ -179,8 +179,10 @@
  * directory helps prevent unnecessary queries during traversal of an entire
  * sub-tree.
  *
- * Some mutating operations, notably {@link #deleteSubtree(Path)} and
- * {@link #move(Collection, Collection)}, are less efficient with this schema.
+ * Some mutating operations, notably
+ * {@link MetadataStore#deleteSubtree(Path, ITtlTimeProvider)} and
+ * {@link MetadataStore#move(Collection, Collection, ITtlTimeProvider)},
+ * are less efficient with this schema.
  * They require mutating multiple items in the DynamoDB table.
  *
  * By default, DynamoDB access is performed within the same AWS region as
@@ -446,14 +448,15 @@ private void initDataAccessRetries(Configuration config) {
 
   @Override
   @Retries.RetryTranslated
-  public void delete(Path path) throws IOException {
-    innerDelete(path, true);
+  public void delete(Path path, ITtlTimeProvider ttlTimeProvider)
+      throws IOException {
+    innerDelete(path, true, ttlTimeProvider);
   }
 
   @Override
   @Retries.RetryTranslated
   public void forgetMetadata(Path path) throws IOException {
-    innerDelete(path, false);
+    innerDelete(path, false, null);
   }
 
   /**
@@ -462,10 +465,13 @@ public void forgetMetadata(Path path) throws IOException {
    * There is no check as to whether the entry exists in the table first.
    * @param path path to delete
    * @param tombstone flag to create a tombstone marker
+   * @param ttlTimeProvider The time provider to set last_updated. Must not
+   *                        be null if tombstone is true.
    * @throws IOException I/O error.
    */
   @Retries.RetryTranslated
-  private void innerDelete(final Path path, boolean tombstone)
+  private void innerDelete(final Path path, boolean tombstone,
+      ITtlTimeProvider ttlTimeProvider)
       throws IOException {
     checkPath(path);
     LOG.debug("Deleting from table {} in region {}: {}",
@@ -480,8 +486,13 @@ private void innerDelete(final Path path, boolean tombstone)
     // on that of S3A itself
     boolean idempotent = S3AFileSystem.DELETE_CONSIDERED_IDEMPOTENT;
     if (tombstone) {
+      Preconditions.checkArgument(ttlTimeProvider != null, "ttlTimeProvider "
+          + "must not be null");
+      final PathMetadata pmTombstone = PathMetadata.tombstone(path);
+      // update the last updated field of record when putting a tombstone
+      pmTombstone.setLastUpdated(ttlTimeProvider.getNow());
       Item item = PathMetadataDynamoDBTranslation.pathMetadataToItem(
-          new DDBPathMetadata(PathMetadata.tombstone(path)));
+          new DDBPathMetadata(pmTombstone));
       writeOp.retry(
           "Put tombstone",
           path.toString(),
@@ -499,7 +510,8 @@ private void innerDelete(final Path path, boolean tombstone)
 
   @Override
   @Retries.RetryTranslated
-  public void deleteSubtree(Path path) throws IOException {
+  public void deleteSubtree(Path path, ITtlTimeProvider ttlTimeProvider)
+      throws IOException {
     checkPath(path);
     LOG.debug("Deleting subtree from table {} in region {}: {}",
         tableName, region, path);
@@ -512,7 +524,7 @@ public void deleteSubtree(Path path) throws IOException {
 
     for (DescendantsIterator desc = new DescendantsIterator(this, meta);
          desc.hasNext();) {
-      innerDelete(desc.next().getPath(), true);
+      innerDelete(desc.next().getPath(), true, ttlTimeProvider);
     }
   }
 
@@ -682,7 +694,8 @@ Collection<DDBPathMetadata> completeAncestry(
   @Override
   @Retries.RetryTranslated
   public void move(Collection<Path> pathsToDelete,
-      Collection<PathMetadata> pathsToCreate) throws IOException {
+      Collection<PathMetadata> pathsToCreate, ITtlTimeProvider ttlTimeProvider)
+      throws IOException {
     if (pathsToDelete == null && pathsToCreate == null) {
       return;
     }
@@ -705,7 +718,11 @@ public void move(Collection<Path> pathsToDelete,
     }
     if (pathsToDelete != null) {
       for (Path meta : pathsToDelete) {
-        newItems.add(new DDBPathMetadata(PathMetadata.tombstone(meta)));
+        Preconditions.checkArgument(ttlTimeProvider != null, "ttlTimeProvider"
+            + " must not be null");
+        final PathMetadata pmTombstone = PathMetadata.tombstone(meta);
+        pmTombstone.setLastUpdated(ttlTimeProvider.getNow());
+        newItems.add(new DDBPathMetadata(pmTombstone));
       }
     }
 
@@ -976,14 +993,37 @@ public void destroy() throws IOException {
   }
 
   @Retries.RetryTranslated
-  private ItemCollection<ScanOutcome> expiredFiles(long modTime,
-      String keyPrefix) throws IOException {
-    String filterExpression =
-        "mod_time < :mod_time and begins_with(parent, :parent)";
-    String projectionExpression = "parent,child";
-    ValueMap map = new ValueMap()
-        .withLong(":mod_time", modTime)
-        .withString(":parent", keyPrefix);
+  private ItemCollection<ScanOutcome> expiredFiles(PruneMode pruneMode,
+      long cutoff, String keyPrefix) throws IOException {
+
+    String filterExpression;
+    String projectionExpression;
+    ValueMap map;
+
+    switch (pruneMode) {
+    case ALL_BY_MODTIME:
+      filterExpression =
+          "mod_time < :mod_time and begins_with(parent, :parent)";
+      projectionExpression = "parent,child";
+      map = new ValueMap()
+          .withLong(":mod_time", cutoff)
+          .withString(":parent", keyPrefix);
+      break;
+    case TOMBSTONES_BY_LASTUPDATED:
+      filterExpression =
+          "last_updated < :last_updated and begins_with(parent, :parent) "
+              + "and is_deleted = :is_deleted";
+      projectionExpression = "parent,child";
+      map = new ValueMap()
+          .withLong(":last_updated", cutoff)
+          .withString(":parent", keyPrefix)
+          .withBoolean(":is_deleted", true);
+      break;
+    default:
+      throw new UnsupportedOperationException("Unsupported prune mode: "
+          + pruneMode);
+    }
+
     return readOp.retry(
         "scan",
         keyPrefix,
@@ -993,20 +1033,31 @@ private ItemCollection<ScanOutcome> expiredFiles(long modTime,
 
   @Override
   @Retries.RetryTranslated
-  public void prune(long modTime) throws IOException {
-    prune(modTime, "/");
+  public void prune(PruneMode pruneMode, long cutoff) throws IOException {
+    prune(pruneMode, cutoff, "/");
   }
 
   /**
    * Prune files, in batches. There's a sleep between each batch.
-   * @param modTime Oldest modification time to allow
+   *
+   * @param pruneMode The mode of operation for the prune For details see
+   *                  {@link MetadataStore#prune(PruneMode, long)}
+   * @param cutoff Oldest modification time to allow
    * @param keyPrefix The prefix for the keys that should be removed
    * @throws IOException Any IO/DDB failure.
    * @throws InterruptedIOException if the prune was interrupted
    */
   @Override
   @Retries.RetryTranslated
-  public void prune(long modTime, String keyPrefix) throws IOException {
+  public void prune(PruneMode pruneMode, long cutoff, String keyPrefix)
+      throws IOException {
+    final ItemCollection<ScanOutcome> items =
+        expiredFiles(pruneMode, cutoff, keyPrefix);
+    innerPrune(items);
+  }
+
+  private void innerPrune(ItemCollection<ScanOutcome> items)
+      throws IOException {
     int itemCount = 0;
     try {
       Collection<Path> deletionBatch =
@@ -1016,7 +1067,7 @@ public void prune(long modTime, String keyPrefix) throws IOException {
           S3GUARD_DDB_BACKGROUND_SLEEP_MSEC_DEFAULT,
           TimeUnit.MILLISECONDS);
       Set<Path> parentPathSet = new HashSet<>();
-      for (Item item : expiredFiles(modTime, keyPrefix)) {
+      for (Item item : items) {
         DDBPathMetadata md = PathMetadataDynamoDBTranslation
             .itemToPathMetadata(item, username);
         Path path = md.getFileStatus().getPath();
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/ITtlTimeProvider.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/ITtlTimeProvider.java
new file mode 100644
index 0000000000000..daee6211b41d9
--- /dev/null
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/ITtlTimeProvider.java
@@ -0,0 +1,34 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.s3a.s3guard;
+
+/**
+ * This interface is defined for handling TTL expiry of metadata in S3Guard.
+ *
+ * TTL can be tested by implementing this interface and setting is as
+ * {@code S3Guard.ttlTimeProvider}. By doing this, getNow() can return any
+ * value preferred and flaky tests could be avoided. By default getNow()
+ * returns the EPOCH in runtime.
+ *
+ * Time is measured in milliseconds,
+ */
+public interface ITtlTimeProvider {
+  long getNow();
+  long getMetadataTtl();
+}
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/LocalMetadataStore.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/LocalMetadataStore.java
index 1a7f02896c635..2f7fec6cbb731 100644
--- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/LocalMetadataStore.java
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/LocalMetadataStore.java
@@ -111,32 +111,34 @@ public String toString() {
   }
 
   @Override
-  public void delete(Path p) throws IOException {
-    doDelete(p, false, true);
+  public void delete(Path p, ITtlTimeProvider ttlTimeProvider)
+      throws IOException {
+    doDelete(p, false, true, ttlTimeProvider);
   }
 
   @Override
   public void forgetMetadata(Path p) throws IOException {
-    doDelete(p, false, false);
+    doDelete(p, false, false, null);
   }
 
   @Override
-  public void deleteSubtree(Path path) throws IOException {
-    doDelete(path, true, true);
+  public void deleteSubtree(Path path, ITtlTimeProvider ttlTimeProvider)
+      throws IOException {
+    doDelete(path, true, true, ttlTimeProvider);
   }
 
-  private synchronized void doDelete(Path p, boolean recursive, boolean
-      tombstone) {
+  private synchronized void doDelete(Path p, boolean recursive,
+      boolean tombstone, ITtlTimeProvider ttlTimeProvider) {
 
     Path path = standardize(p);
 
     // Delete entry from file cache, then from cached parent directory, if any
 
-    deleteCacheEntries(path, tombstone);
+    deleteCacheEntries(path, tombstone, ttlTimeProvider);
 
     if (recursive) {
       // Remove all entries that have this dir as path prefix.
-      deleteEntryByAncestor(path, localCache, tombstone);
+      deleteEntryByAncestor(path, localCache, tombstone, ttlTimeProvider);
     }
   }
 
@@ -190,7 +192,8 @@ public synchronized DirListingMetadata listChildren(Path p) throws
 
   @Override
   public void move(Collection<Path> pathsToDelete,
-      Collection<PathMetadata> pathsToCreate) throws IOException {
+      Collection<PathMetadata> pathsToCreate,
+      ITtlTimeProvider ttlTimeProvider) throws IOException {
     LOG.info("Move {} to {}", pathsToDelete, pathsToCreate);
 
     Preconditions.checkNotNull(pathsToDelete, "pathsToDelete is null");
@@ -204,7 +207,7 @@ public void move(Collection<Path> pathsToDelete,
       // 1. Delete pathsToDelete
       for (Path meta : pathsToDelete) {
         LOG.debug("move: deleting metadata {}", meta);
-        delete(meta);
+        delete(meta, ttlTimeProvider);
       }
 
       // 2. Create new destination path metadata
@@ -322,18 +325,19 @@ public void destroy() throws IOException {
   }
 
   @Override
-  public void prune(long modTime) throws IOException{
-    prune(modTime, "");
+  public void prune(PruneMode pruneMode, long cutoff) throws IOException{
+    prune(pruneMode, cutoff, "");
   }
 
   @Override
-  public synchronized void prune(long modTime, String keyPrefix) {
+  public synchronized void prune(PruneMode pruneMode, long cutoff,
+      String keyPrefix) {
     // prune files
     // filter path_metadata (files), filter expired, remove expired
     localCache.asMap().entrySet().stream()
         .filter(entry -> entry.getValue().hasPathMeta())
-        .filter(entry -> expired(
-            entry.getValue().getFileMeta().getFileStatus(), modTime, keyPrefix))
+        .filter(entry -> expired(pruneMode,
+            entry.getValue().getFileMeta(), cutoff, keyPrefix))
         .forEach(entry -> localCache.invalidate(entry.getKey()));
 
 
@@ -348,28 +352,37 @@ public synchronized void prune(long modTime, String keyPrefix) {
           Collection<PathMetadata> newChildren = new LinkedList<>();
 
           for (PathMetadata child : oldChildren) {
-            FileStatus status = child.getFileStatus();
-            if (!expired(status, modTime, keyPrefix)) {
+            if (!expired(pruneMode, child, cutoff, keyPrefix)) {
               newChildren.add(child);
             }
           }
-          if (newChildren.size() != oldChildren.size()) {
-            DirListingMetadata dlm =
-                new DirListingMetadata(path, newChildren, false);
-            localCache.put(path, new LocalMetadataEntry(dlm));
-            if (!path.isRoot()) {
-              DirListingMetadata parent = getDirListingMeta(path.getParent());
-              if (parent != null) {
-                parent.setAuthoritative(false);
-              }
-            }
-          }
+          removeAuthoritativeFromParent(path, oldChildren, newChildren);
         });
   }
 
-  private boolean expired(FileStatus status, long expiry, String keyPrefix) {
+  private void removeAuthoritativeFromParent(Path path,
+      Collection<PathMetadata> oldChildren,
+      Collection<PathMetadata> newChildren) {
+    if (newChildren.size() != oldChildren.size()) {
+      DirListingMetadata dlm =
+          new DirListingMetadata(path, newChildren, false);
+      localCache.put(path, new LocalMetadataEntry(dlm));
+      if (!path.isRoot()) {
+        DirListingMetadata parent = getDirListingMeta(path.getParent());
+        if (parent != null) {
+          parent.setAuthoritative(false);
+        }
+      }
+    }
+  }
+
+  private boolean expired(PruneMode pruneMode, PathMetadata metadata,
+      long cutoff, String keyPrefix) {
+    final S3AFileStatus status = metadata.getFileStatus();
+    final URI statusUri = status.getPath().toUri();
+
     // remove the protocol from path string to be able to compare
-    String bucket = status.getPath().toUri().getHost();
+    String bucket = statusUri.getHost();
     String statusTranslatedPath = "";
     if(bucket != null && !bucket.isEmpty()){
       // if there's a bucket, (well defined host in Uri) the pathToParentKey
@@ -379,18 +392,33 @@ private boolean expired(FileStatus status, long expiry, String keyPrefix) {
     } else {
       // if there's no bucket in the path the pathToParentKey will fail, so
       // this is the fallback to get the path from status
-      statusTranslatedPath = status.getPath().toUri().getPath();
+      statusTranslatedPath = statusUri.getPath();
+    }
+
+    boolean expired;
+    switch (pruneMode) {
+    case ALL_BY_MODTIME:
+      // Note: S3 doesn't track modification time on directories, so for
+      // consistency with the DynamoDB implementation we ignore that here
+      expired = status.getModificationTime() < cutoff && !status.isDirectory()
+          && statusTranslatedPath.startsWith(keyPrefix);
+      break;
+    case TOMBSTONES_BY_LASTUPDATED:
+      expired = metadata.getLastUpdated() < cutoff && metadata.isDeleted()
+          && statusTranslatedPath.startsWith(keyPrefix);
+      break;
+    default:
+      throw new UnsupportedOperationException("Unsupported prune mode: "
+          + pruneMode);
     }
 
-    // Note: S3 doesn't track modification time on directories, so for
-    // consistency with the DynamoDB implementation we ignore that here
-    return status.getModificationTime() < expiry && !status.isDirectory()
-      && statusTranslatedPath.startsWith(keyPrefix);
+    return expired;
   }
 
   @VisibleForTesting
   static void deleteEntryByAncestor(Path ancestor,
-      Cache<Path, LocalMetadataEntry> cache, boolean tombstone) {
+      Cache<Path, LocalMetadataEntry> cache, boolean tombstone,
+      ITtlTimeProvider ttlTimeProvider) {
 
     cache.asMap().entrySet().stream()
         .filter(entry -> isAncestorOf(ancestor, entry.getKey()))
@@ -400,7 +428,9 @@ static void deleteEntryByAncestor(Path ancestor,
           if(meta.hasDirMeta()){
             cache.invalidate(path);
           } else if(tombstone && meta.hasPathMeta()){
-            meta.setPathMetadata(PathMetadata.tombstone(path));
+            final PathMetadata pmTombstone = PathMetadata.tombstone(path);
+            pmTombstone.setLastUpdated(ttlTimeProvider.getNow());
+            meta.setPathMetadata(pmTombstone);
           } else {
             cache.invalidate(path);
           }
@@ -424,7 +454,8 @@ private static boolean isAncestorOf(Path ancestor, Path f) {
    * Update fileCache and dirCache to reflect deletion of file 'f'.  Call with
    * lock held.
    */
-  private void deleteCacheEntries(Path path, boolean tombstone) {
+  private void deleteCacheEntries(Path path, boolean tombstone,
+      ITtlTimeProvider ttlTimeProvider) {
     LocalMetadataEntry entry = localCache.getIfPresent(path);
     // If there's no entry, delete should silently succeed
     // (based on MetadataStoreTestBase#testDeleteNonExisting)
@@ -438,6 +469,7 @@ private void deleteCacheEntries(Path path, boolean tombstone) {
     if(entry.hasPathMeta()){
       if (tombstone) {
         PathMetadata pmd = PathMetadata.tombstone(path);
+        pmd.setLastUpdated(ttlTimeProvider.getNow());
         entry.setPathMetadata(pmd);
       } else {
         entry.setPathMetadata(null);
@@ -464,6 +496,7 @@ private void deleteCacheEntries(Path path, boolean tombstone) {
         LOG.debug("removing parent's entry for {} ", path);
         if (tombstone) {
           dir.markDeleted(path);
+          dir.setLastUpdated(ttlTimeProvider.getNow());
         } else {
           dir.remove(path);
         }
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/MetadataStore.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/MetadataStore.java
index 46f8dd3e51c72..cb50d6a7b2b78 100644
--- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/MetadataStore.java
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/MetadataStore.java
@@ -62,16 +62,23 @@ public interface MetadataStore extends Closeable {
    * Deletes exactly one path, leaving a tombstone to prevent lingering,
    * inconsistent copies of it from being listed.
    *
+   * Deleting an entry with a tombstone needs a
+   * {@link org.apache.hadoop.fs.s3a.s3guard.S3Guard.TtlTimeProvider} because
+   * the lastUpdated field of the record has to be updated to <pre>now</pre>.
+   *
    * @param path the path to delete
+   * @param ttlTimeProvider the time provider to set last_updated. Must not
+   *                        be null.
    * @throws IOException if there is an error
    */
-  void delete(Path path) throws IOException;
+  void delete(Path path, ITtlTimeProvider ttlTimeProvider)
+      throws IOException;
 
   /**
    * Removes the record of exactly one path.  Does not leave a tombstone (see
-   * {@link MetadataStore#delete(Path)}. It is currently intended for testing
-   * only, and a need to use it as part of normal FileSystem usage is not
-   * anticipated.
+   * {@link MetadataStore#delete(Path, ITtlTimeProvider)}. It is currently
+   * intended for testing only, and a need to use it as part of normal
+   * FileSystem usage is not anticipated.
    *
    * @param path the path to delete
    * @throws IOException if there is an error
@@ -87,10 +94,17 @@ public interface MetadataStore extends Closeable {
    * implementations must also update any stored {@code DirListingMetadata}
    * objects which track the parent of this file.
    *
+   * Deleting a subtree with a tombstone needs a
+   * {@link org.apache.hadoop.fs.s3a.s3guard.S3Guard.TtlTimeProvider} because
+   * the lastUpdated field of all records have to be updated to <pre>now</pre>.
+   *
    * @param path the root of the sub-tree to delete
+   * @param ttlTimeProvider the time provider to set last_updated. Must not
+   *                        be null.
    * @throws IOException if there is an error
    */
-  void deleteSubtree(Path path) throws IOException;
+  void deleteSubtree(Path path, ITtlTimeProvider ttlTimeProvider)
+      throws IOException;
 
   /**
    * Gets metadata for a path.
@@ -150,10 +164,13 @@ PathMetadata get(Path path, boolean wantEmptyDirectoryFlag)
    * @param pathsToCreate Collection of all PathMetadata for the new paths
    *                      that were created at the destination of the rename
    *                      ().
+   * @param ttlTimeProvider the time provider to set last_updated. Must not
+   *                        be null.
    * @throws IOException if there is an error
    */
   void move(Collection<Path> pathsToDelete,
-      Collection<PathMetadata> pathsToCreate) throws IOException;
+      Collection<PathMetadata> pathsToCreate,
+      ITtlTimeProvider ttlTimeProvider) throws IOException;
 
   /**
    * Saves metadata for exactly one path.
@@ -210,29 +227,54 @@ void move(Collection<Path> pathsToDelete,
   void destroy() throws IOException;
 
   /**
-   * Clear any metadata older than a specified time from the repository.
-   * Implementations MUST clear file metadata, and MAY clear directory metadata
-   * (s3a itself does not track modification time for directories).
-   * Implementations may also choose to throw UnsupportedOperationException
-   * istead. Note that modification times should be in UTC, as returned by
-   * System.currentTimeMillis at the time of modification.
+   * Prune method with two modes of operation:
+   * <ul>
+   *   <li>
+   *    {@link PruneMode#ALL_BY_MODTIME}
+   *    Clear any metadata older than a specified mod_time from the store.
+   *    Note that this modification time is the S3 modification time from the
+   *    object's metadata - from the object store.
+   *    Implementations MUST clear file metadata, and MAY clear directory
+   *    metadata (s3a itself does not track modification time for directories).
+   *    Implementations may also choose to throw UnsupportedOperationException
+   *    instead. Note that modification times must be in UTC, as returned by
+   *    System.currentTimeMillis at the time of modification.
+   *   </li>
+   * </ul>
    *
-   * @param modTime Oldest modification time to allow
+   * <ul>
+   *   <li>
+   *    {@link PruneMode#TOMBSTONES_BY_LASTUPDATED}
+   *    Clear any tombstone updated earlier than a specified time from the
+   *    store. Note that this last_updated is the time when the metadata
+   *    entry was last updated and maintained by the metadata store.
+   *    Implementations MUST clear file metadata, and MAY clear directory
+   *    metadata (s3a itself does not track modification time for directories).
+   *    Implementations may also choose to throw UnsupportedOperationException
+   *    instead. Note that last_updated must be in UTC, as returned by
+   *    System.currentTimeMillis at the time of modification.
+   *   </li>
+   * </ul>
+   *
+   * @param pruneMode
+   * @param cutoff Oldest time to allow (UTC)
    * @throws IOException if there is an error
    * @throws UnsupportedOperationException if not implemented
    */
-  void prune(long modTime) throws IOException, UnsupportedOperationException;
+  void prune(PruneMode pruneMode, long cutoff) throws IOException,
+      UnsupportedOperationException;
 
   /**
-   * Same as {@link MetadataStore#prune(long)}, but with an additional
-   * keyPrefix parameter to filter the pruned keys with a prefix.
+   * Same as {@link MetadataStore#prune(PruneMode, long)}, but with an
+   * additional keyPrefix parameter to filter the pruned keys with a prefix.
    *
-   * @param modTime Oldest modification time to allow
+   * @param pruneMode
+   * @param cutoff Oldest time to allow (UTC)
    * @param keyPrefix The prefix for the keys that should be removed
    * @throws IOException if there is an error
    * @throws UnsupportedOperationException if not implemented
    */
-  void prune(long modTime, String keyPrefix)
+  void prune(PruneMode pruneMode, long cutoff, String keyPrefix)
       throws IOException, UnsupportedOperationException;
 
   /**
@@ -250,4 +292,13 @@ void prune(long modTime, String keyPrefix)
    * @throws IOException if there is an error
    */
   void updateParameters(Map<String, String> parameters) throws IOException;
+
+  /**
+   * Modes of operation for prune.
+   * For details see {@link MetadataStore#prune(PruneMode, long)}
+   */
+  enum PruneMode {
+    ALL_BY_MODTIME,
+    TOMBSTONES_BY_LASTUPDATED
+  }
 }
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/NullMetadataStore.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/NullMetadataStore.java
index 04704e7ea73d7..1472ef1a2219f 100644
--- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/NullMetadataStore.java
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/NullMetadataStore.java
@@ -47,7 +47,8 @@ public void close() throws IOException {
   }
 
   @Override
-  public void delete(Path path) throws IOException {
+  public void delete(Path path, ITtlTimeProvider ttlTimeProvider)
+      throws IOException {
   }
 
   @Override
@@ -55,7 +56,8 @@ public void forgetMetadata(Path path) throws IOException {
   }
 
   @Override
-  public void deleteSubtree(Path path) throws IOException {
+  public void deleteSubtree(Path path, ITtlTimeProvider ttlTimeProvider)
+      throws IOException {
   }
 
   @Override
@@ -76,7 +78,8 @@ public DirListingMetadata listChildren(Path path) throws IOException {
 
   @Override
   public void move(Collection<Path> pathsToDelete,
-      Collection<PathMetadata> pathsToCreate) throws IOException {
+      Collection<PathMetadata> pathsToCreate,
+      ITtlTimeProvider ttlTimeProvider) throws IOException {
   }
 
   @Override
@@ -96,11 +99,11 @@ public void destroy() throws IOException {
   }
 
   @Override
-  public void prune(long modTime) {
+  public void prune(PruneMode pruneMode, long cutoff) {
   }
 
   @Override
-  public void prune(long modTime, String keyPrefix) {
+  public void prune(PruneMode pruneMode, long cutoff, String keyPrefix) {
   }
 
   @Override
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/S3Guard.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/S3Guard.java
index cc55951869abf..933a01ced5f4c 100644
--- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/S3Guard.java
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/S3Guard.java
@@ -24,7 +24,13 @@
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.List;
+import java.util.Map;
+import java.util.Objects;
 import java.util.Set;
+import java.util.concurrent.TimeUnit;
+import java.util.stream.Collectors;
+
+import javax.annotation.Nullable;
 
 import com.google.common.annotations.VisibleForTesting;
 import com.google.common.base.Preconditions;
@@ -38,11 +44,14 @@
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.fs.s3a.Retries;
+import org.apache.hadoop.fs.s3a.Retries.RetryTranslated;
 import org.apache.hadoop.fs.s3a.S3AFileStatus;
 import org.apache.hadoop.fs.s3a.S3AInstrumentation;
 import org.apache.hadoop.fs.s3a.Tristate;
 import org.apache.hadoop.util.ReflectionUtils;
 
+import static org.apache.hadoop.fs.s3a.Constants.DEFAULT_METADATASTORE_METADATA_TTL;
+import static org.apache.hadoop.fs.s3a.Constants.METADATASTORE_METADATA_TTL;
 import static org.apache.hadoop.fs.s3a.Constants.S3_METADATA_STORE_IMPL;
 import static org.apache.hadoop.fs.s3a.Statistic.S3GUARD_METADATASTORE_PUT_PATH_LATENCY;
 import static org.apache.hadoop.fs.s3a.Statistic.S3GUARD_METADATASTORE_PUT_PATH_REQUEST;
@@ -65,7 +74,7 @@ public final class S3Guard {
   static final Class<? extends DynamoDBClientFactory>
       S3GUARD_DDB_CLIENT_FACTORY_IMPL_DEFAULT =
       DynamoDBClientFactory.DefaultDynamoDBClientFactory.class;
-  private static final FileStatus[] EMPTY_LISTING = new FileStatus[0];
+  private static final S3AFileStatus[] EMPTY_LISTING = new S3AFileStatus[0];
 
   // Utility class.  All static functions.
   private S3Guard() { }
@@ -139,14 +148,17 @@ static Class<? extends MetadataStore> getMetadataStoreClass(
    * @param ms MetadataStore to {@code put()} into.
    * @param status status to store
    * @param instrumentation instrumentation of the s3a file system
+   * @param timeProvider Time provider to use when writing entries
    * @return The same status as passed in
    * @throws IOException if metadata store update failed
    */
+  @RetryTranslated
   public static S3AFileStatus putAndReturn(MetadataStore ms,
       S3AFileStatus status,
-      S3AInstrumentation instrumentation) throws IOException {
+      S3AInstrumentation instrumentation,
+      ITtlTimeProvider timeProvider) throws IOException {
     long startTimeNano = System.nanoTime();
-    ms.put(new PathMetadata(status));
+    S3Guard.putWithTtl(ms, new PathMetadata(status), timeProvider);
     instrumentation.addValueToQuantiles(S3GUARD_METADATASTORE_PUT_PATH_LATENCY,
         (System.nanoTime() - startTimeNano));
     instrumentation.incrementCounter(S3GUARD_METADATASTORE_PUT_PATH_REQUEST, 1);
@@ -160,7 +172,7 @@ public static S3AFileStatus putAndReturn(MetadataStore ms,
    * @param dirMeta directory listing -may be null
    * @return a possibly-empty array of file status entries
    */
-  public static FileStatus[] dirMetaToStatuses(DirListingMetadata dirMeta)  {
+  public static S3AFileStatus[] dirMetaToStatuses(DirListingMetadata dirMeta)  {
     if (dirMeta == null) {
       return EMPTY_LISTING;
     }
@@ -174,7 +186,7 @@ public static FileStatus[] dirMetaToStatuses(DirListingMetadata dirMeta)  {
       }
     }
 
-    return statuses.toArray(new FileStatus[0]);
+    return statuses.toArray(new S3AFileStatus[0]);
   }
 
   /**
@@ -184,17 +196,22 @@ public static FileStatus[] dirMetaToStatuses(DirListingMetadata dirMeta)  {
    *
    * Also update the MetadataStore to reflect the resulting directory listing.
    *
+   * In not authoritative case: update file metadata if mod_time in listing
+   * of a file is greater then what is currently in the ms
+   *
    * @param ms MetadataStore to use.
    * @param path path to directory
    * @param backingStatuses Directory listing from the backing store.
    * @param dirMeta  Directory listing from MetadataStore.  May be null.
    * @param isAuthoritative State of authoritative mode
+   * @param timeProvider Time provider to use when updating entries
    * @return Final result of directory listing.
    * @throws IOException if metadata store update failed
    */
   public static FileStatus[] dirListingUnion(MetadataStore ms, Path path,
-      List<FileStatus> backingStatuses, DirListingMetadata dirMeta,
-      boolean isAuthoritative) throws IOException {
+      List<S3AFileStatus> backingStatuses, DirListingMetadata dirMeta,
+      boolean isAuthoritative, ITtlTimeProvider timeProvider)
+      throws IOException {
 
     // Fast-path for NullMetadataStore
     if (isNullMetadataStore(ms)) {
@@ -217,13 +234,26 @@ public static FileStatus[] dirListingUnion(MetadataStore ms, Path path,
     // Since the authoritative case is already handled outside this function,
     // we will basically start with the set of directory entries in the
     // DirListingMetadata, and add any that only exist in the backingStatuses.
-
     boolean changed = false;
-    for (FileStatus s : backingStatuses) {
+    final Map<Path, FileStatus> dirMetaMap = dirMeta.getListing().stream()
+        .collect(Collectors.toMap(
+            pm -> pm.getFileStatus().getPath(), PathMetadata::getFileStatus)
+        );
+
+    for (S3AFileStatus s : backingStatuses) {
       if (deleted.contains(s.getPath())) {
         continue;
       }
 
+      if (!isAuthoritative){
+        FileStatus status = dirMetaMap.get(s.getPath());
+        if (status != null
+            && s.getModificationTime() > status.getModificationTime()) {
+          LOG.debug("Update ms with newer metadata of: {}", status);
+          S3Guard.putWithTtl(ms, new PathMetadata(s), timeProvider);
+        }
+      }
+
       // Minor race condition here.  Multiple threads could add to this
       // mutable DirListingMetadata.  Since it is backed by a
       // ConcurrentHashMap, the last put() wins.
@@ -241,7 +271,7 @@ public static FileStatus[] dirListingUnion(MetadataStore ms, Path path,
 
     if (changed && isAuthoritative) {
       dirMeta.setAuthoritative(true); // This is the full directory contents
-      ms.put(dirMeta);
+      S3Guard.putWithTtl(ms, dirMeta, timeProvider);
     }
 
     return dirMetaToStatuses(dirMeta);
@@ -278,11 +308,12 @@ public static boolean isNullMetadataStore(MetadataStore ms) {
    *              dir.
    * @param owner Hadoop user name.
    * @param authoritative Whether to mark new directories as authoritative.
+   * @param timeProvider Time provider for testing.
    */
   @Deprecated
   @Retries.OnceExceptionsSwallowed
   public static void makeDirsOrdered(MetadataStore ms, List<Path> dirs,
-      String owner, boolean authoritative) {
+      String owner, boolean authoritative, ITtlTimeProvider timeProvider) {
     if (dirs == null) {
       return;
     }
@@ -300,7 +331,7 @@ public static void makeDirsOrdered(MetadataStore ms, List<Path> dirs,
      *    [/a/b/file0, /a/b/file1, /a/b/file2, /a/b/file3], isAuthoritative =
      *    true
      */
-    FileStatus prevStatus = null;
+    S3AFileStatus prevStatus = null;
 
     // Use new batched put to reduce round trips.
     List<PathMetadata> pathMetas = new ArrayList<>(dirs.size());
@@ -311,8 +342,8 @@ public static void makeDirsOrdered(MetadataStore ms, List<Path> dirs,
         boolean isLeaf = (prevStatus == null);
         Path f = dirs.get(i);
         assertQualified(f);
-        FileStatus status =
-            createUploadFileStatus(f, true, 0, 0, owner);
+        S3AFileStatus status =
+            createUploadFileStatus(f, true, 0, 0, owner, null, null);
 
         // We only need to put a DirListingMetadata if we are setting
         // authoritative bit
@@ -326,7 +357,7 @@ public static void makeDirsOrdered(MetadataStore ms, List<Path> dirs,
             children.add(new PathMetadata(prevStatus));
           }
           dirMeta = new DirListingMetadata(f, children, authoritative);
-          ms.put(dirMeta);
+          S3Guard.putWithTtl(ms, dirMeta, timeProvider);
         }
 
         pathMetas.add(new PathMetadata(status));
@@ -334,7 +365,7 @@ public static void makeDirsOrdered(MetadataStore ms, List<Path> dirs,
       }
 
       // Batched put
-      ms.put(pathMetas);
+      S3Guard.putWithTtl(ms, pathMetas, timeProvider);
     } catch (IOException ioe) {
       LOG.error("MetadataStore#put() failure:", ioe);
     }
@@ -360,7 +391,8 @@ public static void addMoveDir(MetadataStore ms, Collection<Path> srcPaths,
     }
     assertQualified(srcPath, dstPath);
 
-    FileStatus dstStatus = createUploadFileStatus(dstPath, true, 0, 0, owner);
+    S3AFileStatus dstStatus = createUploadFileStatus(dstPath, true, 0,
+        0, owner, null, null);
     addMoveStatus(srcPaths, dstMetas, srcPath, dstStatus);
   }
 
@@ -376,16 +408,18 @@ public static void addMoveDir(MetadataStore ms, Collection<Path> srcPaths,
    * @param size length of file moved
    * @param blockSize  blocksize to associate with destination file
    * @param owner file owner to use in created records
+   * @param eTag the s3 object eTag of file moved
+   * @param versionId the s3 object versionId of file moved
    */
   public static void addMoveFile(MetadataStore ms, Collection<Path> srcPaths,
       Collection<PathMetadata> dstMetas, Path srcPath, Path dstPath,
-      long size, long blockSize, String owner) {
+      long size, long blockSize, String owner, String eTag, String versionId) {
     if (isNullMetadataStore(ms)) {
       return;
     }
     assertQualified(srcPath, dstPath);
-    FileStatus dstStatus = createUploadFileStatus(dstPath, false,
-        size, blockSize, owner);
+    S3AFileStatus dstStatus = createUploadFileStatus(dstPath, false,
+        size, blockSize, owner, eTag, versionId);
     addMoveStatus(srcPaths, dstMetas, srcPath, dstStatus);
   }
 
@@ -436,28 +470,28 @@ public static void addMoveAncestors(MetadataStore ms,
   }
 
   public static void addAncestors(MetadataStore metadataStore,
-      Path qualifiedPath, String username) throws IOException {
+      Path qualifiedPath, String username, ITtlTimeProvider timeProvider)
+      throws IOException {
     Collection<PathMetadata> newDirs = new ArrayList<>();
     Path parent = qualifiedPath.getParent();
     while (!parent.isRoot()) {
       PathMetadata directory = metadataStore.get(parent);
       if (directory == null || directory.isDeleted()) {
-        FileStatus status = new FileStatus(0, true, 1, 0, 0, 0, null, username,
-            null, parent);
-        PathMetadata meta = new PathMetadata(status, Tristate.FALSE, false);
+        S3AFileStatus s3aStatus = new S3AFileStatus(Tristate.FALSE, parent, username);
+        PathMetadata meta = new PathMetadata(s3aStatus, Tristate.FALSE, false);
         newDirs.add(meta);
       } else {
         break;
       }
       parent = parent.getParent();
     }
-    metadataStore.put(newDirs);
+    S3Guard.putWithTtl(metadataStore, newDirs, timeProvider);
   }
 
   private static void addMoveStatus(Collection<Path> srcPaths,
       Collection<PathMetadata> dstMetas,
       Path srcPath,
-      FileStatus dstStatus) {
+      S3AFileStatus dstStatus) {
     srcPaths.add(srcPath);
     dstMetas.add(new PathMetadata(dstStatus));
   }
@@ -487,4 +521,138 @@ public static void assertQualified(Path...paths) {
       assertQualified(path);
     }
   }
+
+  /**
+   * Runtime implementation for TTL Time Provider interface.
+   */
+  public static class TtlTimeProvider implements ITtlTimeProvider {
+    private long authoritativeDirTtl;
+
+    public TtlTimeProvider(long authoritativeDirTtl) {
+      this.authoritativeDirTtl = authoritativeDirTtl;
+    }
+
+    public TtlTimeProvider(Configuration conf) {
+      this.authoritativeDirTtl =
+          conf.getTimeDuration(METADATASTORE_METADATA_TTL,
+              DEFAULT_METADATASTORE_METADATA_TTL, TimeUnit.MILLISECONDS);
+    }
+
+    @Override
+    public long getNow() {
+      return System.currentTimeMillis();
+    }
+
+    @Override public long getMetadataTtl() {
+      return authoritativeDirTtl;
+    }
+
+    @Override
+    public boolean equals(final Object o) {
+      if (this == o) { return true; }
+      if (o == null || getClass() != o.getClass()) { return false; }
+      final TtlTimeProvider that = (TtlTimeProvider) o;
+      return authoritativeDirTtl == that.authoritativeDirTtl;
+    }
+
+    @Override
+    public int hashCode() {
+      return Objects.hash(authoritativeDirTtl);
+    }
+
+    @Override
+    public String toString() {
+      final StringBuilder sb = new StringBuilder(
+          "TtlTimeProvider{");
+      sb.append("authoritativeDirTtl=").append(authoritativeDirTtl);
+      sb.append(" millis}");
+      return sb.toString();
+    }
+  }
+
+  public static void putWithTtl(MetadataStore ms, DirListingMetadata dirMeta,
+      ITtlTimeProvider timeProvider)
+      throws IOException {
+    dirMeta.setLastUpdated(timeProvider.getNow());
+    dirMeta.getListing()
+        .forEach(pm -> pm.setLastUpdated(timeProvider.getNow()));
+    ms.put(dirMeta);
+  }
+
+  public static void putWithTtl(MetadataStore ms, PathMetadata fileMeta,
+      @Nullable ITtlTimeProvider timeProvider) throws IOException {
+    if (timeProvider != null) {
+      fileMeta.setLastUpdated(timeProvider.getNow());
+    } else {
+      LOG.debug("timeProvider is null, put {} without setting last_updated",
+          fileMeta);
+    }
+    ms.put(fileMeta);
+  }
+
+  public static void putWithTtl(MetadataStore ms,
+      Collection<PathMetadata> fileMetas,
+      @Nullable ITtlTimeProvider timeProvider)
+      throws IOException {
+    if (timeProvider != null) {
+      final long now = timeProvider.getNow();
+      fileMetas.forEach(fileMeta -> fileMeta.setLastUpdated(now));
+    } else {
+      LOG.debug("timeProvider is null, put {} without setting last_updated",
+          fileMetas);
+    }
+    ms.put(fileMetas);
+  }
+
+  public static PathMetadata getWithTtl(MetadataStore ms, Path path,
+      @Nullable ITtlTimeProvider timeProvider) throws IOException {
+    final PathMetadata pathMetadata = ms.get(path);
+    // if timeProvider is null let's return with what the ms has
+    if (timeProvider == null) {
+      LOG.debug("timeProvider is null, returning pathMetadata as is");
+      return pathMetadata;
+    }
+
+    long ttl = timeProvider.getMetadataTtl();
+
+    if (pathMetadata != null) {
+      // Special case: the pathmetadata's last updated is 0. This can happen
+      // eg. with an old db using this implementation
+      if (pathMetadata.getLastUpdated() == 0) {
+        LOG.debug("PathMetadata TTL for {} is 0, so it will be returned as "
+            + "not expired.");
+        return pathMetadata;
+      }
+
+      if (!pathMetadata.isExpired(ttl, timeProvider.getNow())) {
+        return pathMetadata;
+      } else {
+        LOG.debug("PathMetadata TTl for {} is expired in metadata store.",
+            path);
+        return null;
+      }
+    }
+
+    return null;
+  }
+
+  public static DirListingMetadata listChildrenWithTtl(MetadataStore ms,
+      Path path, @Nullable ITtlTimeProvider timeProvider)
+      throws IOException {
+    DirListingMetadata dlm = ms.listChildren(path);
+
+    if (timeProvider == null) {
+      LOG.debug("timeProvider is null, returning DirListingMetadata as is");
+      return dlm;
+    }
+
+    long ttl = timeProvider.getMetadataTtl();
+
+    if (dlm != null && dlm.isAuthoritative()
+        && dlm.isExpired(ttl, timeProvider.getNow())) {
+      dlm.setAuthoritative(false);
+    }
+    return dlm;
+  }
+
 }
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/S3GuardTool.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/S3GuardTool.java
index 9df912424c886..318094adb3935 100644
--- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/S3GuardTool.java
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/S3GuardTool.java
@@ -705,7 +705,8 @@ private void putParentsIfNotPresent(FileStatus f) throws IOException {
         }
         FileStatus dir = DynamoDBMetadataStore.makeDirStatus(parent,
             f.getOwner());
-        getStore().put(new PathMetadata(dir));
+        S3Guard.putWithTtl(getStore(), new PathMetadata(dir),
+            getFilesystem().getTtlTimeProvider());
         dirCache.add(parent);
         parent = parent.getParent();
       }
@@ -737,7 +738,8 @@ private long importDir(FileStatus status) throws IOException {
               located.getOwner());
         }
         putParentsIfNotPresent(child);
-        getStore().put(new PathMetadata(child));
+        S3Guard.putWithTtl(getStore(), new PathMetadata(child),
+            getFilesystem().getTtlTimeProvider());
         items++;
       }
       return items;
@@ -1068,7 +1070,8 @@ public int run(String[] args, PrintStream out) throws
       }
 
       try {
-        getStore().prune(divide, keyPrefix);
+        getStore().prune(MetadataStore.PruneMode.ALL_BY_MODTIME, divide,
+            keyPrefix);
       } catch (UnsupportedOperationException e){
         errorln("Prune operation not supported in metadata store.");
       }
diff --git a/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/s3guard.md b/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/s3guard.md
index a8c8d6cd2cdbd..337fc95b6c703 100644
--- a/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/s3guard.md
+++ b/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/s3guard.md
@@ -98,15 +98,19 @@ This offers no metadata storage, and effectively disables S3Guard.
 
 More settings will may be added in the future.
 Currently the only Metadata Store-independent setting, besides the
-implementation class above, is the *allow authoritative* flag.
+implementation class above, are the *allow authoritative* and *fail-on-error*
+flags.
+
+#### Allow Authoritative
 
 The _authoritative_ expression in S3Guard is present in two different layers, for
 two different reasons:
 
 * Authoritative S3Guard
     * S3Guard can be set as authoritative, which means that an S3A client will
-    avoid round-trips to S3 when **getting directory listings** if there is a fully
-    cached version of the directory stored in metadata store.
+    avoid round-trips to S3 when **getting file metadata**, and **getting
+    directory listings** if there is a fully cached version of the directory
+    stored in metadata store.
     * This mode can be set as a configuration property
     `fs.s3a.metadatastore.authoritative`
     * All interactions with the S3 bucket(s) must be through S3A clients sharing
@@ -128,16 +132,20 @@ two different reasons:
 
 More on Authoritative S3Guard:
 
-* It is not treating the MetadataStore (e.g. dynamodb) as the source of truth
- in general.
-* It is the ability to short-circuit S3 list objects and serve listings from
-the MetadataStore in some circumstances.
+* This setting is about treating the MetadataStore (e.g. dynamodb) as the source
+ of truth in general, and also to short-circuit S3 list objects and serve
+ listings from the MetadataStore in some circumstances.
+* For S3A to skip S3's get object metadata, and serve it directly from the
+MetadataStore, the following things must all be true:
+    1. The S3A client is configured to allow MetadataStore to be authoritative
+    source of a file metadata (`fs.s3a.metadatastore.authoritative=true`).
+    1. The MetadataStore has the file metadata for the path stored in it.
 * For S3A to skip S3's list objects on some path, and serve it directly from
 the MetadataStore, the following things must all be true:
     1. The MetadataStore implementation persists the bit
     `DirListingMetadata.isAuthorititative` set when calling
     `MetadataStore#put` (`DirListingMetadata`)
-    1. The S3A client is configured to allow metadatastore to be authoritative
+    1. The S3A client is configured to allow MetadataStore to be authoritative
     source of a directory listing (`fs.s3a.metadatastore.authoritative=true`).
     1. The MetadataStore has a **full listing for path** stored in it.  This only
     happens if the FS client (s3a) explicitly has stored a full directory
@@ -154,8 +162,9 @@ recommended that you leave the default setting here:
 </property>
 ```
 
-Note that a MetadataStore MAY persist this bit. (Not MUST).
 Setting this to `true` is currently an experimental feature.
+Note that a MetadataStore MAY persist this bit in the directory listings. (Not
+MUST).
 
 Note that if this is set to true, it may exacerbate or persist existing race
 conditions around multiple concurrent modifications and listings of a given
@@ -165,6 +174,57 @@ In particular: **If the Metadata Store is declared as authoritative,
 all interactions with the S3 bucket(s) must be through S3A clients sharing
 the same Metadata Store**
 
+It can be configured how long a directory listing in the MetadataStore is
+considered as authoritative. If `((lastUpdated + ttl) <= now)` is false, the
+directory  listing is no longer considered authoritative, so the flag will be
+removed on `S3AFileSystem` level.
+
+```xml
+<property>
+    <name>fs.s3a.metadatastore.metadata.ttl</name>
+    <value>15m</value>
+</property>
+```
+
+#### Fail on Error
+
+By default, S3AFileSystem write operations will fail when updates to
+S3Guard metadata fail. S3AFileSystem first writes the file to S3 and then
+updates the metadata in S3Guard. If the metadata write fails,
+`MetadataPersistenceException` is thrown.  The file in S3 **is not** rolled
+back.
+
+If the write operation cannot be programmatically retried, the S3Guard metadata
+for the given file can be corrected with a command like the following:
+
+```bash
+hadoop s3guard import [-meta URI] s3a://my-bucket/file-with-bad-metadata
+```
+
+Programmatic retries of the original operation would require overwrite=true.
+Suppose the original operation was FileSystem.create(myFile, overwrite=false).
+If this operation failed with `MetadataPersistenceException` a repeat of the
+same operation would result in `FileAlreadyExistsException` since the original
+operation successfully created the file in S3 and only failed in writing the
+metadata to S3Guard.
+
+Metadata update failures can be downgraded to ERROR logging instead of exception
+by setting the following configuration:
+
+```xml
+<property>
+    <name>fs.s3a.metadatastore.fail.on.write.error</name>
+    <value>false</value>
+</property>
+```
+
+Setting this false is dangerous as it could result in the type of issue S3Guard
+is designed to avoid.  For example, a reader may see an inconsistent listing
+after a recent write since S3Guard may not contain metadata about the recently
+written file due to a metadata write error.
+
+As with the default setting, the new/updated file is still in S3 and **is not**
+rolled back. The S3Guard metadata is likely to be out of sync.
 
 ### 3. Configure the Metadata Store.
 
@@ -191,9 +251,11 @@ this sets the table name to `my-ddb-table-name`
 </property>
 ```
 
-It is good to share a table across multiple buckets for multiple reasons.
+It is good to share a table across multiple buckets for multiple reasons,
+especially if you are *not* using on-demand DynamoDB tables, and instead
+prepaying for provisioned I/O capacity.
 
-1. You are billed for the I/O capacity allocated to the table,
+1. You are billed for the provisioned I/O capacity allocated to the table,
 *even when the table is not used*. Sharing capacity can reduce costs.
 
 1. You can share the "provision burden" across the buckets. That is, rather
@@ -205,8 +267,13 @@ lower.
 S3Guard, because there is only one table to review and configure in the
 AWS management console.
 
+1. When you don't grant the permission to create DynamoDB tables to users.
+A single pre-created table for all buckets avoids the needs for an administrator
+to create one for every bucket.
+
 When wouldn't you want to share a table?
 
+1. When you are using on-demand DynamoDB and want to keep each table isolated.
 1. When you do explicitly want to provision I/O capacity to a specific bucket
 and table, isolated from others.
 
@@ -255,53 +322,82 @@ Next, you can choose whether or not the table will be automatically created
 </property>
 ```
 
-### 7. If creating a table: Set your DynamoDB IO Capacity
+### 7. If creating a table: Choose your billing mode (and perhaps I/O Capacity)
+
+Next, you need to decide whether to use On-Demand DynamoDB and its
+pay-per-request billing (recommended), or to explicitly request a
+provisioned IO capacity.
 
-Next, you need to set the DynamoDB read and write throughput requirements you
-expect to need for your cluster.  Setting higher values will cost you more
-money.  *Note* that these settings only affect table creation when
+Before AWS offered pay-per-request billing, the sole billing mechanism,
+was "provisioned capacity". This mechanism requires you to choose 
+the DynamoDB read and write throughput requirements you
+expect to need for your expected uses of the S3Guard table.
+Setting higher values cost you more money -*even when the table was idle*
+  *Note* that these settings only affect table creation when
 `fs.s3a.s3guard.ddb.table.create` is enabled.  To change the throughput for
 an existing table, use the AWS console or CLI tool.
 
 For more details on DynamoDB capacity units, see the AWS page on [Capacity
 Unit Calculations](http://docs.aws.amazon.com/amazondynamodb/latest/developerguide/WorkingWithTables.html#CapacityUnitCalculations).
 
-The charges are incurred per hour for the life of the table, *even when the
+Provisioned IO capacity is billed per hour for the life of the table, *even when the
 table and the underlying S3 buckets are not being used*.
 
-There are also charges incurred for data storage and for data IO outside of the
+There are also charges incurred for data storage and for data I/O outside of the
 region of the DynamoDB instance. S3Guard only stores metadata in DynamoDB: path names
 and summary details of objects —the actual data is stored in S3, so billed at S3
 rates.
 
+With provisioned I/O capacity, attempting to perform more I/O than the capacity
+requested throttles the operation and may result in operations failing.
+Larger I/O capacities cost more.
+
+With the introduction of On-Demand DynamoDB, you can now avoid paying for
+provisioned capacity by creating an on-demand table.
+With an on-demand table you are not throttled if your DynamoDB requests exceed
+any pre-provisioned limit, nor do you pay per hour even when a table is idle.
+
+You do, however, pay more per DynamoDB operation.
+Even so, the ability to cope with sudden bursts of read or write requests, combined
+with the elimination of charges for idle tables, suit the use patterns made of 
+S3Guard tables by applications interacting with S3. That is: periods when the table
+is rarely used, with intermittent high-load operations when directory trees
+are scanned (query planning and similar), or updated (rename and delete operations).
+
+
+We recommending using On-Demand DynamoDB for maximum performance in operations
+such as query planning, and lowest cost when S3 buckets are not being accessed.
+
+This is the default, as configured in the default configuration options.
+
 ```xml
 <property>
   <name>fs.s3a.s3guard.ddb.table.capacity.read</name>
-  <value>500</value>
+  <value>0</value>
   <description>
     Provisioned throughput requirements for read operations in terms of capacity
-    units for the DynamoDB table.  This config value will only be used when
-    creating a new DynamoDB table, though later you can manually provision by
-    increasing or decreasing read capacity as needed for existing tables.
-    See DynamoDB documents for more information.
+    units for the DynamoDB table. This config value will only be used when
+    creating a new DynamoDB table.
+    If set to 0 (the default), new tables are created with "per-request" capacity.
+    If a positive integer is provided for this and the write capacity, then
+    a table with "provisioned capacity" will be created.
+    You can change the capacity of an existing provisioned-capacity table
+    through the "s3guard set-capacity" command.
   </description>
 </property>
 
 <property>
   <name>fs.s3a.s3guard.ddb.table.capacity.write</name>
-  <value>100</value>
+  <value>0</value>
   <description>
     Provisioned throughput requirements for write operations in terms of
-    capacity units for the DynamoDB table.  Refer to related config
-    fs.s3a.s3guard.ddb.table.capacity.read before usage.
+    capacity units for the DynamoDB table.
+    If set to 0 (the default), new tables are created with "per-request" capacity.
+    Refer to related configuration option fs.s3a.s3guard.ddb.table.capacity.read
   </description>
 </property>
 ```
 
-Attempting to perform more IO than the capacity requested throttles the
-IO, and may result in operations failing. Larger IO capacities cost more.
-We recommending using small read and write capacities when initially experimenting
-with S3Guard.
 
 ## Authenticating with S3Guard
 
@@ -309,9 +405,7 @@ The DynamoDB metadata store takes advantage of the fact that the DynamoDB
 service uses the same authentication mechanisms as S3. S3Guard
 gets all its credentials from the S3A client that is using it.
 
-All existing S3 authentication mechanisms can be used, except for one
-exception. Credentials placed in URIs are not supported for S3Guard, for security
-reasons.
+All existing S3 authentication mechanisms can be used.
 
 ## Per-bucket S3Guard configuration
 
@@ -385,6 +479,48 @@ for two buckets with a shared table, while disabling it for the public
 bucket.
 
 
+### Out-of-band operations with S3Guard
+
+We call an operation out-of-band (OOB) when a bucket is used by a client with
+ S3Guard, and another client runs a write (e.g delete, move, rename,
+ overwrite) operation on an object in the same bucket without S3Guard.
+
+The definition of behaviour in S3AFileSystem/MetadataStore in case of OOBs:
+* A client with S3Guard
+* B client without S3Guard (Directly to S3)
+
+
+* OOB OVERWRITE, authoritative mode:
+  * A client creates F1 file
+  * B client overwrites F1 file with F2 (Same, or different file size)
+  * A client's getFileStatus returns F1 metadata
+
+* OOB OVERWRITE, NOT authoritative mode:
+  * A client creates F1 file
+  * B client overwrites F1 file with F2 (Same, or different file size)
+  * A client's getFileStatus returns F2 metadata. In not authoritative mode we
+ check S3 for the file. If the modification time of the file in S3 is greater
+ than in S3Guard, we can safely return the S3 file metadata and update the
+ cache.
+
+* OOB DELETE, authoritative mode:
+  * A client creates F file
+  * B client deletes F file
+  * A client's getFileStatus returns that the file is still there
+
+* OOB DELETE, NOT authoritative mode:
+  * A client creates F file
+  * B client deletes F file
+  * A client's getFileStatus returns that the file is still there
+
+Note: authoritative and NOT authoritative mode behaves the same at
+OOB DELETE case.
+
+The behaviour in case of getting directory listings:
+* File status in metadata store gets updated during the listing the same way
+as in getFileStatus.
+
+
 ## S3Guard Command Line Interface (CLI)
 
 Note that in some cases an AWS region or `s3a://` URI can be provided.
@@ -410,7 +546,13 @@ hadoop s3guard init -meta URI ( -region REGION | s3a://BUCKET )
 Creates and initializes an empty metadata store.
 
 A DynamoDB metadata store can be initialized with additional parameters
-pertaining to [Provisioned Throughput](http://docs.aws.amazon.com/amazondynamodb/latest/developerguide/HowItWorks.ProvisionedThroughput.html):
+pertaining to capacity. 
+
+If these values are both zero, then an on-demand DynamoDB table is created;
+if positive values then they set the
+[Provisioned Throughput](http://docs.aws.amazon.com/amazondynamodb/latest/developerguide/HowItWorks.ProvisionedThroughput.html)
+of the table.
+
 
 ```bash
 [-write PROVISIONED_WRITES] [-read PROVISIONED_READS]
@@ -426,29 +568,31 @@ metadata store will be created with these tags in DynamoDB.
 Example 1
 
 ```bash
-hadoop s3guard init -meta dynamodb://ireland-team -write 5 -read 10 s3a://ireland-1
+hadoop s3guard init -meta dynamodb://ireland-team -write 0 -read 0 s3a://ireland-1
 ```
 
-Creates a table "ireland-team" with a capacity of 5 for writes, 10 for reads,
-in the same location as the bucket "ireland-1".
+Creates an on-demand table "ireland-team",
+in the same location as the S3 bucket "ireland-1".
 
 
 Example 2
 
 ```bash
-hadoop s3guard init -meta dynamodb://ireland-team -region eu-west-1
+hadoop s3guard init -meta dynamodb://ireland-team -region eu-west-1 --read 0 --write 0
 ```
 
 Creates a table "ireland-team" in the region "eu-west-1.amazonaws.com"
 
-
 Example 3
 
 ```bash
 hadoop s3guard init -meta dynamodb://ireland-team -tag tag1=first;tag2=second;
 ```
 
-Creates a table "ireland-team" with tags "first" and "second".
+Creates a table "ireland-team" with tags "first" and "second". The read and
+write capacity will be those of the site configuration's values of
+`fs.s3a.s3guard.ddb.table.capacity.read` and `fs.s3a.s3guard.ddb.table.capacity.write`;
+if these are both zero then it will be an on-demand table.
 
 ### Import a bucket: `s3guard import`
 
@@ -486,7 +630,7 @@ hadoop s3guard diff s3a://ireland-1
 Prints and optionally checks the s3guard and encryption status of a bucket.
 
 ```bash
-hadoop s3guard bucket-info [ -guarded ] [-unguarded] [-auth] [-nonauth] [-magic] [-encryption ENCRYPTION] s3a://BUCKET
+hadoop s3guard bucket-info [-guarded] [-unguarded] [-auth] [-nonauth] [-magic] [-encryption ENCRYPTION] s3a://BUCKET
 ```
 
 Options
@@ -494,7 +638,7 @@ Options
 | argument | meaning |
 |-----------|-------------|
 | `-guarded` | Require S3Guard to be enabled |
-| `-unguarded` | Require S3Guard to be disabled |
+| `-unguarded` | Force S3Guard to be disabled |
 | `-auth` | Require the S3Guard mode to be "authoritative" |
 | `-nonauth` | Require the S3Guard mode to be "non-authoritative" |
 | `-magic` | Require the S3 filesystem to be support the "magic" committer |
@@ -519,6 +663,7 @@ Filesystem s3a://ireland-1 is using S3Guard with store DynamoDBMetadataStore{reg
 Authoritative S3Guard: fs.s3a.metadatastore.authoritative=false
 Metadata Store Diagnostics:
   ARN=arn:aws:dynamodb:eu-west-1:00000000:table/ireland-1
+  billing-mode=provisioned
   description=S3Guard metadata store in DynamoDB
   name=ireland-1
   read-capacity=20
@@ -541,9 +686,13 @@ Metadata Store Diagnostics:
 The "magic" committer is supported
 
 S3A Client
+  Signing Algorithm: fs.s3a.signing-algorithm=(unset)
   Endpoint: fs.s3a.endpoint=s3-eu-west-1.amazonaws.com
   Encryption: fs.s3a.server-side-encryption-algorithm=none
   Input seek policy: fs.s3a.experimental.input.fadvise=normal
+  Change Detection Source: fs.s3a.change.detection.source=etag
+  Change Detection Mode: fs.s3a.change.detection.mode=server
+Delegation token support is disabled
 ```
 
 This listing includes all the information about the table supplied from
@@ -679,9 +828,10 @@ Delete all entries more than 90 minutes old from the table "ireland-team" in
 the region "eu-west-1".
 
 
-### Tune the IO capacity of the DynamoDB Table, `s3guard set-capacity`
+### Tune the I/O capacity of the DynamoDB Table, `s3guard set-capacity`
 
-Alter the read and/or write capacity of a s3guard table.
+Alter the read and/or write capacity of a s3guard table created with provisioned
+I/O capacity.
 
 ```bash
 hadoop s3guard set-capacity [--read UNIT] [--write UNIT] ( -region REGION | s3a://BUCKET )
@@ -689,6 +839,9 @@ hadoop s3guard set-capacity [--read UNIT] [--write UNIT] ( -region REGION | s3a:
 
 The `--read` and `--write` units are those of `s3guard init`.
 
+It cannot be used to change the I/O capacity of an on demand table (there is
+no need), and nor can it be used to convert an existing table to being
+on-demand. For that the AWS console must be used.
 
 Example
 
@@ -705,6 +858,7 @@ and 20 write. (This is a low number, incidentally)
 2017-08-30 16:21:26,344 [main] INFO  s3guard.DynamoDBMetadataStore (DynamoDBMetadataStore.java:updateParameters(1086)) - Changing capacity of table to read: 20, write: 20
 Metadata Store Diagnostics:
   ARN=arn:aws:dynamodb:eu-west-1:00000000000:table/ireland-1
+  billing-mode=provisioned
   description=S3Guard metadata store in DynamoDB
   name=ireland-1
   read-capacity=25
@@ -726,6 +880,7 @@ write values match that already in use.
 2017-08-30 16:24:35,337 [main] INFO  s3guard.DynamoDBMetadataStore (DynamoDBMetadataStore.java:updateParameters(1090)) - Table capacity unchanged at read: 20, write: 20
 Metadata Store Diagnostics:
   ARN=arn:aws:dynamodb:eu-west-1:00000000000:table/ireland-1
+  billing-mode=provisioned
   description=S3Guard metadata store in DynamoDB
   name=ireland-1
   read-capacity=20
@@ -821,12 +976,12 @@ are only made after successful file creation, deletion and rename, the
 store is *unlikely* to get out of sync, it is still something which
 merits more testing before it could be considered reliable.
 
-## Managing DynamoDB IO Capacity
+## Managing DynamoDB I/O Capacity
 
-DynamoDB is not only billed on use (data and IO requests), it is billed
-on allocated IO Capacity.
+Historically, DynamoDB has been not only billed on use (data and I/O requests)
+-but on provisioned I/O Capacity.
 
-When an application makes more requests than
+With Provisioned IO, when an application makes more requests than
 the allocated capacity permits, the request is rejected; it is up to
 the calling application to detect when it is being so throttled and
 react. S3Guard does this, but as a result: when the client is being
@@ -834,7 +989,7 @@ throttled, operations are slower. This capacity throttling is averaged
 over a few minutes: a briefly overloaded table will not be throttled,
 but the rate cannot be sustained.
 
-The load on a table isvisible in the AWS console: go to the
+The load on a table is visible in the AWS console: go to the
 DynamoDB page for the table and select the "metrics" tab.
 If the graphs of throttled read or write
 requests show that a lot of throttling has taken place, then there is not
@@ -895,22 +1050,220 @@ If operations, especially directory operations, are slow, check the AWS
 console. It is also possible to set up AWS alerts for capacity limits
 being exceeded.
 
+### <a name="on-demand"></a> On-Demand Dynamo Capacity
+
+[Amazon DynamoDB On-Demand](https://aws.amazon.com/blogs/aws/amazon-dynamodb-on-demand-no-capacity-planning-and-pay-per-request-pricing/)
+removes the need to pre-allocate I/O capacity for S3Guard tables.
+Instead the caller is _only_ charged per I/O Operation.
+
+* There are no SLA capacity guarantees. This is generally not an issue
+for S3Guard applications.
+* There's no explicit limit on I/O capacity, so operations which make
+heavy use of S3Guard tables (for example: SQL query planning) do not
+get throttled.
+* You are charged more per DynamoDB API call, in exchange for paying nothing
+when you are not interacting with DynamoDB.
+* There's no way put a limit on the I/O; you may unintentionally run up
+large bills through sustained heavy load.
+* The `s3guard set-capacity` command fails: it does not make sense any more.
+
+When idle, S3Guard tables are only billed for the data stored, not for
+any unused capacity. For this reason, there is no performance benefit
+from sharing a single S3Guard table across multiple buckets.
+
+*Creating a S3Guard Table with On-Demand Tables*
+
+The default settings for S3Guard are to create on-demand tables; this
+can also be done explicitly in the `s3guard init` command by setting the
+read and write capacities to zero.
+
+
+```bash
+hadoop s3guard init -meta dynamodb://ireland-team -write 0 -read 0 s3a://ireland-1
+```
+
+*Enabling DynamoDB On-Demand for an existing S3Guard table*
+
+You cannot currently convert an existing S3Guard table to being an on-demand
+table through the `s3guard` command.
+
+It can be done through the AWS console or [the CLI](https://docs.aws.amazon.com/cli/latest/reference/dynamodb/update-table.html).
+From the Web console or the command line, switch the billing to pay-per-request.
+
+Once enabled, the read and write capacities of the table listed in the
+`hadoop s3guard bucket-info` command become "0", and the "billing-mode"
+attribute changes to "per-request":
+
+```
+> hadoop s3guard bucket-info s3a://example-bucket/
+
+Filesystem s3a://example-bucket
+Location: eu-west-1
+Filesystem s3a://example-bucket is using S3Guard with store
+  DynamoDBMetadataStore{region=eu-west-1, tableName=example-bucket,
+  tableArn=arn:aws:dynamodb:eu-west-1:11111122223333:table/example-bucket}
+Authoritative S3Guard: fs.s3a.metadatastore.authoritative=false
+Metadata Store Diagnostics:
+  ARN=arn:aws:dynamodb:eu-west-1:11111122223333:table/example-bucket
+  billing-mode=per-request
+  description=S3Guard metadata store in DynamoDB
+  name=example-bucket
+  persist.authoritative.bit=true
+  read-capacity=0
+  region=eu-west-1
+  retryPolicy=ExponentialBackoffRetry(maxRetries=9, sleepTime=250 MILLISECONDS)
+  size=66797
+  status=ACTIVE
+  table={AttributeDefinitions:
+    [{AttributeName: child,AttributeType: S},
+     {AttributeName: parent,AttributeType: S}],
+     TableName: example-bucket,
+     KeySchema: [{
+       AttributeName: parent,KeyType: HASH},
+       {AttributeName: child,KeyType: RANGE}],
+     TableStatus: ACTIVE,
+     CreationDateTime: Thu Oct 11 18:51:14 BST 2018,
+     ProvisionedThroughput: {
+       LastIncreaseDateTime: Tue Oct 30 16:48:45 GMT 2018,
+       LastDecreaseDateTime: Tue Oct 30 18:00:03 GMT 2018,
+       NumberOfDecreasesToday: 0,
+       ReadCapacityUnits: 0,
+       WriteCapacityUnits: 0},
+     TableSizeBytes: 66797,
+     ItemCount: 415,
+     TableArn: arn:aws:dynamodb:eu-west-1:11111122223333:table/example-bucket,
+     TableId: a7b0728a-f008-4260-b2a0-aaaaabbbbb,}
+  write-capacity=0
+The "magic" committer is supported
+```
+
+### <a name="autoscaling"></a> Autoscaling (Provisioned Capacity) S3Guard tables.
+
 [DynamoDB Auto Scaling](https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/AutoScaling.html)
 can automatically increase and decrease the allocated capacity.
-This is good for keeping capacity high when needed, but avoiding large
-bills when it is not.
+
+Before DynamoDB On-Demand was introduced, autoscaling was the sole form
+of dynamic scaling. 
 
 Experiments with S3Guard and DynamoDB Auto Scaling have shown that any Auto Scaling
 operation will only take place after callers have been throttled for a period of
 time. The clients will still need to be configured to retry when overloaded
 until any extra capacity is allocated. Furthermore, as this retrying will
-block the threads from performing other operations -including more IO, the
+block the threads from performing other operations -including more I/O, the
 the autoscale may not scale fast enough.
 
-We recommend experimenting with this, based on usage information collected
-from previous days, and and choosing a combination of
-retry counts and an interval which allow for the clients to cope with
-some throttling, but not to time out other applications.
+This is why the DynamoDB On-Demand appears is a better option for
+workloads with Hadoop, Spark, Hive and other applications.
+
+If autoscaling is to be used, we recommend experimenting with the option,
+based on usage information collected from previous days, and choosing a
+combination of retry counts and an interval which allow for the clients to cope with
+some throttling, but not to time-out other applications.
+
+## Read-After-Overwrite Consistency
+
+S3Guard provides read-after-overwrite consistency through ETags (default) or
+object versioning checked either on the server (default) or client. This works
+such that a reader reading a file after an overwrite either sees the new version
+of the file or an error. Without S3Guard, new readers may see the original
+version. Once S3 reaches eventual consistency, new readers will see the new
+version.
+
+Readers using S3Guard will usually see the new file version, but may
+in rare cases see `RemoteFileChangedException` instead. This would occur if
+an S3 object read cannot provide the version tracked in S3Guard metadata.
+
+S3Guard achieves this behavior by storing ETags and object version IDs in the
+S3Guard metadata store (e.g. DynamoDB). On opening a file, S3AFileSystem
+will look in S3 for the version of the file indicated by the ETag or object
+version ID stored in the metadata store. If that version is unavailable,
+`RemoteFileChangedException` is thrown. Whether ETag or version ID and
+server or client mode is used is determed by the
+[fs.s3a.change.detection configuration options](./index.html#Handling_Read-During-Overwrite).
+
+### No Versioning Metadata Available
+
+When the first S3AFileSystem clients are upgraded to a version of
+`S3AFileSystem` that contains these change tracking features, any existing
+S3Guard metadata will not contain ETags or object version IDs. Reads of files
+tracked in such S3Guard metadata will access whatever version of the file is
+available in S3 at the time of read. Only if the file is subsequently updated
+will S3Guard start tracking ETag and object version ID and as such generating
+`RemoteFileChangedException` if an inconsistency is detected.
+
+Similarly, when S3Guard metadata is pruned, S3Guard will no longer be able to
+detect an inconsistent read. S3Guard metadata should be retained for at least
+as long as the perceived possible read-after-overwrite temporary inconsistency
+window. That window is expected to be short, but there are no guarantees so it
+is at the administrator's discretion to weigh the risk.
+
+### Known Limitations
+
+#### S3 Select
+
+S3 Select does not provide a capability for server-side ETag or object
+version ID qualification. Whether `fs.s3a.change.detection.mode` is "client" or
+"server", S3Guard will cause a client-side check of the file version before
+opening the file with S3 Select. If the current version does not match the
+version tracked in S3Guard, `RemoteFileChangedException` is thrown.
+
+It is still possible that the S3 Select read will access a different version of
+the file, if the visible file version changes between the version check and
+the opening of the file. This can happen due to eventual consistency or
+an overwrite of the file between the version check and the open of the file.
+
+#### Rename
+
+Rename is implemented via copy in S3. With `fs.s3a.change.detection.mode` set
+to "client", a fully reliable mechansim for ensuring the copied content is the expected
+content is not possible. This is the case since there isn't necessarily a way
+to know the expected ETag or version ID to appear on the object resulting from
+the copy.
+
+Furthermore, if `fs.s3a.change.detection.mode` is "server" and a third-party S3
+implementation is used that doesn't honor the provided ETag or version ID,
+S3AFileSystem and S3Guard cannot detect it.
+
+When `fs.s3.change.detection.mode` is "client", a client-side check
+will be performed before the copy to ensure the current version of the file
+matches S3Guard metadata. If not, `RemoteFileChangedException` is thrown.
+Similar to as discussed with regard to S3 Select, this is not sufficient to
+guarantee that same version is the version copied.
+
+When `fs.s3.change.detection.mode` server, the expected version is also specified
+in the underlying S3 `CopyObjectRequest`. As long as the server honors it, the
+copied object will be correct.
+
+All this said, with the defaults of `fs.s3.change.detection.mode` of "server" and
+`fs.s3.change.detection.source` of "etag", when working with Amazon's S3, copy should in fact
+either copy the expected file version or, in the case of an eventual consistency
+anomaly, generate `RemoteFileChangedException`. The same should be true when
+`fs.s3.change.detection.source` = "versionid".
+
+#### Out of Sync Metadata
+
+The S3Guard version tracking metadata (ETag or object version ID) could become
+out of sync with the true current object metadata in S3.  For example, S3Guard
+is still tracking v1 of some file after v2 has been written.  This could occur
+for reasons such as a writer writing without utilizing S3Guard and/or
+S3AFileSystem or simply due to a write with S3AFileSystem and S3Guard that wrote
+successfully to S3, but failed in communication with S3Guard's metadata store
+(e.g. DynamoDB).
+
+If this happens, reads of the affected file(s) will result in
+`RemoteFileChangedException` until one of:
+
+* the S3Guard metadata is corrected out-of-band
+* the file is overwritten (causing an S3Guard metadata update)
+* the S3Guard metadata is pruned
+
+The S3Guard metadata for a file can be corrected with the `s3guard import`
+command as discussed above. The command can take a file URI instead of a
+bucket URI to correct the metadata for a single file. For example:
+
+```bash
+hadoop s3guard import [-meta URI] s3a://my-bucket/file-with-bad-metadata
+```
 
 ## Troubleshooting
 
@@ -963,20 +1316,20 @@ Consider increasing your provisioning level with the UpdateTable API.
 (Service: AmazonDynamoDBv2; Status Code: 400;
 Error Code: ProvisionedThroughputExceededException;
 ```
-The IO load of clients of the (shared) DynamoDB table was exceeded.
+The I/O load of clients of the (shared) DynamoDB table was exceeded.
 
-1. Increase the capacity of the DynamoDB table.
-1. Increase the retry count and/or sleep time of S3Guard on throttle events.
-1. Enable capacity autoscaling for the table in the AWS console.
+1. Switch to On-Demand Dynamo DB tables (AWS console)
+1. Increase the capacity of the DynamoDB table (AWS console or `s3guard set-capacity`)/
+1. Increase the retry count and/or sleep time of S3Guard on throttle events (Hadoop configuration).
 
 ### Error `Max retries exceeded`
 
 The I/O load of clients of the (shared) DynamoDB table was exceeded, and
 the number of attempts to retry the operation exceeded the configured amount.
 
+1. Switch to On-Demand Dynamo DB tables (AWS console).
 1. Increase the capacity of the DynamoDB table.
 1. Increase the retry count and/or sleep time of S3Guard on throttle events.
-1. Enable capacity autoscaling for the table in the AWS console.
 
 
 ### Error when running `set-capacity`: `org.apache.hadoop.fs.s3a.AWSServiceThrottledException: ProvisionTable`
@@ -992,9 +1345,134 @@ Next decrease can be made at Wednesday, July 25, 2018 9:48:14 PM UTC
 ```
 
 There's are limit on how often you can change the capacity of an DynamoDB table;
-if you call set-capacity too often, it fails. Wait until the after the time indicated
+if you call `set-capacity` too often, it fails. Wait until the after the time indicated
 and try again.
 
+### Error `Invalid region specified`
+
+```
+java.io.IOException: Invalid region specified "iceland-2":
+  Region can be configured with fs.s3a.s3guard.ddb.region:
+  us-gov-west-1, us-east-1, us-east-2, us-west-1, us-west-2,
+  eu-west-1, eu-west-2, eu-west-3, eu-central-1, ap-south-1,
+  ap-southeast-1, ap-southeast-2, ap-northeast-1, ap-northeast-2,
+  sa-east-1, cn-north-1, cn-northwest-1, ca-central-1
+  at org.apache.hadoop.fs.s3a.s3guard.DynamoDBClientFactory$DefaultDynamoDBClientFactory.getRegion
+  at org.apache.hadoop.fs.s3a.s3guard.DynamoDBClientFactory$DefaultDynamoDBClientFactory.createDynamoDBClient
+```
+
+The region specified in `fs.s3a.s3guard.ddb.region` is invalid.
+
+### "Neither ReadCapacityUnits nor WriteCapacityUnits can be specified when BillingMode is PAY_PER_REQUEST"
+
+```
+ValidationException; One or more parameter values were invalid:
+  Neither ReadCapacityUnits nor WriteCapacityUnits can be specified when
+  BillingMode is PAY_PER_REQUEST
+  (Service: AmazonDynamoDBv2; Status Code: 400; Error Code: ValidationException)
+```
+
+On-Demand DynamoDB tables do not have any fixed capacity -it is an error
+to try to change it with the `set-capacity` command.
+
+### `MetadataPersistenceException`
+
+A filesystem write operation failed to persist metadata to S3Guard. The file was
+successfully written to S3 and now the S3Guard metadata is likely to be out of
+sync.
+
+See [Fail on Error](#fail-on-error) for more detail.
+
+### Error `RemoteFileChangedException`
+
+An exception like the following could occur for a couple of reasons:
+
+* the S3Guard metadata is out of sync with the true S3 metadata.  For
+example, the S3Guard DynamoDB table is tracking a different ETag than the ETag
+shown in the exception.  This may suggest the object was updated in S3 without
+involvement from S3Guard or there was a transient failure when S3Guard tried to
+write to DynamoDB.
+
+* S3 is exhibiting read-after-overwrite temporary inconsistency.  The S3Guard
+metadata was updated with a new ETag during a recent write, but the current read
+is not seeing that ETag due to S3 eventual consistency.  This exception prevents
+the reader from an inconsistent read where the reader sees an older version of
+the file.
+
+```
+org.apache.hadoop.fs.s3a.RemoteFileChangedException: open 's3a://my-bucket/test/file.txt':
+  Change reported by S3 while reading at position 0.
+  ETag 4e886e26c072fef250cfaf8037675405 was unavailable
+  at org.apache.hadoop.fs.s3a.impl.ChangeTracker.processResponse(ChangeTracker.java:167)
+  at org.apache.hadoop.fs.s3a.S3AInputStream.reopen(S3AInputStream.java:207)
+  at org.apache.hadoop.fs.s3a.S3AInputStream.lambda$lazySeek$1(S3AInputStream.java:355)
+  at org.apache.hadoop.fs.s3a.Invoker.lambda$retry$2(Invoker.java:195)
+  at org.apache.hadoop.fs.s3a.Invoker.once(Invoker.java:109)
+  at org.apache.hadoop.fs.s3a.Invoker.lambda$retry$3(Invoker.java:265)
+  at org.apache.hadoop.fs.s3a.Invoker.retryUntranslated(Invoker.java:322)
+  at org.apache.hadoop.fs.s3a.Invoker.retry(Invoker.java:261)
+  at org.apache.hadoop.fs.s3a.Invoker.retry(Invoker.java:193)
+  at org.apache.hadoop.fs.s3a.Invoker.retry(Invoker.java:215)
+  at org.apache.hadoop.fs.s3a.S3AInputStream.lazySeek(S3AInputStream.java:348)
+  at org.apache.hadoop.fs.s3a.S3AInputStream.read(S3AInputStream.java:381)
+  at java.io.FilterInputStream.read(FilterInputStream.java:83)
+```
+
+### Error `AWSClientIOException: copyFile` caused by `NullPointerException`
+
+The AWS SDK has an [issue](https://github.com/aws/aws-sdk-java/issues/1644)
+where it will throw a relatively generic `AmazonClientException` caused by
+`NullPointerException` when copying a file and specifying a precondition
+that cannot be met.  This can bubble up from `S3AFileSystem.rename()`. It
+suggests that the file in S3 is inconsistent with the metadata in S3Guard.
+
+```
+org.apache.hadoop.fs.s3a.AWSClientIOException: copyFile(test/rename-eventually2.dat, test/dest2.dat) on test/rename-eventually2.dat: com.amazonaws.AmazonClientException: Unable to complete transfer: null: Unable to complete transfer: null
+  at org.apache.hadoop.fs.s3a.S3AUtils.translateException(S3AUtils.java:201)
+  at org.apache.hadoop.fs.s3a.Invoker.once(Invoker.java:111)
+  at org.apache.hadoop.fs.s3a.Invoker.lambda$retry$4(Invoker.java:314)
+  at org.apache.hadoop.fs.s3a.Invoker.retryUntranslated(Invoker.java:406)
+  at org.apache.hadoop.fs.s3a.Invoker.retry(Invoker.java:310)
+  at org.apache.hadoop.fs.s3a.Invoker.retry(Invoker.java:285)
+  at org.apache.hadoop.fs.s3a.S3AFileSystem.copyFile(S3AFileSystem.java:3034)
+  at org.apache.hadoop.fs.s3a.S3AFileSystem.innerRename(S3AFileSystem.java:1258)
+  at org.apache.hadoop.fs.s3a.S3AFileSystem.rename(S3AFileSystem.java:1119)
+  at org.apache.hadoop.fs.s3a.ITestS3ARemoteFileChanged.lambda$testRenameEventuallyConsistentFile2$6(ITestS3ARemoteFileChanged.java:556)
+  at org.apache.hadoop.test.LambdaTestUtils.intercept(LambdaTestUtils.java:498)
+  at org.apache.hadoop.fs.s3a.ITestS3ARemoteFileChanged.testRenameEventuallyConsistentFile2(ITestS3ARemoteFileChanged.java:554)
+  at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
+  at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
+  at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
+  at java.lang.reflect.Method.invoke(Method.java:498)
+  at org.junit.runners.model.FrameworkMethod$1.runReflectiveCall(FrameworkMethod.java:50)
+  at org.junit.internal.runners.model.ReflectiveCallable.run(ReflectiveCallable.java:12)
+  at org.junit.runners.model.FrameworkMethod.invokeExplosively(FrameworkMethod.java:47)
+  at org.junit.internal.runners.statements.InvokeMethod.evaluate(InvokeMethod.java:17)
+  at org.junit.internal.runners.statements.RunBefores.evaluate(RunBefores.java:26)
+  at org.junit.internal.runners.statements.RunAfters.evaluate(RunAfters.java:27)
+  at org.junit.rules.TestWatcher$1.evaluate(TestWatcher.java:55)
+  at org.junit.internal.runners.statements.FailOnTimeout$CallableStatement.call(FailOnTimeout.java:298)
+  at org.junit.internal.runners.statements.FailOnTimeout$CallableStatement.call(FailOnTimeout.java:292)
+  at java.util.concurrent.FutureTask.run(FutureTask.java:266)
+  at java.lang.Thread.run(Thread.java:748)
+Caused by: com.amazonaws.AmazonClientException: Unable to complete transfer: null
+  at com.amazonaws.services.s3.transfer.internal.AbstractTransfer.unwrapExecutionException(AbstractTransfer.java:286)
+  at com.amazonaws.services.s3.transfer.internal.AbstractTransfer.rethrowExecutionException(AbstractTransfer.java:265)
+  at com.amazonaws.services.s3.transfer.internal.CopyImpl.waitForCopyResult(CopyImpl.java:67)
+  at org.apache.hadoop.fs.s3a.impl.CopyOutcome.waitForCopy(CopyOutcome.java:72)
+  at org.apache.hadoop.fs.s3a.S3AFileSystem.lambda$copyFile$14(S3AFileSystem.java:3047)
+  at org.apache.hadoop.fs.s3a.Invoker.once(Invoker.java:109)
+  ... 25 more
+Caused by: java.lang.NullPointerException
+  at com.amazonaws.services.s3.transfer.internal.CopyCallable.copyInOneChunk(CopyCallable.java:154)
+  at com.amazonaws.services.s3.transfer.internal.CopyCallable.call(CopyCallable.java:134)
+  at com.amazonaws.services.s3.transfer.internal.CopyMonitor.call(CopyMonitor.java:132)
+  at com.amazonaws.services.s3.transfer.internal.CopyMonitor.call(CopyMonitor.java:43)
+  at java.util.concurrent.FutureTask.run(FutureTask.java:266)
+  at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
+  at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
+  ... 1 more
+```
 
 ## Other Topics
 
diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3GuardOutOfBandOperations.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3GuardOutOfBandOperations.java
new file mode 100644
index 0000000000000..2af9a0ab73ef2
--- /dev/null
+++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3GuardOutOfBandOperations.java
@@ -0,0 +1,975 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.s3a;
+
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.net.URI;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.UUID;
+import java.util.concurrent.atomic.AtomicLong;
+import java.util.stream.Collectors;
+
+import org.junit.Assume;
+import org.junit.Before;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+import org.apache.hadoop.fs.FSDataInputStream;
+import org.apache.hadoop.fs.s3a.s3guard.NullMetadataStore;
+import org.apache.hadoop.io.IOUtils;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FSDataOutputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.s3a.impl.ChangeDetectionPolicy;
+import org.apache.hadoop.fs.s3a.impl.ChangeDetectionPolicy.Source;
+import org.apache.hadoop.fs.s3a.s3guard.DirListingMetadata;
+import org.apache.hadoop.fs.s3a.s3guard.MetadataStore;
+import org.apache.hadoop.fs.s3a.s3guard.PathMetadata;
+import org.apache.hadoop.fs.s3a.s3guard.ITtlTimeProvider;
+import org.apache.hadoop.fs.LocatedFileStatus;
+import org.apache.hadoop.fs.RemoteIterator;
+
+import static org.apache.hadoop.fs.contract.ContractTestUtils.touch;
+import static org.apache.hadoop.fs.s3a.Constants.DEFAULT_METADATASTORE_METADATA_TTL;
+import static org.apache.hadoop.fs.s3a.Constants.METADATASTORE_METADATA_TTL;
+import static org.apache.hadoop.fs.s3a.S3ATestUtils.removeBaseAndBucketOverrides;
+import static org.apache.hadoop.test.LambdaTestUtils.eventually;
+import static org.junit.Assume.assumeTrue;
+import static org.apache.hadoop.fs.contract.ContractTestUtils.readBytesToString;
+import static org.apache.hadoop.fs.contract.ContractTestUtils.writeTextFile;
+import static org.apache.hadoop.fs.s3a.Constants.METADATASTORE_AUTHORITATIVE;
+import static org.apache.hadoop.fs.s3a.Constants.S3_METADATA_STORE_IMPL;
+import static org.apache.hadoop.fs.s3a.S3ATestUtils.metadataStorePersistsAuthoritativeBit;
+import static org.apache.hadoop.test.LambdaTestUtils.intercept;
+import static org.mockito.Mockito.mock;
+import static org.mockito.Mockito.when;
+
+/**
+ *
+ * This integration test is for documenting and defining how S3Guard should
+ * behave in case of out-of-band (OOB) operations.
+ * <pre>
+ * The behavior is the following in case of S3AFileSystem.getFileStatus:
+ * A client with S3Guard
+ * B client without S3Guard (Directly to S3)
+ *
+ * * OOB OVERWRITE, authoritative mode:
+ * ** A client creates F1 file
+ * ** B client overwrites F1 file with F2 (Same, or different file size)
+ * ** A client's getFileStatus returns F1 metadata
+ *
+ * * OOB OVERWRITE, NOT authoritative mode:
+ * ** A client creates F1 file
+ * ** B client overwrites F1 file with F2 (Same, or different file size)
+ * ** A client's getFileStatus returns F2 metadata. In not authoritative
+ * mode we check S3 for the file. If the modification time of the file in S3
+ * is greater than in S3Guard, we can safely return the S3 file metadata and
+ * update the cache.
+ *
+ * * OOB DELETE, authoritative mode:
+ * ** A client creates F file
+ * ** B client deletes F file
+ * ** A client's getFileStatus returns that the file is still there
+ *
+ * * OOB DELETE, NOT authoritative mode:
+ * ** A client creates F file
+ * ** B client deletes F file
+ * ** A client's getFileStatus returns that the file is still there
+ *
+ * As you can see, authoritative and NOT authoritative mode behaves the same
+ * at OOB DELETE case.
+ *
+ * The behavior is the following in case of S3AFileSystem.listStatus:
+ * * File status in metadata store gets updated during the listing (in
+ * S3Guard.dirListingUnion) the same way as in getFileStatus.
+ * </pre>
+ */
+@RunWith(Parameterized.class)
+public class ITestS3GuardOutOfBandOperations extends AbstractS3ATestBase {
+
+  public static final int TIMESTAMP_SLEEP = 2000;
+
+  public static final int STABILIZATION_TIME = 20_000;
+
+  public static final int PROBE_INTERVAL_MILLIS = 500;
+
+  private S3AFileSystem guardedFs;
+  private S3AFileSystem rawFS;
+
+  private MetadataStore realMs;
+
+  /**
+   * Is the "real" FS Authoritative.
+   */
+  private final boolean authoritative;
+
+  /**
+   * Test array for parameterized test runs.
+   * @return a list of parameter tuples.
+   */
+  @Parameterized.Parameters(name="auth={0}")
+  public static Collection<Object[]> params() {
+    return Arrays.asList(new Object[][]{
+        {true}, {false}
+    });
+  }
+
+  public ITestS3GuardOutOfBandOperations(final boolean authoritative) {
+    this.authoritative = authoritative;
+  }
+
+  /**
+   * By changing the method name, the thread name is changed and
+   * so you can see in the logs which mode is being tested.
+   * @return a string to use for the thread namer.
+   */
+  @Override
+  protected String getMethodName() {
+    return super.getMethodName() +
+        (authoritative ? "-auth" : "-nonauth");
+  }
+
+  @Before
+  public void setup() throws Exception {
+    super.setup();
+    S3AFileSystem fs = getFileSystem();
+    // These test will fail if no ms
+    assumeTrue("FS needs to have a metadatastore.",
+        fs.hasMetadataStore());
+    assumeTrue("Metadatastore should persist authoritative bit",
+        metadataStorePersistsAuthoritativeBit(fs.getMetadataStore()));
+
+    // This test setup shares a single metadata store across instances,
+    // so that test runs with a local FS work.
+    // but this needs to be addressed in teardown, where the guarded fs
+    // needs to be detached from the metadata store before it is closed,
+    realMs = fs.getMetadataStore();
+    // now we create a new FS with the auth parameter
+    guardedFs = createGuardedFS(authoritative);
+    assertTrue("No S3Guard store for " + guardedFs,
+        guardedFs.hasMetadataStore());
+    assertEquals("Authoritative status in " + guardedFs,
+        authoritative, guardedFs.hasAuthoritativeMetadataStore());
+
+    // create raw fs without s3guard
+    rawFS = createUnguardedFS();
+    assertFalse("Raw FS still has S3Guard " + rawFS,
+        rawFS.hasMetadataStore());
+  }
+
+  @Override
+  public void teardown() throws Exception {
+    if (guardedFs != null) {
+      // detach from the (shared) metadata store.
+      guardedFs.setMetadataStore(new NullMetadataStore());
+      // and only then close it.
+      IOUtils.cleanupWithLogger(LOG, guardedFs);
+    }
+    IOUtils.cleanupWithLogger(LOG, rawFS);
+    super.teardown();
+  }
+
+  /**
+   * Create a new FS which is the same config as the test FS, except
+   * that it is guarded with the specific authoritative mode.
+   * @param authoritativeMode mode of the new FS's metastore
+   * @return the new FS
+   */
+  private S3AFileSystem createGuardedFS(boolean authoritativeMode)
+      throws Exception {
+    S3AFileSystem testFS = getFileSystem();
+    Configuration config = new Configuration(testFS.getConf());
+    URI uri = testFS.getUri();
+
+    removeBaseAndBucketOverrides(uri.getHost(), config,
+        METADATASTORE_AUTHORITATIVE,
+        METADATASTORE_METADATA_TTL);
+    config.setBoolean(METADATASTORE_AUTHORITATIVE, authoritativeMode);
+    config.setLong(METADATASTORE_METADATA_TTL,
+        DEFAULT_METADATASTORE_METADATA_TTL);
+    final S3AFileSystem gFs = createFS(uri, config);
+    // set back the same metadata store instance
+    gFs.setMetadataStore(realMs);
+    return gFs;
+  }
+
+  /**
+   * Create a test filesystem which is always unguarded.
+   * This filesystem MUST be closed in test teardown.
+   * @return the new FS
+   */
+  private S3AFileSystem createUnguardedFS() throws Exception {
+    S3AFileSystem testFS = getFileSystem();
+    Configuration config = new Configuration(testFS.getConf());
+    URI uri = testFS.getUri();
+
+    removeBaseAndBucketOverrides(uri.getHost(), config,
+        S3_METADATA_STORE_IMPL);
+    removeBaseAndBucketOverrides(uri.getHost(), config,
+        METADATASTORE_AUTHORITATIVE);
+    return createFS(uri, config);
+  }
+
+  /**
+   * Create and initialize a new filesystem.
+   * This filesystem MUST be closed in test teardown.
+   * @param uri FS URI
+   * @param config config.
+   * @return new instance
+   * @throws IOException failure
+   */
+  private S3AFileSystem createFS(final URI uri, final Configuration config)
+      throws IOException {
+    S3AFileSystem fs2 = new S3AFileSystem();
+    fs2.initialize(uri, config);
+    return fs2;
+  }
+
+  @Test
+  public void testSameLengthOverwrite() throws Exception {
+    String firstText = "hello, world!";
+    String secondText = "HELLO, WORLD!";
+    overwriteFile(firstText, secondText);
+  }
+
+  @Test
+  public void testLongerLengthOverwrite() throws Exception {
+    String firstText = "Hello, World!";
+    String secondText = firstText + " " + firstText;
+    overwriteFile(firstText, secondText);
+  }
+
+  @Test
+  public void testOutOfBandDeletes() throws Exception {
+    ChangeDetectionPolicy changeDetectionPolicy =
+        ((S3AFileSystem) getFileSystem()).getChangeDetectionPolicy();
+    Assume.assumeFalse("FNF not expected when using a bucket with"
+            + " object versioning",
+        changeDetectionPolicy.getSource() == Source.VersionId);
+
+    Path testFileName = path("OutOfBandDelete-" + UUID.randomUUID());
+    outOfBandDeletes(testFileName, authoritative);
+  }
+
+  @Test
+  public void testListingSameLengthOverwrite() throws Exception {
+    overwriteFileInListing("THE TEXT", "the text");
+  }
+
+  @Test
+  public void testListingLongerLengthOverwrite() throws Exception {
+    overwriteFileInListing("THE TEXT", "THE LONGER TEXT");
+  }
+
+  @Test
+  public void testListingDelete() throws Exception {
+    deleteFileInListing();
+  }
+
+  /**
+   * Tests that tombstone expiry is implemented, so if a file is created raw
+   * while the tombstone exist in ms for with the same name then S3Guard will
+   * check S3 for the file.
+   *
+   * Seq: create guarded; delete guarded; create raw (same path); read guarded;
+   * This will fail if no tombstone expiry is set
+   *
+   * @throws Exception
+   */
+  @Test
+  public void testTombstoneExpiryGuardedDeleteRawCreate() throws Exception {
+    boolean allowAuthoritative = authoritative;
+    Path testFilePath = path("TEGDRC-" + UUID.randomUUID() + "/file");
+    LOG.info("Allow authoritative param: {}",  allowAuthoritative);
+    String originalText = "some test";
+    String newText = "the new originalText for test";
+
+    final ITtlTimeProvider originalTimeProvider =
+        guardedFs.getTtlTimeProvider();
+    try {
+      final AtomicLong now = new AtomicLong(1);
+      final AtomicLong metadataTtl = new AtomicLong(1);
+
+      // SET TTL TIME PROVIDER FOR TESTING
+      ITtlTimeProvider testTimeProvider =
+          new ITtlTimeProvider() {
+            @Override public long getNow() {
+              return now.get();
+            }
+
+            @Override public long getMetadataTtl() {
+              return metadataTtl.get();
+            }
+          };
+      guardedFs.setTtlTimeProvider(testTimeProvider);
+
+      // CREATE GUARDED
+      createAndAwaitFs(guardedFs, testFilePath, originalText);
+
+      // DELETE GUARDED
+      deleteGuardedTombstoned(guardedFs, testFilePath, now);
+
+      // CREATE RAW
+      createAndAwaitFs(rawFS, testFilePath, newText);
+
+      // CHECK LISTING - THE FILE SHOULD NOT BE THERE, EVEN IF IT'S CREATED RAW
+      checkListingDoesNotContainPath(guardedFs, testFilePath);
+
+      // CHANGE TTL SO ENTRY (& TOMBSTONE METADATA) WILL EXPIRE
+      long willExpire = now.get() + metadataTtl.get() + 1L;
+      now.set(willExpire);
+      LOG.info("willExpire: {}, ttlNow: {}; ttlTTL: {}", willExpire,
+          testTimeProvider.getNow(), testTimeProvider.getMetadataTtl());
+
+      // READ GUARDED
+      String newRead = readBytesToString(guardedFs, testFilePath,
+          newText.length());
+
+      // CHECK LISTING - THE FILE SHOULD BE THERE, TOMBSTONE EXPIRED
+      checkListingContainsPath(guardedFs, testFilePath);
+
+      // we can assert that the originalText is the new one, which created raw
+      LOG.info("Old: {}, New: {}, Read: {}", originalText, newText, newRead);
+      assertEquals("The text should be modified with a new.", newText,
+          newRead);
+    } finally {
+      guardedFs.delete(testFilePath, true);
+      guardedFs.setTtlTimeProvider(originalTimeProvider);
+    }
+  }
+
+  private void createAndAwaitFs(S3AFileSystem fs, Path testFilePath,
+      String text) throws Exception {
+    writeTextFile(fs, testFilePath, text, true);
+    final FileStatus newStatus = awaitFileStatus(fs, testFilePath);
+    assertNotNull("Newly created file status should not be null.", newStatus);
+  }
+
+  private void deleteGuardedTombstoned(S3AFileSystem guarded,
+      Path testFilePath, AtomicLong now) throws Exception {
+    guarded.delete(testFilePath, true);
+
+    final PathMetadata metadata =
+        guarded.getMetadataStore().get(testFilePath);
+    assertNotNull("Created file metadata should not be null in ms",
+        metadata);
+    assertEquals("Created file metadata last_updated should equal with "
+            + "mocked now", now.get(), metadata.getLastUpdated());
+
+    intercept(FileNotFoundException.class, testFilePath.toString(),
+        "This file should throw FNFE when reading through "
+            + "the guarded fs, and the metadatastore tombstoned the file.",
+        () -> guarded.getFileStatus(testFilePath));
+  }
+
+  /**
+   * createNonRecursive must fail if the parent directory has been deleted,
+   * and succeed if the tombstone has expired and the directory has been
+   * created out of band.
+   */
+  @Test
+  public void testCreateNonRecursiveFailsIfParentDeleted() throws Exception {
+    LOG.info("Authoritative mode: {}", authoritative);
+
+    String dirToDelete = methodName + UUID.randomUUID().toString();
+    String fileToTry = dirToDelete + "/theFileToTry";
+
+    final Path dirPath = path(dirToDelete);
+    final Path filePath = path(fileToTry);
+
+    // Create a directory with
+    ITtlTimeProvider mockTimeProvider = mock(ITtlTimeProvider.class);
+    ITtlTimeProvider originalTimeProvider = guardedFs.getTtlTimeProvider();
+
+    try {
+      guardedFs.setTtlTimeProvider(mockTimeProvider);
+      when(mockTimeProvider.getNow()).thenReturn(100L);
+      when(mockTimeProvider.getMetadataTtl()).thenReturn(5L);
+
+      // CREATE DIRECTORY
+      guardedFs.mkdirs(dirPath);
+
+      // DELETE DIRECTORY
+      guardedFs.delete(dirPath, true);
+
+      // WRITE TO DELETED DIRECTORY - FAIL
+      intercept(FileNotFoundException.class,
+          dirToDelete,
+          "createNonRecursive must fail if the parent directory has been deleted.",
+          () -> createNonRecursive(guardedFs, filePath));
+
+      // CREATE THE DIRECTORY RAW
+      rawFS.mkdirs(dirPath);
+      awaitFileStatus(rawFS, dirPath);
+
+      // SET TIME SO METADATA EXPIRES
+      when(mockTimeProvider.getNow()).thenReturn(110L);
+
+      // WRITE TO DELETED DIRECTORY - SUCCESS
+      createNonRecursive(guardedFs, filePath);
+
+    } finally {
+      guardedFs.delete(filePath, true);
+      guardedFs.delete(dirPath, true);
+      guardedFs.setTtlTimeProvider(originalTimeProvider);
+    }
+  }
+
+  /**
+   * When lastUpdated = 0 the entry should not expire. This is a special case
+   * eg. for old metadata entries
+   */
+  @Test
+  public void testLastUpdatedZeroWontExpire() throws Exception {
+    LOG.info("Authoritative mode: {}", authoritative);
+
+    String testFile = methodName + UUID.randomUUID().toString() +
+        "/theFileToTry";
+
+    long ttl = 10L;
+    final Path filePath = path(testFile);
+
+    ITtlTimeProvider mockTimeProvider = mock(ITtlTimeProvider.class);
+    ITtlTimeProvider originalTimeProvider = guardedFs.getTtlTimeProvider();
+
+    try {
+      guardedFs.setTtlTimeProvider(mockTimeProvider);
+      when(mockTimeProvider.getMetadataTtl()).thenReturn(ttl);
+
+      // create a file while the NOW is 0, so it will set 0 as the last_updated
+      when(mockTimeProvider.getNow()).thenReturn(0L);
+      touch(guardedFs, filePath);
+      deleteFile(guardedFs, filePath);
+
+      final PathMetadata pathMetadata =
+          guardedFs.getMetadataStore().get(filePath);
+      assertNotNull("pathMetadata should not be null after deleting with "
+          + "tombstones", pathMetadata);
+      assertEquals("pathMetadata lastUpdated field should be 0", 0,
+          pathMetadata.getLastUpdated());
+
+      // set the time, so the metadata would expire
+      when(mockTimeProvider.getNow()).thenReturn(2*ttl);
+      intercept(FileNotFoundException.class, filePath.toString(),
+          "This file should throw FNFE when reading through "
+              + "the guarded fs, and the metadatastore tombstoned the file. "
+              + "The tombstone won't expire if lastUpdated is set to 0.",
+          () -> guardedFs.getFileStatus(filePath));
+
+    } finally {
+      guardedFs.delete(filePath, true);
+      guardedFs.setTtlTimeProvider(originalTimeProvider);
+    }
+  }
+
+  /**
+   * 1. File is deleted in the guarded fs.
+   * 2. File is replaced in the raw fs.
+   * 3. File is deleted in the guarded FS after the expiry time.
+   * 4. File MUST NOT exist in raw FS.
+   */
+  @Test
+  public void deleteAfterTombstoneExpiryOobCreate() throws Exception {
+    LOG.info("Authoritative mode: {}", authoritative);
+
+    String testFile = methodName + UUID.randomUUID().toString() +
+        "/theFileToTry";
+
+    long ttl = 10L;
+    final Path filePath = path(testFile);
+
+    ITtlTimeProvider mockTimeProvider = mock(ITtlTimeProvider.class);
+    ITtlTimeProvider originalTimeProvider = guardedFs.getTtlTimeProvider();
+
+    try {
+      guardedFs.setTtlTimeProvider(mockTimeProvider);
+      when(mockTimeProvider.getMetadataTtl()).thenReturn(ttl);
+
+      // CREATE AND DELETE WITH GUARDED FS
+      when(mockTimeProvider.getNow()).thenReturn(100L);
+      touch(guardedFs, filePath);
+      deleteFile(guardedFs, filePath);
+
+      final PathMetadata pathMetadata =
+          guardedFs.getMetadataStore().get(filePath);
+      assertNotNull("pathMetadata should not be null after deleting with "
+          + "tombstones", pathMetadata);
+
+      // REPLACE WITH RAW FS
+      touch(rawFS, filePath);
+      awaitFileStatus(rawFS, filePath);
+
+      // SET EXPIRY TIME, SO THE TOMBSTONE IS EXPIRED
+      when(mockTimeProvider.getNow()).thenReturn(100L + 2 * ttl);
+
+      // DELETE IN GUARDED FS
+      guardedFs.delete(filePath, true);
+
+      // FILE MUST NOT EXIST IN RAW
+      intercept(FileNotFoundException.class, filePath.toString(),
+          "This file should throw FNFE when reading through "
+              + "the raw fs, and the guarded fs deleted the file.",
+          () -> rawFS.getFileStatus(filePath));
+
+    } finally {
+      guardedFs.delete(filePath, true);
+      guardedFs.setTtlTimeProvider(originalTimeProvider);
+    }
+  }
+
+  private void checkListingDoesNotContainPath(S3AFileSystem fs, Path filePath)
+      throws IOException {
+    final RemoteIterator<LocatedFileStatus> listIter =
+        fs.listFiles(filePath.getParent(), false);
+    while (listIter.hasNext()) {
+      final LocatedFileStatus lfs = listIter.next();
+      assertNotEquals("The tombstone has not been expired, so must not be"
+          + " listed.", filePath, lfs.getPath());
+    }
+    LOG.info("{}; file omitted from listFiles listing as expected.", filePath);
+
+    final FileStatus[] fileStatuses = fs.listStatus(filePath.getParent());
+    for (FileStatus fileStatus : fileStatuses) {
+      assertNotEquals("The tombstone has not been expired, so must not be"
+          + " listed.", filePath, fileStatus.getPath());
+    }
+    LOG.info("{}; file omitted from listStatus as expected.", filePath);
+  }
+
+  private void checkListingContainsPath(S3AFileSystem fs, Path filePath)
+      throws IOException {
+    final RemoteIterator<LocatedFileStatus> listIter =
+        fs.listFiles(filePath.getParent(), false);
+
+    while (listIter.hasNext()) {
+      final LocatedFileStatus lfs = listIter.next();
+      assertEquals(filePath, lfs.getPath());
+    }
+
+    final FileStatus[] fileStatuses = fs.listStatus(filePath.getParent());
+    for (FileStatus fileStatus : fileStatuses)
+      assertEquals("The file should be listed in fs.listStatus",
+          filePath, fileStatus.getPath());
+  }
+
+  /**
+   * Perform an out-of-band delete.
+   * @param testFilePath filename
+   * @param allowAuthoritative  is the store authoritative
+   * @throws Exception failure
+   */
+  private void outOfBandDeletes(
+      final Path testFilePath,
+      final boolean allowAuthoritative)
+      throws Exception {
+    try {
+      // Create initial file
+      String text = "Hello, World!";
+      writeTextFile(guardedFs, testFilePath, text, true);
+      awaitFileStatus(rawFS, testFilePath);
+
+      // Delete the file without S3Guard (raw)
+      deleteFile(rawFS, testFilePath);
+
+      // The check is the same if s3guard is authoritative and if it's not
+      // it should be in the ms
+      FileStatus status = guardedFs.getFileStatus(testFilePath);
+      LOG.info("Authoritative: {} status path: {}",
+          allowAuthoritative, status.getPath());
+      expectExceptionWhenReading(testFilePath, text);
+      expectExceptionWhenReadingOpenFileAPI(testFilePath, text);
+    } finally {
+      guardedFs.delete(testFilePath, true);
+    }
+  }
+
+  /**
+   * Overwrite a file out of band.
+   * @param firstText first text
+   * @param secondText second text
+   * @throws Exception failure
+   */
+  private void overwriteFile(String firstText, String secondText)
+      throws Exception {
+    boolean allowAuthoritative = authoritative;
+    Path testFilePath = path("OverwriteFileTest-" + UUID.randomUUID());
+    LOG.info("Allow authoritative param: {}",  allowAuthoritative);
+    try {
+      // Create initial file
+      writeTextFile(
+          guardedFs, testFilePath, firstText, true);
+      // and cache the value for later
+      final FileStatus origStatus = awaitFileStatus(rawFS, testFilePath);
+      waitForDifferentTimestamps();
+      // Overwrite the file without S3Guard
+      writeTextFile(
+          rawFS, testFilePath, secondText, true);
+
+      // Read the file and verify the data
+      eventually(STABILIZATION_TIME, PROBE_INTERVAL_MILLIS,
+          () -> {
+            FileStatus rawFileStatus = rawFS.getFileStatus(testFilePath);
+            final FileStatus guardedFileStatus =
+                guardedFs.getFileStatus(testFilePath);
+            verifyFileStatusAsExpected(firstText, secondText,
+                allowAuthoritative,
+                origStatus,
+                rawFileStatus,
+                guardedFileStatus);
+          });
+    } finally {
+      guardedFs.delete(testFilePath, true);
+    }
+  }
+
+  /**
+   * Assert that an array has a given size; in failure the full string values
+   * of the array will be included, one per line.
+   * @param message message for errors.
+   * @param expected expected length.
+   * @param array the array to probe
+   */
+  private <T> void assertArraySize(
+      final String message,
+      final int expected,
+      final T[] array) {
+    if (expected != array.length) {
+      // condition is not met, build an error which includes all the entries
+      String listing = Arrays.stream(array)
+          .map(Object::toString)
+          .collect(Collectors.joining("\n"));
+      fail(message + ": expected " + expected + " elements but found "
+          + array.length
+          + "\n" + listing);
+    }
+  }
+
+  /**
+   * Overwrite a file, verify that the text is different as is the timestamp.
+   * There are some pauses in the test to ensure that timestamps are different.
+   * @param firstText first text to write
+   * @param secondText second text to write
+   */
+  private void overwriteFileInListing(String firstText, String secondText)
+      throws Exception {
+    boolean allowAuthoritative = authoritative;
+
+    LOG.info("Authoritative mode enabled: {}", allowAuthoritative);
+    String rUUID = UUID.randomUUID().toString();
+    String testDir = "dir-" + rUUID + "/";
+    String testFile = testDir + "file-1-" + rUUID;
+    Path testDirPath = path(testDir);
+    Path testFilePath = guardedFs.qualify(path(testFile));
+
+    try {
+      // Create initial statusIterator with guarded ms
+      writeTextFile(guardedFs, testFilePath, firstText, true);
+      // and cache the value for later
+      final S3AFileStatus origStatus = awaitFileStatus(rawFS, testFilePath);
+      assertNotNull("No etag in raw status " + origStatus,
+          origStatus.getETag());
+
+      // Do a listing to cache the lists. Should be authoritative if it's set.
+      final S3AFileStatus[] origList = (S3AFileStatus[]) guardedFs.listStatus(
+          testDirPath);
+      assertArraySize("Added one file to the new dir, so the number of "
+              + "files in the dir should be one.", 1, origList);
+      S3AFileStatus origGuardedFileStatus = origList[0];
+      assertNotNull("No etag in origGuardedFileStatus" + origGuardedFileStatus,
+          origGuardedFileStatus.getETag());
+      final DirListingMetadata dirListingMetadata =
+          realMs.listChildren(guardedFs.qualify(testDirPath));
+      assertListingAuthority(allowAuthoritative, dirListingMetadata);
+
+      // a brief pause to guarantee timestamps are different.
+      waitForDifferentTimestamps();
+
+      // Update file with second text without S3Guard (raw)
+      deleteFile(rawFS, testFilePath);
+
+      // write to the test path with the second text
+      writeTextFile(rawFS, testFilePath, secondText, true);
+      // and await it becoming visible again.
+      final FileStatus rawFileStatus = awaitFileStatus(rawFS, testFilePath);
+
+      // check listing in guarded store.
+      final S3AFileStatus[] modList = (S3AFileStatus[]) guardedFs.listStatus(
+          testDirPath);
+      assertArraySize("Added one file to the new dir then modified it, "
+          + "so the number of files in the dir should be one.", 1,
+          modList);
+      assertEquals("The only file path in the directory listing should be "
+              + "equal to the testFilePath.", testFilePath,
+          modList[0].getPath());
+
+      // Read the file and verify the data
+      eventually(STABILIZATION_TIME, PROBE_INTERVAL_MILLIS,
+          () -> {
+            final FileStatus guardedFileStatus =
+                guardedFs.getFileStatus(testFilePath);
+            verifyFileStatusAsExpected(firstText, secondText,
+                allowAuthoritative,
+                origStatus,
+                rawFileStatus,
+                guardedFileStatus);
+          });
+    } finally {
+      guardedFs.delete(testDirPath, true);
+    }
+  }
+
+  private void deleteFile(final S3AFileSystem fs, final Path testFilePath)
+      throws Exception {
+    fs.delete(testFilePath, true);
+    awaitDeletedFileDisappearance(fs, testFilePath);
+  }
+
+
+  /**
+   * Verify that the file status of a file which has been overwritten
+   * is as expected, throwing informative exceptions if not.
+   * @param firstText text of the first write
+   * @param secondText text of the second
+   * @param allowAuthoritative is S3Guard being authoritative
+   * @param origStatus filestatus of the first written file
+   * @param rawFileStatus status of the updated file from the raw FS
+   * @param guardedFileStatus status of the updated file from the guarded FS
+   */
+  private void verifyFileStatusAsExpected(final String firstText,
+      final String secondText,
+      final boolean allowAuthoritative,
+      final FileStatus origStatus,
+      final FileStatus rawFileStatus,
+      final FileStatus guardedFileStatus) {
+    String stats = "\nRaw: " + rawFileStatus.toString() +
+        "\nGuarded: " + guardedFileStatus.toString();
+    if (firstText.length() != secondText.length()) {
+      // the file lengths are different, so compare that first.
+      // it's not going to be brittle to timestamps, and easy to understand
+      // when there is an error.
+
+      // check the file length in the raw FS To verify that status is actually
+      // stabilized w.r.t the last write.
+      long expectedLength = secondText.length();
+      assertEquals("Length of raw file status did not match the updated text "
+              + rawFileStatus,
+          expectedLength, rawFileStatus.getLen());
+      // now compare the lengths of the the raw and guarded files
+      long guardedLength = guardedFileStatus.getLen();
+      if (allowAuthoritative) {
+        // expect the length to be out of sync
+        assertNotEquals(
+            "File length in authoritative table with " + stats,
+            expectedLength, guardedLength);
+      } else {
+        assertEquals(
+            "File length in authoritative table with " + stats,
+            expectedLength, guardedLength);
+      }
+    }
+    // check etag. This relies on first and second text being different.
+    final S3AFileStatus rawS3AFileStatus = (S3AFileStatus) rawFileStatus;
+    final S3AFileStatus guardedS3AFileStatus = (S3AFileStatus)
+        guardedFileStatus;
+    final S3AFileStatus origS3AFileStatus = (S3AFileStatus) origStatus;
+    assertNotEquals(
+        "raw status still no to date with changes" + stats,
+        origS3AFileStatus.getETag(), rawS3AFileStatus.getETag());
+    if (allowAuthoritative) {
+      // expect the etag to be out of sync
+      assertNotEquals(
+          "etag in authoritative table with " + stats,
+          rawS3AFileStatus.getETag(), guardedS3AFileStatus.getETag());
+    } else {
+      assertEquals(
+          "etag in non-authoritative table with " + stats,
+          rawS3AFileStatus.getETag(), guardedS3AFileStatus.getETag());
+    }
+    // Next: modification time.
+    long rawModTime = rawFileStatus.getModificationTime();
+    long guardedModTime = guardedFileStatus.getModificationTime();
+    assertNotEquals(
+        "Updated file still has original timestamp\n"
+            + " original " + origStatus + stats,
+        origStatus.getModificationTime(), rawModTime);
+    if (allowAuthoritative) {
+      // If authoritative is allowed metadata is not updated, so mod_time
+      // won't match
+      assertNotEquals("Authoritative is enabled, so metadata is not "
+              + "updated in ms, so mod_time won't match. Expecting "
+              + "different values for raw and guarded filestatus."
+              + stats,
+          rawModTime,
+          guardedModTime);
+    } else {
+      // If authoritative is not enabled metadata is updated, mod_time
+      // will match
+      assertEquals("Authoritative is disabled, so metadata is"
+              + " updated in ms, so mod_time must match. Expecting "
+              + " same values for raw and guarded filestatus."
+              + stats,
+          rawModTime,
+          guardedModTime);
+    }
+  }
+
+  /**
+   * A brief pause to guarantee timestamps are different.
+   * This doesn't have to be as long as a stabilization delay.
+   */
+  private void waitForDifferentTimestamps() throws InterruptedException {
+    Thread.sleep(TIMESTAMP_SLEEP);
+  }
+
+  /**
+   * Assert that a listing has the specific authority.
+   * @param expectAuthoritative expect authority bit of listing
+   * @param dirListingMetadata listing to check
+   */
+  private void assertListingAuthority(final boolean expectAuthoritative,
+      final DirListingMetadata dirListingMetadata) {
+    if (expectAuthoritative) {
+      assertTrue("DirListingMeta should be authoritative if authoritative "
+              + "mode is enabled.",
+          dirListingMetadata.isAuthoritative());
+    } else {
+      assertFalse("DirListingMeta should not be authoritative if "
+              + "authoritative mode is disabled.",
+          dirListingMetadata.isAuthoritative());
+    }
+  }
+
+  /**
+   * Delete a file and use listStatus to build up the S3Guard cache.
+   */
+  private void deleteFileInListing()
+      throws Exception {
+
+    boolean allowAuthoritative = authoritative;
+    LOG.info("Authoritative mode enabled: {}", allowAuthoritative);
+    String rUUID = UUID.randomUUID().toString();
+    String testDir = "dir-" + rUUID + "/";
+    String testFile = testDir + "file-1-" + rUUID;
+    Path testDirPath = path(testDir);
+    Path testFilePath = guardedFs.qualify(path(testFile));
+    String text = "Some random text";
+
+    try {
+      // Create initial statusIterator with real ms
+      writeTextFile(
+          guardedFs, testFilePath, text, true);
+      awaitFileStatus(rawFS, testFilePath);
+
+      // Do a listing to cache the lists. Should be authoritative if it's set.
+      final FileStatus[] origList = guardedFs.listStatus(testDirPath);
+      assertEquals("Added one file to the new dir, so the number of "
+          + "files in the dir should be one.", 1, origList.length);
+      final DirListingMetadata dirListingMetadata =
+          realMs.listChildren(guardedFs.qualify(testDirPath));
+      assertListingAuthority(allowAuthoritative, dirListingMetadata);
+
+      // Delete the file without S3Guard (raw)
+      deleteFile(rawFS, testFilePath);
+
+      // File status will be still readable from s3guard
+      FileStatus status = guardedFs.getFileStatus(testFilePath);
+      LOG.info("authoritative: {} status: {}", allowAuthoritative, status);
+      expectExceptionWhenReading(testFilePath, text);
+      expectExceptionWhenReadingOpenFileAPI(testFilePath, text);
+    } finally {
+      guardedFs.delete(testDirPath, true);
+    }
+  }
+
+  /**
+   * We expect the read to fail with an FNFE: open will be happy.
+   * @param testFilePath path of the test file
+   * @param text the context in the file.
+   * @throws Exception failure other than the FNFE
+   */
+  private void expectExceptionWhenReading(Path testFilePath, String text)
+      throws Exception {
+    try (FSDataInputStream in = guardedFs.open(testFilePath)) {
+      intercept(FileNotFoundException.class, () -> {
+        byte[] bytes = new byte[text.length()];
+        return in.read(bytes, 0, bytes.length);
+      });
+    }
+  }
+
+  /**
+   * We expect the read to fail with an FNFE: open will be happy.
+   * @param testFilePath path of the test file
+   * @param text the context in the file.
+   * @throws Exception failure other than the FNFE
+   */
+  private void expectExceptionWhenReadingOpenFileAPI(
+      Path testFilePath, String text)
+      throws Exception {
+    try (
+        FSDataInputStream in = guardedFs.openFile(testFilePath).build().get()
+    ) {
+      intercept(FileNotFoundException.class, () -> {
+        byte[] bytes = new byte[text.length()];
+        return in.read(bytes, 0, bytes.length);
+      });
+    }
+  }
+
+  /**
+   * Wait for a deleted file to no longer be visible.
+   * @param fs filesystem
+   * @param testFilePath path to query
+   * @throws Exception failure
+   */
+  private void awaitDeletedFileDisappearance(final S3AFileSystem fs,
+      final Path testFilePath) throws Exception {
+    eventually(
+        STABILIZATION_TIME, PROBE_INTERVAL_MILLIS,
+        () -> intercept(FileNotFoundException.class,
+            () -> fs.getFileStatus(testFilePath)));
+  }
+
+  /**
+   * Wait for a file to be visible.
+   * @param fs filesystem
+   * @param testFilePath path to query
+   * @return the file status.
+   * @throws Exception failure
+   */
+  private S3AFileStatus awaitFileStatus(S3AFileSystem fs,
+      final Path testFilePath)
+      throws Exception {
+    return (S3AFileStatus) eventually(
+        STABILIZATION_TIME, PROBE_INTERVAL_MILLIS,
+        () -> fs.getFileStatus(testFilePath));
+  }
+
+  private FSDataOutputStream createNonRecursive(FileSystem fs, Path path)
+      throws Exception {
+    return fs
+        .createNonRecursive(path, false, 4096, (short) 3, (short) 4096, null);
+  }
+
+}
diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3GuardTtl.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3GuardTtl.java
new file mode 100644
index 0000000000000..962232239afb9
--- /dev/null
+++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3GuardTtl.java
@@ -0,0 +1,289 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.s3a;
+
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.UUID;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.s3a.s3guard.DirListingMetadata;
+import org.apache.hadoop.fs.s3a.s3guard.ITtlTimeProvider;
+import org.apache.hadoop.fs.s3a.s3guard.MetadataStore;
+import org.apache.hadoop.fs.s3a.s3guard.S3Guard;
+
+import org.junit.Assume;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+import static org.apache.hadoop.fs.contract.ContractTestUtils.touch;
+import static org.apache.hadoop.fs.s3a.Constants.METADATASTORE_AUTHORITATIVE;
+import static org.apache.hadoop.fs.s3a.S3ATestUtils.isMetadataStoreAuthoritative;
+import static org.apache.hadoop.fs.s3a.S3ATestUtils.metadataStorePersistsAuthoritativeBit;
+import static org.mockito.Mockito.mock;
+import static org.mockito.Mockito.when;
+
+/**
+ * These tests are testing the S3Guard TTL (time to live) features.
+ */
+@RunWith(Parameterized.class)
+public class ITestS3GuardTtl extends AbstractS3ATestBase {
+
+  private final boolean authoritative;
+
+  /**
+   * Test array for parameterized test runs.
+   * @return a list of parameter tuples.
+   */
+  @Parameterized.Parameters
+  public static Collection<Object[]> params() {
+    return Arrays.asList(new Object[][]{
+        {true}, {false}
+    });
+  }
+
+  /**
+   * By changing the method name, the thread name is changed and
+   * so you can see in the logs which mode is being tested.
+   * @return a string to use for the thread namer.
+   */
+  @Override
+  protected String getMethodName() {
+    return super.getMethodName() +
+        (authoritative ? "-auth" : "-nonauth");
+  }
+
+  public ITestS3GuardTtl(boolean authoritative) {
+    this.authoritative = authoritative;
+  }
+
+  /**
+   * Patch the configuration - this test needs disabled filesystem caching.
+   * These tests modify the fs instance that would cause flaky tests.
+   * @return a configuration
+   */
+  @Override
+  protected Configuration createConfiguration() {
+    Configuration configuration = super.createConfiguration();
+    S3ATestUtils.disableFilesystemCaching(configuration);
+    configuration =
+        S3ATestUtils.prepareTestConfiguration(configuration);
+    configuration.setBoolean(METADATASTORE_AUTHORITATIVE, authoritative);
+    return configuration;
+  }
+
+  @Test
+  public void testDirectoryListingAuthoritativeTtl() throws Exception {
+    LOG.info("Authoritative mode: {}", authoritative);
+
+    final S3AFileSystem fs = getFileSystem();
+    Assume.assumeTrue(fs.hasMetadataStore());
+    final MetadataStore ms = fs.getMetadataStore();
+
+    Assume.assumeTrue("MetadataStore should be capable for authoritative "
+            + "storage of directories to run this test.",
+        metadataStorePersistsAuthoritativeBit(ms));
+
+    Assume.assumeTrue("MetadataStore should be authoritative for this test",
+        isMetadataStoreAuthoritative(getFileSystem().getConf()));
+
+    ITtlTimeProvider mockTimeProvider =
+        mock(ITtlTimeProvider.class);
+    ITtlTimeProvider restoreTimeProvider = fs.getTtlTimeProvider();
+    fs.setTtlTimeProvider(mockTimeProvider);
+    when(mockTimeProvider.getNow()).thenReturn(100L);
+    when(mockTimeProvider.getMetadataTtl()).thenReturn(1L);
+
+    Path dir = path("ttl/");
+    Path file = path("ttl/afile");
+
+    try {
+      fs.mkdirs(dir);
+      touch(fs, file);
+
+      // get an authoritative listing in ms
+      fs.listStatus(dir);
+      // check if authoritative
+      DirListingMetadata dirListing =
+          S3Guard.listChildrenWithTtl(ms, dir, mockTimeProvider);
+      assertTrue("Listing should be authoritative.",
+          dirListing.isAuthoritative());
+      // change the time, and assume it's not authoritative anymore
+      when(mockTimeProvider.getNow()).thenReturn(102L);
+      dirListing = S3Guard.listChildrenWithTtl(ms, dir, mockTimeProvider);
+      assertFalse("Listing should not be authoritative.",
+          dirListing.isAuthoritative());
+
+      // get an authoritative listing in ms again - retain test
+      fs.listStatus(dir);
+      // check if authoritative
+      dirListing = S3Guard.listChildrenWithTtl(ms, dir, mockTimeProvider);
+      assertTrue("Listing shoud be authoritative after listStatus.",
+          dirListing.isAuthoritative());
+    } finally {
+      fs.delete(dir, true);
+      fs.setTtlTimeProvider(restoreTimeProvider);
+    }
+  }
+
+  @Test
+  public void testFileMetadataExpiresTtl() throws Exception {
+    LOG.info("Authoritative mode: {}", authoritative);
+
+    Path fileExpire1 = path("expirettl-" + UUID.randomUUID());
+    Path fileExpire2 = path("expirettl-" + UUID.randomUUID());
+    Path fileRetain = path("expirettl-" + UUID.randomUUID());
+
+    final S3AFileSystem fs = getFileSystem();
+    Assume.assumeTrue(fs.hasMetadataStore());
+    final MetadataStore ms = fs.getMetadataStore();
+
+    ITtlTimeProvider mockTimeProvider = mock(ITtlTimeProvider.class);
+    ITtlTimeProvider originalTimeProvider = fs.getTtlTimeProvider();
+
+    try {
+      fs.setTtlTimeProvider(mockTimeProvider);
+      when(mockTimeProvider.getMetadataTtl()).thenReturn(5L);
+
+      // set the time, so the fileExpire1 will expire
+      when(mockTimeProvider.getNow()).thenReturn(100L);
+      touch(fs, fileExpire1);
+      // set the time, so fileExpire2 will expire
+      when(mockTimeProvider.getNow()).thenReturn(101L);
+      touch(fs, fileExpire2);
+      // set the time, so fileRetain won't expire
+      when(mockTimeProvider.getNow()).thenReturn(109L);
+      touch(fs, fileRetain);
+      final FileStatus origFileRetainStatus = fs.getFileStatus(fileRetain);
+      // change time, so the first two file metadata is expired
+      when(mockTimeProvider.getNow()).thenReturn(110L);
+
+      // metadata is expired so this should refresh the metadata with
+      // last_updated to the getNow()
+      final FileStatus fileExpire1Status = fs.getFileStatus(fileExpire1);
+      assertNotNull(fileExpire1Status);
+      assertEquals(110L, ms.get(fileExpire1).getLastUpdated());
+
+      // metadata is expired so this should refresh the metadata with
+      // last_updated to the getNow()
+      final FileStatus fileExpire2Status = fs.getFileStatus(fileExpire2);
+      assertNotNull(fileExpire2Status);
+      assertEquals(110L, ms.get(fileExpire2).getLastUpdated());
+
+      final FileStatus fileRetainStatus = fs.getFileStatus(fileRetain);
+      assertEquals("Modification time of these files should be equal.",
+          origFileRetainStatus.getModificationTime(),
+          fileRetainStatus.getModificationTime());
+      assertNotNull(fileRetainStatus);
+      assertEquals(109L, ms.get(fileRetain).getLastUpdated());
+    } finally {
+      fs.delete(fileExpire1, true);
+      fs.delete(fileExpire2, true);
+      fs.delete(fileRetain, true);
+      fs.setTtlTimeProvider(originalTimeProvider);
+    }
+  }
+
+  /**
+   * create(tombstone file) must succeed irrespective of overwrite flag.
+   */
+  @Test
+  public void testCreateOnTombstonedFileSucceeds() throws Exception {
+    LOG.info("Authoritative mode: {}", authoritative);
+    final S3AFileSystem fs = getFileSystem();
+
+    String fileToTry = methodName + UUID.randomUUID().toString();
+
+    final Path filePath = path(fileToTry);
+
+    // Create a directory with
+    ITtlTimeProvider mockTimeProvider = mock(ITtlTimeProvider.class);
+    ITtlTimeProvider originalTimeProvider = fs.getTtlTimeProvider();
+
+    try {
+      fs.setTtlTimeProvider(mockTimeProvider);
+      when(mockTimeProvider.getNow()).thenReturn(100L);
+      when(mockTimeProvider.getMetadataTtl()).thenReturn(5L);
+
+      // CREATE A FILE
+      touch(fs, filePath);
+
+      // DELETE THE FILE - TOMBSTONE
+      fs.delete(filePath, true);
+
+      // CREATE THE SAME FILE WITHOUT ERROR DESPITE THE TOMBSTONE
+      touch(fs, filePath);
+
+    } finally {
+      fs.delete(filePath, true);
+      fs.setTtlTimeProvider(originalTimeProvider);
+    }
+  }
+
+  /**
+   * create("parent has tombstone") must always succeed (We dont check the
+   * parent), but after the file has been written, all entries up the tree
+   * must be valid. That is: the putAncestor code will correct everything
+   */
+  @Test
+  public void testCreateParentHasTombstone() throws Exception {
+    LOG.info("Authoritative mode: {}", authoritative);
+    final S3AFileSystem fs = getFileSystem();
+
+    String dirToDelete = methodName + UUID.randomUUID().toString();
+    String fileToTry = dirToDelete + "/theFileToTry";
+
+    final Path dirPath = path(dirToDelete);
+    final Path filePath = path(fileToTry);
+
+    // Create a directory with
+    ITtlTimeProvider mockTimeProvider = mock(ITtlTimeProvider.class);
+    ITtlTimeProvider originalTimeProvider = fs.getTtlTimeProvider();
+
+    try {
+      fs.setTtlTimeProvider(mockTimeProvider);
+      when(mockTimeProvider.getNow()).thenReturn(100L);
+      when(mockTimeProvider.getMetadataTtl()).thenReturn(5L);
+
+      // CREATE DIRECTORY
+      fs.mkdirs(dirPath);
+
+      // DELETE DIRECTORY
+      fs.delete(dirPath, true);
+
+      // WRITE TO DELETED DIRECTORY - SUCCESS
+      touch(fs, filePath);
+
+      // SET TIME SO METADATA EXPIRES
+      when(mockTimeProvider.getNow()).thenReturn(110L);
+
+      // WRITE TO DELETED DIRECTORY - SUCCESS
+      touch(fs, filePath);
+
+    } finally {
+      fs.delete(filePath, true);
+      fs.delete(dirPath, true);
+      fs.setTtlTimeProvider(originalTimeProvider);
+    }
+  }
+
+}
diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/AbstractS3GuardToolTestBase.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/AbstractS3GuardToolTestBase.java
index 9c88a62f29ca0..cdef917a43cd3 100644
--- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/AbstractS3GuardToolTestBase.java
+++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/AbstractS3GuardToolTestBase.java
@@ -280,7 +280,7 @@ private void testPruneCommand(Configuration cmdConf, Path parent,
           "This child should have been kept (prefix restriction).", 1);
     } finally {
       getFileSystem().delete(parent, true);
-      ms.prune(Long.MAX_VALUE);
+      ms.prune(MetadataStore.PruneMode.ALL_BY_MODTIME, Long.MAX_VALUE);
     }
   }
 
diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/ITestDynamoDBMetadataStore.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/ITestDynamoDBMetadataStore.java
index 53559107529d1..709aa5a60a676 100644
--- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/ITestDynamoDBMetadataStore.java
+++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/ITestDynamoDBMetadataStore.java
@@ -187,6 +187,12 @@ public void tearDown() throws Exception {
     fileSystem.close();
   }
 
+  @Override protected String getPathStringForPrune(String path)
+      throws Exception {
+    String b = getTestBucketName(getContract().getFileSystem().getConf());
+    return "/" + b + "/dir2";
+  }
+
   /**
    * Each contract has its own S3AFileSystem and DynamoDBMetadataStore objects.
    */
@@ -361,7 +367,7 @@ private void doTestBatchWrite(int numDelete, int numPut,
     }
 
     // move the old paths to new paths and verify
-    ms.move(pathsToDelete, newMetas);
+    ms.move(pathsToDelete, newMetas, getTtlTimeProvider());
     assertEquals(0, ms.listChildren(oldDir).withoutTombstones().numEntries());
     if (newMetas != null) {
       assertTrue(CollectionUtils
@@ -559,7 +565,7 @@ public void testMovePopulatesAncestors() throws IOException {
             1024, false))
     );
 
-    ddbms.move(fullSourcePaths, pathsToCreate);
+    ddbms.move(fullSourcePaths, pathsToCreate, getTtlTimeProvider());
 
     // assert that all the ancestors should have been populated automatically
     assertCached(testRoot + "/c");
diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/ITestDynamoDBMetadataStoreScale.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/ITestDynamoDBMetadataStoreScale.java
index 48dbce98a77ee..aa2dda835af79 100644
--- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/ITestDynamoDBMetadataStoreScale.java
+++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/ITestDynamoDBMetadataStoreScale.java
@@ -240,7 +240,8 @@ public void test_030_BatchedWrite() throws Exception {
 
               if (pruneItems == BATCH_SIZE) {
                 describe("pruning files");
-                ddbms.prune(Long.MAX_VALUE /* all files */);
+                ddbms.prune(MetadataStore.PruneMode.ALL_BY_MODTIME,
+                    Long.MAX_VALUE /* all files */);
                 pruneItems = 0;
               }
               if (tracker.probe()) {
@@ -302,7 +303,7 @@ public void test_050_getVersionMarkerItem() throws Throwable {
   private void retryingDelete(final Path path) {
     try {
       ddbms.getInvoker().retry("Delete ", path.toString(), true,
-          () -> ddbms.delete(path));
+          () -> ddbms.delete(path, new S3Guard.TtlTimeProvider(getConf())));
     } catch (IOException e) {
       LOG.warn("Failed to delete {}: ", path, e);
     }
diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/MetadataStoreTestBase.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/MetadataStoreTestBase.java
index 45d6051ddb190..754da0db7992a 100644
--- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/MetadataStoreTestBase.java
+++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/MetadataStoreTestBase.java
@@ -23,8 +23,8 @@
 import java.util.Arrays;
 import java.util.Collection;
 import java.util.HashSet;
+import java.util.List;
 import java.util.Set;
-import java.util.Map;
 
 import com.google.common.collect.Sets;
 import org.junit.After;
@@ -39,11 +39,14 @@
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.fs.RemoteIterator;
 import org.apache.hadoop.fs.permission.FsPermission;
+import org.apache.hadoop.fs.s3a.S3AFileStatus;
 import org.apache.hadoop.fs.s3a.S3ATestUtils;
 import org.apache.hadoop.fs.s3a.Tristate;
 import org.apache.hadoop.io.IOUtils;
 import org.apache.hadoop.test.HadoopTestBase;
 
+import static org.apache.hadoop.fs.s3a.S3ATestUtils.metadataStorePersistsAuthoritativeBit;
+
 /**
  * Main test class for MetadataStore implementations.
  * Implementations should each create a test by subclassing this and
@@ -59,11 +62,14 @@ public abstract class MetadataStoreTestBase extends HadoopTestBase {
   /** Some dummy values for sanity-checking FileStatus contents. */
   static final long BLOCK_SIZE = 32 * 1024 * 1024;
   static final int REPLICATION = 1;
-  static final FsPermission PERMISSION = new FsPermission((short)0755);
   static final String OWNER = "bob";
-  static final String GROUP = "uncles";
-  private final long accessTime = System.currentTimeMillis();
-  private final long modTime = accessTime - 5000;
+  private final long modTime = System.currentTimeMillis() - 5000;
+
+  // attributes not supported by S3AFileStatus
+  static final FsPermission PERMISSION = null;
+  static final String GROUP = null;
+  private final long accessTime = 0;
+  private static ITtlTimeProvider ttlTimeProvider;
 
   /**
    * Each test should override this.  Will use a new Configuration instance.
@@ -119,6 +125,8 @@ public void setUp() throws Exception {
     assertNotNull("null MetadataStore", ms);
     assertNotNull("null FileSystem", contract.getFileSystem());
     ms.initialize(contract.getFileSystem());
+    ttlTimeProvider =
+        new S3Guard.TtlTimeProvider(contract.getFileSystem().getConf());
   }
 
   @After
@@ -146,14 +154,14 @@ private void doTestDescendantsIterator(
       String[] checkNodes) throws Exception {
     // we set up the example file system tree in metadata store
     for (String pathStr : createNodes) {
-      final FileStatus status = pathStr.contains("file")
+      final S3AFileStatus status = pathStr.contains("file")
           ? basicFileStatus(strToPath(pathStr), 100, false)
           : basicFileStatus(strToPath(pathStr), 0, true);
       ms.put(new PathMetadata(status));
     }
 
     final PathMetadata rootMeta = new PathMetadata(makeDirStatus("/"));
-    RemoteIterator<FileStatus> iterator;
+    RemoteIterator<S3AFileStatus> iterator;
     if (implementation == DescendantsIterator.class) {
       iterator = new DescendantsIterator(ms, rootMeta);
     } else if (implementation == MetadataStoreListFilesIterator.class) {
@@ -306,7 +314,7 @@ public void testRootDirPutNew() throws Exception {
   public void testDelete() throws Exception {
     setUpDeleteTest();
 
-    ms.delete(strToPath("/ADirectory1/db1/file2"));
+    ms.delete(strToPath("/ADirectory1/db1/file2"), ttlTimeProvider);
 
     /* Ensure delete happened. */
     assertDirectorySize("/ADirectory1/db1", 1);
@@ -334,7 +342,7 @@ private void deleteSubtreeHelper(String pathPrefix) throws Exception {
     if (!allowMissing()) {
       assertCached(p + "/ADirectory1/db1");
     }
-    ms.deleteSubtree(strToPath(p + "/ADirectory1/db1/"));
+    ms.deleteSubtree(strToPath(p + "/ADirectory1/db1/"), ttlTimeProvider);
 
     assertEmptyDirectory(p + "/ADirectory1");
     assertDeleted(p + "/ADirectory1/db1");
@@ -354,7 +362,7 @@ private void deleteSubtreeHelper(String pathPrefix) throws Exception {
   public void testDeleteRecursiveRoot() throws Exception {
     setUpDeleteTest();
 
-    ms.deleteSubtree(strToPath("/"));
+    ms.deleteSubtree(strToPath("/"), ttlTimeProvider);
     assertDeleted("/ADirectory1");
     assertDeleted("/ADirectory2");
     assertDeleted("/ADirectory2/db1");
@@ -365,10 +373,10 @@ public void testDeleteRecursiveRoot() throws Exception {
   @Test
   public void testDeleteNonExisting() throws Exception {
     // Path doesn't exist, but should silently succeed
-    ms.delete(strToPath("/bobs/your/uncle"));
+    ms.delete(strToPath("/bobs/your/uncle"), ttlTimeProvider);
 
     // Ditto.
-    ms.deleteSubtree(strToPath("/internets"));
+    ms.deleteSubtree(strToPath("/internets"), ttlTimeProvider);
   }
 
 
@@ -404,7 +412,7 @@ public void testGet() throws Exception {
     }
 
     if (!(ms instanceof NullMetadataStore)) {
-      ms.delete(strToPath(filePath));
+      ms.delete(strToPath(filePath), ttlTimeProvider);
       meta = ms.get(strToPath(filePath));
       assertTrue("Tombstone not left for deleted file", meta.isDeleted());
     }
@@ -511,21 +519,13 @@ public void testListChildren() throws Exception {
     }
   }
 
-  private boolean isMetadataStoreAuthoritative() throws IOException {
-    Map<String, String> diags = ms.getDiagnostics();
-    String isAuth =
-        diags.get(MetadataStoreCapabilities.PERSISTS_AUTHORITATIVE_BIT);
-    if(isAuth == null){
-      return false;
-    }
-    return Boolean.valueOf(isAuth);
-  }
+
 
   @Test
   public void testListChildrenAuthoritative() throws IOException {
     Assume.assumeTrue("MetadataStore should be capable for authoritative "
         + "storage of directories to run this test.",
-        isMetadataStoreAuthoritative());
+        metadataStorePersistsAuthoritativeBit(ms));
 
     setupListStatus();
 
@@ -590,7 +590,7 @@ public void testMove() throws Exception {
     destMetas.add(new PathMetadata(makeDirStatus("/b1")));
     destMetas.add(new PathMetadata(makeFileStatus("/b1/file1", 100)));
     destMetas.add(new PathMetadata(makeFileStatus("/b1/file2", 100)));
-    ms.move(srcPaths, destMetas);
+    ms.move(srcPaths, destMetas, ttlTimeProvider);
 
     // Assert src is no longer there
     dirMeta = ms.listChildren(strToPath("/a1"));
@@ -640,11 +640,11 @@ public void testMultiBucketPaths() throws Exception {
 
     // Make sure delete is correct as well
     if (!allowMissing()) {
-      ms.delete(new Path(p2));
+      ms.delete(new Path(p2), ttlTimeProvider);
       meta = ms.get(new Path(p1));
       assertNotNull("Path should not have been deleted", meta);
     }
-    ms.delete(new Path(p1));
+    ms.delete(new Path(p1), ttlTimeProvider);
   }
 
   @Test
@@ -653,8 +653,7 @@ public void testPruneFiles() throws Exception {
     createNewDirs("/pruneFiles");
 
     long oldTime = getTime();
-    ms.put(new PathMetadata(makeFileStatus("/pruneFiles/old", 1, oldTime,
-        oldTime)));
+    ms.put(new PathMetadata(makeFileStatus("/pruneFiles/old", 1, oldTime)));
     DirListingMetadata ls2 = ms.listChildren(strToPath("/pruneFiles"));
     if (!allowMissing()) {
       assertListingsEqual(ls2.getListing(), "/pruneFiles/old");
@@ -665,8 +664,7 @@ public void testPruneFiles() throws Exception {
     Thread.sleep(1);
     long cutoff = System.currentTimeMillis();
     long newTime = getTime();
-    ms.put(new PathMetadata(makeFileStatus("/pruneFiles/new", 1, newTime,
-        newTime)));
+    ms.put(new PathMetadata(makeFileStatus("/pruneFiles/new", 1, newTime)));
 
     DirListingMetadata ls;
     ls = ms.listChildren(strToPath("/pruneFiles"));
@@ -674,7 +672,7 @@ public void testPruneFiles() throws Exception {
       assertListingsEqual(ls.getListing(), "/pruneFiles/new",
           "/pruneFiles/old");
     }
-    ms.prune(cutoff);
+    ms.prune(MetadataStore.PruneMode.ALL_BY_MODTIME, cutoff);
     ls = ms.listChildren(strToPath("/pruneFiles"));
     if (allowMissing()) {
       assertDeleted("/pruneFiles/old");
@@ -697,14 +695,14 @@ public void testPruneDirs() throws Exception {
 
     long oldTime = getTime();
     ms.put(new PathMetadata(makeFileStatus("/pruneDirs/dir/file",
-        1, oldTime, oldTime)));
+        1, oldTime)));
 
     // It's possible for the Local implementation to get from the old
     // modification time to here in under 1ms, causing it to not get pruned
     Thread.sleep(1);
     long cutoff = getTime();
 
-    ms.prune(cutoff);
+    ms.prune(MetadataStore.PruneMode.ALL_BY_MODTIME, cutoff);
 
     assertDeleted("/pruneDirs/dir/file");
   }
@@ -721,10 +719,10 @@ public void testPruneUnsetsAuthoritative() throws Exception {
     createNewDirs(rootDir, grandparentDir, parentDir);
     long time = System.currentTimeMillis();
     ms.put(new PathMetadata(
-        new FileStatus(0, false, 0, 0, time - 1, strToPath(staleFile)),
+        basicFileStatus(0, false, 0, time - 1, strToPath(staleFile)),
         Tristate.FALSE, false));
     ms.put(new PathMetadata(
-        new FileStatus(0, false, 0, 0, time + 1, strToPath(freshFile)),
+        basicFileStatus(0, false, 0, time + 1, strToPath(freshFile)),
         Tristate.FALSE, false));
 
     // set parent dir as authoritative
@@ -734,7 +732,7 @@ public void testPruneUnsetsAuthoritative() throws Exception {
       ms.put(parentDirMd);
     }
 
-    ms.prune(time);
+    ms.prune(MetadataStore.PruneMode.ALL_BY_MODTIME, time);
     DirListingMetadata listing;
     for (String directory : directories) {
       Path path = strToPath(directory);
@@ -758,10 +756,10 @@ public void testPrunePreservesAuthoritative() throws Exception {
     createNewDirs(rootDir, grandparentDir, parentDir);
     long time = System.currentTimeMillis();
     ms.put(new PathMetadata(
-        new FileStatus(0, false, 0, 0, time + 1, strToPath(staleFile)),
+        basicFileStatus(0, false, 0, time + 1, strToPath(staleFile)),
         Tristate.FALSE, false));
     ms.put(new PathMetadata(
-        new FileStatus(0, false, 0, 0, time + 1, strToPath(freshFile)),
+        basicFileStatus(0, false, 0, time + 1, strToPath(freshFile)),
         Tristate.FALSE, false));
 
     if (!allowMissing()) {
@@ -771,7 +769,7 @@ public void testPrunePreservesAuthoritative() throws Exception {
       ms.put(parentDirMd);
 
       // prune the ms
-      ms.prune(time);
+      ms.prune(MetadataStore.PruneMode.ALL_BY_MODTIME, time);
 
       // get the directory listings
       DirListingMetadata rootDirMd = ms.listChildren(strToPath(rootDir));
@@ -814,6 +812,104 @@ public void testPutDirListingMetadataPutsFileMetadata()
     }
   }
 
+  @Test
+  public void testPutRetainsIsDeletedInParentListing() throws Exception {
+    final Path path = strToPath("/a/b");
+    final S3AFileStatus fileStatus = basicFileStatus(path, 0, false);
+    PathMetadata pm = new PathMetadata(fileStatus);
+    pm.setIsDeleted(true);
+    ms.put(pm);
+    if(!allowMissing()) {
+      final PathMetadata pathMetadata =
+          ms.listChildren(path.getParent()).get(path);
+      assertTrue("isDeleted should be true on the parent listing",
+          pathMetadata.isDeleted());
+    }
+  }
+
+  @Test
+  public void testPruneExpiredTombstones() throws Exception {
+    List<String> keepFilenames = new ArrayList<>(
+        Arrays.asList("/dir1/fileK1", "/dir1/fileK2", "/dir1/fileK3"));
+    List<String> removeFilenames = new ArrayList<>(
+        Arrays.asList("/dir1/fileR1", "/dir1/fileR2", "/dir1/fileR3"));
+
+    long cutoff = 9001;
+
+    for(String fN : keepFilenames) {
+      final PathMetadata pathMetadata = new PathMetadata(makeFileStatus(fN, 1));
+      pathMetadata.setLastUpdated(9002L);
+      ms.put(pathMetadata);
+    }
+
+    for(String fN : removeFilenames) {
+      final PathMetadata pathMetadata = new PathMetadata(makeFileStatus(fN, 1));
+      pathMetadata.setLastUpdated(9000L);
+      // tombstones are the deleted files!
+      pathMetadata.setIsDeleted(true);
+      ms.put(pathMetadata);
+    }
+
+    ms.prune(MetadataStore.PruneMode.TOMBSTONES_BY_LASTUPDATED, cutoff);
+
+    if (!allowMissing()) {
+      for (String fN : keepFilenames) {
+        final PathMetadata pathMetadata = ms.get(strToPath(fN));
+        assertNotNull("Kept files should be in the metastore after prune",
+            pathMetadata);
+      }
+    }
+
+    for(String fN : removeFilenames) {
+      final PathMetadata pathMetadata = ms.get(strToPath(fN));
+      assertNull("Expired tombstones should be removed from metastore after "
+          + "the prune.", pathMetadata);
+    }
+  }
+
+  @Test
+  public void testPruneExpiredTombstonesSpecifiedPath() throws Exception {
+    List<String> keepFilenames = new ArrayList<>(
+        Arrays.asList("/dir1/fileK1", "/dir1/fileK2", "/dir1/fileK3"));
+    List<String> removeFilenames = new ArrayList<>(
+        Arrays.asList("/dir2/fileR1", "/dir2/fileR2", "/dir2/fileR3"));
+
+    long cutoff = 9001;
+
+    // Both are expired. Difference is it will only delete the specified one.
+    for (String fN : keepFilenames) {
+      final PathMetadata pathMetadata = new PathMetadata(makeFileStatus(fN, 1));
+      pathMetadata.setLastUpdated(9002L);
+      ms.put(pathMetadata);
+    }
+
+    for (String fN : removeFilenames) {
+      final PathMetadata pathMetadata = new PathMetadata(makeFileStatus(fN, 1));
+      pathMetadata.setLastUpdated(9000L);
+      // tombstones are the deleted files!
+      pathMetadata.setIsDeleted(true);
+      ms.put(pathMetadata);
+    }
+
+    final String prunePath = getPathStringForPrune("/dir2");
+    ms.prune(MetadataStore.PruneMode.TOMBSTONES_BY_LASTUPDATED, cutoff,
+        prunePath);
+
+    if (!allowMissing()) {
+      for (String fN : keepFilenames) {
+        final PathMetadata pathMetadata = ms.get(strToPath(fN));
+        assertNotNull("Kept files should be in the metastore after prune",
+            pathMetadata);
+      }
+    }
+
+    for (String fN : removeFilenames) {
+      final PathMetadata pathMetadata = ms.get(strToPath(fN));
+      assertNull("Expired tombstones should be removed from metastore after "
+          + "the prune.", pathMetadata);
+    }
+  }
+
   /*
    * Helper functions.
    */
@@ -828,6 +924,16 @@ private String[] buildPathStrings(String parent, String... paths)
     return paths;
   }
 
+
+  /**
+   * The prune operation needs the path with the bucket name as a string in
+   * {@link DynamoDBMetadataStore}, but not for {@link LocalMetadataStore}.
+   * This is an implementation detail of the ms, so this should be
+   * implemented in the subclasses.
+   */
+  protected abstract String getPathStringForPrune(String path)
+      throws Exception;
+
   private void commonTestPutListStatus(final String parent) throws IOException {
     putListStatusFiles(parent, true, buildPathStrings(parent, "file1", "file2",
         "file3"));
@@ -939,40 +1045,54 @@ private void assertEmptyDirs(String ...dirs) throws IOException {
     }
   }
 
-  FileStatus basicFileStatus(Path path, int size, boolean isDir) throws
+  S3AFileStatus basicFileStatus(Path path, int size, boolean isDir) throws
       IOException {
-    return basicFileStatus(path, size, isDir, modTime, accessTime);
+    return basicFileStatus(path, size, isDir, modTime);
   }
 
-  public static FileStatus basicFileStatus(Path path, int size, boolean isDir,
-      long newModTime, long newAccessTime) throws IOException {
-    return new FileStatus(size, isDir, REPLICATION, BLOCK_SIZE, newModTime,
-        newAccessTime, PERMISSION, OWNER, GROUP, path);
+  S3AFileStatus basicFileStatus(int size, boolean isDir,
+      long blockSize, long modificationTime, Path path) {
+    if (isDir) {
+      return new S3AFileStatus(Tristate.UNKNOWN, path, null);
+    } else {
+      return new S3AFileStatus(size, modificationTime, path, blockSize, null,
+          null, null);
+    }
   }
 
-  private FileStatus makeFileStatus(String pathStr, int size) throws
+  public static S3AFileStatus basicFileStatus(Path path, int size,
+      boolean isDir, long newModTime) throws IOException {
+    if (isDir) {
+      return new S3AFileStatus(Tristate.UNKNOWN, path, OWNER);
+    } else {
+      return new S3AFileStatus(size, newModTime, path, BLOCK_SIZE, OWNER,
+          null, null);
+    }
+  }
+
+  private S3AFileStatus makeFileStatus(String pathStr, int size) throws
       IOException {
-    return makeFileStatus(pathStr, size, modTime, accessTime);
+    return makeFileStatus(pathStr, size, modTime);
   }
 
-  private FileStatus makeFileStatus(String pathStr, int size, long newModTime,
-      long newAccessTime) throws IOException {
+  private S3AFileStatus makeFileStatus(String pathStr, int size,
+      long newModTime) throws IOException {
     return basicFileStatus(strToPath(pathStr), size, false,
-        newModTime, newAccessTime);
+        newModTime);
   }
 
   void verifyFileStatus(FileStatus status, long size) {
     S3ATestUtils.verifyFileStatus(status, size, BLOCK_SIZE, modTime);
   }
 
-  private FileStatus makeDirStatus(String pathStr) throws IOException {
-    return basicFileStatus(strToPath(pathStr), 0, true, modTime, accessTime);
+  private S3AFileStatus makeDirStatus(String pathStr) throws IOException {
+    return basicFileStatus(strToPath(pathStr), 0, true, modTime);
   }
 
   /**
    * Verify the directory file status. Subclass may verify additional fields.
    */
-  void verifyDirStatus(FileStatus status) {
+  void verifyDirStatus(S3AFileStatus status) {
     assertTrue("Is a dir", status.isDirectory());
     assertEquals("zero length", 0, status.getLen());
   }
@@ -989,4 +1109,8 @@ protected static long getTime() {
     return System.currentTimeMillis();
   }
 
+  protected static ITtlTimeProvider getTtlTimeProvider() {
+    return ttlTimeProvider;
+  }
+
 }
diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/TestLocalMetadataStore.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/TestLocalMetadataStore.java
index 2ea20b26b023d..1d231eac96fbb 100644
--- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/TestLocalMetadataStore.java
+++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/TestLocalMetadataStore.java
@@ -73,6 +73,11 @@ public AbstractMSContract createContract(Configuration conf) throws
     return new LocalMSContract(conf);
   }
 
+  @Override protected String getPathStringForPrune(String path)
+      throws Exception{
+    return path;
+  }
+
   @Test
   public void testClearByAncestor() throws Exception {
     Cache<Path, LocalMetadataEntry> cache = CacheBuilder.newBuilder().build();
@@ -182,7 +187,7 @@ private static void assertClearResult(Cache<Path, LocalMetadataEntry> cache,
       String prefixStr, String pathStr, int leftoverSize) throws IOException {
     populateMap(cache, prefixStr);
     LocalMetadataStore.deleteEntryByAncestor(new Path(prefixStr + pathStr),
-        cache, true);
+        cache, true, getTtlTimeProvider());
     assertEquals(String.format("Cache should have %d entries", leftoverSize),
         leftoverSize, sizeOfMap(cache));
     cache.invalidateAll();
diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/TestNullMetadataStore.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/TestNullMetadataStore.java
index c0541ea98ee26..2e0bc4b7e4f0e 100644
--- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/TestNullMetadataStore.java
+++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/TestNullMetadataStore.java
@@ -46,6 +46,11 @@ public boolean allowMissing() {
     return true;
   }
 
+  @Override protected String getPathStringForPrune(String path)
+      throws Exception {
+    return path;
+  }
+
   @Override
   public AbstractMSContract createContract() {
     return new NullMSContract();
diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/TestS3Guard.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/TestS3Guard.java
index 745e7aad28868..bdb256cba3dea 100644
--- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/TestS3Guard.java
+++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/TestS3Guard.java
@@ -18,14 +18,28 @@
 
 package org.apache.hadoop.fs.s3a.s3guard;
 
+import java.util.ArrayList;
 import java.util.Arrays;
+import java.util.Collection;
 import java.util.List;
+import java.util.concurrent.TimeUnit;
 
 import org.junit.Assert;
 import org.junit.Test;
 
+import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileStatus;
 import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.s3a.S3AFileStatus;
+import org.apache.hadoop.fs.s3a.Tristate;
+
+import static org.apache.hadoop.fs.s3a.Constants.DEFAULT_METADATASTORE_METADATA_TTL;
+import static org.apache.hadoop.fs.s3a.Constants.METADATASTORE_METADATA_TTL;
+import static org.mockito.Mockito.mock;
+import static org.mockito.Mockito.spy;
+import static org.mockito.Mockito.times;
+import static org.mockito.Mockito.verify;
+import static org.mockito.Mockito.when;
 
 /**
  * Tests for the {@link S3Guard} utility class.
@@ -49,13 +63,15 @@ public void testDirListingUnion() throws Exception {
         Arrays.asList(m1, m2), false);
 
     // Two other files in s3
-    List<FileStatus> s3Listing = Arrays.asList(
+    List<S3AFileStatus> s3Listing = Arrays.asList(
         makeFileStatus("s3a://bucket/dir/s3-file3", false),
         makeFileStatus("s3a://bucket/dir/s3-file4", false)
     );
 
+    ITtlTimeProvider timeProvider = new S3Guard.TtlTimeProvider(
+        DEFAULT_METADATASTORE_METADATA_TTL);
     FileStatus[] result = S3Guard.dirListingUnion(ms, dirPath, s3Listing,
-        dirMeta, false);
+        dirMeta, false, timeProvider);
 
     assertEquals("listing length", 4, result.length);
     assertContainsPath(result, "s3a://bucket/dir/ms-file1");
@@ -64,6 +80,185 @@ public void testDirListingUnion() throws Exception {
     assertContainsPath(result, "s3a://bucket/dir/s3-file4");
   }
 
+  @Test
+  public void testPutWithTtlDirListingMeta() throws Exception {
+    // arrange
+    DirListingMetadata dlm = new DirListingMetadata(new Path("/"), null,
+        false);
+    MetadataStore ms = spy(MetadataStore.class);
+    ITtlTimeProvider timeProvider =
+        mock(ITtlTimeProvider.class);
+    when(timeProvider.getNow()).thenReturn(100L);
+
+    // act
+    S3Guard.putWithTtl(ms, dlm, timeProvider);
+
+    // assert
+    assertEquals("last update in " + dlm, 100L, dlm.getLastUpdated());
+    verify(timeProvider, times(1)).getNow();
+    verify(ms, times(1)).put(dlm);
+  }
+
+  @Test
+  public void testPutWithTtlFileMeta() throws Exception {
+    // arrange
+    S3AFileStatus fileStatus = mock(S3AFileStatus.class);
+    when(fileStatus.getPath()).thenReturn(new Path("/"));
+    PathMetadata pm = new PathMetadata(fileStatus);
+    MetadataStore ms = spy(MetadataStore.class);
+    ITtlTimeProvider timeProvider =
+        mock(ITtlTimeProvider.class);
+    when(timeProvider.getNow()).thenReturn(100L);
+
+    // act
+    S3Guard.putWithTtl(ms, pm, timeProvider);
+
+    // assert
+    assertEquals("last update in " + pm, 100L, pm.getLastUpdated());
+    verify(timeProvider, times(1)).getNow();
+    verify(ms, times(1)).put(pm);
+  }
+
+  @Test
+  public void testPutWithTtlCollection() throws Exception {
+    // arrange
+    S3AFileStatus fileStatus = mock(S3AFileStatus.class);
+    when(fileStatus.getPath()).thenReturn(new Path("/"));
+    Collection<PathMetadata> pmCollection = new ArrayList<>();
+    for (int i = 0; i < 10; i++) {
+      pmCollection.add(new PathMetadata(fileStatus));
+    }
+    MetadataStore ms = spy(MetadataStore.class);
+    ITtlTimeProvider timeProvider =
+        mock(ITtlTimeProvider.class);
+    when(timeProvider.getNow()).thenReturn(100L);
+
+    // act
+    S3Guard.putWithTtl(ms, pmCollection, timeProvider);
+
+    // assert
+    pmCollection.forEach(
+        pm -> assertEquals(100L, pm.getLastUpdated())
+    );
+    verify(timeProvider, times(1)).getNow();
+    verify(ms, times(1)).put(pmCollection);
+  }
+
+  @Test
+  public void testGetWithTtlExpired() throws Exception {
+    // arrange
+    S3AFileStatus fileStatus = mock(S3AFileStatus.class);
+    Path path = new Path("/file");
+    when(fileStatus.getPath()).thenReturn(path);
+    PathMetadata pm = new PathMetadata(fileStatus);
+    pm.setLastUpdated(100L);
+
+    MetadataStore ms = mock(MetadataStore.class);
+    when(ms.get(path)).thenReturn(pm);
+
+    ITtlTimeProvider timeProvider =
+        mock(ITtlTimeProvider.class);
+    when(timeProvider.getNow()).thenReturn(101L);
+    when(timeProvider.getMetadataTtl()).thenReturn(1L);
+
+    // act
+    final PathMetadata pmExpired = S3Guard.getWithTtl(ms, path, timeProvider);
+
+    // assert
+    assertNull(pmExpired);
+  }
+
+  @Test
+  public void testGetWithTtlNotExpired() throws Exception {
+    // arrange
+    S3AFileStatus fileStatus = mock(S3AFileStatus.class);
+    Path path = new Path("/file");
+    when(fileStatus.getPath()).thenReturn(path);
+    PathMetadata pm = new PathMetadata(fileStatus);
+    pm.setLastUpdated(100L);
+
+    MetadataStore ms = mock(MetadataStore.class);
+    when(ms.get(path)).thenReturn(pm);
+
+    ITtlTimeProvider timeProvider =
+        mock(ITtlTimeProvider.class);
+    when(timeProvider.getNow()).thenReturn(101L);
+    when(timeProvider.getMetadataTtl()).thenReturn(2L);
+
+    // act
+    final PathMetadata pmNotExpired =
+        S3Guard.getWithTtl(ms, path, timeProvider);
+
+    // assert
+    assertNotNull(pmNotExpired);
+  }
+
+  @Test
+  public void testGetWithZeroLastUpdatedNotExpired() throws Exception {
+    // arrange
+    S3AFileStatus fileStatus = mock(S3AFileStatus.class);
+    Path path = new Path("/file");
+    when(fileStatus.getPath()).thenReturn(path);
+    PathMetadata pm = new PathMetadata(fileStatus);
+    // we set 0 this time as the last updated: can happen eg. when we use an
+    // old dynamo table
+    pm.setLastUpdated(0L);
+
+    MetadataStore ms = mock(MetadataStore.class);
+    when(ms.get(path)).thenReturn(pm);
+
+    ITtlTimeProvider timeProvider =
+        mock(ITtlTimeProvider.class);
+    when(timeProvider.getNow()).thenReturn(101L);
+    when(timeProvider.getMetadataTtl()).thenReturn(2L);
+
+    // act
+    final PathMetadata pmExpired = S3Guard.getWithTtl(ms, path, timeProvider);
+
+    // assert
+    assertNotNull(pmExpired);
+  }
+
+
+  /**
+   * Makes sure that all uses of TTL timeouts use a consistent time unit.
+   * @throws Throwable failure
+   */
+  @Test
+  public void testTTLConstruction() throws Throwable {
+    // first one
+    ITtlTimeProvider timeProviderExplicit = new S3Guard.TtlTimeProvider(
+        DEFAULT_METADATASTORE_METADATA_TTL);
+
+    // mirror the FS construction,
+    // from a config guaranteed to be empty (i.e. the code defval)
+    Configuration conf = new Configuration(false);
+    long millitime = conf.getTimeDuration(METADATASTORE_METADATA_TTL,
+        DEFAULT_METADATASTORE_METADATA_TTL, TimeUnit.MILLISECONDS);
+    assertEquals(15 * 60_000, millitime);
+    S3Guard.TtlTimeProvider fsConstruction = new S3Guard.TtlTimeProvider(
+        millitime);
+    assertEquals("explicit vs fs construction", timeProviderExplicit,
+        fsConstruction);
+    assertEquals("first and second constructor", timeProviderExplicit,
+        new S3Guard.TtlTimeProvider(conf));
+    // set the conf to a time without unit
+    conf.setLong(METADATASTORE_METADATA_TTL,
+        DEFAULT_METADATASTORE_METADATA_TTL);
+    assertEquals("first and second time set through long", timeProviderExplicit,
+        new S3Guard.TtlTimeProvider(conf));
+    double timeInSeconds = DEFAULT_METADATASTORE_METADATA_TTL / 1000;
+    double timeInMinutes = timeInSeconds / 60;
+    String timeStr = String.format("%dm", (int) timeInMinutes);
+    assertEquals(":wrong time in minutes from " + timeInMinutes,
+        "15m", timeStr);
+    conf.set(METADATASTORE_METADATA_TTL, timeStr);
+    assertEquals("Time in millis as string from "
+            + conf.get(METADATASTORE_METADATA_TTL),
+        timeProviderExplicit,
+        new S3Guard.TtlTimeProvider(conf));
+  }
+
   void assertContainsPath(FileStatus[] statuses, String pathStr) {
     assertTrue("listing doesn't contain " + pathStr,
         containsPath(statuses, pathStr));
@@ -82,12 +277,15 @@ private PathMetadata makePathMeta(String pathStr, boolean isDir) {
     return new PathMetadata(makeFileStatus(pathStr, isDir));
   }
 
-  private FileStatus makeFileStatus(String pathStr, boolean isDir) {
+  private S3AFileStatus makeFileStatus(String pathStr, boolean isDir) {
     Path p = new Path(pathStr);
+    S3AFileStatus fileStatus;
     if (isDir) {
-      return new FileStatus(0, true, 1, 1, System.currentTimeMillis(), p);
+      fileStatus = new S3AFileStatus(Tristate.UNKNOWN, p, null);
     } else {
-      return new FileStatus(100, false, 1, 1, System.currentTimeMillis(), p);
+      fileStatus = new S3AFileStatus(
+          100, System.currentTimeMillis(), p, 1, null, null, null);
     }
+    return fileStatus;
   }
 }
diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/scale/AbstractITestS3AMetadataStoreScale.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/scale/AbstractITestS3AMetadataStoreScale.java
index 0e6a1d8d09245..0c469f2b8a28e 100644
--- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/scale/AbstractITestS3AMetadataStoreScale.java
+++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/scale/AbstractITestS3AMetadataStoreScale.java
@@ -18,11 +18,15 @@
 
 package org.apache.hadoop.fs.s3a.scale;
 
+import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.fs.s3a.S3AFileStatus;
+import org.apache.hadoop.fs.s3a.s3guard.ITtlTimeProvider;
 import org.apache.hadoop.fs.s3a.s3guard.MetadataStore;
 import org.apache.hadoop.fs.s3a.s3guard.PathMetadata;
+import org.apache.hadoop.fs.s3a.s3guard.S3Guard;
 
+import org.junit.Before;
 import org.junit.FixMethodOrder;
 import org.junit.Test;
 import org.junit.runners.MethodSorters;
@@ -54,6 +58,12 @@ public abstract class AbstractITestS3AMetadataStoreScale extends
   static final long ACCESS_TIME = System.currentTimeMillis();
 
   static final Path BUCKET_ROOT = new Path("s3a://fake-bucket/");
+  private ITtlTimeProvider ttlTimeProvider;
+
+  @Before
+  public void initialize() {
+    ttlTimeProvider = new S3Guard.TtlTimeProvider(new Configuration());
+  }
 
   /**
    * Subclasses should override this to provide the MetadataStore they which
@@ -129,7 +139,7 @@ public void test_020_Moves() throws Throwable {
             toDelete = movedPaths;
             toCreate = origMetas;
           }
-          ms.move(toDelete, toCreate);
+          ms.move(toDelete, toCreate, ttlTimeProvider);
         }
         moveTimer.end();
         printTiming(LOG, "move", moveTimer, operations);
@@ -193,7 +203,7 @@ protected void clearMetadataStore(MetadataStore ms, long count)
       throws IOException {
     describe("Recursive deletion");
     NanoTimer deleteTimer = new NanoTimer();
-    ms.deleteSubtree(BUCKET_ROOT);
+    ms.deleteSubtree(BUCKET_ROOT, ttlTimeProvider);
     deleteTimer.end();
     printTiming(LOG, "delete", deleteTimer, count);
   }

From b7ee22735f007fe44b7fd8db43789f2b1606e731 Mon Sep 17 00:00:00 2001
From: Ben Roling <ben.roling@cerner.com>
Date: Mon, 20 May 2019 02:59:54 +0530
Subject: [PATCH 40/40] HADOOP-16085. S3Guard: use object version or etags to
 protect against inconsistent read after replace/overwrite.

Contributed by Ben Roling.

S3Guard will now track the etag of uploaded files and, if an S3
bucket is versioned, the object version.

You can then control how to react to a mismatch between the data
in the DynamoDB table and that in the store: warn, fail, or, when
using versions, return the original value.

This adds two new columns to the table: etag and version.
This is transparent to older S3A clients -but when such clients
add/update data to the S3Guard table, they will not add these values.
As a result, the etag/version checks will not work with files uploaded by older clients.

For a consistent experience, upgrade all clients to use the latest hadoop version.
---
 hadoop-tools/hadoop-aws/pom.xml               |    5 +
 .../org/apache/hadoop/fs/s3a/Invoker.java     |   84 ++
 .../fs/s3a/RemoteFileChangedException.java    |   20 +
 .../apache/hadoop/fs/s3a/S3AFileStatus.java   |   49 +-
 .../hadoop/fs/s3a/S3ALocatedFileStatus.java   |   63 +
 .../org/apache/hadoop/fs/s3a/S3AUtils.java    |   21 +-
 .../fs/s3a/S3GuardExistsRetryPolicy.java      |    1 +
 .../hadoop/fs/s3a/S3ObjectAttributes.java     |   16 +-
 .../fs/s3a/impl/ChangeDetectionPolicy.java    |  148 ++-
 .../hadoop/fs/s3a/impl/ChangeTracker.java     |  147 ++-
 .../hadoop/fs/s3a/impl/CopyOutcome.java       |   80 ++
 .../fs/s3a/s3guard/DescendantsIterator.java   |    6 +-
 .../fs/s3a/s3guard/DirListingMetadata.java    |    5 +-
 .../fs/s3a/s3guard/DynamoDBMetadataStore.java |   61 +-
 .../fs/s3a/s3guard/LocalMetadataStore.java    |    3 +-
 .../MetadataStoreListFilesIterator.java       |   12 +-
 .../hadoop/fs/s3a/s3guard/PathMetadata.java   |   27 +-
 .../hadoop/fs/s3a/s3guard/S3GuardTool.java    |   23 +-
 .../markdown/tools/hadoop-aws/committers.md   |    9 +-
 .../tools/hadoop-aws/delegation_tokens.md     |    5 +-
 .../site/markdown/tools/hadoop-aws/testing.md |   18 +
 .../tools/hadoop-aws/troubleshooting_s3a.md   |    4 +-
 .../hadoop/fs/s3a/AbstractS3AMockTest.java    |   17 +-
 .../hadoop/fs/s3a/ITestS3ADelayedFNF.java     |   10 +
 .../hadoop/fs/s3a/ITestS3AInconsistency.java  |   10 +
 .../fs/s3a/ITestS3ARemoteFileChanged.java     | 1174 ++++++++++++++++-
 .../fs/s3a/ITestS3GuardListConsistency.java   |   41 +-
 .../hadoop/fs/s3a/MockS3AFileSystem.java      |    2 +-
 .../apache/hadoop/fs/s3a/S3ATestUtils.java    |   25 +-
 .../org/apache/hadoop/fs/s3a/TestListing.java |   29 +-
 .../fs/s3a/TestStreamChangeTracker.java       |  197 ++-
 .../s3guard/AbstractS3GuardToolTestBase.java  |    2 +-
 .../s3guard/ITestDynamoDBMetadataStore.java   |    5 +-
 .../ITestDynamoDBMetadataStoreScale.java      |    3 +-
 .../fs/s3a/s3guard/ITestS3GuardToolLocal.java |   37 +
 .../s3a/s3guard/TestDirListingMetadata.java   |   16 +-
 .../s3a/s3guard/TestLocalMetadataStore.java   |   11 +-
 .../TestObjectChangeDetectionAttributes.java  |  380 ++++++
 .../AbstractITestS3AMetadataStoreScale.java   |    6 +-
 .../fs/s3a/select/ITestS3SelectCLI.java       |    8 +
 .../fs/s3a/select/ITestS3SelectLandsat.java   |    8 +
 .../fs/s3a/select/ITestS3SelectMRJob.java     |   10 +
 .../filecache/TestS3AResourceScope.java       |    2 +-
 43 files changed, 2601 insertions(+), 199 deletions(-)
 create mode 100644 hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3ALocatedFileStatus.java
 create mode 100644 hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/impl/CopyOutcome.java
 create mode 100644 hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/TestObjectChangeDetectionAttributes.java

diff --git a/hadoop-tools/hadoop-aws/pom.xml b/hadoop-tools/hadoop-aws/pom.xml
index 8c12e74a6247c..9b44a5c8afa27 100644
--- a/hadoop-tools/hadoop-aws/pom.xml
+++ b/hadoop-tools/hadoop-aws/pom.xml
@@ -405,6 +405,11 @@
       <artifactId>hadoop-common</artifactId>
       <scope>provided</scope>
     </dependency>
+    <dependency>
+      <groupId>org.apache.httpcomponents</groupId>
+      <artifactId>httpcore</artifactId>
+      <scope>provided</scope>
+    </dependency>
     <dependency>
       <groupId>org.apache.hadoop</groupId>
       <artifactId>hadoop-common</artifactId>
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/Invoker.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/Invoker.java
index 68a69f39321be..a59ffa9c6e088 100644
--- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/Invoker.java
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/Invoker.java
@@ -197,6 +197,33 @@ public void retry(String action,
         });
   }
 
+  /**
+   * Execute a void operation with retry processing when doRetry=true, else
+   * just once.
+   * @param doRetry true if retries should be performed
+   * @param action action to execute (used in error messages)
+   * @param path path of work (used in error messages)
+   * @param idempotent does the operation have semantics
+   * which mean that it can be retried even if was already executed?
+   * @param retrying callback on retries
+   * @param operation operation to execute
+   * @throws IOException any IOE raised, or translated exception
+   */
+  @Retries.RetryTranslated
+  public void maybeRetry(boolean doRetry,
+      String action,
+      String path,
+      boolean idempotent,
+      Retried retrying,
+      VoidOperation operation)
+      throws IOException {
+    maybeRetry(doRetry, action, path, idempotent, retrying,
+        () -> {
+          operation.execute();
+          return null;
+        });
+  }
+
   /**
    * Execute a void operation with  the default retry callback invoked.
    * @param action action to execute (used in error messages)
@@ -215,6 +242,28 @@ public void retry(String action,
     retry(action, path, idempotent, retryCallback, operation);
   }
 
+  /**
+   * Execute a void operation with the default retry callback invoked when
+   * doRetry=true, else just once.
+   * @param doRetry true if retries should be performed
+   * @param action action to execute (used in error messages)
+   * @param path path of work (used in error messages)
+   * @param idempotent does the operation have semantics
+   * which mean that it can be retried even if was already executed?
+   * @param operation operation to execute
+   * @throws IOException any IOE raised, or translated exception
+   */
+  @Retries.RetryTranslated
+  public void maybeRetry(
+      boolean doRetry,
+      String action,
+      String path,
+      boolean idempotent,
+      VoidOperation operation)
+      throws IOException {
+    maybeRetry(doRetry, action, path, idempotent, retryCallback, operation);
+  }
+
   /**
    * Execute a function with the default retry callback invoked.
    * @param action action to execute (used in error messages)
@@ -265,6 +314,41 @@ public <T> T retry(
         () -> once(action, path, operation));
   }
 
+  /**
+   * Execute a function with retry processing when doRetry=true, else just once.
+   * Uses {@link #once(String, String, Operation)} as the inner
+   * invocation mechanism before retry logic is performed.
+   * @param <T> type of return value
+   * @param doRetry true if retries should be performed
+   * @param action action to execute (used in error messages)
+   * @param path path of work (used in error messages)
+   * @param idempotent does the operation have semantics
+   * which mean that it can be retried even if was already executed?
+   * @param retrying callback on retries
+   * @param operation operation to execute
+   * @return the result of the call
+   * @throws IOException any IOE raised, or translated exception
+   */
+  @Retries.RetryTranslated
+  public <T> T maybeRetry(
+      boolean doRetry,
+      String action,
+      @Nullable String path,
+      boolean idempotent,
+      Retried retrying,
+      Operation<T> operation)
+      throws IOException {
+    if (doRetry) {
+      return retryUntranslated(
+          toDescription(action, path),
+          idempotent,
+          retrying,
+          () -> once(action, path, operation));
+    } else {
+      return once(action, path, operation);
+    }
+  }
+
   /**
    * Execute a function with retry processing and no translation.
    * and the default retry callback.
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/RemoteFileChangedException.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/RemoteFileChangedException.java
index cfa5935bbf3e3..1df2d7ee20389 100644
--- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/RemoteFileChangedException.java
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/RemoteFileChangedException.java
@@ -32,6 +32,9 @@
 @InterfaceStability.Unstable
 public class RemoteFileChangedException extends PathIOException {
 
+  public static final String PRECONDITIONS_FAILED =
+      "Constraints of request were unsatisfiable";
+
   /**
    * Constructs a RemoteFileChangedException.
    *
@@ -46,4 +49,21 @@ public RemoteFileChangedException(String path,
     super(path, message);
     setOperation(operation);
   }
+
+  /**
+   * Constructs a RemoteFileChangedException.
+   *
+   * @param path the path accessed when the change was detected
+   * @param operation the operation (e.g. open, re-open) performed when the
+   * change was detected
+   * @param message a message providing more details about the condition
+   * @param cause inner cause.
+   */
+  public RemoteFileChangedException(String path,
+      String operation,
+      String message,
+      Throwable cause) {
+    super(path, message, cause);
+    setOperation(operation);
+  }
 }
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AFileStatus.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AFileStatus.java
index ca6ca908beca6..e8ff846f20fd0 100644
--- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AFileStatus.java
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AFileStatus.java
@@ -32,6 +32,8 @@
 @InterfaceStability.Evolving
 public class S3AFileStatus extends FileStatus {
   private Tristate isEmptyDirectory;
+  private String eTag;
+  private String versionId;
 
   /**
    * Create a directory status.
@@ -69,15 +71,17 @@ public S3AFileStatus(Tristate isemptydir,
    * @param path path
    * @param blockSize block size
    * @param owner owner
+   * @param eTag eTag of the S3 object if available, else null
+   * @param versionId versionId of the S3 object if available, else null
    */
   public S3AFileStatus(long length, long modification_time, Path path,
-      long blockSize, String owner) {
-    super(length, false, 1, blockSize, modification_time, 0,
-        null, null, null, null,
+      long blockSize, String owner, String eTag, String versionId) {
+    super(length, false, 1, blockSize, modification_time,
+        0, null, owner, owner, null,
         path, false, true, false);
     isEmptyDirectory = Tristate.FALSE;
-    setOwner(owner);
-    setGroup(owner);
+    this.eTag = eTag;
+    this.versionId = versionId;
   }
 
   /**
@@ -86,16 +90,19 @@ public S3AFileStatus(long length, long modification_time, Path path,
    * @param source FileStatus to convert to S3AFileStatus
    * @param isEmptyDirectory TRUE/FALSE if known to be / not be an empty
    *     directory, UNKNOWN if that information was not computed.
+   * @param eTag eTag of the S3 object if available, else null
+   * @param versionId versionId of the S3 object if available, else null
    * @return a new S3AFileStatus
    */
   public static S3AFileStatus fromFileStatus(FileStatus source,
-      Tristate isEmptyDirectory) {
+      Tristate isEmptyDirectory, String eTag, String versionId) {
     if (source.isDirectory()) {
       return new S3AFileStatus(isEmptyDirectory, source.getPath(),
           source.getOwner());
     } else {
       return new S3AFileStatus(source.getLen(), source.getModificationTime(),
-          source.getPath(), source.getBlockSize(), source.getOwner());
+          source.getPath(), source.getBlockSize(), source.getOwner(),
+          eTag, versionId);
     }
   }
 
@@ -109,6 +116,28 @@ public Tristate isEmptyDirectory() {
     return isEmptyDirectory;
   }
 
+  /**
+   * Update isEmptyDirectory attribute.
+   * @param isEmptyDirectory new isEmptyDirectory value
+   */
+  public void setIsEmptyDirectory(Tristate isEmptyDirectory) {
+    this.isEmptyDirectory = isEmptyDirectory;
+  }
+
+  /**
+   * @return the S3 object eTag when available, else null.
+   */
+  public String getETag() {
+    return eTag;
+  }
+
+  /**
+   * @return the S3 object versionId when available, else null.
+   */
+  public String getVersionId() {
+    return versionId;
+  }
+
   /** Compare if this object is equal to another object.
    * @param   o the object to be compared.
    * @return  true if two file status has the same path name; false if not.
@@ -150,8 +179,10 @@ public long getModificationTime(){
 
   @Override
   public String toString() {
-    return super.toString() +
-        String.format(" isEmptyDirectory=%s", isEmptyDirectory().name());
+    return super.toString()
+        + String.format(" isEmptyDirectory=%s", isEmptyDirectory().name()
+        + String.format(" eTag=%s", eTag)
+        + String.format(" versionId=%s", versionId));
   }
 
 }
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3ALocatedFileStatus.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3ALocatedFileStatus.java
new file mode 100644
index 0000000000000..d3ca2610e225b
--- /dev/null
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3ALocatedFileStatus.java
@@ -0,0 +1,63 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * <p>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.s3a;
+
+import org.apache.hadoop.fs.BlockLocation;
+import org.apache.hadoop.fs.LocatedFileStatus;
+
+import static com.google.common.base.Preconditions.checkNotNull;
+
+/**
+ * {@link LocatedFileStatus} extended to also carry ETag and object version ID.
+ */
+public class S3ALocatedFileStatus extends LocatedFileStatus {
+
+  private static final long serialVersionUID = 3597192103662929338L;
+
+  private final String eTag;
+  private final String versionId;
+
+  public S3ALocatedFileStatus(S3AFileStatus status, BlockLocation[] locations,
+      String eTag, String versionId) {
+    super(checkNotNull(status), locations);
+    this.eTag = eTag;
+    this.versionId = versionId;
+  }
+
+  public String getETag() {
+    return eTag;
+  }
+
+  public String getVersionId() {
+    return versionId;
+  }
+
+  // equals() and hashCode() overridden to avoid FindBugs warning.
+  // Base implementation is equality on Path only, which is still appropriate.
+
+  @Override
+  public boolean equals(Object o) {
+    return super.equals(o);
+  }
+
+  @Override
+  public int hashCode() {
+    return super.hashCode();
+  }
+}
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AUtils.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AUtils.java
index cc548eca189e6..2afb473309031 100644
--- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AUtils.java
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AUtils.java
@@ -531,16 +531,20 @@ public static String stringify(AmazonS3Exception e) {
    * @param summary summary from AWS
    * @param blockSize block size to declare.
    * @param owner owner of the file
+   * @param eTag S3 object eTag or null if unavailable
+   * @param versionId S3 object versionId or null if unavailable
    * @return a status entry
    */
   public static S3AFileStatus createFileStatus(Path keyPath,
       S3ObjectSummary summary,
       long blockSize,
-      String owner) {
+      String owner,
+      String eTag,
+      String versionId) {
     long size = summary.getSize();
     return createFileStatus(keyPath,
         objectRepresentsDirectory(summary.getKey(), size),
-        size, summary.getLastModified(), blockSize, owner);
+        size, summary.getLastModified(), blockSize, owner, eTag, versionId);
   }
 
   /**
@@ -553,22 +557,27 @@ public static S3AFileStatus createFileStatus(Path keyPath,
    * @param size file length
    * @param blockSize block size for file status
    * @param owner Hadoop username
+   * @param eTag S3 object eTag or null if unavailable
+   * @param versionId S3 object versionId or null if unavailable
    * @return a status entry
    */
   public static S3AFileStatus createUploadFileStatus(Path keyPath,
-      boolean isDir, long size, long blockSize, String owner) {
+      boolean isDir, long size, long blockSize, String owner,
+      String eTag, String versionId) {
     Date date = isDir ? null : new Date();
-    return createFileStatus(keyPath, isDir, size, date, blockSize, owner);
+    return createFileStatus(keyPath, isDir, size, date, blockSize, owner,
+        eTag, versionId);
   }
 
   /* Date 'modified' is ignored when isDir is true. */
   private static S3AFileStatus createFileStatus(Path keyPath, boolean isDir,
-      long size, Date modified, long blockSize, String owner) {
+      long size, Date modified, long blockSize, String owner,
+      String eTag, String versionId) {
     if (isDir) {
       return new S3AFileStatus(Tristate.UNKNOWN, keyPath, owner);
     } else {
       return new S3AFileStatus(size, dateToLong(modified), keyPath, blockSize,
-          owner);
+          owner, eTag, versionId);
     }
   }
 
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3GuardExistsRetryPolicy.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3GuardExistsRetryPolicy.java
index 023d0c3cf2c37..1a0135bb9b5d5 100644
--- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3GuardExistsRetryPolicy.java
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3GuardExistsRetryPolicy.java
@@ -42,6 +42,7 @@ public S3GuardExistsRetryPolicy(Configuration conf) {
   protected Map<Class<? extends Exception>, RetryPolicy> createExceptionMap() {
     Map<Class<? extends Exception>, RetryPolicy> b = super.createExceptionMap();
     b.put(FileNotFoundException.class, retryIdempotentCalls);
+    b.put(RemoteFileChangedException.class, retryIdempotentCalls);
     return b;
   }
 }
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3ObjectAttributes.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3ObjectAttributes.java
index 19c810683d86d..436b51b6c1979 100644
--- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3ObjectAttributes.java
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3ObjectAttributes.java
@@ -34,16 +34,22 @@ public class S3ObjectAttributes {
   private final String key;
   private final S3AEncryptionMethods serverSideEncryptionAlgorithm;
   private final String serverSideEncryptionKey;
+  private final String eTag;
+  private final String versionId;
 
   S3ObjectAttributes(
       String bucket,
       String key,
       S3AEncryptionMethods serverSideEncryptionAlgorithm,
-      String serverSideEncryptionKey) {
+      String serverSideEncryptionKey,
+      String eTag,
+      String versionId) {
     this.bucket = bucket;
     this.key = key;
     this.serverSideEncryptionAlgorithm = serverSideEncryptionAlgorithm;
     this.serverSideEncryptionKey = serverSideEncryptionKey;
+    this.eTag = eTag;
+    this.versionId = versionId;
   }
 
   String getBucket() {
@@ -61,4 +67,12 @@ S3AEncryptionMethods getServerSideEncryptionAlgorithm() {
   String getServerSideEncryptionKey() {
     return serverSideEncryptionKey;
   }
+
+  public String getETag() {
+    return eTag;
+  }
+
+  public String getVersionId() {
+    return versionId;
+  }
 }
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/impl/ChangeDetectionPolicy.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/impl/ChangeDetectionPolicy.java
index f3d8bc20c824b..b0e9d6f454426 100644
--- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/impl/ChangeDetectionPolicy.java
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/impl/ChangeDetectionPolicy.java
@@ -20,8 +20,11 @@
 
 import java.util.Locale;
 
+import com.amazonaws.services.s3.model.CopyObjectRequest;
+import com.amazonaws.services.s3.model.GetObjectMetadataRequest;
 import com.amazonaws.services.s3.model.GetObjectRequest;
 import com.amazonaws.services.s3.model.ObjectMetadata;
+import com.amazonaws.services.s3.transfer.model.CopyResult;
 import com.google.common.annotations.VisibleForTesting;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -30,6 +33,7 @@
 import org.apache.hadoop.classification.InterfaceAudience;
 import org.apache.hadoop.classification.InterfaceStability;
 import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.s3a.S3ObjectAttributes;
 import org.apache.hadoop.fs.s3a.RemoteFileChangedException;
 
 import static org.apache.hadoop.fs.s3a.Constants.*;
@@ -47,7 +51,7 @@ public abstract class ChangeDetectionPolicy {
       LoggerFactory.getLogger(ChangeDetectionPolicy.class);
 
   @VisibleForTesting
-  public static final String CHANGE_DETECTED = "change detected";
+  public static final String CHANGE_DETECTED = "change detected  on client";
 
   private final Mode mode;
   private final boolean requireVersion;
@@ -200,6 +204,28 @@ public static ChangeDetectionPolicy createPolicy(final Mode mode,
   public abstract String getRevisionId(ObjectMetadata objectMetadata,
       String uri);
 
+  /**
+   * Like {{@link #getRevisionId(ObjectMetadata, String)}}, but retrieves the
+   * revision identifier from {@link S3ObjectAttributes}.
+   *
+   * @param s3Attributes the object attributes
+   * @return the revisionId string as interpreted by this policy, or potentially
+   * null if the attribute is unavailable (such as when the policy says to use
+   * versionId but object versioning is not enabled for the bucket).
+   */
+  public abstract String getRevisionId(S3ObjectAttributes s3Attributes);
+
+  /**
+   * Like {{@link #getRevisionId(ObjectMetadata, String)}}, but retrieves the
+   * revision identifier from {@link CopyResult}.
+   *
+   * @param copyResult the copy result
+   * @return the revisionId string as interpreted by this policy, or potentially
+   * null if the attribute is unavailable (such as when the policy says to use
+   * versionId but object versioning is not enabled for the bucket).
+   */
+  public abstract String getRevisionId(CopyResult copyResult);
+
   /**
    * Applies the given {@link #getRevisionId(ObjectMetadata, String) revisionId}
    * as a server-side qualification on the {@code GetObjectRequest}.
@@ -210,6 +236,26 @@ public abstract String getRevisionId(ObjectMetadata objectMetadata,
   public abstract void applyRevisionConstraint(GetObjectRequest request,
       String revisionId);
 
+  /**
+   * Applies the given {@link #getRevisionId(ObjectMetadata, String) revisionId}
+   * as a server-side qualification on the {@code CopyObjectRequest}.
+   *
+   * @param request the request
+   * @param revisionId the revision id
+   */
+  public abstract void applyRevisionConstraint(CopyObjectRequest request,
+      String revisionId);
+
+  /**
+   * Applies the given {@link #getRevisionId(ObjectMetadata, String) revisionId}
+   * as a server-side qualification on the {@code GetObjectMetadataRequest}.
+   *
+   * @param request the request
+   * @param revisionId the revision id
+   */
+  public abstract void applyRevisionConstraint(GetObjectMetadataRequest request,
+      String revisionId);
+
   /**
    * Takes appropriate action based on {@link #getMode() mode} when a change has
    * been detected.
@@ -234,6 +280,7 @@ public ImmutablePair<Boolean, RemoteFileChangedException> onChangeDetected(
       long position,
       String operation,
       long timesAlreadyDetected) {
+    String positionText = position >= 0 ? (" at " + position) : "";
     switch (mode) {
     case None:
       // something changed; we don't care.
@@ -242,8 +289,9 @@ public ImmutablePair<Boolean, RemoteFileChangedException> onChangeDetected(
       if (timesAlreadyDetected == 0) {
         // only warn on the first detection to avoid a noisy log
         LOG.warn(
-            String.format("%s change detected on %s %s at %d. Expected %s got %s",
-                getSource(), operation, uri, position, revisionId,
+            String.format(
+                "%s change detected on %s %s%s. Expected %s got %s",
+                getSource(), operation, uri, positionText, revisionId,
                 newRevisionId));
         return new ImmutablePair<>(true, null);
       }
@@ -251,15 +299,16 @@ public ImmutablePair<Boolean, RemoteFileChangedException> onChangeDetected(
     case Client:
     case Server:
     default:
-      // mode == Client (or Server, but really won't be called for Server)
+      // mode == Client or Server; will trigger on version failures
+      // of getObjectMetadata even on server.
       return new ImmutablePair<>(true,
           new RemoteFileChangedException(uri,
               operation,
               String.format("%s "
                       + CHANGE_DETECTED
-                      + " while reading at position %s."
+                      + " during %s%s."
                     + " Expected %s got %s",
-              getSource(), position, revisionId, newRevisionId)));
+              getSource(), operation, positionText, revisionId, newRevisionId)));
     }
   }
 
@@ -277,11 +326,38 @@ public String getRevisionId(ObjectMetadata objectMetadata, String uri) {
       return objectMetadata.getETag();
     }
 
+    @Override
+    public String getRevisionId(S3ObjectAttributes s3Attributes) {
+      return s3Attributes.getETag();
+    }
+
+    @Override
+    public String getRevisionId(CopyResult copyResult) {
+      return copyResult.getETag();
+    }
+
     @Override
     public void applyRevisionConstraint(GetObjectRequest request,
         String revisionId) {
-      LOG.debug("Restricting request to etag {}", revisionId);
-      request.withMatchingETagConstraint(revisionId);
+      if (revisionId != null) {
+        LOG.debug("Restricting get request to etag {}", revisionId);
+        request.withMatchingETagConstraint(revisionId);
+      }
+    }
+
+    @Override
+    public void applyRevisionConstraint(CopyObjectRequest request,
+        String revisionId) {
+      if (revisionId != null) {
+        LOG.debug("Restricting copy request to etag {}", revisionId);
+        request.withMatchingETagConstraint(revisionId);
+      }
+    }
+
+    @Override
+    public void applyRevisionConstraint(GetObjectMetadataRequest request,
+        String revisionId) {
+      // GetObjectMetadataRequest doesn't support eTag qualification
     }
 
     @Override
@@ -323,11 +399,41 @@ public String getRevisionId(ObjectMetadata objectMetadata, String uri) {
       return versionId;
     }
 
+    @Override
+    public String getRevisionId(S3ObjectAttributes s3Attributes) {
+      return s3Attributes.getVersionId();
+    }
+
+    @Override
+    public String getRevisionId(CopyResult copyResult) {
+      return copyResult.getVersionId();
+    }
+
     @Override
     public void applyRevisionConstraint(GetObjectRequest request,
         String revisionId) {
-      LOG.debug("Restricting request to version {}", revisionId);
-      request.withVersionId(revisionId);
+      if (revisionId != null) {
+        LOG.debug("Restricting get request to version {}", revisionId);
+        request.withVersionId(revisionId);
+      }
+    }
+
+    @Override
+    public void applyRevisionConstraint(CopyObjectRequest request,
+        String revisionId) {
+      if (revisionId != null) {
+        LOG.debug("Restricting copy request to version {}", revisionId);
+        request.withSourceVersionId(revisionId);
+      }
+    }
+
+    @Override
+    public void applyRevisionConstraint(GetObjectMetadataRequest request,
+        String revisionId) {
+      if (revisionId != null) {
+        LOG.debug("Restricting metadata request to version {}", revisionId);
+        request.withVersionId(revisionId);
+      }
     }
 
     @Override
@@ -361,12 +467,34 @@ public String getRevisionId(final ObjectMetadata objectMetadata,
       return null;
     }
 
+    @Override
+    public String getRevisionId(final S3ObjectAttributes s3ObjectAttributes) {
+      return null;
+    }
+
+    @Override
+    public String getRevisionId(CopyResult copyResult) {
+      return null;
+    }
+
     @Override
     public void applyRevisionConstraint(final GetObjectRequest request,
         final String revisionId) {
 
     }
 
+    @Override
+    public void applyRevisionConstraint(CopyObjectRequest request,
+        String revisionId) {
+
+    }
+
+    @Override
+    public void applyRevisionConstraint(GetObjectMetadataRequest request,
+        String revisionId) {
+
+    }
+
     @Override
     public String toString() {
       return "NoChangeDetection";
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/impl/ChangeTracker.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/impl/ChangeTracker.java
index f76602b953259..75fecd5f14632 100644
--- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/impl/ChangeTracker.java
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/impl/ChangeTracker.java
@@ -20,11 +20,15 @@
 
 import java.util.concurrent.atomic.AtomicLong;
 
+import com.amazonaws.AmazonServiceException;
+import com.amazonaws.SdkBaseException;
+import com.amazonaws.services.s3.model.CopyObjectRequest;
+import com.amazonaws.services.s3.model.GetObjectMetadataRequest;
 import com.amazonaws.services.s3.model.GetObjectRequest;
 import com.amazonaws.services.s3.model.ObjectMetadata;
 import com.amazonaws.services.s3.model.S3Object;
+import com.amazonaws.services.s3.transfer.model.CopyResult;
 import com.google.common.annotations.VisibleForTesting;
-import org.apache.hadoop.fs.s3a.NoVersionAttributeException;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -32,14 +36,18 @@
 import org.apache.hadoop.classification.InterfaceAudience;
 import org.apache.hadoop.classification.InterfaceStability;
 import org.apache.hadoop.fs.PathIOException;
+import org.apache.hadoop.fs.s3a.NoVersionAttributeException;
 import org.apache.hadoop.fs.s3a.RemoteFileChangedException;
+import org.apache.hadoop.fs.s3a.S3ObjectAttributes;
 
 import static com.google.common.base.Preconditions.checkNotNull;
+import static org.apache.http.HttpStatus.SC_PRECONDITION_FAILED;
 
 /**
- * Change tracking for input streams: the revision ID/etag
- * the previous request is recorded and when the next request comes in,
- * it is compared.
+ * Change tracking for input streams: the version ID or etag of the object is
+ * tracked and compared on open/re-open.  An initial version ID or etag may or
+ * may not be available, depending on usage (e.g. if S3Guard is utilized).
+ *
  * Self-contained for testing and use in different streams.
  */
 @InterfaceAudience.Private
@@ -49,7 +57,7 @@ public class ChangeTracker {
   private static final Logger LOG =
       LoggerFactory.getLogger(ChangeTracker.class);
 
-  public static final String CHANGE_REPORTED_BY_S3 = "reported by S3";
+  public static final String CHANGE_REPORTED_BY_S3 = "Change reported by S3";
 
   /** Policy to use. */
   private final ChangeDetectionPolicy policy;
@@ -76,13 +84,20 @@ public class ChangeTracker {
    * @param uri URI of object being tracked
    * @param policy policy to track.
    * @param versionMismatches reference to the version mismatch counter
+   * @param s3ObjectAttributes attributes of the object, potentially including
+   * an eTag or versionId to match depending on {@code policy}
    */
   public ChangeTracker(final String uri,
       final ChangeDetectionPolicy policy,
-      final AtomicLong versionMismatches) {
+      final AtomicLong versionMismatches,
+      final S3ObjectAttributes s3ObjectAttributes) {
     this.policy = checkNotNull(policy);
     this.uri = uri;
     this.versionMismatches = versionMismatches;
+    this.revisionId = policy.getRevisionId(s3ObjectAttributes);
+    if (revisionId != null) {
+      LOG.debug("Revision ID for object at {}: {}", uri, revisionId);
+    }
   }
 
   public String getRevisionId() {
@@ -115,6 +130,33 @@ public boolean maybeApplyConstraint(
     return false;
   }
 
+  /**
+   * Apply any revision control set by the policy if it is to be
+   * enforced on the server.
+   * @param request request to modify
+   * @return true iff a constraint was added.
+   */
+  public boolean maybeApplyConstraint(
+      final CopyObjectRequest request) {
+
+    if (policy.getMode() == ChangeDetectionPolicy.Mode.Server
+        && revisionId != null) {
+      policy.applyRevisionConstraint(request, revisionId);
+      return true;
+    }
+    return false;
+  }
+
+  public boolean maybeApplyConstraint(
+      final GetObjectMetadataRequest request) {
+
+    if (policy.getMode() == ChangeDetectionPolicy.Mode.Server
+        && revisionId != null) {
+      policy.applyRevisionConstraint(request, revisionId);
+      return true;
+    }
+    return false;
+  }
 
   /**
    * Process the response from the server for validation against the
@@ -135,29 +177,106 @@ public void processResponse(final S3Object object,
         // object was not returned.
         versionMismatches.incrementAndGet();
         throw new RemoteFileChangedException(uri, operation,
-            String.format("%s change "
-                    + CHANGE_REPORTED_BY_S3
-                    + " while reading"
+            String.format(CHANGE_REPORTED_BY_S3
+                    + " during %s"
                     + " at position %s."
-                    + " Version %s was unavailable",
-                getSource(),
+                    + " %s %s was unavailable",
+                operation,
                 pos,
+                getSource(),
                 getRevisionId()));
       } else {
         throw new PathIOException(uri, "No data returned from GET request");
       }
     }
 
-    final ObjectMetadata metadata = object.getObjectMetadata();
+    processMetadata(object.getObjectMetadata(), operation);
+  }
+
+  /**
+   * Process the response from the server for validation against the
+   * change policy.
+   * @param copyResult result of a copy operation
+   * @throws PathIOException raised on failure
+   * @throws RemoteFileChangedException if the remote file has changed.
+   */
+  public void processResponse(final CopyResult copyResult)
+      throws PathIOException {
+    // ETag (sometimes, depending on encryption and/or multipart) is not the
+    // same on the copied object as the original.  Version Id seems to never
+    // be the same on the copy.  As such, there isn't really anything that
+    // can be verified on the response, except that a revision ID is present
+    // if required.
+    String newRevisionId = policy.getRevisionId(copyResult);
+    LOG.debug("Copy result {}: {}", policy.getSource(), newRevisionId);
+    if (newRevisionId == null && policy.isRequireVersion()) {
+      throw new NoVersionAttributeException(uri, String.format(
+          "Change detection policy requires %s",
+          policy.getSource()));
+    }
+  }
+
+  /**
+   * Process an exception generated against the change policy.
+   * If the exception indicates the file has changed, this method throws
+   * {@code RemoteFileChangedException} with the original exception as the
+   * cause.
+   * @param e the exception
+   * @param operation the operation performed when the exception was
+   * generated (e.g. "copy", "read", "select").
+   * @throws RemoteFileChangedException if the remote file has changed.
+   */
+  public void processException(SdkBaseException e, String operation) throws
+      RemoteFileChangedException {
+    if (e instanceof AmazonServiceException) {
+      AmazonServiceException serviceException = (AmazonServiceException) e;
+      // This isn't really going to be hit due to
+      // https://github.com/aws/aws-sdk-java/issues/1644
+      if (serviceException.getStatusCode() == SC_PRECONDITION_FAILED) {
+        versionMismatches.incrementAndGet();
+        throw new RemoteFileChangedException(uri, operation, String.format(
+            RemoteFileChangedException.PRECONDITIONS_FAILED
+                + " on %s."
+                + " Version %s was unavailable",
+            getSource(),
+            getRevisionId()),
+            serviceException);
+      }
+    }
+  }
+
+  /**
+   * Process metadata response from server for validation against the change
+   * policy.
+   * @param metadata metadata returned from server
+   * @param operation operation in progress
+   * @throws PathIOException raised on failure
+   * @throws RemoteFileChangedException if the remote file has changed.
+   */
+  public void processMetadata(final ObjectMetadata metadata,
+      final String operation) throws PathIOException {
     final String newRevisionId = policy.getRevisionId(metadata, uri);
+    processNewRevision(newRevisionId, operation, -1);
+  }
+
+  /**
+   * Validate a revision from the server against our expectations.
+   * @param newRevisionId new revision.
+   * @param operation operation in progress
+   * @param pos offset in the file; -1 for "none"
+   * @throws PathIOException raised on failure
+   * @throws RemoteFileChangedException if the remote file has changed.
+   */
+  private void processNewRevision(final String newRevisionId,
+      final String operation, final long pos) throws PathIOException {
     if (newRevisionId == null && policy.isRequireVersion()) {
       throw new NoVersionAttributeException(uri, String.format(
           "Change detection policy requires %s",
           policy.getSource()));
     }
     if (revisionId == null) {
-      // revisionId is null on first (re)open. Pin it so change can be detected
-      // if object has been updated
+      // revisionId may be null on first (re)open. Pin it so change can be
+      // detected if object has been updated
       LOG.debug("Setting revision ID for object at {}: {}",
           uri, newRevisionId);
       revisionId = newRevisionId;
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/impl/CopyOutcome.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/impl/CopyOutcome.java
new file mode 100644
index 0000000000000..16459ac45b850
--- /dev/null
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/impl/CopyOutcome.java
@@ -0,0 +1,80 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.s3a.impl;
+
+import com.amazonaws.SdkBaseException;
+import com.amazonaws.services.s3.transfer.Copy;
+import com.amazonaws.services.s3.transfer.model.CopyResult;
+
+/**
+ * Extracts the outcome of a TransferManager-executed copy operation.
+ */
+public final class CopyOutcome {
+
+  /**
+   * Result of a successful copy.
+   */
+  private final CopyResult copyResult;
+
+  /** the copy was interrupted. */
+  private final InterruptedException interruptedException;
+
+  /**
+   * The copy raised an AWS Exception of some form.
+   */
+  private final SdkBaseException awsException;
+
+  public CopyOutcome(CopyResult copyResult,
+      InterruptedException interruptedException,
+      SdkBaseException awsException) {
+    this.copyResult = copyResult;
+    this.interruptedException = interruptedException;
+    this.awsException = awsException;
+  }
+
+  public CopyResult getCopyResult() {
+    return copyResult;
+  }
+
+  public InterruptedException getInterruptedException() {
+    return interruptedException;
+  }
+
+  public SdkBaseException getAwsException() {
+    return awsException;
+  }
+
+  /**
+   * Calls {@code Copy.waitForCopyResult()} to await the result, converts
+   * it to a copy outcome.
+   * Exceptions caught and
+   * @param copy the copy operation.
+   * @return the outcome.
+   */
+  public static CopyOutcome waitForCopy(Copy copy) {
+    try {
+      CopyResult result = copy.waitForCopyResult();
+      return new CopyOutcome(result, null, null);
+    } catch (SdkBaseException e) {
+      return new CopyOutcome(null, null, e);
+    } catch (InterruptedException e) {
+      return new CopyOutcome(null, e, null);
+    }
+  }
+}
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/DescendantsIterator.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/DescendantsIterator.java
index dcee35824ed0f..88a46745b11bf 100644
--- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/DescendantsIterator.java
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/DescendantsIterator.java
@@ -28,9 +28,9 @@
 
 import org.apache.hadoop.classification.InterfaceAudience;
 import org.apache.hadoop.classification.InterfaceStability;
-import org.apache.hadoop.fs.FileStatus;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.fs.RemoteIterator;
+import org.apache.hadoop.fs.s3a.S3AFileStatus;
 
 /**
  * {@code DescendantsIterator} is a {@link RemoteIterator} that implements
@@ -83,7 +83,7 @@
  */
 @InterfaceAudience.Private
 @InterfaceStability.Evolving
-public class DescendantsIterator implements RemoteIterator<FileStatus> {
+public class DescendantsIterator implements RemoteIterator<S3AFileStatus> {
 
   private final MetadataStore metadataStore;
   private final Queue<PathMetadata> queue = new LinkedList<>();
@@ -121,7 +121,7 @@ public boolean hasNext() throws IOException {
   }
 
   @Override
-  public FileStatus next() throws IOException {
+  public S3AFileStatus next() throws IOException {
     if (!hasNext()) {
       throw new NoSuchElementException("No more descendants.");
     }
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/DirListingMetadata.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/DirListingMetadata.java
index b5e596150ecde..bebd6fd74a03f 100644
--- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/DirListingMetadata.java
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/DirListingMetadata.java
@@ -34,6 +34,7 @@
 import org.apache.hadoop.classification.InterfaceStability;
 import org.apache.hadoop.fs.FileStatus;
 import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.s3a.S3AFileStatus;
 import org.apache.hadoop.fs.s3a.Tristate;
 
 /**
@@ -61,7 +62,7 @@ public class DirListingMetadata {
    * Create a directory listing metadata container.
    *
    * @param path Path of the directory. If this path has a host component, then
-   *     all paths added later via {@link #put(FileStatus)} must also have
+   *     all paths added later via {@link #put(S3AFileStatus)} must also have
    *     the same host.
    * @param listing Entries in the directory.
    * @param isAuthoritative true iff listing is the full contents of the
@@ -216,7 +217,7 @@ public void remove(Path childPath) {
    * @return true if the status was added or replaced with a new value. False
    * if the same FileStatus value was already present.
    */
-  public boolean put(FileStatus childFileStatus) {
+  public boolean put(S3AFileStatus childFileStatus) {
     Preconditions.checkNotNull(childFileStatus,
         "childFileStatus must be non-null");
     Path childPath = childStatusToPathKey(childFileStatus);
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/DynamoDBMetadataStore.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/DynamoDBMetadataStore.java
index 0a298b4edac4d..d70b2ac7d5f2d 100644
--- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/DynamoDBMetadataStore.java
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/DynamoDBMetadataStore.java
@@ -88,6 +88,7 @@
 import org.apache.hadoop.fs.s3a.Constants;
 import org.apache.hadoop.fs.s3a.Invoker;
 import org.apache.hadoop.fs.s3a.Retries;
+import org.apache.hadoop.fs.s3a.S3AFileStatus;
 import org.apache.hadoop.fs.s3a.S3AFileSystem;
 import org.apache.hadoop.fs.s3a.S3AInstrumentation;
 import org.apache.hadoop.fs.s3a.S3AUtils;
@@ -129,6 +130,14 @@
  *      This attribute is meaningful only to file items.</li>
  * <li> optional long attribute revealing block size of the file.
  *      This attribute is meaningful only to file items.</li>
+ * <li> optional string attribute tracking the s3 eTag of the file.
+ *      May be absent if the metadata was entered with a version of S3Guard
+ *      before this was tracked.
+ *      This attribute is meaningful only to file items.</li>
+  * <li> optional string attribute tracking the s3 versionId of the file.
+ *      May be absent if the metadata was entered with a version of S3Guard
+ *      before this was tracked.
+ *      This attribute is meaningful only to file items.</li>
  * </ul>
  *
  * The DynamoDB partition key is the parent, and the range key is the child.
@@ -155,20 +164,20 @@
  * This is persisted to a single DynamoDB table as:
  *
  * <pre>
- * =========================================================================
- * | parent                 | child | is_dir | mod_time | len |     ...    |
- * =========================================================================
- * | /bucket                | dir1  | true   |          |     |            |
- * | /bucket/dir1           | dir2  | true   |          |     |            |
- * | /bucket/dir1           | dir3  | true   |          |     |            |
- * | /bucket/dir1/dir2      | file1 |        |   100    | 111 |            |
- * | /bucket/dir1/dir2      | file2 |        |   200    | 222 |            |
- * | /bucket/dir1/dir3      | dir4  | true   |          |     |            |
- * | /bucket/dir1/dir3      | dir5  | true   |          |     |            |
- * | /bucket/dir1/dir3/dir4 | file3 |        |   300    | 333 |            |
- * | /bucket/dir1/dir3/dir5 | file4 |        |   400    | 444 |            |
- * | /bucket/dir1/dir3      | dir6  | true   |          |     |            |
- * =========================================================================
+ * ====================================================================================
+ * | parent                 | child | is_dir | mod_time | len | etag | ver_id |  ...  |
+ * ====================================================================================
+ * | /bucket                | dir1  | true   |          |     |      |        |       |
+ * | /bucket/dir1           | dir2  | true   |          |     |      |        |       |
+ * | /bucket/dir1           | dir3  | true   |          |     |      |        |       |
+ * | /bucket/dir1/dir2      | file1 |        |   100    | 111 | abc  |  mno   |       |
+ * | /bucket/dir1/dir2      | file2 |        |   200    | 222 | def  |  pqr   |       |
+ * | /bucket/dir1/dir3      | dir4  | true   |          |     |      |        |       |
+ * | /bucket/dir1/dir3      | dir5  | true   |          |     |      |        |       |
+ * | /bucket/dir1/dir3/dir4 | file3 |        |   300    | 333 | ghi  |  stu   |       |
+ * | /bucket/dir1/dir3/dir5 | file4 |        |   400    | 444 | jkl  |  vwx   |       |
+ * | /bucket/dir1/dir3      | dir6  | true   |          |     |      |        |       |
+ * ====================================================================================
  * </pre>
  *
  * This choice of schema is efficient for read access patterns.
@@ -611,16 +620,15 @@ private DDBPathMetadata innerGet(Path path, boolean wantEmptyDirectoryFlag)
   }
 
   /**
-   * Make a FileStatus object for a directory at given path.  The FileStatus
-   * only contains what S3A needs, and omits mod time since S3A uses its own
-   * implementation which returns current system time.
-   * @param owner  username of owner
+   * Make a S3AFileStatus object for a directory at given path.
+   * The FileStatus only contains what S3A needs, and omits mod time
+   * since S3A uses its own implementation which returns current system time.
+   * @param dirOwner  username of owner
    * @param path   path to dir
-   * @return new FileStatus
+   * @return new S3AFileStatus
    */
-  private FileStatus makeDirStatus(String owner, Path path) {
-    return new FileStatus(0, true, 1, 0, 0, 0, null,
-            owner, null, path);
+  private S3AFileStatus makeDirStatus(String dirOwner, Path path) {
+    return new S3AFileStatus(Tristate.UNKNOWN, path, dirOwner);
   }
 
   @Override
@@ -682,7 +690,7 @@ Collection<DDBPathMetadata> completeAncestry(
       while (!parent.isRoot() && !ancestry.containsKey(parent)) {
         LOG.debug("auto-create ancestor path {} for child path {}",
             parent, path);
-        final FileStatus status = makeDirStatus(parent, username);
+        final S3AFileStatus status = makeDirStatus(parent, username);
         ancestry.put(parent, new DDBPathMetadata(status, Tristate.FALSE,
             false));
         parent = parent.getParent();
@@ -892,7 +900,7 @@ Collection<DDBPathMetadata> fullPathsToPut(DDBPathMetadata meta)
     while (path != null && !path.isRoot()) {
       final Item item = getConsistentItem(path);
       if (!itemExists(item)) {
-        final FileStatus status = makeDirStatus(path, username);
+        final S3AFileStatus status = makeDirStatus(path, username);
         metasToPut.add(new DDBPathMetadata(status, Tristate.FALSE, false,
             meta.isAuthoritativeDir()));
         path = path.getParent();
@@ -915,9 +923,8 @@ private boolean itemExists(Item item) {
   }
 
   /** Create a directory FileStatus using current system time as mod time. */
-  static FileStatus makeDirStatus(Path f, String owner) {
-    return  new FileStatus(0, true, 1, 0, System.currentTimeMillis(), 0,
-        null, owner, owner, f);
+  static S3AFileStatus makeDirStatus(Path f, String owner) {
+    return new S3AFileStatus(Tristate.UNKNOWN, f, owner);
   }
 
   /**
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/LocalMetadataStore.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/LocalMetadataStore.java
index 2f7fec6cbb731..bdd9e78321c27 100644
--- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/LocalMetadataStore.java
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/LocalMetadataStore.java
@@ -28,6 +28,7 @@
 import org.apache.hadoop.fs.FileStatus;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.s3a.S3AFileStatus;
 import org.apache.hadoop.fs.s3a.Tristate;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -234,7 +235,7 @@ public void move(Collection<Path> pathsToDelete,
   public void put(PathMetadata meta) throws IOException {
 
     Preconditions.checkNotNull(meta);
-    FileStatus status = meta.getFileStatus();
+    S3AFileStatus status = meta.getFileStatus();
     Path path = standardize(status.getPath());
     synchronized (this) {
 
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/MetadataStoreListFilesIterator.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/MetadataStoreListFilesIterator.java
index 378d10980c835..e4e76c50d6ce5 100644
--- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/MetadataStoreListFilesIterator.java
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/MetadataStoreListFilesIterator.java
@@ -33,9 +33,9 @@
 
 import org.apache.hadoop.classification.InterfaceAudience;
 import org.apache.hadoop.classification.InterfaceStability;
-import org.apache.hadoop.fs.FileStatus;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.fs.RemoteIterator;
+import org.apache.hadoop.fs.s3a.S3AFileStatus;
 
 /**
  * {@code MetadataStoreListFilesIterator} is a {@link RemoteIterator} that
@@ -85,14 +85,14 @@
 @InterfaceAudience.Private
 @InterfaceStability.Evolving
 public class MetadataStoreListFilesIterator implements
-    RemoteIterator<FileStatus> {
+    RemoteIterator<S3AFileStatus> {
   public static final Logger LOG = LoggerFactory.getLogger(
       MetadataStoreListFilesIterator.class);
 
   private final boolean allowAuthoritative;
   private final MetadataStore metadataStore;
   private final Set<Path> tombstones = new HashSet<>();
-  private Iterator<FileStatus> leafNodesIterator = null;
+  private Iterator<S3AFileStatus> leafNodesIterator = null;
 
   public MetadataStoreListFilesIterator(MetadataStore ms, PathMetadata meta,
       boolean allowAuthoritative) throws IOException {
@@ -104,7 +104,7 @@ public MetadataStoreListFilesIterator(MetadataStore ms, PathMetadata meta,
 
   private void prefetch(PathMetadata meta) throws IOException {
     final Queue<PathMetadata> queue = new LinkedList<>();
-    final Collection<FileStatus> leafNodes = new ArrayList<>();
+    final Collection<S3AFileStatus> leafNodes = new ArrayList<>();
 
     if (meta != null) {
       final Path path = meta.getFileStatus().getPath();
@@ -121,7 +121,7 @@ private void prefetch(PathMetadata meta) throws IOException {
 
     while(!queue.isEmpty()) {
       PathMetadata nextMetadata = queue.poll();
-      FileStatus nextStatus = nextMetadata.getFileStatus();
+      S3AFileStatus nextStatus = nextMetadata.getFileStatus();
       if (nextStatus.isFile()) {
         // All files are leaf nodes by definition
         leafNodes.add(nextStatus);
@@ -159,7 +159,7 @@ public boolean hasNext() {
   }
 
   @Override
-  public FileStatus next() {
+  public S3AFileStatus next() {
     return leafNodesIterator.next();
   }
 
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/PathMetadata.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/PathMetadata.java
index 2a0219e4cef07..1c00bf82cba9d 100644
--- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/PathMetadata.java
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/PathMetadata.java
@@ -21,8 +21,8 @@
 import com.google.common.base.Preconditions;
 import org.apache.hadoop.classification.InterfaceAudience;
 import org.apache.hadoop.classification.InterfaceStability;
-import org.apache.hadoop.fs.FileStatus;
 import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.s3a.S3AFileStatus;
 import org.apache.hadoop.fs.s3a.Tristate;
 
 /**
@@ -33,7 +33,7 @@
 @InterfaceStability.Evolving
 public class PathMetadata {
 
-  private final FileStatus fileStatus;
+  private S3AFileStatus fileStatus;
   private Tristate isEmptyDirectory;
   private boolean isDeleted;
 
@@ -43,24 +43,25 @@ public class PathMetadata {
    * @return the entry.
    */
   public static PathMetadata tombstone(Path path) {
-    long now = System.currentTimeMillis();
-    FileStatus status = new FileStatus(0, false, 0, 0, now, path);
-    return new PathMetadata(status, Tristate.UNKNOWN, true);
+    S3AFileStatus s3aStatus = new S3AFileStatus(0,
+        System.currentTimeMillis(), path, 0, null,
+        null, null);
+    return new PathMetadata(s3aStatus, Tristate.UNKNOWN, true);
   }
 
   /**
    * Creates a new {@code PathMetadata} containing given {@code FileStatus}.
    * @param fileStatus file status containing an absolute path.
    */
-  public PathMetadata(FileStatus fileStatus) {
-    this(fileStatus, Tristate.UNKNOWN);
+  public PathMetadata(S3AFileStatus fileStatus) {
+    this(fileStatus, Tristate.UNKNOWN, false);
   }
 
-  public PathMetadata(FileStatus fileStatus, Tristate isEmptyDir) {
+  public PathMetadata(S3AFileStatus fileStatus, Tristate isEmptyDir) {
     this(fileStatus, isEmptyDir, false);
   }
 
-  public PathMetadata(FileStatus fileStatus, Tristate isEmptyDir, boolean
+  public PathMetadata(S3AFileStatus fileStatus, Tristate isEmptyDir, boolean
       isDeleted) {
     Preconditions.checkNotNull(fileStatus, "fileStatus must be non-null");
     Preconditions.checkNotNull(fileStatus.getPath(), "fileStatus path must be" +
@@ -75,7 +76,7 @@ public PathMetadata(FileStatus fileStatus, Tristate isEmptyDir, boolean
   /**
    * @return {@code FileStatus} contained in this {@code PathMetadata}.
    */
-  public final FileStatus getFileStatus() {
+  public final S3AFileStatus getFileStatus() {
     return fileStatus;
   }
 
@@ -91,6 +92,7 @@ public Tristate isEmptyDirectory() {
 
   void setIsEmptyDirectory(Tristate isEmptyDirectory) {
     this.isEmptyDirectory = isEmptyDirectory;
+    fileStatus.setIsEmptyDirectory(isEmptyDirectory);
   }
 
   public boolean isDeleted() {
@@ -128,10 +130,11 @@ public String toString() {
    * @param sb target StringBuilder
    */
   public void prettyPrint(StringBuilder sb) {
-    sb.append(String.format("%-5s %-20s %-7d %-8s %-6s",
+    sb.append(String.format("%-5s %-20s %-7d %-8s %-6s %-20s %-20s",
         fileStatus.isDirectory() ? "dir" : "file",
         fileStatus.getPath().toString(), fileStatus.getLen(),
-        isEmptyDirectory.name(), isDeleted));
+        isEmptyDirectory.name(), isDeleted,
+        fileStatus.getETag(), fileStatus.getVersionId()));
     sb.append(fileStatus);
   }
 
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/S3GuardTool.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/S3GuardTool.java
index 318094adb3935..22c14fd4f497a 100644
--- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/S3GuardTool.java
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/S3GuardTool.java
@@ -44,12 +44,12 @@
 import org.apache.hadoop.conf.Configured;
 import org.apache.hadoop.fs.FileStatus;
 import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.LocatedFileStatus;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.fs.RemoteIterator;
 import org.apache.hadoop.fs.s3a.MultipartUtils;
 import org.apache.hadoop.fs.s3a.S3AFileStatus;
 import org.apache.hadoop.fs.s3a.S3AFileSystem;
+import org.apache.hadoop.fs.s3a.S3ALocatedFileStatus;
 import org.apache.hadoop.fs.s3a.S3AUtils;
 import org.apache.hadoop.fs.s3a.auth.delegation.S3ADelegationTokens;
 import org.apache.hadoop.fs.s3a.commit.CommitConstants;
@@ -703,7 +703,7 @@ private void putParentsIfNotPresent(FileStatus f) throws IOException {
         if (dirCache.contains(parent)) {
           return;
         }
-        FileStatus dir = DynamoDBMetadataStore.makeDirStatus(parent,
+        S3AFileStatus dir = DynamoDBMetadataStore.makeDirStatus(parent,
             f.getOwner());
         S3Guard.putWithTtl(getStore(), new PathMetadata(dir),
             getFilesystem().getTtlTimeProvider());
@@ -719,13 +719,13 @@ private void putParentsIfNotPresent(FileStatus f) throws IOException {
      */
     private long importDir(FileStatus status) throws IOException {
       Preconditions.checkArgument(status.isDirectory());
-      RemoteIterator<LocatedFileStatus> it = getFilesystem()
+      RemoteIterator<S3ALocatedFileStatus> it = getFilesystem()
           .listFilesAndEmptyDirectories(status.getPath(), true);
       long items = 0;
 
       while (it.hasNext()) {
-        LocatedFileStatus located = it.next();
-        FileStatus child;
+        S3ALocatedFileStatus located = it.next();
+        S3AFileStatus child;
         if (located.isDirectory()) {
           child = DynamoDBMetadataStore.makeDirStatus(located.getPath(),
               located.getOwner());
@@ -735,7 +735,9 @@ private long importDir(FileStatus status) throws IOException {
               located.getModificationTime(),
               located.getPath(),
               located.getBlockSize(),
-              located.getOwner());
+              located.getOwner(),
+              located.getETag(),
+              located.getVersionId());
         }
         putParentsIfNotPresent(child);
         S3Guard.putWithTtl(getStore(), new PathMetadata(child),
@@ -763,7 +765,8 @@ public int run(String[] args, PrintStream out) throws Exception {
         filePath = "/";
       }
       Path path = new Path(filePath);
-      FileStatus status = getFilesystem().getFileStatus(path);
+      S3AFileStatus status = (S3AFileStatus) getFilesystem()
+          .getFileStatus(path);
 
       try {
         initMetadataStore(false);
@@ -1167,7 +1170,7 @@ public int run(String[] args, PrintStream out)
           magic ? "is" : "is not");
 
       println(out, "%nS3A Client");
-
+      printOption(out, "\tSigning Algorithm", SIGNING_ALGORITHM, "(unset)");
       String endpoint = conf.getTrimmed(ENDPOINT, "");
       println(out, "\tEndpoint: %s=%s",
           ENDPOINT,
@@ -1176,6 +1179,10 @@ public int run(String[] args, PrintStream out)
           printOption(out, "\tEncryption", SERVER_SIDE_ENCRYPTION_ALGORITHM,
               "none");
       printOption(out, "\tInput seek policy", INPUT_FADVISE, INPUT_FADV_NORMAL);
+      printOption(out, "\tChange Detection Source", CHANGE_DETECT_SOURCE,
+          CHANGE_DETECT_SOURCE_DEFAULT);
+      printOption(out, "\tChange Detection Mode", CHANGE_DETECT_MODE,
+          CHANGE_DETECT_MODE_DEFAULT);
 
       // look at delegation token support
       if (fs.getDelegationTokens().isPresent()) {
diff --git a/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/committers.md b/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/committers.md
index 09e123d6eda10..ef9c999359b0c 100644
--- a/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/committers.md
+++ b/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/committers.md
@@ -690,10 +690,15 @@ Filesystem s3a://landsat-pds is not using S3Guard
 The "magic" committer is not supported
 
 S3A Client
-  Endpoint: fs.s3a.endpoint=(unset)
+  Signing Algorithm: fs.s3a.signing-algorithm=(unset)
+  Endpoint: fs.s3a.endpoint=s3.amazonaws.com
   Encryption: fs.s3a.server-side-encryption-algorithm=none
   Input seek policy: fs.s3a.experimental.input.fadvise=normal
-2017-09-27 19:18:57,917 INFO util.ExitUtil: Exiting with status 46: 46: The magic committer is not enabled for s3a://landsat-pds
+  Change Detection Source: fs.s3a.change.detection.source=etag
+  Change Detection Mode: fs.s3a.change.detection.mode=server
+Delegation token support is disabled
+2019-05-17 13:53:38,245 [main] INFO  util.ExitUtil (ExitUtil.java:terminate(210)) -
+ Exiting with status 46: 46: The magic committer is not enabled for s3a://landsat-pds
 ```
 
 ## Error message: "File being created has a magic path, but the filesystem has magic file support disabled:
diff --git a/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/delegation_tokens.md b/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/delegation_tokens.md
index 30226f85eb9b7..aad3f355b2de6 100644
--- a/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/delegation_tokens.md
+++ b/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/delegation_tokens.md
@@ -522,12 +522,15 @@ $ hadoop s3guard bucket-info s3a://landsat-pds/
 Filesystem s3a://landsat-pds
 Location: us-west-2
 Filesystem s3a://landsat-pds is not using S3Guard
-The "magic" committer is supported
+The "magic" committer is not supported
 
 S3A Client
+  Signing Algorithm: fs.s3a.signing-algorithm=(unset)
   Endpoint: fs.s3a.endpoint=s3.amazonaws.com
   Encryption: fs.s3a.server-side-encryption-algorithm=none
   Input seek policy: fs.s3a.experimental.input.fadvise=normal
+  Change Detection Source: fs.s3a.change.detection.source=etag
+  Change Detection Mode: fs.s3a.change.detection.mode=server
 Delegation Support enabled: token kind = S3ADelegationToken/Session
 Hadoop security mode: SIMPLE
 ```
diff --git a/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/testing.md b/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/testing.md
index e3f227de220dc..d67105f7d2ea3 100644
--- a/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/testing.md
+++ b/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/testing.md
@@ -299,6 +299,24 @@ plugin:
 ```bash
 mvn surefire-report:failsafe-report-only
 ```
+## <a name="versioning"></a> Testing Versioned Stores
+
+Some tests (specifically some in `ITestS3ARemoteFileChanged`) require
+a versioned bucket for full test coverage as well as S3Guard being enabled.
+
+To enable versioning in a bucket.
+
+1. In the AWS S3 Management console find and select the bucket.
+1. In the Properties "tab", set it as versioned.
+1. <i>Important</i> Create a lifecycle rule to automatically clean up old versions
+after 24h. This avoids running up bills for objects which tests runs create and
+then delete.
+1. Run the tests again.
+
+Once a bucket is converted to being versioned, it cannot be converted back
+to being unversioned.
+
+
 ## <a name="scale"></a> Scale Tests
 
 There are a set of tests designed to measure the scalability and performance
diff --git a/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/troubleshooting_s3a.md b/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/troubleshooting_s3a.md
index 3123221bd8293..8cdac9e35263f 100644
--- a/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/troubleshooting_s3a.md
+++ b/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/troubleshooting_s3a.md
@@ -970,8 +970,8 @@ and the like. The standard strategy here is to save to HDFS and then copy to S3.
 
 ```
 org.apache.hadoop.fs.s3a.RemoteFileChangedException: re-open `s3a://my-bucket/test/file.txt':
-  ETag change reported by S3 while reading at position 1949.
-  Version f9c186d787d4de9657e99f280ba26555 was unavailable
+  Change reported by S3 while reading at position 1949.
+  ETag f9c186d787d4de9657e99f280ba26555 was unavailable
   at org.apache.hadoop.fs.s3a.impl.ChangeTracker.processResponse(ChangeTracker.java:137)
   at org.apache.hadoop.fs.s3a.S3AInputStream.reopen(S3AInputStream.java:200)
   at org.apache.hadoop.fs.s3a.S3AInputStream.lambda$lazySeek$1(S3AInputStream.java:346)
diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/AbstractS3AMockTest.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/AbstractS3AMockTest.java
index 03c91e62cedce..886795a9d90fc 100644
--- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/AbstractS3AMockTest.java
+++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/AbstractS3AMockTest.java
@@ -56,19 +56,26 @@ public abstract class AbstractS3AMockTest {
 
   @Before
   public void setup() throws Exception {
+    Configuration conf = createConfiguration();
+    fs = new S3AFileSystem();
+    URI uri = URI.create(FS_S3A + "://" + BUCKET);
+    fs.initialize(uri, conf);
+    s3 = fs.getAmazonS3ClientForTesting("mocking");
+  }
+
+  public Configuration createConfiguration() {
     Configuration conf = new Configuration();
     conf.setClass(S3_CLIENT_FACTORY_IMPL, MockS3ClientFactory.class,
         S3ClientFactory.class);
-    // We explicitly disable MetadataStore even if it's configured. For unit
+    // We explicitly disable MetadataStore. For unit
     // test we don't issue request to AWS DynamoDB service.
     conf.setClass(S3_METADATA_STORE_IMPL, NullMetadataStore.class,
         MetadataStore.class);
     // FS is always magic
     conf.setBoolean(CommitConstants.MAGIC_COMMITTER_ENABLED, true);
-    fs = new S3AFileSystem();
-    URI uri = URI.create(FS_S3A + "://" + BUCKET);
-    fs.initialize(uri, conf);
-    s3 = fs.getAmazonS3ClientForTesting("mocking");
+    // use minimum multipart size for faster triggering
+    conf.setLong(Constants.MULTIPART_SIZE, MULTIPART_MIN_SIZE);
+    return conf;
   }
 
   @After
diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3ADelayedFNF.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3ADelayedFNF.java
index 7abd47497646e..870172ec3e12e 100644
--- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3ADelayedFNF.java
+++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3ADelayedFNF.java
@@ -22,7 +22,11 @@
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.fs.contract.ContractTestUtils;
+import org.apache.hadoop.fs.s3a.impl.ChangeDetectionPolicy;
+import org.apache.hadoop.fs.s3a.impl.ChangeDetectionPolicy.Source;
 import org.apache.hadoop.test.LambdaTestUtils;
+
+import org.junit.Assume;
 import org.junit.Test;
 
 import java.io.FileNotFoundException;
@@ -43,6 +47,12 @@ public class ITestS3ADelayedFNF extends AbstractS3ATestBase {
   @Test
   public void testNotFoundFirstRead() throws Exception {
     FileSystem fs = getFileSystem();
+    ChangeDetectionPolicy changeDetectionPolicy =
+        ((S3AFileSystem) fs).getChangeDetectionPolicy();
+    Assume.assumeFalse("FNF not expected when using a bucket with"
+            + " object versioning",
+        changeDetectionPolicy.getSource() == Source.VersionId);
+
     Path p = path("some-file");
     ContractTestUtils.createFile(fs, p, false, new byte[] {20, 21, 22});
 
diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AInconsistency.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AInconsistency.java
index 6ac803e3085eb..da671030c2f54 100644
--- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AInconsistency.java
+++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AInconsistency.java
@@ -24,9 +24,13 @@
 import org.apache.hadoop.fs.contract.AbstractFSContract;
 import org.apache.hadoop.fs.contract.ContractTestUtils;
 import org.apache.hadoop.fs.contract.s3a.S3AContract;
+import org.apache.hadoop.fs.s3a.impl.ChangeDetectionPolicy;
+import org.apache.hadoop.fs.s3a.impl.ChangeDetectionPolicy.Source;
 import org.apache.hadoop.fs.s3a.s3guard.MetadataStore;
 import org.apache.hadoop.fs.s3a.s3guard.NullMetadataStore;
 import org.apache.hadoop.test.LambdaTestUtils;
+
+import org.junit.Assume;
 import org.junit.Test;
 
 import java.io.FileNotFoundException;
@@ -106,6 +110,12 @@ public void testGetFileStatus() throws Exception {
   @Test
   public void testOpenDeleteRead() throws Exception {
     S3AFileSystem fs = getFileSystem();
+    ChangeDetectionPolicy changeDetectionPolicy =
+        ((S3AFileSystem) fs).getChangeDetectionPolicy();
+    Assume.assumeFalse("FNF not expected when using a bucket with"
+            + " object versioning",
+        changeDetectionPolicy.getSource() == Source.VersionId);
+
     Path p = path("testOpenDeleteRead.txt");
     writeTextFile(fs, p, "1337c0d3z", true);
     try (InputStream s = fs.open(p)) {
diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3ARemoteFileChanged.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3ARemoteFileChanged.java
index 98dd2026f5f0d..c345a1f9da745 100644
--- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3ARemoteFileChanged.java
+++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3ARemoteFileChanged.java
@@ -19,77 +19,258 @@
 package org.apache.hadoop.fs.s3a;
 
 import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.nio.charset.Charset;
 import java.util.Arrays;
 import java.util.Collection;
+import java.util.Optional;
 
+import com.amazonaws.AmazonClientException;
+import com.amazonaws.services.s3.AmazonS3;
+import com.amazonaws.services.s3.model.CopyObjectRequest;
+import com.amazonaws.services.s3.model.CopyObjectResult;
+import com.amazonaws.services.s3.model.GetObjectMetadataRequest;
+import com.amazonaws.services.s3.model.GetObjectRequest;
+import com.amazonaws.services.s3.model.ObjectMetadata;
+import com.amazonaws.services.s3.model.S3Object;
+import com.google.common.base.Charsets;
 import org.junit.Assume;
 import org.junit.Test;
 import org.junit.runner.RunWith;
 import org.junit.runners.Parameterized;
+import org.mockito.ArgumentMatchers;
+import org.mockito.Mockito;
+import org.mockito.invocation.InvocationOnMock;
+import org.mockito.stubbing.Answer;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+import org.apache.commons.io.IOUtils;
+import org.apache.commons.lang3.tuple.Pair;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FSDataInputStream;
 import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.s3a.impl.ChangeDetectionPolicy.Mode;
 import org.apache.hadoop.fs.s3a.impl.ChangeDetectionPolicy.Source;
+import org.apache.hadoop.fs.s3a.s3guard.LocalMetadataStore;
+import org.apache.hadoop.fs.s3a.s3guard.MetadataStore;
+import org.apache.hadoop.fs.s3a.s3guard.NullMetadataStore;
+import org.apache.hadoop.fs.s3a.s3guard.PathMetadata;
 
 import static org.apache.hadoop.fs.contract.ContractTestUtils.dataset;
+import static org.apache.hadoop.fs.contract.ContractTestUtils.readUTF8;
 import static org.apache.hadoop.fs.contract.ContractTestUtils.writeDataset;
 import static org.apache.hadoop.fs.s3a.Constants.*;
 import static org.apache.hadoop.fs.s3a.S3ATestUtils.getTestBucketName;
 import static org.apache.hadoop.fs.s3a.S3ATestUtils.removeBucketOverrides;
+import static org.apache.hadoop.fs.s3a.impl.ChangeDetectionPolicy.CHANGE_DETECTED;
+import static org.apache.hadoop.fs.s3a.select.SelectConstants.S3_SELECT_CAPABILITY;
+import static org.apache.hadoop.fs.s3a.select.SelectConstants.SELECT_SQL;
 import static org.apache.hadoop.test.LambdaTestUtils.eventually;
 import static org.apache.hadoop.test.LambdaTestUtils.intercept;
+import static org.apache.hadoop.test.LambdaTestUtils.interceptFuture;
+import static org.mockito.Mockito.doAnswer;
+import static org.mockito.Mockito.when;
 
 /**
  * Test S3A remote file change detection.
+ * This is a very parameterized test; the first three parameters
+ * define configuration options for the tests, while the final one
+ * declares the expected outcomes given those options.
+ *
+ * This test uses mocking to insert transient failures into the S3 client,
+ * underneath the S3A Filesystem instance.
+ *
+ * This is used to simulate eventual consistency, so force the change policy
+ * failure modes to be encountered.
+ *
+ * If changes are made to the filesystem such that the number of calls to
+ * operations such as {@link S3AFileSystem#getObjectMetadata(Path)} are
+ * changed, the number of failures which the mock layer must generate may
+ * change.
+ *
+ * As the S3Guard auth mode flag does control whether or not a HEAD is issued
+ * in a call to {@code getFileStatus()}; the test parameter {@link #authMode}
+ * is used to help predict this count.
+ *
+ * <i>Important:</i> if you are seeing failures in this test after changing
+ * one of the rename/copy/open operations, it may be that an increase,
+ * decrease or change in the number of low-level S3 HEAD/GET operations is
+ * triggering the failures.
+ * Please review the changes to see that you haven't unintentionally done this.
+ * If it is intentional, please update the parameters here.
+ *
+ * If you are seeing failures without such a change, and nobody else is,
+ * it is likely that you have a different bucket configuration option which
+ * is somehow triggering a regression. If you can work out which option
+ * this is, then extend {@link #createConfiguration()} to reset that parameter
+ * too.
+ *
+ * Note: to help debug these issues, set the log for this to DEBUG:
+ * <pre>
+ *   log4j.logger.org.apache.hadoop.fs.s3a.ITestS3ARemoteFileChanged=DEBUG
+ * </pre>
+ * The debug information printed will include a trace of where operations
+ * are being called from, to help understand why the test is failing.
  */
 @RunWith(Parameterized.class)
 public class ITestS3ARemoteFileChanged extends AbstractS3ATestBase {
+
   private static final Logger LOG =
       LoggerFactory.getLogger(ITestS3ARemoteFileChanged.class);
 
+  private static final String TEST_DATA = "Some test data";
+
+  private static final byte[] TEST_DATA_BYTES = TEST_DATA.getBytes(
+      Charsets.UTF_8);
+  private static final int TEST_MAX_RETRIES = 5;
+  private static final String TEST_RETRY_INTERVAL = "10ms";
+  private static final String QUOTED_TEST_DATA =
+      "\"" + TEST_DATA + "\"";
+
+  private Optional<AmazonS3> originalS3Client = Optional.empty();
+
+  private enum InteractionType {
+    READ,
+    READ_AFTER_DELETE,
+    EVENTUALLY_CONSISTENT_READ,
+    COPY,
+    EVENTUALLY_CONSISTENT_COPY,
+    EVENTUALLY_CONSISTENT_METADATA,
+    SELECT,
+    EVENTUALLY_CONSISTENT_SELECT
+  }
+
   private final String changeDetectionSource;
   private final String changeDetectionMode;
-  private final boolean expectChangeException;
-  private final boolean expectFileNotFoundException;
+  private final boolean authMode;
+  private final Collection<InteractionType> expectedExceptionInteractions;
+  private S3AFileSystem fs;
 
-  @Parameterized.Parameters
+  /**
+   * Test parameters.
+   * <ol>
+   *   <li>Change detection source: etag or version.</li>
+   *   <li>Change detection policy: server, client, client+warn, none</li>
+   *   <li>Whether to enable auth mode on the filesystem.</li>
+   *   <li>Expected outcomes.</li>
+   * </ol>
+   * @return the test configuration.
+   */
+  @Parameterized.Parameters(name = "{0}-{1}-auth-{2}")
   public static Collection<Object[]> params() {
     return Arrays.asList(new Object[][]{
         // make sure it works with invalid config
-        {"bogus", "bogus", true, true},
+        {"bogus", "bogus",
+            true,
+            Arrays.asList(
+                InteractionType.READ,
+                InteractionType.READ_AFTER_DELETE,
+                InteractionType.EVENTUALLY_CONSISTENT_READ,
+                InteractionType.COPY,
+                InteractionType.EVENTUALLY_CONSISTENT_COPY,
+                InteractionType.EVENTUALLY_CONSISTENT_METADATA,
+                InteractionType.SELECT,
+                InteractionType.EVENTUALLY_CONSISTENT_SELECT)},
 
         // test with etag
-        {CHANGE_DETECT_SOURCE_ETAG, CHANGE_DETECT_MODE_SERVER, true, true},
-        {CHANGE_DETECT_SOURCE_ETAG, CHANGE_DETECT_MODE_CLIENT, true, true},
-        {CHANGE_DETECT_SOURCE_ETAG, CHANGE_DETECT_MODE_WARN, false, true},
-        {CHANGE_DETECT_SOURCE_ETAG, CHANGE_DETECT_MODE_NONE, false, true},
+        {CHANGE_DETECT_SOURCE_ETAG, CHANGE_DETECT_MODE_SERVER,
+            true,
+            Arrays.asList(
+                InteractionType.READ,
+                InteractionType.READ_AFTER_DELETE,
+                InteractionType.EVENTUALLY_CONSISTENT_READ,
+                InteractionType.COPY,
+                InteractionType.EVENTUALLY_CONSISTENT_COPY,
+                InteractionType.EVENTUALLY_CONSISTENT_METADATA,
+                InteractionType.SELECT,
+                InteractionType.EVENTUALLY_CONSISTENT_SELECT)},
+        {CHANGE_DETECT_SOURCE_ETAG, CHANGE_DETECT_MODE_CLIENT,
+            false,
+            Arrays.asList(
+                InteractionType.READ,
+                InteractionType.EVENTUALLY_CONSISTENT_READ,
+                InteractionType.READ_AFTER_DELETE,
+                InteractionType.COPY,
+                // not InteractionType.EVENTUALLY_CONSISTENT_COPY as copy change
+                // detection can't really occur client-side.  The eTag of
+                // the new object can't be expected to match.
+                InteractionType.EVENTUALLY_CONSISTENT_METADATA,
+                InteractionType.SELECT,
+                InteractionType.EVENTUALLY_CONSISTENT_SELECT)},
+        {CHANGE_DETECT_SOURCE_ETAG, CHANGE_DETECT_MODE_WARN,
+            false,
+            Arrays.asList(
+                InteractionType.READ_AFTER_DELETE)},
+        {CHANGE_DETECT_SOURCE_ETAG, CHANGE_DETECT_MODE_NONE,
+            false,
+            Arrays.asList(
+                InteractionType.READ_AFTER_DELETE)},
 
         // test with versionId
-        // when using server-side versionId, the exceptions shouldn't happen
-        // since the previous version will still be available
-        {CHANGE_DETECT_SOURCE_VERSION_ID, CHANGE_DETECT_MODE_SERVER, false,
-            false},
+        // when using server-side versionId, the exceptions
+        // shouldn't happen since the previous version will still be available
+        {CHANGE_DETECT_SOURCE_VERSION_ID, CHANGE_DETECT_MODE_SERVER,
+            true,
+            Arrays.asList(
+                InteractionType.EVENTUALLY_CONSISTENT_READ,
+                InteractionType.EVENTUALLY_CONSISTENT_COPY,
+                InteractionType.EVENTUALLY_CONSISTENT_METADATA,
+                InteractionType.EVENTUALLY_CONSISTENT_SELECT)},
 
         // with client-side versionId it will behave similar to client-side eTag
-        {CHANGE_DETECT_SOURCE_VERSION_ID, CHANGE_DETECT_MODE_CLIENT, true,
-            true},
+        {CHANGE_DETECT_SOURCE_VERSION_ID, CHANGE_DETECT_MODE_CLIENT,
+            false,
+            Arrays.asList(
+                InteractionType.READ,
+                InteractionType.READ_AFTER_DELETE,
+                InteractionType.EVENTUALLY_CONSISTENT_READ,
+                InteractionType.COPY,
+                // not InteractionType.EVENTUALLY_CONSISTENT_COPY as copy change
+                // detection can't really occur client-side.  The versionId of
+                // the new object can't be expected to match.
+                InteractionType.EVENTUALLY_CONSISTENT_METADATA,
+                InteractionType.SELECT,
+                InteractionType.EVENTUALLY_CONSISTENT_SELECT)},
 
-        {CHANGE_DETECT_SOURCE_VERSION_ID, CHANGE_DETECT_MODE_WARN, false, true},
-        {CHANGE_DETECT_SOURCE_VERSION_ID, CHANGE_DETECT_MODE_NONE, false, true}
+        {CHANGE_DETECT_SOURCE_VERSION_ID, CHANGE_DETECT_MODE_WARN,
+            true,
+            Arrays.asList(
+                InteractionType.READ_AFTER_DELETE)},
+        {CHANGE_DETECT_SOURCE_VERSION_ID, CHANGE_DETECT_MODE_NONE,
+            false,
+            Arrays.asList(
+                InteractionType.READ_AFTER_DELETE)}
     });
   }
 
   public ITestS3ARemoteFileChanged(String changeDetectionSource,
       String changeDetectionMode,
-      boolean expectException,
-      boolean expectFileNotFoundException) {
+      boolean authMode,
+      Collection<InteractionType> expectedExceptionInteractions) {
     this.changeDetectionSource = changeDetectionSource;
     this.changeDetectionMode = changeDetectionMode;
-    this.expectChangeException = expectException;
-    this.expectFileNotFoundException = expectFileNotFoundException;
+    this.authMode = authMode;
+    this.expectedExceptionInteractions = expectedExceptionInteractions;
+  }
+
+  @Override
+  public void setup() throws Exception {
+    super.setup();
+    // skip all versioned checks if the remote FS doesn't do
+    // versions.
+    fs = getFileSystem();
+    skipIfVersionPolicyAndNoVersionId();
+    // cache the original S3 client for teardown.
+    originalS3Client = Optional.of(
+        fs.getAmazonS3ClientForTesting("caching"));
+  }
+
+  @Override
+  public void teardown() throws Exception {
+    // restore the s3 client so there's no mocking interfering with the teardown
+    originalS3Client.ifPresent(fs::setAmazonS3Client);
+    super.teardown();
   }
 
   @Override
@@ -98,33 +279,65 @@ protected Configuration createConfiguration() {
     String bucketName = getTestBucketName(conf);
     removeBucketOverrides(bucketName, conf,
         CHANGE_DETECT_SOURCE,
-        CHANGE_DETECT_MODE);
+        CHANGE_DETECT_MODE,
+        RETRY_LIMIT,
+        RETRY_INTERVAL,
+        METADATASTORE_AUTHORITATIVE);
     conf.set(CHANGE_DETECT_SOURCE, changeDetectionSource);
     conf.set(CHANGE_DETECT_MODE, changeDetectionMode);
+    conf.setBoolean(METADATASTORE_AUTHORITATIVE, authMode);
+
+    // reduce retry limit so FileNotFoundException cases timeout faster,
+    // speeding up the tests
+    conf.setInt(RETRY_LIMIT, TEST_MAX_RETRIES);
+    conf.set(RETRY_INTERVAL, TEST_RETRY_INTERVAL);
+
+    if (conf.getClass(S3_METADATA_STORE_IMPL, MetadataStore.class) ==
+        NullMetadataStore.class) {
+      LOG.debug("Enabling local S3Guard metadata store");
+      // favor LocalMetadataStore over NullMetadataStore
+      conf.setClass(S3_METADATA_STORE_IMPL,
+          LocalMetadataStore.class, MetadataStore.class);
+    }
     S3ATestUtils.disableFilesystemCaching(conf);
     return conf;
   }
 
+  /**
+   * Get the path of this method, including parameterized values.
+   * @return a path unique to this method and parameters
+   * @throws IOException failure.
+   */
+  protected Path path() throws IOException {
+    return super.path(getMethodName());
+  }
+
+  /**
+   * How many HEAD requests are made in a call to
+   * {@link S3AFileSystem#getFileStatus(Path)}?
+   * @return a number >= 0.
+   */
+  private int getFileStatusHeadCount() {
+    return authMode ? 0 : 1;
+  }
+
+  /**
+   * Tests reading a file that is changed while the reader's InputStream is
+   * open.
+   */
   @Test
-  public void testReadFileChanged() throws Throwable {
+  public void testReadFileChangedStreamOpen() throws Throwable {
+    describe("Tests reading a file that is changed while the reader's "
+        + "InputStream is open.");
     final int originalLength = 8192;
     final byte[] originalDataset = dataset(originalLength, 'a', 32);
     final int newLength = originalLength + 1;
     final byte[] newDataset = dataset(newLength, 'A', 32);
-    final S3AFileSystem fs = getFileSystem();
     final Path testpath = path("readFileToChange.txt");
     // initial write
     writeDataset(fs, testpath, originalDataset, originalDataset.length,
         1024, false);
 
-    if (fs.getChangeDetectionPolicy().getSource() == Source.VersionId) {
-      // skip versionId tests if the bucket doesn't have object versioning
-      // enabled
-      Assume.assumeTrue(
-          "Target filesystem does not support versioning",
-          fs.getObjectMetadata(fs.pathToKey(testpath)).getVersionId() != null);
-    }
-
     try(FSDataInputStream instream = fs.open(testpath)) {
       // seek forward and read successfully
       instream.seek(1024);
@@ -152,9 +365,8 @@ public void testReadFileChanged() throws Throwable {
       // now check seek backward
       instream.seek(instream.getPos() - 100);
 
-      if (expectChangeException) {
-        intercept(RemoteFileChangedException.class, "", "read",
-            () -> instream.read());
+      if (expectedExceptionInteractions.contains(InteractionType.READ)) {
+        expectReadFailure(instream);
       } else {
         instream.read();
       }
@@ -164,9 +376,8 @@ public void testReadFileChanged() throws Throwable {
       // seek backward
       instream.seek(0);
 
-      if (expectChangeException) {
-        intercept(RemoteFileChangedException.class, "", "read",
-            () -> instream.read(buf));
+      if (expectedExceptionInteractions.contains(InteractionType.READ)) {
+        expectReadFailure(instream);
         intercept(RemoteFileChangedException.class, "", "read",
             () -> instream.read(0, buf, 0, buf.length));
         intercept(RemoteFileChangedException.class,  "", "readfully",
@@ -183,7 +394,8 @@ public void testReadFileChanged() throws Throwable {
       // seek backward
       instream.seek(0);
 
-      if (expectFileNotFoundException) {
+      if (expectedExceptionInteractions.contains(
+          InteractionType.READ_AFTER_DELETE)) {
         intercept(FileNotFoundException.class, "", "read()",
             () -> instream.read());
         intercept(FileNotFoundException.class, "", "readfully",
@@ -194,4 +406,890 @@ public void testReadFileChanged() throws Throwable {
       }
     }
   }
+
+  /**
+   * Tests reading a file where the version visible in S3 does not match the
+   * version tracked in the metadata store.
+   */
+  @Test
+  public void testReadFileChangedOutOfSyncMetadata() throws Throwable {
+    final Path testpath = writeOutOfSyncFileVersion("fileChangedOutOfSync.dat");
+
+    try (FSDataInputStream instream = fs.open(testpath)) {
+      if (expectedExceptionInteractions.contains(InteractionType.READ)) {
+        expectReadFailure(instream);
+      } else {
+        instream.read();
+      }
+    }
+  }
+
+  /**
+   * Ensures a file can be read when there is no version metadata
+   * (ETag, versionId).
+   */
+  @Test
+  public void testReadWithNoVersionMetadata() throws Throwable {
+    final Path testpath = writeFileWithNoVersionMetadata("readnoversion.dat");
+
+    assertEquals("Contents of " + testpath,
+        TEST_DATA,
+        readUTF8(fs, testpath, -1));
+  }
+
+  /**
+   * Tests using S3 Select on a file where the version visible in S3 does not
+   * match the version tracked in the metadata store.
+   */
+  @Test
+  public void testSelectChangedFile() throws Throwable {
+    requireS3Select();
+    final Path testpath = writeOutOfSyncFileVersion("select.dat");
+
+    if (expectedExceptionInteractions.contains(InteractionType.SELECT)) {
+      interceptFuture(RemoteFileChangedException.class, "select",
+          fs.openFile(testpath)
+              .must(SELECT_SQL, "SELECT * FROM S3OBJECT").build());
+    } else {
+      fs.openFile(testpath)
+          .must(SELECT_SQL, "SELECT * FROM S3OBJECT")
+          .build()
+          .get()
+          .close();
+    }
+  }
+
+  /**
+   * Tests using S3 Select on a file where the version visible in S3 does not
+   * initially match the version tracked in the metadata store, but eventually
+   * (after retries) does.
+   */
+  @Test
+  public void testSelectEventuallyConsistentFile() throws Throwable {
+    describe("Eventually Consistent S3 Select");
+    requireS3Guard();
+    requireS3Select();
+    AmazonS3 s3ClientSpy = spyOnFilesystem();
+
+    final Path testpath1 = writeEventuallyConsistentFileVersion(
+        "select1.dat", s3ClientSpy, 0, TEST_MAX_RETRIES, 0);
+
+    // should succeed since the inconsistency doesn't last longer than the
+    // configured retry limit
+    fs.openFile(testpath1)
+        .must(SELECT_SQL, "SELECT * FROM S3OBJECT")
+        .build()
+        .get()
+        .close();
+
+    // select() makes a getFileStatus() call before the consistency checking
+    // that will match the stub. As such, we need an extra inconsistency here
+    // to cross the threshold
+    int getMetadataInconsistencyCount = TEST_MAX_RETRIES + 2;
+    final Path testpath2 = writeEventuallyConsistentFileVersion(
+        "select2.dat", s3ClientSpy, 0, getMetadataInconsistencyCount, 0);
+
+    if (expectedExceptionInteractions.contains(
+        InteractionType.EVENTUALLY_CONSISTENT_SELECT)) {
+      // should fail since the inconsistency lasts longer than the configured
+      // retry limit
+      interceptFuture(RemoteFileChangedException.class, "select",
+          fs.openFile(testpath2)
+              .must(SELECT_SQL, "SELECT * FROM S3OBJECT").build());
+    } else {
+      fs.openFile(testpath2)
+          .must(SELECT_SQL, "SELECT * FROM S3OBJECT")
+          .build()
+          .get()
+          .close();
+    }
+  }
+
+  /**
+   * Ensures a file can be read via S3 Select when there is no version metadata
+   * (ETag, versionId).
+   */
+  @Test
+  public void testSelectWithNoVersionMetadata() throws Throwable {
+    requireS3Select();
+    final Path testpath =
+        writeFileWithNoVersionMetadata("selectnoversion.dat");
+
+    try (FSDataInputStream instream = fs.openFile(testpath)
+        .must(SELECT_SQL, "SELECT * FROM S3OBJECT").build().get()) {
+      assertEquals(QUOTED_TEST_DATA,
+          IOUtils.toString(instream, Charset.forName("UTF-8")).trim());
+    }
+  }
+
+  /**
+   * Tests doing a rename() on a file where the version visible in S3 does not
+   * match the version tracked in the metadata store.
+   * @throws Throwable failure
+   */
+  @Test
+  public void testRenameChangedFile() throws Throwable {
+    final Path testpath = writeOutOfSyncFileVersion("rename.dat");
+
+    final Path dest = path("dest.dat");
+    if (expectedExceptionInteractions.contains(InteractionType.COPY)) {
+      intercept(RemoteFileChangedException.class, "",
+          "expected copy() failure",
+          () -> fs.rename(testpath, dest));
+    } else {
+      fs.rename(testpath, dest);
+    }
+  }
+
+  /**
+   * Inconsistent response counts for getObjectMetadata() and
+   * copyObject() for a rename.
+   * @param metadataCallsExpectedBeforeRetryLoop number of getObjectMetadata
+   * calls expected before the consistency checking retry loop
+   * @return the inconsistencies for (metadata, copy)
+   */
+  private Pair<Integer, Integer> renameInconsistencyCounts(
+      int metadataCallsExpectedBeforeRetryLoop) {
+    int metadataInconsistencyCount = TEST_MAX_RETRIES
+        + metadataCallsExpectedBeforeRetryLoop;
+    int copyInconsistencyCount =
+        versionCheckingIsOnServer() ? TEST_MAX_RETRIES : 0;
+
+    return Pair.of(metadataInconsistencyCount, copyInconsistencyCount);
+  }
+
+  /**
+   * Tests doing a rename() on a file where the version visible in S3 does not
+   * match the version in the metadata store until a certain number of retries
+   * has been met.
+   */
+  @Test
+  public void testRenameEventuallyConsistentFile() throws Throwable {
+    requireS3Guard();
+    AmazonS3 s3ClientSpy = spyOnFilesystem();
+
+    // Total inconsistent response count across getObjectMetadata() and
+    // copyObject().
+    // The split of inconsistent responses between getObjectMetadata() and
+    // copyObject() is arbitrary.
+    Pair<Integer, Integer> counts = renameInconsistencyCounts(
+        getFileStatusHeadCount());
+    int metadataInconsistencyCount = counts.getLeft();
+    int copyInconsistencyCount = counts.getRight();
+    final Path testpath1 =
+        writeEventuallyConsistentFileVersion("rename-eventually1.dat",
+            s3ClientSpy,
+            0,
+            metadataInconsistencyCount,
+            copyInconsistencyCount);
+
+    final Path dest1 = path("dest1.dat");
+    // shouldn't fail since the inconsistency doesn't last through the
+    // configured retry limit
+    fs.rename(testpath1, dest1);
+  }
+
+  /**
+   * Tests doing a rename() on a file where the version visible in S3 does not
+   * match the version in the metadata store until a certain number of retries
+   * has been met.
+   * The test expects failure by AWSClientIOException caused by NPE due to
+   * https://github.com/aws/aws-sdk-java/issues/1644
+   */
+  @Test
+  public void testRenameEventuallyConsistentFileNPE() throws Throwable {
+    requireS3Guard();
+    skipIfVersionPolicyAndNoVersionId();
+    AmazonS3 s3ClientSpy = spyOnFilesystem();
+
+    Pair<Integer, Integer> counts = renameInconsistencyCounts(
+        getFileStatusHeadCount());
+    int metadataInconsistencyCount = counts.getLeft();
+    int copyInconsistencyCount = counts.getRight();
+    // giving copyInconsistencyCount + 1 here should trigger the failure,
+    // exceeding the retry limit
+    final Path testpath2 =
+        writeEventuallyConsistentFileVersion("rename-eventuallyNPE.dat",
+            s3ClientSpy,
+            0,
+            metadataInconsistencyCount,
+            copyInconsistencyCount + 1);
+    final Path dest2 = path("destNPE.dat");
+    if (expectedExceptionInteractions.contains(
+        InteractionType.EVENTUALLY_CONSISTENT_COPY)) {
+      // should fail since the inconsistency is set up to persist longer than
+      // the configured retry limit
+      // the expected exception is not RemoteFileChangedException due to
+      // https://github.com/aws/aws-sdk-java/issues/1644
+      // If this test is failing after an AWS SDK update,
+      // then it means the SDK bug is fixed.
+      // Please update this test to match the new behavior.
+      AWSClientIOException exception =
+          intercept(AWSClientIOException.class,
+              "Unable to complete transfer: null",
+              "expected copy() failure",
+              () -> fs.rename(testpath2, dest2));
+      AmazonClientException cause = exception.getCause();
+      if (cause == null) {
+        // no cause; something else went wrong: throw.
+        throw new AssertionError("No inner cause",
+            exception);
+      }
+      Throwable causeCause = cause.getCause();
+      if (!(causeCause instanceof NullPointerException)) {
+        // null causeCause or it is the wrong type: throw
+        throw new AssertionError("Innermost cause is not NPE",
+            exception);
+      }
+    } else {
+      fs.rename(testpath2, dest2);
+    }
+  }
+
+  /**
+   * Tests doing a rename() on a file where the version visible in S3 does not
+   * match the version in the metadata store until a certain number of retries
+   * has been met.
+   * The test expects failure by RemoteFileChangedException.
+   */
+  @Test
+  public void testRenameEventuallyConsistentFileRFCE() throws Throwable {
+    requireS3Guard();
+    skipIfVersionPolicyAndNoVersionId();
+    AmazonS3 s3ClientSpy = spyOnFilesystem();
+
+    Pair<Integer, Integer> counts = renameInconsistencyCounts(
+        getFileStatusHeadCount());
+    int metadataInconsistencyCount = counts.getLeft();
+    int copyInconsistencyCount = counts.getRight();
+    // giving metadataInconsistencyCount + 1 here should trigger the failure,
+    // exceeding the retry limit
+    final Path testpath2 =
+        writeEventuallyConsistentFileVersion("rename-eventuallyRFCE.dat",
+            s3ClientSpy,
+            0,
+            metadataInconsistencyCount + 1,
+            copyInconsistencyCount);
+    final Path dest2 = path("destRFCE.dat");
+    if (expectedExceptionInteractions.contains(
+        InteractionType.EVENTUALLY_CONSISTENT_METADATA)) {
+      // should fail since the inconsistency is set up to persist longer than
+      // the configured retry limit
+      intercept(RemoteFileChangedException.class,
+          CHANGE_DETECTED,
+          "expected copy() failure",
+          () -> fs.rename(testpath2, dest2));
+    } else {
+      fs.rename(testpath2, dest2);
+    }
+  }
+
+  /**
+   * Tests doing a rename() on a directory containing
+   * an file which is eventually consistent.
+   * There is no call to getFileStatus on the source file whose
+   * inconsistency is simulated; the state of S3Guard auth mode is not
+   * relevant.
+   */
+  @Test
+  public void testRenameEventuallyConsistentDirectory() throws Throwable {
+    requireS3Guard();
+    AmazonS3 s3ClientSpy = spyOnFilesystem();
+    Path basedir = path();
+    Path sourcedir = new Path(basedir, "sourcedir");
+    fs.mkdirs(sourcedir);
+    Path destdir = new Path(basedir, "destdir");
+    String inconsistent = "inconsistent";
+    String consistent = "consistent";
+    Path inconsistentFile = new Path(sourcedir, inconsistent);
+    Path consistentFile = new Path(sourcedir, consistent);
+
+    // write the consistent data
+    writeDataset(fs, consistentFile, TEST_DATA_BYTES, TEST_DATA_BYTES.length,
+        1024, true, true);
+
+    Pair<Integer, Integer> counts = renameInconsistencyCounts(0);
+    int metadataInconsistencyCount = counts.getLeft();
+    int copyInconsistencyCount = counts.getRight();
+
+    writeEventuallyConsistentData(
+        s3ClientSpy,
+        inconsistentFile,
+        TEST_DATA_BYTES,
+        0,
+        metadataInconsistencyCount,
+        copyInconsistencyCount);
+
+    // must not fail since the inconsistency doesn't last through the
+    // configured retry limit
+    fs.rename(sourcedir, destdir);
+  }
+
+  /**
+   * Ensures a file can be renamed when there is no version metadata
+   * (ETag, versionId).
+   */
+  @Test
+  public void testRenameWithNoVersionMetadata() throws Throwable {
+    final Path testpath =
+        writeFileWithNoVersionMetadata("renamenoversion.dat");
+
+    final Path dest = path("noversiondest.dat");
+    fs.rename(testpath, dest);
+    assertEquals("Contents of " + dest,
+        TEST_DATA,
+        readUTF8(fs, dest, -1));
+  }
+
+  /**
+   * Ensures S3Guard and retries allow an eventually consistent read.
+   */
+  @Test
+  public void testReadAfterEventuallyConsistentWrite() throws Throwable {
+    requireS3Guard();
+    AmazonS3 s3ClientSpy = spyOnFilesystem();
+    final Path testpath1 =
+        writeEventuallyConsistentFileVersion("eventually1.dat",
+            s3ClientSpy, TEST_MAX_RETRIES, 0 , 0);
+
+    try (FSDataInputStream instream1 = fs.open(testpath1)) {
+      // succeeds on the last retry
+      instream1.read();
+    }
+  }
+
+  /**
+   * Ensures S3Guard and retries allow an eventually consistent read.
+   */
+  @Test
+  public void testReadAfterEventuallyConsistentWrite2() throws Throwable {
+    requireS3Guard();
+    AmazonS3 s3ClientSpy = spyOnFilesystem();
+    final Path testpath2 =
+        writeEventuallyConsistentFileVersion("eventually2.dat",
+            s3ClientSpy, TEST_MAX_RETRIES + 1, 0, 0);
+
+    try (FSDataInputStream instream2 = fs.open(testpath2)) {
+      if (expectedExceptionInteractions.contains(
+          InteractionType.EVENTUALLY_CONSISTENT_READ)) {
+        // keeps retrying and eventually gives up with RemoteFileChangedException
+        expectReadFailure(instream2);
+      } else {
+        instream2.read();
+      }
+    }
+  }
+
+  /**
+   * Ensures read on re-open (after seek backwards) when S3 does not return the
+   * version of the file tracked in the metadata store fails immediately.  No
+   * retries should happen since a retry is not expected to recover.
+   */
+  @Test
+  public void testEventuallyConsistentReadOnReopen() throws Throwable {
+    requireS3Guard();
+    AmazonS3 s3ClientSpy = spyOnFilesystem();
+    String filename = "eventually-reopen.dat";
+    final Path testpath =
+        writeEventuallyConsistentFileVersion(filename,
+            s3ClientSpy, 0, 0, 0);
+
+    try (FSDataInputStream instream = fs.open(testpath)) {
+      instream.read();
+      // overwrite the file, returning inconsistent version for
+      // (effectively) infinite retries
+      writeEventuallyConsistentFileVersion(filename, s3ClientSpy,
+          Integer.MAX_VALUE, 0, 0);
+      instream.seek(0);
+      if (expectedExceptionInteractions.contains(InteractionType.READ)) {
+        // if it retries at all, it will retry forever, which should fail
+        // the test.  The expected behavior is immediate
+        // RemoteFileChangedException.
+        expectReadFailure(instream);
+      } else {
+        instream.read();
+      }
+    }
+  }
+
+  /**
+   * Writes a file with old ETag and versionId in the metadata store such
+   * that the metadata is out of sync with S3.  Attempts to read such a file
+   * should result in {@link RemoteFileChangedException}.
+   */
+  private Path writeOutOfSyncFileVersion(String filename) throws IOException {
+    final Path testpath = path(filename);
+    final byte[] dataset = TEST_DATA_BYTES;
+    writeDataset(fs, testpath, dataset, dataset.length,
+        1024, false);
+    S3AFileStatus originalStatus = (S3AFileStatus) fs.getFileStatus(testpath);
+
+    // overwrite with half the content
+    writeDataset(fs, testpath, dataset, dataset.length / 2,
+        1024, true);
+
+    S3AFileStatus newStatus = (S3AFileStatus) fs.getFileStatus(testpath);
+
+    // put back the original etag, versionId
+    S3AFileStatus forgedStatus =
+        S3AFileStatus.fromFileStatus(newStatus, Tristate.FALSE,
+            originalStatus.getETag(), originalStatus.getVersionId());
+    fs.getMetadataStore().put(
+        new PathMetadata(forgedStatus, Tristate.FALSE, false));
+
+    return testpath;
+  }
+
+  /**
+   * Writes {@link #TEST_DATA} to a file where the file will be inconsistent
+   * in S3 for a set of operations.
+   * The duration of the inconsistency is controlled by the
+   * getObjectInconsistencyCount, getMetadataInconsistencyCount, and
+   * copyInconsistentCallCount parameters.
+   * The inconsistency manifests in AmazonS3#getObject,
+   * AmazonS3#getObjectMetadata, and AmazonS3#copyObject.
+   * This method sets up the provided s3ClientSpy to return a response to each
+   * of these methods indicating an inconsistency where the requested object
+   * version (eTag or versionId) is not available until a certain retry
+   * threshold is met.
+   * Providing inconsistent call count values above or
+   * below the overall retry limit allows a test to simulate a condition that
+   * either should or should not result in an overall failure from retry
+   * exhaustion.
+   * @param filename name of file (will be under test path)
+   * @param s3ClientSpy s3 client to patch
+   * @param getObjectInconsistencyCount number of GET inconsistencies
+   * @param getMetadataInconsistencyCount number of HEAD inconsistencies
+   * @param copyInconsistencyCount number of COPY inconsistencies.
+   * @return the path written
+   * @throws IOException failure to write the test data.
+   */
+  private Path writeEventuallyConsistentFileVersion(String filename,
+      AmazonS3 s3ClientSpy,
+      int getObjectInconsistencyCount,
+      int getMetadataInconsistencyCount,
+      int copyInconsistencyCount)
+      throws IOException {
+    return writeEventuallyConsistentData(s3ClientSpy,
+        path(filename),
+        TEST_DATA_BYTES,
+        getObjectInconsistencyCount,
+        getMetadataInconsistencyCount,
+        copyInconsistencyCount);
+  }
+
+  /**
+   * Writes data to a path and configures the S3 client for inconsistent
+   * HEAD, GET or COPY operations.
+   * @param testpath absolute path of file
+   * @param s3ClientSpy s3 client to patch
+   * @param dataset bytes to write.
+   * @param getObjectInconsistencyCount number of GET inconsistencies
+   * @param getMetadataInconsistencyCount number of HEAD inconsistencies
+   * @param copyInconsistencyCount number of COPY inconsistencies.
+   * @return the path written
+   * @throws IOException failure to write the test data.
+   */
+  private Path writeEventuallyConsistentData(final AmazonS3 s3ClientSpy,
+      final Path testpath,
+      final byte[] dataset,
+      final int getObjectInconsistencyCount,
+      final int getMetadataInconsistencyCount,
+      final int copyInconsistencyCount)
+      throws IOException {
+    writeDataset(fs, testpath, dataset, dataset.length,
+        1024, true);
+    S3AFileStatus originalStatus = (S3AFileStatus) fs.getFileStatus(testpath);
+
+    // overwrite with half the content
+    writeDataset(fs, testpath, dataset, dataset.length / 2,
+        1024, true);
+
+    LOG.debug("Original file info: {}: version={}, etag={}", testpath,
+        originalStatus.getVersionId(), originalStatus.getETag());
+
+    S3AFileStatus newStatus = (S3AFileStatus) fs.getFileStatus(testpath);
+    LOG.debug("Updated file info: {}: version={}, etag={}", testpath,
+        newStatus.getVersionId(), newStatus.getETag());
+
+    stubTemporaryUnavailable(s3ClientSpy, getObjectInconsistencyCount,
+        testpath, newStatus);
+
+    stubTemporaryWrongVersion(s3ClientSpy, getObjectInconsistencyCount,
+        testpath, originalStatus);
+
+    if (versionCheckingIsOnServer()) {
+      // only stub inconsistency when mode is server since no constraints that
+      // should trigger inconsistency are passed in any other mode
+      stubTemporaryCopyInconsistency(s3ClientSpy, testpath, newStatus,
+          copyInconsistencyCount);
+    }
+
+    stubTemporaryMetadataInconsistency(s3ClientSpy, testpath, originalStatus,
+        newStatus, getMetadataInconsistencyCount);
+
+    return testpath;
+  }
+
+  /**
+   * Log the call hierarchy at debug level, helps track down
+   * where calls to operations are coming from.
+   */
+  private void logLocationAtDebug() {
+    if (LOG.isDebugEnabled()) {
+      LOG.debug("Call hierarchy", new Exception("here"));
+    }
+  }
+
+  /**
+   * Stubs {@link AmazonS3#getObject(GetObjectRequest)}
+   * within s3ClientSpy to return null until inconsistentCallCount calls have
+   * been made.  The null response simulates what occurs when an object
+   * matching the specified ETag or versionId is not available.
+   * @param s3ClientSpy the spy to stub
+   * @param inconsistentCallCount the number of calls that should return the
+   * null response
+   * @param testpath the path of the object the stub should apply to
+   */
+  private void stubTemporaryUnavailable(AmazonS3 s3ClientSpy,
+      int inconsistentCallCount, Path testpath,
+      S3AFileStatus newStatus) {
+    Answer<S3Object> temporarilyUnavailableAnswer = new Answer<S3Object>() {
+      private int callCount = 0;
+
+      @Override
+      public S3Object answer(InvocationOnMock invocation) throws Throwable {
+        // simulates ETag or versionId constraint not met until
+        // inconsistentCallCount surpassed
+        callCount++;
+        if (callCount <= inconsistentCallCount) {
+          LOG.info("Temporarily unavailable {} count {} of {}",
+              testpath, callCount, inconsistentCallCount);
+          logLocationAtDebug();
+          return null;
+        }
+        return (S3Object) invocation.callRealMethod();
+      }
+    };
+
+    // match the requests that would be made in either server-side change
+    // detection mode
+    doAnswer(temporarilyUnavailableAnswer).when(s3ClientSpy)
+        .getObject(
+            matchingGetObjectRequest(
+                testpath, newStatus.getETag(), null));
+    doAnswer(temporarilyUnavailableAnswer).when(s3ClientSpy)
+        .getObject(
+            matchingGetObjectRequest(
+                testpath, null, newStatus.getVersionId()));
+  }
+
+  /**
+   * Stubs {@link AmazonS3#getObject(GetObjectRequest)}
+   * within s3ClientSpy to return an object modified to contain metadata
+   * from originalStatus until inconsistentCallCount calls have been made.
+   * @param s3ClientSpy the spy to stub
+   * @param testpath the path of the object the stub should apply to
+   * @param inconsistentCallCount the number of calls that should return the
+   * null response
+   * @param originalStatus the status metadata to inject into the
+   * inconsistentCallCount responses
+   */
+  private void stubTemporaryWrongVersion(AmazonS3 s3ClientSpy,
+      int inconsistentCallCount, Path testpath,
+      S3AFileStatus originalStatus) {
+    Answer<S3Object> temporarilyWrongVersionAnswer = new Answer<S3Object>() {
+      private int callCount = 0;
+
+      @Override
+      public S3Object answer(InvocationOnMock invocation) throws Throwable {
+        // simulates old ETag or versionId until inconsistentCallCount surpassed
+        callCount++;
+        S3Object s3Object = (S3Object) invocation.callRealMethod();
+        if (callCount <= inconsistentCallCount) {
+          LOG.info("Temporary Wrong Version {} count {} of {}",
+              testpath, callCount, inconsistentCallCount);
+          logLocationAtDebug();
+          S3Object objectSpy = Mockito.spy(s3Object);
+          ObjectMetadata metadataSpy =
+              Mockito.spy(s3Object.getObjectMetadata());
+          when(objectSpy.getObjectMetadata()).thenReturn(metadataSpy);
+          when(metadataSpy.getETag()).thenReturn(originalStatus.getETag());
+          when(metadataSpy.getVersionId())
+              .thenReturn(originalStatus.getVersionId());
+          return objectSpy;
+        }
+        return s3Object;
+      }
+    };
+
+    // match requests that would be made in client-side change detection
+    doAnswer(temporarilyWrongVersionAnswer).when(s3ClientSpy).getObject(
+        matchingGetObjectRequest(testpath, null, null));
+  }
+
+  /**
+   * Stubs {@link AmazonS3#copyObject(CopyObjectRequest)}
+   * within s3ClientSpy to return null (indicating preconditions not met) until
+   * copyInconsistentCallCount calls have been made.
+   * @param s3ClientSpy the spy to stub
+   * @param testpath the path of the object the stub should apply to
+   * @param newStatus the status metadata containing the ETag and versionId
+   * that should be matched in order for the stub to apply
+   * @param copyInconsistentCallCount how many times to return the
+   * precondition failed error
+   */
+  private void stubTemporaryCopyInconsistency(AmazonS3 s3ClientSpy,
+      Path testpath, S3AFileStatus newStatus,
+      int copyInconsistentCallCount) {
+    Answer<CopyObjectResult> temporarilyPreconditionsNotMetAnswer =
+        new Answer<CopyObjectResult>() {
+      private int callCount = 0;
+
+      @Override
+      public CopyObjectResult answer(InvocationOnMock invocation)
+          throws Throwable {
+        callCount++;
+        if (callCount <= copyInconsistentCallCount) {
+          String message = "preconditions not met on call " + callCount
+              + " of " + copyInconsistentCallCount;
+          LOG.info("Copying {}: {}", testpath, message);
+          logLocationAtDebug();
+          return null;
+        }
+        return (CopyObjectResult) invocation.callRealMethod();
+      }
+    };
+
+    // match requests made during copy
+    doAnswer(temporarilyPreconditionsNotMetAnswer).when(s3ClientSpy).copyObject(
+        matchingCopyObjectRequest(testpath, newStatus.getETag(), null));
+    doAnswer(temporarilyPreconditionsNotMetAnswer).when(s3ClientSpy).copyObject(
+        matchingCopyObjectRequest(testpath, null, newStatus.getVersionId()));
+  }
+
+  /**
+   * Stubs {@link AmazonS3#getObjectMetadata(GetObjectMetadataRequest)}
+   * within s3ClientSpy to return metadata from originalStatus until
+   * metadataInconsistentCallCount calls have been made.
+   * @param s3ClientSpy the spy to stub
+   * @param testpath the path of the object the stub should apply to
+   * @param originalStatus the inconsistent status metadata to return
+   * @param newStatus the status metadata to return after
+   * metadataInconsistentCallCount is met
+   * @param metadataInconsistentCallCount how many times to return the
+   * inconsistent metadata
+   */
+  private void stubTemporaryMetadataInconsistency(AmazonS3 s3ClientSpy,
+      Path testpath, S3AFileStatus originalStatus,
+      S3AFileStatus newStatus, int metadataInconsistentCallCount) {
+    Answer<ObjectMetadata> temporarilyOldMetadataAnswer =
+        new Answer<ObjectMetadata>() {
+      private int callCount = 0;
+
+      @Override
+      public ObjectMetadata answer(InvocationOnMock invocation)
+          throws Throwable {
+        ObjectMetadata objectMetadata =
+            (ObjectMetadata) invocation.callRealMethod();
+        callCount++;
+        if (callCount <= metadataInconsistentCallCount) {
+          LOG.info("Inconsistent metadata {} count {} of {}",
+              testpath, callCount, metadataInconsistentCallCount);
+          logLocationAtDebug();
+          ObjectMetadata metadataSpy =
+              Mockito.spy(objectMetadata);
+          when(metadataSpy.getETag()).thenReturn(originalStatus.getETag());
+          when(metadataSpy.getVersionId())
+              .thenReturn(originalStatus.getVersionId());
+          return metadataSpy;
+        }
+        return objectMetadata;
+      }
+    };
+
+    // match requests made during select
+    doAnswer(temporarilyOldMetadataAnswer).when(s3ClientSpy).getObjectMetadata(
+        matchingMetadataRequest(testpath, null));
+    doAnswer(temporarilyOldMetadataAnswer).when(s3ClientSpy).getObjectMetadata(
+        matchingMetadataRequest(testpath, newStatus.getVersionId()));
+  }
+
+  /**
+   * Writes a file with null ETag and versionId in the metadata store.
+   */
+  private Path writeFileWithNoVersionMetadata(String filename)
+      throws IOException {
+    final Path testpath = path(filename);
+    writeDataset(fs, testpath, TEST_DATA_BYTES, TEST_DATA_BYTES.length,
+        1024, false);
+    S3AFileStatus originalStatus = (S3AFileStatus) fs.getFileStatus(testpath);
+
+    // remove ETag and versionId
+    S3AFileStatus newStatus = S3AFileStatus.fromFileStatus(originalStatus,
+        Tristate.FALSE, null, null);
+    fs.getMetadataStore().put(new PathMetadata(newStatus, Tristate.FALSE,
+        false));
+
+    return testpath;
+  }
+
+  /**
+   * The test is invalid if the policy uses versionId but the bucket doesn't
+   * have versioning enabled.
+   *
+   * Tests the given file for a versionId to detect whether bucket versioning
+   * is enabled.
+   */
+  private void skipIfVersionPolicyAndNoVersionId(Path testpath)
+      throws IOException {
+    if (fs.getChangeDetectionPolicy().getSource() == Source.VersionId) {
+      // skip versionId tests if the bucket doesn't have object versioning
+      // enabled
+      Assume.assumeTrue(
+          "Target filesystem does not support versioning",
+          fs.getObjectMetadata(fs.pathToKey(testpath)).getVersionId() != null);
+    }
+  }
+
+  /**
+   * Like {@link #skipIfVersionPolicyAndNoVersionId(Path)} but generates a new
+   * file to test versionId against.
+   */
+  private void skipIfVersionPolicyAndNoVersionId() throws IOException {
+    if (fs.getChangeDetectionPolicy().getSource() == Source.VersionId) {
+      Path versionIdFeatureTestFile = path("versionIdTest");
+      writeDataset(fs, versionIdFeatureTestFile, TEST_DATA_BYTES,
+          TEST_DATA_BYTES.length, 1024, true, true);
+      skipIfVersionPolicyAndNoVersionId(versionIdFeatureTestFile);
+    }
+  }
+
+  private GetObjectRequest matchingGetObjectRequest(Path path, String eTag,
+      String versionId) {
+    return ArgumentMatchers.argThat(request -> {
+      if (request.getBucketName().equals(fs.getBucket())
+          && request.getKey().equals(fs.pathToKey(path))) {
+        if (eTag == null && !request.getMatchingETagConstraints().isEmpty()) {
+          return false;
+        }
+        if (eTag != null &&
+            !request.getMatchingETagConstraints().contains(eTag)) {
+          return false;
+        }
+        if (versionId == null && request.getVersionId() != null) {
+          return false;
+        }
+        if (versionId != null && !versionId.equals(request.getVersionId())) {
+          return false;
+        }
+        return true;
+      }
+      return false;
+    });
+  }
+
+  private CopyObjectRequest matchingCopyObjectRequest(Path path, String eTag,
+      String versionId) {
+    return ArgumentMatchers.argThat(request -> {
+      if (request.getSourceBucketName().equals(fs.getBucket())
+          && request.getSourceKey().equals(fs.pathToKey(path))) {
+        if (eTag == null && !request.getMatchingETagConstraints().isEmpty()) {
+          return false;
+        }
+        if (eTag != null &&
+            !request.getMatchingETagConstraints().contains(eTag)) {
+          return false;
+        }
+        if (versionId == null && request.getSourceVersionId() != null) {
+          return false;
+        }
+        if (versionId != null &&
+            !versionId.equals(request.getSourceVersionId())) {
+          return false;
+        }
+        return true;
+      }
+      return false;
+    });
+  }
+
+  private GetObjectMetadataRequest matchingMetadataRequest(Path path,
+      String versionId) {
+    return ArgumentMatchers.argThat(request -> {
+      if (request.getBucketName().equals(fs.getBucket())
+          && request.getKey().equals(fs.pathToKey(path))) {
+        if (versionId == null && request.getVersionId() != null) {
+          return false;
+        }
+        if (versionId != null &&
+            !versionId.equals(request.getVersionId())) {
+          return false;
+        }
+        return true;
+      }
+      return false;
+    });
+  }
+
+  /**
+   * Skip a test case if it needs S3Guard and the filesystem does
+   * not have it.
+   */
+  private void requireS3Guard() {
+    Assume.assumeTrue("S3Guard must be enabled", fs.hasMetadataStore());
+  }
+
+  /**
+   * Skip a test case if S3 Select is not supported on this store.
+   */
+  private void requireS3Select() {
+    Assume.assumeTrue("S3 Select is not enabled",
+        getFileSystem().hasCapability(S3_SELECT_CAPABILITY));
+  }
+
+  /**
+   * Spy on the filesystem at the S3 client level.
+   * @return a mocked S3 client to which the test FS is bonded.
+   */
+  private AmazonS3 spyOnFilesystem() {
+    AmazonS3 s3ClientSpy = Mockito.spy(
+        fs.getAmazonS3ClientForTesting("mocking"));
+    fs.setAmazonS3Client(s3ClientSpy);
+    return s3ClientSpy;
+  }
+
+  /**
+   * Expect reading this stream to fail.
+   * @param instream input stream.
+   * @return the caught exception.
+   * @throws Exception an other exception
+   */
+
+  private RemoteFileChangedException expectReadFailure(
+      final FSDataInputStream instream)
+      throws Exception {
+    return intercept(RemoteFileChangedException.class, "",
+        "read() returned",
+        () -> readToText(instream.read()));
+  }
+
+  /**
+   * Convert the result of a read to a text string for errors.
+   * @param r result of the read() call.
+   * @return a string for exception text.
+   */
+  private String readToText(int r) {
+    return r < 32
+        ? (String.format("%02d", r))
+        : (String.format("%c", (char) r));
+  }
+
+  /**
+   * Is the version checking on the server?
+   * @return true if the server returns 412 errors.
+   */
+  private boolean versionCheckingIsOnServer() {
+    return fs.getChangeDetectionPolicy().getMode() == Mode.Server;
+  }
 }
diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3GuardListConsistency.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3GuardListConsistency.java
index a1df1a5fb527d..739d08807bdee 100644
--- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3GuardListConsistency.java
+++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3GuardListConsistency.java
@@ -30,6 +30,7 @@
 import org.apache.hadoop.fs.contract.AbstractFSContract;
 import org.apache.hadoop.fs.contract.s3a.S3AContract;
 
+import com.google.common.collect.Lists;
 import org.junit.Assume;
 import org.junit.Test;
 
@@ -249,7 +250,7 @@ public void testConsistentRenameAfterDelete() throws Exception {
         DEFAULT_DELAY_KEY_SUBSTRING)));
 
     try {
-      RemoteIterator<LocatedFileStatus> old = fs.listFilesAndEmptyDirectories(
+      RemoteIterator<S3ALocatedFileStatus> old = fs.listFilesAndEmptyDirectories(
           path("a"), true);
       fail("Recently renamed dir should not be visible");
     } catch(FileNotFoundException e) {
@@ -553,6 +554,44 @@ public void testInconsistentS3ClientDeletes() throws Throwable {
     );
   }
 
+  /**
+   * Tests that the file's eTag and versionId are preserved in recursive
+   * listings.
+   */
+  @Test
+  public void testListingReturnsVersionMetadata() throws Throwable {
+    S3AFileSystem fs = getFileSystem();
+    Assume.assumeTrue(fs.hasMetadataStore());
+
+    // write simple file
+    Path file = path("file1");
+    try (FSDataOutputStream outputStream = fs.create(file)) {
+      outputStream.writeChars("hello");
+    }
+
+    // get individual file status
+    FileStatus[] fileStatuses = fs.listStatus(file);
+    assertEquals(1, fileStatuses.length);
+    S3AFileStatus status = (S3AFileStatus) fileStatuses[0];
+    String eTag = status.getETag();
+    String versionId = status.getVersionId();
+
+    // get status through recursive directory listing
+    RemoteIterator<LocatedFileStatus> filesIterator = fs.listFiles(
+        file.getParent(), true);
+    List<LocatedFileStatus> files = Lists.newArrayList();
+    while (filesIterator.hasNext()) {
+      files.add(filesIterator.next());
+    }
+    assertEquals(1, files.size());
+
+    // ensure eTag and versionId are preserved in directory listing
+    S3ALocatedFileStatus locatedFileStatus =
+        (S3ALocatedFileStatus) files.get(0);
+    assertEquals(eTag, locatedFileStatus.getETag());
+    assertEquals(versionId, locatedFileStatus.getVersionId());
+  }
+
   /**
    * Assert that the two list sizes match; failure message includes the lists.
    * @param message text for the assertion
diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/MockS3AFileSystem.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/MockS3AFileSystem.java
index 51ff299e7be08..0e091a9e9cf0a 100644
--- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/MockS3AFileSystem.java
+++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/MockS3AFileSystem.java
@@ -177,7 +177,7 @@ public boolean exists(Path f) throws IOException {
   }
 
   @Override
-  void finishedWrite(String key, long length) {
+  void finishedWrite(String key, long length, String eTag, String versionId) {
 
   }
 
diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/S3ATestUtils.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/S3ATestUtils.java
index 484f079e3e6d6..3f1eafaa16339 100644
--- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/S3ATestUtils.java
+++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/S3ATestUtils.java
@@ -979,30 +979,25 @@ public static void verifyFileStatus(FileStatus status,
    * Verify the status entry of a directory matches that expected.
    * @param status status entry to check
    * @param replication replication factor
-   * @param modTime modified time
-   * @param accessTime access time
    * @param owner owner
-   * @param group user group
-   * @param permission permission.
    */
-  public static void verifyDirStatus(FileStatus status,
+  public static void verifyDirStatus(S3AFileStatus status,
       int replication,
-      long modTime,
-      long accessTime,
-      String owner,
-      String group,
-      FsPermission permission) {
+      String owner) {
     String details = status.toString();
     assertTrue("Is a dir: " + details, status.isDirectory());
     assertEquals("zero length: " + details, 0, status.getLen());
-
-    assertEquals("Mod time: " + details, modTime, status.getModificationTime());
+    // S3AFileStatus always assigns modTime = System.currentTimeMillis()
+    assertTrue("Mod time: " + details, status.getModificationTime() > 0);
     assertEquals("Replication value: " + details, replication,
         status.getReplication());
-    assertEquals("Access time: " + details, accessTime, status.getAccessTime());
+    assertEquals("Access time: " + details, 0, status.getAccessTime());
     assertEquals("Owner: " + details, owner, status.getOwner());
-    assertEquals("Group: " + details, group, status.getGroup());
-    assertEquals("Permission: " + details, permission, status.getPermission());
+    // S3AFileStatus always assigns group=owner
+    assertEquals("Group: " + details, owner, status.getGroup());
+    // S3AFileStatus always assigns permission = default
+    assertEquals("Permission: " + details,
+        FsPermission.getDefault(), status.getPermission());
   }
 
   /**
diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/TestListing.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/TestListing.java
index 39a5e3bd87d25..1a533bfe64609 100644
--- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/TestListing.java
+++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/TestListing.java
@@ -19,7 +19,6 @@
 package org.apache.hadoop.fs.s3a;
 
 import org.apache.hadoop.fs.FileStatus;
-import org.apache.hadoop.fs.LocatedFileStatus;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.fs.RemoteIterator;
 import org.junit.Assert;
@@ -40,11 +39,11 @@
  */
 public class TestListing extends AbstractS3AMockTest {
 
-  private static class MockRemoteIterator<FileStatus> implements
-      RemoteIterator<FileStatus> {
-    private Iterator<FileStatus> iterator;
+  private static class MockRemoteIterator<S3AFileStatus> implements
+      RemoteIterator<S3AFileStatus> {
+    private Iterator<S3AFileStatus> iterator;
 
-    MockRemoteIterator(Collection<FileStatus> source) {
+    MockRemoteIterator(Collection<S3AFileStatus> source) {
       iterator = source.iterator();
     }
 
@@ -52,13 +51,13 @@ public boolean hasNext() {
       return iterator.hasNext();
     }
 
-    public FileStatus next() {
+    public S3AFileStatus next() {
       return iterator.next();
     }
   }
 
-  private FileStatus blankFileStatus(Path path) {
-    return new FileStatus(0, true, 0, 0, 0, path);
+  private S3AFileStatus blankFileStatus(Path path) {
+    return new S3AFileStatus(Tristate.UNKNOWN, path, null);
   }
 
   @Test
@@ -78,11 +77,11 @@ public void testTombstoneReconcilingIterator() throws Exception {
     Set<Path> tombstones = new HashSet<>();
     tombstones.add(deletedChild);
 
-    RemoteIterator<FileStatus> sourceIterator = new MockRemoteIterator(
+    RemoteIterator<S3AFileStatus> sourceIterator = new MockRemoteIterator(
         statuses);
-    RemoteIterator<LocatedFileStatus> locatedIterator =
+    RemoteIterator<S3ALocatedFileStatus> locatedIterator =
         listing.createLocatedFileStatusIterator(sourceIterator);
-    RemoteIterator<LocatedFileStatus> reconcilingIterator =
+    RemoteIterator<S3ALocatedFileStatus> reconcilingIterator =
         listing.createTombstoneReconcilingIterator(locatedIterator, tombstones);
 
     Set<Path> expectedPaths = new HashSet<>();
@@ -98,8 +97,12 @@ public void testTombstoneReconcilingIterator() throws Exception {
 
   @Test
   public void testProvidedFileStatusIteratorEnd() throws Exception {
-    FileStatus[] statuses = {
-        new FileStatus(100, false, 1, 8192, 0, new Path("s3a://blah/blah"))
+    S3AFileStatus s3aStatus = new S3AFileStatus(
+        100, 0, new Path("s3a://blah/blah"),
+        8192, null, null, null);
+
+    S3AFileStatus[] statuses = {
+        s3aStatus
     };
     ProvidedFileStatusIterator it = new ProvidedFileStatusIterator(statuses,
         ACCEPT_ALL, new Listing.AcceptAllButS3nDirs());
diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/TestStreamChangeTracker.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/TestStreamChangeTracker.java
index f073c4c486d2f..c645ac5ad807d 100644
--- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/TestStreamChangeTracker.java
+++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/TestStreamChangeTracker.java
@@ -20,18 +20,23 @@
 
 import java.util.concurrent.atomic.AtomicLong;
 
+import com.amazonaws.AmazonServiceException;
+import com.amazonaws.SdkBaseException;
 import com.amazonaws.services.s3.Headers;
+import com.amazonaws.services.s3.model.CopyObjectRequest;
 import com.amazonaws.services.s3.model.GetObjectRequest;
 import com.amazonaws.services.s3.model.ObjectMetadata;
 import com.amazonaws.services.s3.model.S3Object;
-import org.apache.hadoop.fs.PathIOException;
+import com.amazonaws.services.s3.transfer.model.CopyResult;
 import org.junit.Test;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+import org.apache.hadoop.fs.PathIOException;
 import org.apache.hadoop.fs.s3a.impl.ChangeDetectionPolicy;
 import org.apache.hadoop.fs.s3a.impl.ChangeTracker;
 import org.apache.hadoop.test.HadoopTestBase;
+import org.apache.http.HttpStatus;
 
 import static org.apache.hadoop.fs.s3a.impl.ChangeDetectionPolicy.CHANGE_DETECTED;
 import static org.apache.hadoop.fs.s3a.impl.ChangeDetectionPolicy.createPolicy;
@@ -50,6 +55,8 @@ public class TestStreamChangeTracker extends HadoopTestBase {
 
   public static final String OBJECT = "object";
 
+  public static final String DEST_OBJECT = "new_object";
+
   public static final String URI = "s3a://" + BUCKET + "/" + OBJECT;
 
   @Test
@@ -161,12 +168,108 @@ public void testVersionCheckingOnServer() throws Throwable {
         CHANGE_DETECTED);
   }
 
+  @Test
+  public void testVersionCheckingUpfrontETag() throws Throwable {
+    ChangeTracker tracker = newTracker(
+        ChangeDetectionPolicy.Mode.Server,
+        ChangeDetectionPolicy.Source.ETag,
+        false,
+        objectAttributes("etag1", "versionid1"));
+
+    assertEquals("etag1", tracker.getRevisionId());
+  }
+
+  @Test
+  public void testVersionCheckingUpfrontVersionId() throws Throwable {
+    ChangeTracker tracker = newTracker(
+        ChangeDetectionPolicy.Mode.Server,
+        ChangeDetectionPolicy.Source.VersionId,
+        false,
+        objectAttributes("etag1", "versionid1"));
+
+    assertEquals("versionid1", tracker.getRevisionId());
+  }
+
+  @Test
+  public void testVersionCheckingETagCopyServer() throws Throwable {
+    ChangeTracker tracker = newTracker(
+        ChangeDetectionPolicy.Mode.Server,
+        ChangeDetectionPolicy.Source.VersionId,
+        false,
+        objectAttributes("etag1", "versionid1"));
+    assertConstraintApplied(tracker, newCopyObjectRequest());
+  }
+
+  @Test
+  public void testVersionCheckingETagCopyClient() throws Throwable {
+    ChangeTracker tracker = newTracker(
+        ChangeDetectionPolicy.Mode.Client,
+        ChangeDetectionPolicy.Source.VersionId,
+        false,
+        objectAttributes("etag1", "versionid1"));
+    assertFalse("Tracker should not have applied contraints " + tracker,
+        tracker.maybeApplyConstraint(newCopyObjectRequest()));
+  }
+
+  @Test
+  public void testCopyVersionIdRequired() throws Throwable {
+    ChangeTracker tracker = newTracker(
+        ChangeDetectionPolicy.Mode.Client,
+        ChangeDetectionPolicy.Source.VersionId,
+        true,
+        objectAttributes("etag1", "versionId"));
+
+    expectNoVersionAttributeException(tracker, newCopyResult("etag1",
+        null),
+        "policy requires VersionId");
+  }
+
+  @Test
+  public void testCopyETagRequired() throws Throwable {
+    ChangeTracker tracker = newTracker(
+        ChangeDetectionPolicy.Mode.Client,
+        ChangeDetectionPolicy.Source.ETag,
+        true,
+        objectAttributes("etag1", "versionId"));
+
+    expectNoVersionAttributeException(tracker, newCopyResult(null,
+        "versionId"),
+        "policy requires ETag");
+  }
+
+  @Test
+  public void testCopyVersionMismatch() throws Throwable {
+    ChangeTracker tracker = newTracker(
+        ChangeDetectionPolicy.Mode.Server,
+        ChangeDetectionPolicy.Source.ETag,
+        true,
+        objectAttributes("etag", "versionId"));
+
+    // 412 is translated to RemoteFileChangedException
+    // note: this scenario is never currently hit due to
+    // https://github.com/aws/aws-sdk-java/issues/1644
+    AmazonServiceException awsException =
+        new AmazonServiceException("aws exception");
+    awsException.setStatusCode(HttpStatus.SC_PRECONDITION_FAILED);
+    expectChangeException(tracker, awsException, "copy",
+        RemoteFileChangedException.PRECONDITIONS_FAILED);
+
+    // processing another type of exception does nothing
+    tracker.processException(new SdkBaseException("foo"), "copy");
+  }
+
   protected void assertConstraintApplied(final ChangeTracker tracker,
       final GetObjectRequest request) {
     assertTrue("Tracker should have applied contraints " + tracker,
         tracker.maybeApplyConstraint(request));
   }
 
+  protected void assertConstraintApplied(final ChangeTracker tracker,
+      final CopyObjectRequest request) throws PathIOException {
+    assertTrue("Tracker should have applied contraints " + tracker,
+        tracker.maybeApplyConstraint(request));
+  }
+
   protected RemoteFileChangedException expectChangeException(
       final ChangeTracker tracker,
       final S3Object response,
@@ -175,6 +278,15 @@ protected RemoteFileChangedException expectChangeException(
         RemoteFileChangedException.class);
   }
 
+  protected RemoteFileChangedException expectChangeException(
+      final ChangeTracker tracker,
+      final SdkBaseException exception,
+      final String operation,
+      final String message) throws Exception {
+    return expectException(tracker, exception, operation, message,
+        RemoteFileChangedException.class);
+  }
+
   protected PathIOException expectNoVersionAttributeException(
       final ChangeTracker tracker,
       final S3Object response,
@@ -183,6 +295,14 @@ protected PathIOException expectNoVersionAttributeException(
         NoVersionAttributeException.class);
   }
 
+  protected PathIOException expectNoVersionAttributeException(
+      final ChangeTracker tracker,
+      final CopyResult response,
+      final String message) throws Exception {
+    return expectException(tracker, response, message,
+        NoVersionAttributeException.class);
+  }
+
   protected <T extends Exception> T expectException(
       final ChangeTracker tracker,
       final S3Object response,
@@ -197,6 +317,35 @@ protected <T extends Exception> T expectException(
         });
   }
 
+  protected <T extends Exception> T expectException(
+      final ChangeTracker tracker,
+      final CopyResult response,
+      final String message,
+      final Class<T> clazz) throws Exception {
+    return intercept(
+        clazz,
+        message,
+        () -> {
+          tracker.processResponse(response);
+          return tracker;
+        });
+  }
+
+  protected <T extends Exception> T expectException(
+      final ChangeTracker tracker,
+      final SdkBaseException exception,
+      final String operation,
+      final String message,
+      final Class<T> clazz) throws Exception {
+    return intercept(
+        clazz,
+        message,
+        () -> {
+          tracker.processException(exception, operation);
+          return tracker;
+        });
+  }
+
   protected void assertRevisionId(final ChangeTracker tracker,
       final String revId) {
     assertEquals("Wrong revision ID in " + tracker,
@@ -218,14 +367,29 @@ protected void assertTrackerMismatchCount(
    */
   protected ChangeTracker newTracker(final ChangeDetectionPolicy.Mode mode,
       final ChangeDetectionPolicy.Source source, boolean requireVersion) {
+    return newTracker(mode, source, requireVersion,
+        objectAttributes(null, null));
+  }
+
+  /**
+   * Create tracker.
+   * Contains standard assertions(s).
+   * @return the tracker.
+   */
+  protected ChangeTracker newTracker(final ChangeDetectionPolicy.Mode mode,
+      final ChangeDetectionPolicy.Source source, boolean requireVersion,
+      S3ObjectAttributes objectAttributes) {
     ChangeDetectionPolicy policy = createPolicy(
         mode,
         source,
         requireVersion);
     ChangeTracker tracker = new ChangeTracker(URI, policy,
-        new AtomicLong(0));
-    assertFalse("Tracker should not have applied constraints " + tracker,
-        tracker.maybeApplyConstraint(newGetObjectRequest()));
+        new AtomicLong(0), objectAttributes);
+    if (objectAttributes.getVersionId() == null
+        && objectAttributes.getETag() == null) {
+      assertFalse("Tracker should not have applied constraints " + tracker,
+          tracker.maybeApplyConstraint(newGetObjectRequest()));
+    }
     return tracker;
   }
 
@@ -233,6 +397,21 @@ private GetObjectRequest newGetObjectRequest() {
     return new GetObjectRequest(BUCKET, OBJECT);
   }
 
+  private CopyObjectRequest newCopyObjectRequest() {
+    return new CopyObjectRequest(BUCKET, OBJECT, BUCKET, DEST_OBJECT);
+  }
+
+  private CopyResult newCopyResult(String eTag, String versionId) {
+    CopyResult copyResult = new CopyResult();
+    copyResult.setSourceBucketName(BUCKET);
+    copyResult.setSourceKey(OBJECT);
+    copyResult.setDestinationBucketName(BUCKET);
+    copyResult.setDestinationKey(DEST_OBJECT);
+    copyResult.setETag(eTag);
+    copyResult.setVersionId(versionId);
+    return copyResult;
+  }
+
   private S3Object newResponse(String etag, String versionId) {
     ObjectMetadata md = new ObjectMetadata();
     if (etag != null) {
@@ -252,4 +431,14 @@ private S3Object emptyResponse() {
     response.setKey(OBJECT);
     return response;
   }
+
+  private S3ObjectAttributes objectAttributes(
+      String etag, String versionId) {
+    return new S3ObjectAttributes(BUCKET,
+        OBJECT,
+        null,
+        null,
+        etag,
+        versionId);
+  }
 }
diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/AbstractS3GuardToolTestBase.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/AbstractS3GuardToolTestBase.java
index cdef917a43cd3..9b95ef3bb8093 100644
--- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/AbstractS3GuardToolTestBase.java
+++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/AbstractS3GuardToolTestBase.java
@@ -220,7 +220,7 @@ protected void createFile(Path path, boolean onS3, boolean onMetadataStore)
       ContractTestUtils.touch(fs, path);
     } else if (onMetadataStore) {
       S3AFileStatus status = new S3AFileStatus(100L, System.currentTimeMillis(),
-          fs.qualify(path), 512L, "hdfs");
+          fs.qualify(path), 512L, "hdfs", null, null);
       putFile(ms, status);
     }
   }
diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/ITestDynamoDBMetadataStore.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/ITestDynamoDBMetadataStore.java
index 709aa5a60a676..3c8867afad777 100644
--- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/ITestDynamoDBMetadataStore.java
+++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/ITestDynamoDBMetadataStore.java
@@ -227,12 +227,13 @@ public DynamoDBMSContract createContract(Configuration conf) {
   }
 
   @Override
-  FileStatus basicFileStatus(Path path, int size, boolean isDir)
+  S3AFileStatus basicFileStatus(Path path, int size, boolean isDir)
       throws IOException {
     String owner = UserGroupInformation.getCurrentUser().getShortUserName();
     return isDir
         ? new S3AFileStatus(true, path, owner)
-        : new S3AFileStatus(size, getModTime(), path, BLOCK_SIZE, owner);
+        : new S3AFileStatus(size, getModTime(), path, BLOCK_SIZE, owner,
+            null, null);
   }
 
   private DynamoDBMetadataStore getDynamoMetadataStore() throws IOException {
diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/ITestDynamoDBMetadataStoreScale.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/ITestDynamoDBMetadataStoreScale.java
index aa2dda835af79..66541469efb45 100644
--- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/ITestDynamoDBMetadataStoreScale.java
+++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/ITestDynamoDBMetadataStoreScale.java
@@ -41,7 +41,6 @@
 
 import org.apache.commons.lang3.StringUtils;
 import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileStatus;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.fs.StorageStatistics;
 import org.apache.hadoop.fs.contract.ContractTestUtils;
@@ -227,7 +226,7 @@ public void test_030_BatchedWrite() throws Exception {
             long pruneItems = 0;
             for (long i = 0; i < iterations; i++) {
               Path longPath = pathOfDepth(BATCH_SIZE, String.valueOf(i));
-              FileStatus status = basicFileStatus(longPath, 0, false, 12345,
+              S3AFileStatus status = basicFileStatus(longPath, 0, false,
                   12345);
               PathMetadata pm = new PathMetadata(status);
               synchronized (toCleanup) {
diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/ITestS3GuardToolLocal.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/ITestS3GuardToolLocal.java
index 6a4d45e9ea170..f81f0e2bc13b7 100644
--- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/ITestS3GuardToolLocal.java
+++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/ITestS3GuardToolLocal.java
@@ -37,7 +37,9 @@
 
 import org.apache.hadoop.fs.FSDataOutputStream;
 import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.s3a.S3AFileStatus;
 import org.apache.hadoop.fs.s3a.S3AFileSystem;
+import org.apache.hadoop.fs.s3a.Tristate;
 
 import static org.apache.hadoop.fs.s3a.MultipartTestUtils.*;
 import static org.apache.hadoop.fs.s3a.S3ATestUtils.getLandsatCSVFile;
@@ -95,6 +97,41 @@ public void testImportCommand() throws Exception {
     // assertTrue(children.isAuthoritative());
   }
 
+  @Test
+  public void testImportCommandRepairsETagAndVersionId() throws Exception {
+    S3AFileSystem fs = getFileSystem();
+    MetadataStore ms = getMetadataStore();
+    Path path = path("test-version-metadata");
+    try (FSDataOutputStream out = fs.create(path)) {
+      out.write(1);
+    }
+    S3AFileStatus originalStatus = (S3AFileStatus) fs.getFileStatus(path);
+
+    // put in bogus ETag and versionId
+    S3AFileStatus bogusStatus = S3AFileStatus.fromFileStatus(originalStatus,
+        Tristate.FALSE, "bogusETag", "bogusVersionId");
+    ms.put(new PathMetadata(bogusStatus));
+
+    // sanity check that bogus status is actually persisted
+    S3AFileStatus retrievedBogusStatus = (S3AFileStatus) fs.getFileStatus(path);
+    assertEquals("bogus ETag was not persisted",
+        "bogusETag", retrievedBogusStatus.getETag());
+    assertEquals("bogus versionId was not persisted",
+        "bogusVersionId", retrievedBogusStatus.getVersionId());
+
+    // execute the import
+    S3GuardTool.Import cmd = new S3GuardTool.Import(fs.getConf());
+    cmd.setStore(ms);
+    exec(cmd, "import", path.toString());
+
+    // make sure ETag and versionId were corrected
+    S3AFileStatus updatedStatus = (S3AFileStatus) fs.getFileStatus(path);
+    assertEquals("ETag was not corrected",
+        originalStatus.getETag(), updatedStatus.getETag());
+    assertEquals("VersionId was not corrected",
+        originalStatus.getVersionId(), updatedStatus.getVersionId());
+  }
+
   @Test
   public void testDestroyBucketExistsButNoTable() throws Throwable {
     run(Destroy.NAME,
diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/TestDirListingMetadata.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/TestDirListingMetadata.java
index 8458252af763c..cb183a2954820 100644
--- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/TestDirListingMetadata.java
+++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/TestDirListingMetadata.java
@@ -38,6 +38,8 @@
 public class TestDirListingMetadata {
 
   private static final String TEST_OWNER = "hadoop";
+  public static final String TEST_ETAG = "abc";
+  public static final String TEST_VERSION_ID = "def";
 
   @Rule
   public ExpectedException exception = ExpectedException.none();
@@ -79,7 +81,8 @@ public void testListing() {
     PathMetadata pathMeta2 = new PathMetadata(
         new S3AFileStatus(true, new Path(path, "dir2"), TEST_OWNER));
     PathMetadata pathMeta3 = new PathMetadata(
-        new S3AFileStatus(123, 456, new Path(path, "file1"), 8192, TEST_OWNER));
+        new S3AFileStatus(123, 456, new Path(path, "file1"), 8192, TEST_OWNER,
+            TEST_ETAG, TEST_VERSION_ID));
     List<PathMetadata> listing = Arrays.asList(pathMeta1, pathMeta2, pathMeta3);
     DirListingMetadata meta = new DirListingMetadata(path, listing, false);
     assertEquals(path, meta.getPath());
@@ -130,7 +133,8 @@ public void testGet() {
     PathMetadata pathMeta2 = new PathMetadata(
         new S3AFileStatus(true, new Path(path, "dir2"), TEST_OWNER));
     PathMetadata pathMeta3 = new PathMetadata(
-        new S3AFileStatus(123, 456, new Path(path, "file1"), 8192, TEST_OWNER));
+        new S3AFileStatus(123, 456, new Path(path, "file1"), 8192, TEST_OWNER,
+            TEST_ETAG, TEST_VERSION_ID));
     List<PathMetadata> listing = Arrays.asList(pathMeta1, pathMeta2, pathMeta3);
     DirListingMetadata meta = new DirListingMetadata(path, listing, false);
     assertEquals(path, meta.getPath());
@@ -181,7 +185,8 @@ public void testPut() {
     PathMetadata pathMeta2 = new PathMetadata(
         new S3AFileStatus(true, new Path(path, "dir2"), TEST_OWNER));
     PathMetadata pathMeta3 = new PathMetadata(
-        new S3AFileStatus(123, 456, new Path(path, "file1"), 8192, TEST_OWNER));
+        new S3AFileStatus(123, 456, new Path(path, "file1"), 8192, TEST_OWNER,
+            TEST_ETAG, TEST_VERSION_ID));
     List<PathMetadata> listing = Arrays.asList(pathMeta1, pathMeta2, pathMeta3);
     DirListingMetadata meta = new DirListingMetadata(path, listing, false);
     assertEquals(path, meta.getPath());
@@ -243,7 +248,8 @@ public void testRemove() {
     PathMetadata pathMeta2 = new PathMetadata(
         new S3AFileStatus(true, new Path(path, "dir2"), TEST_OWNER));
     PathMetadata pathMeta3 = new PathMetadata(
-        new S3AFileStatus(123, 456, new Path(path, "file1"), 8192, TEST_OWNER));
+        new S3AFileStatus(123, 456, new Path(path, "file1"), 8192, TEST_OWNER,
+            TEST_ETAG, TEST_VERSION_ID));
     List<PathMetadata> listing = Arrays.asList(pathMeta1, pathMeta2, pathMeta3);
     DirListingMetadata meta = new DirListingMetadata(path, listing, false);
     assertEquals(path, meta.getPath());
@@ -296,7 +302,7 @@ private static DirListingMetadata makeTwoDirsOneFile(Path parent) {
         new S3AFileStatus(true, new Path(parent, "dir2"), TEST_OWNER));
     PathMetadata pathMeta3 = new PathMetadata(
         new S3AFileStatus(123, 456, new Path(parent, "file1"), 8192,
-            TEST_OWNER));
+            TEST_OWNER, TEST_ETAG, TEST_VERSION_ID));
     List<PathMetadata> listing = Arrays.asList(pathMeta1, pathMeta2, pathMeta3);
     return new DirListingMetadata(parent, listing, false);
   }
diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/TestLocalMetadataStore.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/TestLocalMetadataStore.java
index 1d231eac96fbb..ee7b584ca18d2 100644
--- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/TestLocalMetadataStore.java
+++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/TestLocalMetadataStore.java
@@ -30,7 +30,9 @@
 import org.apache.hadoop.fs.FileStatus;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.s3a.S3AFileStatus;
 import org.apache.hadoop.fs.s3a.S3ATestUtils;
+import org.apache.hadoop.fs.s3a.Tristate;
 
 /**
  * MetadataStore unit test for {@link LocalMetadataStore}.
@@ -173,8 +175,8 @@ private static void populateMap(Cache<Path, LocalMetadataEntry> cache,
 
   private static void populateEntry(Cache<Path, LocalMetadataEntry> cache,
       Path path) {
-    FileStatus fileStatus = new FileStatus(0, true, 0, 0, 0, path);
-    cache.put(path, new LocalMetadataEntry(new PathMetadata(fileStatus)));
+    S3AFileStatus s3aStatus = new S3AFileStatus(Tristate.UNKNOWN, path, null);
+    cache.put(path, new LocalMetadataEntry(new PathMetadata(s3aStatus)));
   }
 
   private static long sizeOfMap(Cache<Path, LocalMetadataEntry> cache) {
@@ -201,9 +203,8 @@ protected void verifyFileStatus(FileStatus status, long size) {
   }
 
   @Override
-  protected void verifyDirStatus(FileStatus status) {
-    S3ATestUtils.verifyDirStatus(status, REPLICATION, getModTime(),
-        getAccessTime(), OWNER, GROUP, PERMISSION);
+  protected void verifyDirStatus(S3AFileStatus status) {
+    S3ATestUtils.verifyDirStatus(status, REPLICATION, OWNER);
   }
 
 }
diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/TestObjectChangeDetectionAttributes.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/TestObjectChangeDetectionAttributes.java
new file mode 100644
index 0000000000000..f001262b36a04
--- /dev/null
+++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/TestObjectChangeDetectionAttributes.java
@@ -0,0 +1,380 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.s3a.s3guard;
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.Collection;
+
+import com.amazonaws.services.s3.Headers;
+import com.amazonaws.services.s3.model.CompleteMultipartUploadRequest;
+import com.amazonaws.services.s3.model.CompleteMultipartUploadResult;
+import com.amazonaws.services.s3.model.GetObjectMetadataRequest;
+import com.amazonaws.services.s3.model.GetObjectRequest;
+import com.amazonaws.services.s3.model.InitiateMultipartUploadRequest;
+import com.amazonaws.services.s3.model.InitiateMultipartUploadResult;
+import com.amazonaws.services.s3.model.ListObjectsV2Request;
+import com.amazonaws.services.s3.model.ListObjectsV2Result;
+import com.amazonaws.services.s3.model.ObjectMetadata;
+import com.amazonaws.services.s3.model.PutObjectRequest;
+import com.amazonaws.services.s3.model.PutObjectResult;
+import com.amazonaws.services.s3.model.S3Object;
+import com.amazonaws.services.s3.model.UploadPartRequest;
+import com.amazonaws.services.s3.model.UploadPartResult;
+import org.hamcrest.BaseMatcher;
+import org.hamcrest.Description;
+import org.hamcrest.Matcher;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSDataInputStream;
+import org.apache.hadoop.fs.FSDataOutputStream;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.s3a.AbstractS3AMockTest;
+import org.apache.hadoop.fs.s3a.Constants;
+import org.apache.hadoop.fs.s3a.S3AFileStatus;
+
+import static org.apache.hadoop.fs.s3a.Constants.CHANGE_DETECT_MODE;
+import static org.apache.hadoop.fs.s3a.Constants.CHANGE_DETECT_MODE_SERVER;
+import static org.apache.hadoop.fs.s3a.Constants.CHANGE_DETECT_SOURCE;
+import static org.apache.hadoop.fs.s3a.Constants.CHANGE_DETECT_SOURCE_ETAG;
+import static org.apache.hadoop.fs.s3a.Constants.CHANGE_DETECT_SOURCE_VERSION_ID;
+import static org.junit.Assert.assertArrayEquals;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+import static org.mockito.ArgumentMatchers.any;
+import static org.mockito.Mockito.when;
+import static org.mockito.hamcrest.MockitoHamcrest.argThat;
+
+/**
+ * Unit tests to ensure object eTag and versionId are captured on S3 PUT and
+ * used on GET.
+ * Further (integration) testing is performed in
+ * {@link org.apache.hadoop.fs.s3a.ITestS3ARemoteFileChanged}.
+ */
+@RunWith(Parameterized.class)
+public class TestObjectChangeDetectionAttributes extends AbstractS3AMockTest {
+  private final String changeDetectionSource;
+
+  public TestObjectChangeDetectionAttributes(String changeDetectionSource) {
+    this.changeDetectionSource = changeDetectionSource;
+  }
+
+  @Parameterized.Parameters(name = "change={0}")
+  public static Collection<Object[]> params() {
+    return Arrays.asList(new Object[][]{
+        {CHANGE_DETECT_SOURCE_ETAG},
+        {CHANGE_DETECT_SOURCE_VERSION_ID}
+    });
+  }
+
+  @Override
+  public Configuration createConfiguration() {
+    Configuration conf = super.createConfiguration();
+    conf.setClass(Constants.S3_METADATA_STORE_IMPL,
+        LocalMetadataStore.class, MetadataStore.class);
+    conf.set(CHANGE_DETECT_SOURCE, changeDetectionSource);
+    conf.set(CHANGE_DETECT_MODE, CHANGE_DETECT_MODE_SERVER);
+    return conf;
+  }
+
+  /**
+   * Tests a file uploaded with a single PUT to ensure eTag is captured and used
+   * on file read.
+   */
+  @Test
+  public void testCreateAndReadFileSinglePart() throws Exception {
+    String bucket = "s3a://mock-bucket/";
+    String file = "single-part-file";
+    Path path = new Path(bucket, file);
+    byte[] content = "content".getBytes();
+    String eTag = "abc";
+    String versionId = "def";
+
+    putObject(file, path, content, eTag, versionId);
+
+    // make sure the eTag and versionId were put into the metadataStore
+    assertVersionAttributes(path, eTag, versionId);
+
+    // Ensure underlying S3 getObject call uses the stored eTag or versionId
+    // when reading data back.  If it doesn't, the read won't work and the
+    // assert will fail.
+    assertContent(file, path, content, eTag, versionId);
+
+    // test overwrite
+    byte[] newConent = "newcontent".getBytes();
+    String newETag = "newETag";
+    String newVersionId = "newVersionId";
+
+    putObject(file, path, newConent, newETag, newVersionId);
+    assertVersionAttributes(path, newETag, newVersionId);
+    assertContent(file, path, newConent, newETag, newVersionId);
+  }
+
+  /**
+   * Tests a file uploaded with multi-part upload to ensure eTag is captured
+   * and used on file read.
+   */
+  @Test
+  public void testCreateAndReadFileMultiPart() throws Exception {
+    String bucket = "s3a://mock-bucket/";
+    String file = "multi-part-file";
+    Path path = new Path(bucket, file);
+    byte[] content = new byte[Constants.MULTIPART_MIN_SIZE + 1];
+    String eTag = "abc";
+    String versionId = "def";
+
+    multipartUpload(file, path, content, eTag, versionId);
+
+    // make sure the eTag and versionId were put into the metadataStore
+    assertVersionAttributes(path, eTag, versionId);
+
+    // Ensure underlying S3 getObject call uses the stored eTag or versionId
+    // when reading data back.  If it doesn't, the read won't work and the
+    // assert will fail.
+    assertContent(file, path, content, eTag, versionId);
+
+    // test overwrite
+    byte[] newContent = new byte[Constants.MULTIPART_MIN_SIZE + 1];
+    Arrays.fill(newContent, (byte) 1);
+    String newETag = "newETag";
+    String newVersionId = "newVersionId";
+
+    multipartUpload(file, path, newContent, newETag, newVersionId);
+    assertVersionAttributes(path, newETag, newVersionId);
+    assertContent(file, path, newContent, newETag, newVersionId);
+  }
+
+  private void putObject(String file, Path path, byte[] content,
+      String eTag, String versionId) throws IOException {
+    PutObjectResult putObjectResult = new PutObjectResult();
+    ObjectMetadata objectMetadata = new ObjectMetadata();
+    objectMetadata.setContentLength(content.length);
+    putObjectResult.setMetadata(objectMetadata);
+    putObjectResult.setETag(eTag);
+    putObjectResult.setVersionId(versionId);
+
+    when(s3.getObjectMetadata(any(GetObjectMetadataRequest.class)))
+        .thenThrow(NOT_FOUND);
+    when(s3.putObject(argThat(correctPutObjectRequest(file))))
+        .thenReturn(putObjectResult);
+    ListObjectsV2Result emptyListing = new ListObjectsV2Result();
+    when(s3.listObjectsV2(argThat(correctListObjectsRequest(file + "/"))))
+        .thenReturn(emptyListing);
+
+    FSDataOutputStream outputStream = fs.create(path);
+    outputStream.write(content);
+    outputStream.close();
+  }
+
+  private void multipartUpload(String file, Path path, byte[] content,
+      String eTag, String versionId) throws IOException {
+    CompleteMultipartUploadResult uploadResult =
+        new CompleteMultipartUploadResult();
+    uploadResult.setVersionId(versionId);
+
+    when(s3.getObjectMetadata(any(GetObjectMetadataRequest.class)))
+        .thenThrow(NOT_FOUND);
+
+    InitiateMultipartUploadResult initiateMultipartUploadResult =
+        new InitiateMultipartUploadResult();
+    initiateMultipartUploadResult.setUploadId("uploadId");
+    when(s3.initiateMultipartUpload(
+        argThat(correctInitiateMultipartUploadRequest(file))))
+        .thenReturn(initiateMultipartUploadResult);
+
+    UploadPartResult uploadPartResult = new UploadPartResult();
+    uploadPartResult.setETag("partETag");
+    when(s3.uploadPart(argThat(correctUploadPartRequest(file))))
+        .thenReturn(uploadPartResult);
+
+    CompleteMultipartUploadResult multipartUploadResult =
+        new CompleteMultipartUploadResult();
+    multipartUploadResult.setETag(eTag);
+    multipartUploadResult.setVersionId(versionId);
+    when(s3.completeMultipartUpload(
+        argThat(correctMultipartUploadRequest(file))))
+        .thenReturn(multipartUploadResult);
+
+    ListObjectsV2Result emptyListing = new ListObjectsV2Result();
+    when(s3.listObjectsV2(argThat(correctListObjectsRequest(file + "/"))))
+        .thenReturn(emptyListing);
+
+    FSDataOutputStream outputStream = fs.create(path);
+    outputStream.write(content);
+    outputStream.close();
+  }
+
+  private void assertContent(String file, Path path, byte[] content,
+      String eTag, String versionId) throws IOException {
+    S3Object s3Object = new S3Object();
+    ObjectMetadata metadata = new ObjectMetadata();
+    metadata.setHeader(Headers.S3_VERSION_ID, versionId);
+    metadata.setHeader(Headers.ETAG, eTag);
+    s3Object.setObjectMetadata(metadata);
+    s3Object.setObjectContent(new ByteArrayInputStream(content));
+    when(s3.getObject(argThat(correctGetObjectRequest(file, eTag, versionId))))
+        .thenReturn(s3Object);
+    FSDataInputStream inputStream = fs.open(path);
+    byte[] readContent = IOUtils.toByteArray(inputStream);
+    assertArrayEquals(content, readContent);
+  }
+
+  private void assertVersionAttributes(Path path, String eTag, String versionId)
+      throws IOException {
+    MetadataStore metadataStore = fs.getMetadataStore();
+    PathMetadata pathMetadata = metadataStore.get(path);
+    assertNotNull(pathMetadata);
+    S3AFileStatus fileStatus = pathMetadata.getFileStatus();
+    assertEquals(eTag, fileStatus.getETag());
+    assertEquals(versionId, fileStatus.getVersionId());
+  }
+
+  private Matcher<GetObjectRequest> correctGetObjectRequest(final String key,
+      final String eTag, final String versionId) {
+    return new BaseMatcher<GetObjectRequest>() {
+      @Override
+      public boolean matches(Object item) {
+        if (item instanceof GetObjectRequest) {
+          GetObjectRequest getObjectRequest = (GetObjectRequest) item;
+          if (getObjectRequest.getKey().equals(key)) {
+            if (changeDetectionSource.equals(
+                CHANGE_DETECT_SOURCE_ETAG)) {
+              return getObjectRequest.getMatchingETagConstraints()
+                  .contains(eTag);
+            } else if (changeDetectionSource.equals(
+                CHANGE_DETECT_SOURCE_VERSION_ID)) {
+              return getObjectRequest.getVersionId().equals(versionId);
+            }
+          }
+        }
+        return false;
+      }
+
+      @Override
+      public void describeTo(Description description) {
+        description.appendText("key and "
+            + changeDetectionSource
+            + " matches");
+      }
+    };
+  }
+
+  private Matcher<UploadPartRequest> correctUploadPartRequest(
+      final String key) {
+    return new BaseMatcher<UploadPartRequest>() {
+      @Override
+      public boolean matches(Object item) {
+        if (item instanceof UploadPartRequest) {
+          UploadPartRequest request = (UploadPartRequest) item;
+          return request.getKey().equals(key);
+        }
+        return false;
+      }
+
+      @Override
+      public void describeTo(Description description) {
+        description.appendText("key matches");
+      }
+    };
+  }
+
+  private Matcher<InitiateMultipartUploadRequest>
+      correctInitiateMultipartUploadRequest(final String key) {
+    return new BaseMatcher<InitiateMultipartUploadRequest>() {
+      @Override
+      public void describeTo(Description description) {
+        description.appendText("key matches");
+      }
+
+      @Override
+      public boolean matches(Object item) {
+        if (item instanceof InitiateMultipartUploadRequest) {
+          InitiateMultipartUploadRequest request =
+              (InitiateMultipartUploadRequest) item;
+          return request.getKey().equals(key);
+        }
+        return false;
+      }
+    };
+  }
+
+  private Matcher<CompleteMultipartUploadRequest>
+      correctMultipartUploadRequest(final String key) {
+    return new BaseMatcher<CompleteMultipartUploadRequest>() {
+      @Override
+      public boolean matches(Object item) {
+        if (item instanceof CompleteMultipartUploadRequest) {
+          CompleteMultipartUploadRequest request =
+              (CompleteMultipartUploadRequest) item;
+          return request.getKey().equals(key);
+        }
+        return false;
+      }
+
+      @Override
+      public void describeTo(Description description) {
+        description.appendText("key matches");
+      }
+    };
+  }
+
+  private Matcher<ListObjectsV2Request> correctListObjectsRequest(
+      final String key) {
+    return new BaseMatcher<ListObjectsV2Request>() {
+      @Override
+      public boolean matches(Object item) {
+        if (item instanceof ListObjectsV2Request) {
+          ListObjectsV2Request listObjectsRequest =
+              (ListObjectsV2Request) item;
+          return listObjectsRequest.getPrefix().equals(key);
+        }
+        return false;
+      }
+
+      @Override
+      public void describeTo(Description description) {
+        description.appendText("key matches");
+      }
+    };
+  }
+
+  private Matcher<PutObjectRequest> correctPutObjectRequest(
+      final String key) {
+    return new BaseMatcher<PutObjectRequest>() {
+      @Override
+      public boolean matches(Object item) {
+        if (item instanceof PutObjectRequest) {
+          PutObjectRequest putObjectRequest = (PutObjectRequest) item;
+          return putObjectRequest.getKey().equals(key);
+        }
+        return false;
+      }
+
+      @Override
+      public void describeTo(Description description) {
+        description.appendText("key matches");
+      }
+    };
+  }
+}
diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/scale/AbstractITestS3AMetadataStoreScale.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/scale/AbstractITestS3AMetadataStoreScale.java
index 0c469f2b8a28e..1bffc3b1b72fc 100644
--- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/scale/AbstractITestS3AMetadataStoreScale.java
+++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/scale/AbstractITestS3AMetadataStoreScale.java
@@ -180,7 +180,8 @@ protected S3AFileStatus copyStatus(S3AFileStatus status) {
           status.getOwner());
     } else {
       return new S3AFileStatus(status.getLen(), status.getModificationTime(),
-          status.getPath(), status.getBlockSize(), status.getOwner());
+          status.getPath(), status.getBlockSize(), status.getOwner(),
+          status.getETag(), status.getVersionId());
     }
   }
 
@@ -217,7 +218,8 @@ private static void printTiming(Logger log, String op, NanoTimer timer,
   }
 
   protected static S3AFileStatus makeFileStatus(Path path) throws IOException {
-    return new S3AFileStatus(SIZE, ACCESS_TIME, path, BLOCK_SIZE, OWNER);
+    return new S3AFileStatus(SIZE, ACCESS_TIME, path, BLOCK_SIZE, OWNER,
+        null, null);
   }
 
   protected static S3AFileStatus makeDirStatus(Path p) throws IOException {
diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/select/ITestS3SelectCLI.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/select/ITestS3SelectCLI.java
index fccf708fef4e8..e31b48e5b5862 100644
--- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/select/ITestS3SelectCLI.java
+++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/select/ITestS3SelectCLI.java
@@ -25,6 +25,7 @@
 import java.nio.charset.Charset;
 import java.util.List;
 
+import org.junit.Assume;
 import org.junit.Test;
 
 import org.apache.commons.io.IOUtils;
@@ -34,6 +35,8 @@
 import org.apache.hadoop.fs.s3a.S3AFileSystem;
 import org.apache.hadoop.fs.s3a.S3ATestUtils;
 import org.apache.hadoop.fs.s3a.Statistic;
+import org.apache.hadoop.fs.s3a.impl.ChangeDetectionPolicy;
+import org.apache.hadoop.fs.s3a.impl.ChangeDetectionPolicy.Source;
 import org.apache.hadoop.fs.s3a.s3guard.S3GuardTool;
 import org.apache.hadoop.util.ExitUtil;
 import org.apache.hadoop.util.OperationDuration;
@@ -80,6 +83,11 @@ public void setup() throws Exception {
     selectConf = new Configuration(getConfiguration());
     localFile = getTempFilename();
     landsatSrc = getLandsatGZ().toString();
+    ChangeDetectionPolicy changeDetectionPolicy =
+        getLandsatFS().getChangeDetectionPolicy();
+    Assume.assumeFalse("the standard landsat bucket doesn't have versioning",
+        changeDetectionPolicy.getSource() == Source.VersionId
+            && changeDetectionPolicy.isRequireVersion());
   }
 
   @Override
diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/select/ITestS3SelectLandsat.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/select/ITestS3SelectLandsat.java
index 78f3a6d1fe558..2099edd248b63 100644
--- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/select/ITestS3SelectLandsat.java
+++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/select/ITestS3SelectLandsat.java
@@ -21,6 +21,7 @@
 import java.io.IOException;
 import java.util.List;
 
+import org.junit.Assume;
 import org.junit.Test;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -32,6 +33,8 @@
 import org.apache.hadoop.fs.FileContext;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.fs.PathIOException;
+import org.apache.hadoop.fs.s3a.impl.ChangeDetectionPolicy;
+import org.apache.hadoop.fs.s3a.impl.ChangeDetectionPolicy.Source;
 import org.apache.hadoop.fs.s3a.S3AFileSystem;
 import org.apache.hadoop.fs.s3a.S3AInstrumentation;
 import org.apache.hadoop.fs.s3a.S3ATestUtils;
@@ -190,6 +193,11 @@ public void setup() throws Exception {
     // disable the gzip codec, so that the record readers do not
     // get confused
     enablePassthroughCodec(selectConf, ".gz");
+    ChangeDetectionPolicy changeDetectionPolicy =
+        getLandsatFS().getChangeDetectionPolicy();
+    Assume.assumeFalse("the standard landsat bucket doesn't have versioning",
+        changeDetectionPolicy.getSource() == Source.VersionId
+            && changeDetectionPolicy.isRequireVersion());
   }
 
   protected int getMaxLines() {
diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/select/ITestS3SelectMRJob.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/select/ITestS3SelectMRJob.java
index ee7de8c7ac2f2..181d797767397 100644
--- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/select/ITestS3SelectMRJob.java
+++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/select/ITestS3SelectMRJob.java
@@ -21,6 +21,9 @@
 import java.io.IOException;
 import java.util.concurrent.atomic.AtomicLong;
 
+import org.apache.hadoop.fs.s3a.impl.ChangeDetectionPolicy;
+import org.apache.hadoop.fs.s3a.impl.ChangeDetectionPolicy.Source;
+import org.junit.Assume;
 import org.junit.Test;
 
 import org.apache.hadoop.conf.Configuration;
@@ -90,6 +93,13 @@ public class ITestS3SelectMRJob extends AbstractS3SelectTest {
   public void setup() throws Exception {
     super.setup();
     fs = S3ATestUtils.createTestFileSystem(conf);
+
+    ChangeDetectionPolicy changeDetectionPolicy =
+        getLandsatFS().getChangeDetectionPolicy();
+    Assume.assumeFalse("the standard landsat bucket doesn't have versioning",
+        changeDetectionPolicy.getSource() == Source.VersionId
+            && changeDetectionPolicy.isRequireVersion());
+
     rootPath = path("ITestS3SelectMRJob");
     Path workingDir = path("working");
     fs.setWorkingDirectory(workingDir);
diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/mapreduce/filecache/TestS3AResourceScope.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/mapreduce/filecache/TestS3AResourceScope.java
index c9b1ddc97ee18..172f79e09aea7 100644
--- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/mapreduce/filecache/TestS3AResourceScope.java
+++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/mapreduce/filecache/TestS3AResourceScope.java
@@ -48,7 +48,7 @@ public void testS3AFilesArePrivate() throws Throwable {
 
   @Test
   public void testS3AFilesArePrivateOtherContstructor() throws Throwable {
-    S3AFileStatus status = new S3AFileStatus(0, 0, PATH, 1, "self");
+    S3AFileStatus status = new S3AFileStatus(0, 0, PATH, 1, "self", null, null);
     assertTrue("Not encrypted: " + status, status.isEncrypted());
     assertNotExecutable(status);
   }