From af8056281a33919ab7d161a05d4d0d134ad9d159 Mon Sep 17 00:00:00 2001
From: Steve Loughran <stevel@apache.org>
Date: Fri, 8 Mar 2019 11:53:49 +0000
Subject: [PATCH 1/3] HADOOP-13327 Output Stream Specification

Change-Id: I1b6bc258a40a8bd57879d9edc3e5bb1303f0fff2
---
 .../apache/hadoop/fs/CanSetDropBehind.java    |   2 +-
 .../java/org/apache/hadoop/fs/Syncable.java   |   6 +-
 .../fs/impl/StoreImplementationUtils.java     |  81 ++
 .../hadoop/fs/impl/StreamStateModel.java      | 205 +++++
 .../site/markdown/filesystem/filesystem.md    |  16 +-
 .../site/markdown/filesystem/outputstream.md  | 857 ++++++++++++++++++
 .../contract/AbstractContractCreateTest.java  |  88 +-
 .../contract/AbstractFSContractTestBase.java  |  11 +-
 .../hadoop/fs/contract/ContractOptions.java   |   9 +
 .../src/test/resources/contract/hdfs.xml      |  10 +
 .../hadoop/fs/s3a/S3ABlockOutputStream.java   | 101 ++-
 .../fs/s3a/TestS3ABlockOutputStream.java      |  66 --
 .../hadoop/fs/adl/AdlFsOutputStream.java      |  10 +-
 .../src/test/resources/adls.xml               |  10 +
 14 files changed, 1362 insertions(+), 110 deletions(-)
 create mode 100644 hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/impl/StoreImplementationUtils.java
 create mode 100644 hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/impl/StreamStateModel.java
 create mode 100644 hadoop-common-project/hadoop-common/src/site/markdown/filesystem/outputstream.md
 delete mode 100644 hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/TestS3ABlockOutputStream.java

diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/CanSetDropBehind.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/CanSetDropBehind.java
index 2e2d98b9c5462..0077838920a9e 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/CanSetDropBehind.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/CanSetDropBehind.java
@@ -36,6 +36,6 @@ public interface CanSetDropBehind {
    *         UnsupportedOperationException  If this stream doesn't support
    *                                        setting the drop-behind.
    */
-  public void setDropBehind(Boolean dropCache) 
+  void setDropBehind(Boolean dropCache)
       throws IOException, UnsupportedOperationException;
 }
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/Syncable.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/Syncable.java
index 7ec3509ce1df6..1f1295e4e7deb 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/Syncable.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/Syncable.java
@@ -27,16 +27,16 @@
 @InterfaceAudience.Public
 @InterfaceStability.Evolving
 public interface Syncable {
-  
+
   /** Flush out the data in client's user buffer. After the return of
    * this call, new readers will see the data.
    * @throws IOException if any error occurs
    */
-  public void hflush() throws IOException;
+  void hflush() throws IOException;
   
   /** Similar to posix fsync, flush out the data in client's user buffer 
    * all the way to the disk device (but the disk may have it in its cache).
    * @throws IOException if error occurs
    */
-  public void hsync() throws IOException;
+  void hsync() throws IOException;
 }
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/impl/StoreImplementationUtils.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/impl/StoreImplementationUtils.java
new file mode 100644
index 0000000000000..5d0a00735fb87
--- /dev/null
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/impl/StoreImplementationUtils.java
@@ -0,0 +1,81 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.impl;
+
+import java.io.OutputStream;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.classification.InterfaceStability;
+import org.apache.hadoop.fs.StreamCapabilities;
+
+import static org.apache.hadoop.fs.StreamCapabilities.HFLUSH;
+import static org.apache.hadoop.fs.StreamCapabilities.HSYNC;
+
+/**
+ * Utility classes to help implementing filesystems and streams.
+ */
+@InterfaceAudience.Private
+@InterfaceStability.Unstable
+public final class StoreImplementationUtils {
+
+  private StoreImplementationUtils() {
+  }
+
+  /**
+   * Check the supplied capabilities for being those required for full
+   * {@code Syncable.hsync()} and {@code Syncable.hflush()} functionality.
+   * @param capability capability string.
+   * @return true if either refers to one of the Syncable operations.
+   */
+  public static boolean supportsSyncable(String capability) {
+    return capability.equalsIgnoreCase(HSYNC) ||
+        capability.equalsIgnoreCase((HFLUSH));
+  }
+
+  /**
+   * Probe for an object having a capability; returns true
+   * iff the stream implements {@link StreamCapabilities} and its
+   * {@code hasCapabilities()} method returns true for the capability.
+   * This is a private method intended to provided a common implementation
+   * for input, output streams; only the stronger typed
+   * @param object object to probe.
+   * @param capability capability to probe for
+   * @return true iff the object implements stream capabilities and
+   * declares that it supports the capability.
+   */
+  private static boolean objectHasCapability(Object object, String capability) {
+    if (object instanceof StreamCapabilities) {
+      return ((StreamCapabilities) object).hasCapability(capability);
+    }
+    return false;
+  }
+
+  /**
+   * Probe for an output stream having a capability; returns true
+   * iff the stream implements {@link StreamCapabilities} and its
+   * {@code hasCapabilities()} method returns true for the capability.
+   * @param out output stream
+   * @param capability capability to probe for
+   * @return true iff the stream declares that it supports the capability.
+   */
+  public static boolean hasCapability(OutputStream out, String capability) {
+    return objectHasCapability(out, capability);
+  }
+
+}
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/impl/StreamStateModel.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/impl/StreamStateModel.java
new file mode 100644
index 0000000000000..3db52929727f0
--- /dev/null
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/impl/StreamStateModel.java
@@ -0,0 +1,205 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.impl;
+
+import java.io.IOException;
+import java.util.concurrent.locks.Lock;
+import java.util.concurrent.locks.ReentrantLock;
+
+import com.google.common.base.Preconditions;
+
+import org.apache.commons.lang3.StringUtils;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.PathIOException;
+
+import static org.apache.hadoop.fs.FSExceptionMessages.STREAM_IS_CLOSED;
+
+/**
+ * Models a stream's state and can be used for checking this state before
+ * any operation.
+ *
+ * The model has three states: Open, Error, and Closed,
+ *
+ * <pre>
+ *   Open: caller can interact with the stream.
+ *   Error: all operations will raise the previously recorded exception.
+ *   Closed: operations will be rejected.
+ * </pre>
+ */
+public class StreamStateModel {
+
+  /**
+   * States of the stream.
+   */
+  public enum State {
+
+    /**
+     * Stream is open.
+     */
+    Open,
+
+    /**
+     * Stream is in an error state.
+     * It is not expected to recover from this.
+     */
+    Error,
+
+    /**
+     * Stream is now closed. Operations will fail.
+     */
+    Closed
+  }
+
+  /**
+   * Path; if not empty then a {@link PathIOException} will be raised
+   * containing this path.
+   */
+  private final String path;
+
+  /** Lock. Not considering an InstrumentedWriteLock, but it is an option. */
+  private final Lock lock = new ReentrantLock();
+
+  /**
+   * Initial state: open.
+   * This is volatile: it can be queried without encountering any locks.
+   * However, to guarantee the state is constant through the life of an
+   * operation, updates must be through the synchronized methods.
+   */
+  private volatile State state = State.Open;
+
+  /** Any exception to raise on the next checkOpen call. */
+  private IOException exception;
+
+  public StreamStateModel(final Path path) {
+    this.path = path.toString();
+  }
+
+  public StreamStateModel(final String path) {
+    this.path = path;
+  }
+
+  /**
+   * Get the current state.
+   * Not synchronized; lock if you want consistency across calls.
+   * @return the current state.
+   */
+  public State getState() {
+    return state;
+  }
+
+  /**
+   * Change state to closed. No-op if the state was in closed or error
+   * @return true if the state changed.
+   */
+  public synchronized boolean enterClosedState() {
+    if (state == State.Open) {
+      state = State.Closed;
+      return true;
+    } else {
+      return false;
+    }
+  }
+
+  /**
+   * Change state to error and stores first error so it can be re-thrown.
+   * If already in error: return previous exception.
+   * @param ex the exception to record
+   * @return the exception set when the error state was entered.
+   */
+  public synchronized IOException enterErrorState(final IOException ex) {
+    Preconditions.checkArgument(ex != null, "Null exception");
+    switch (state) {
+      // a stream can go into the error state when open or closed
+    case Open:
+    case Closed:
+      exception = ex;
+      state = State.Error;
+      break;
+    case Error:
+      // already in this state; retain the previous exception.
+      break;
+    }
+    return exception;
+  }
+
+  /**
+   * Check a stream is open.
+   * If in an error state: rethrow that exception. If closed,
+   * throw an exception about that.
+   * @throws IOException if the stream is not open.
+   */
+  public synchronized void checkOpen() throws IOException {
+    switch (state) {
+    case Open:
+      return;
+
+    case Error:
+      throw exception;
+
+    case Closed:
+      if (StringUtils.isNotEmpty(path)) {
+        throw new PathIOException(path, STREAM_IS_CLOSED);
+      } else {
+        throw new IOException(STREAM_IS_CLOSED);
+      }
+    }
+  }
+
+  /**
+   * Acquire an exclusive lock.
+   * @param checkOpen must the stream be open?
+   * @throws IOException if the stream is in error state or checkOpen==true
+   * and the stream is closed.
+   */
+  public void acquireLock(boolean checkOpen) throws IOException {
+    // fail fast if the stream is required to be open and it is not
+    if (checkOpen) {
+      checkOpen();
+    }
+
+    // acquire the lock; this may suspend the thread
+    lock.lock();
+
+    // now verify that the stream is still open.
+    if (checkOpen) {
+      checkOpen();
+    }
+  }
+
+  /**
+   * Release the lock.
+   */
+  public void releaseLock() {
+    lock.unlock();
+  }
+
+  /**
+   * Check for a stream being in a specific state.
+   * The check is synchronized, but not locked; if the caller does
+   * not hold a lock then the state may change before any subsequent
+   * operation.
+   * @param expected expected state
+   * @return return true iff the steam was in the state at the time
+   * of checking.
+   */
+  public synchronized boolean isInState(State expected) {
+    return state == expected;
+  }
+
+}
diff --git a/hadoop-common-project/hadoop-common/src/site/markdown/filesystem/filesystem.md b/hadoop-common-project/hadoop-common/src/site/markdown/filesystem/filesystem.md
index a2458ee891448..a60490e832181 100644
--- a/hadoop-common-project/hadoop-common/src/site/markdown/filesystem/filesystem.md
+++ b/hadoop-common-project/hadoop-common/src/site/markdown/filesystem/filesystem.md
@@ -594,6 +594,8 @@ and MAY be a `RuntimeException` or subclass. For instance, HDFS may raise a `Inv
 
     result = FSDataOutputStream
 
+A zero byte file must exist at the end of the specified path, visible to all
+
 The updated (valid) FileSystem must contains all the parent directories of the path, as created by `mkdirs(parent(p))`.
 
 The result is `FSDataOutputStream`, which through its operations may generate new filesystem states with updated values of
@@ -609,8 +611,16 @@ The result is `FSDataOutputStream`, which through its operations may generate ne
 
 * S3A, Swift and potentially other Object Stores do not currently change the FS state
 until the output stream `close()` operation is completed.
-This MAY be a bug, as it allows >1 client to create a file with `overwrite==false`,
- and potentially confuse file/directory logic
+This is a significant difference between the behavior of object stores
+and that of filesystems, as it allows >1 client to create a file with `overwrite==false`,
+and potentially confuse file/directory logic. In particular, using `create()` to acquire
+an exclusive lock on a file (whoever creates the file without an error is considered
+the holder of the lock) is not a valid algorithm when working with object stores.
+
+* Object stores may create an empty file as a marker when a file is created.
+However, object stores whith overwrite=true semantics may not implement this atomically,
+so creating files with `overwrite==false` cannot be used as an implicit exclusion
+mechanism between processes.
 
 * The Local FileSystem raises a `FileNotFoundException` when trying to create a file over
 a directory, hence it is listed as an exception that MAY be raised when
@@ -647,7 +657,7 @@ Implementations without a compliant call SHOULD throw `UnsupportedOperationExcep
 
 #### Postconditions
 
-    FS
+    FS' = FS
     result = FSDataOutputStream
 
 Return: `FSDataOutputStream`, which can update the entry `FS.Files[p]`
diff --git a/hadoop-common-project/hadoop-common/src/site/markdown/filesystem/outputstream.md b/hadoop-common-project/hadoop-common/src/site/markdown/filesystem/outputstream.md
new file mode 100644
index 0000000000000..6f4969f2742e8
--- /dev/null
+++ b/hadoop-common-project/hadoop-common/src/site/markdown/filesystem/outputstream.md
@@ -0,0 +1,857 @@
+<!---
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License. See accompanying LICENSE file.
+-->
+
+<!-- MACRO{toc|fromDepth=1|toDepth=3} -->
+
+# Output: `OutputStream`, `Syncable` and `StreamCapabilities`
+
+
+With the exception of `FileSystem.copyFromLocalFile()`,
+all API operations which write data to a filesystem in Hadoop do so
+through the Java "OutputStreams" API. More specifically, they do
+so through `OutputStream` subclasses obtained through calls to
+`FileSystem.create()`, `FileSystem.append()`,
+or `FSDataOutputStreamBuilder.build()`.
+
+These all return instances of `FSDataOutputStream`, through which data
+can be written through various `write()` methods.
+After a stream's `close()` method is called, all data written to the
+stream MUST BE persisted to the fileysystem and visible to oll other
+clients attempting to read data from that path via `FileSystem.open()`.
+
+As well as operations to write the data, Hadoop's Output Streams
+provide methods to flush buffered data back to the filesystem,
+so as to ensure that the data is reliably persisted and/or visible
+to other callers. This is done via the `Syncable` interface. It was
+originally intended that the presence of this interface could be interpreted
+as a guarantee that the stream supported it's methods, but this has proven
+impossible to guarantee as the static nature of the interface is incompatible
+with filesystems whose syncability semantics may vary on a store/path basis.
+As an example, erasure coded files in HDFS have
+A new interface, `StreamCapabilities` has been implemented to allow callers to probe the exact capabilities of a stream, even transitively
+through a chain of streams.
+
+* HDFS's primary stream implementation is
+`org.apache.hadoop.hdfs.DFSOutputStream`.
+* The subclass `org.apache.hadoop.hdfs.DFSStripedOutputStream` supports erasure
+coding: it removes the `Syncable` behaviors from the base class.
+* The output streams `org.apache.hadoop.fs.FSOutputSummer` and
+`org.apache.hadoop.fs.ChecksumFileSystem.ChecksumFSOutputSummer`
+contain the underlying checksummed output stream used by
+both HDFS and the "file" filesystems.
+
+
+## Output Stream Model
+
+For this specification, an output stream can be viewed as a list of bytes
+stored in in the client
+
+```python
+buffer: List[byte]`
+```
+
+A flag, `open` tracks whether the stream is open: after the stream
+is closed no more data may be written to it:
+
+```python
+open: bool
+buffer: List[byte]
+```
+
+The destination path of the stream, `path` can be tracked to form a triple
+`Path, open, buffer`
+
+
+```python
+Stream = (path, open, buffer)
+```
+
+(Immediately) after `Syncable` operations which flush data to the filesystem,
+the data at the stream's destination path must match that of
+`buffer`. That is, the following condition holds:
+
+```python
+FS'.Files(path) == buffer
+```
+
+Any client reading the data at the path will see the new data.
+The two sync operations, `hflush()` and `hsync()` differ in their durability
+guarantees, not visibility of data.
+
+
+### State of Stream and filesystem after `Filesystem.create()`
+
+
+The output stream returned by a `FileSystem.create(path)` call contains no
+data:
+
+```python
+Stream' = (path, true, [])
+```
+
+The filesystem `FS'` must contain a 0-byte file at the path:
+
+```python
+data(FS', path) == []
+```
+
+Accordingly, the the initial state of `Stream'.buffer` is implicitly
+consistent with the data at the filesystem.
+
+
+*Object Stores*: the 0-byte empty file may not exist in the filesystem.
+
+### State of Stream and filesystem after `Filesystem.append()`
+
+The output stream returned from a call of
+ `FileSystem.append(path, buffersize, progress)`,
+can be modelled as a stream whose `buffer` is intialized to that of
+the original file:
+
+```python
+Stream' = (path, true, data(FS, path))
+```
+
+####  Persisting data
+
+When the stream writes data back to its store, be it in any
+supported flush operation, in the `close()` operation, or at any other
+time the stream chooses to do so, the contents of the file
+are replaced with the current buffer
+
+```python
+Stream' = (path, true, buffer)
+FS' = FS where data(FS', path) == buffer
+```
+
+After a call to `close()`, the stream is closed for all operations other
+than `close()`; they MAY fail with `IOException` or `RuntimeException`.
+
+```python
+Stream' =  (path, false, [])
+```
+
+The `close()` operation must be idempotent with the sole attempt to write the
+data made in the first invocation.
+
+1. If `close()` succeeds, subsequent calls are no-ops.
+1. If `close()` fails, again, subsequent calls are no-ops. They MAY rethrow
+the previous exception, but they MUST NOT retry the write.
+
+<!--  ============================================================= -->
+<!--  CLASS: FSDataOutputStream -->
+<!--  ============================================================= -->
+
+# Class `FSDataOutputStream`
+
+
+```java
+public class FSDataOutputStream
+  extends DataOutputStream
+  implements Syncable, CanSetDropBehind, StreamCapabilities {
+ // ...
+}
+```
+
+The `FileSystem.create()`, `FileSystem.append()` and
+`FSDataOutputStreamBuilder.build()` calls return an instance
+of a class `FSDataOutputStream`, a subclass of `java.io.OutputStream`.
+
+The base class wraps an `OutputStream` instance, one which may implement `Streamable`,
+`CanSetDropBehind` and `StreamCapabilities`.
+
+This document covers the requirements of such implementations.
+
+HDFS's `FileSystem` implementation, `DistributedFileSystem`, returns an instance
+of `HdfsDataOutputStream`. This implementation has at least two behaviors
+which are not explicitly declared by the base Java implmentation
+
+1. Writes are synchronized: more than one thread can write to the same
+output stream. This is a use pattern which HBase relies on.
+
+1. `OutputStream.flush()` is a no-op when the file is closed. Apache Druid
+has made such a call on this in the past
+[HADOOP-14346](https://issues.apache.org/jira/browse/HADOOP-14346).
+
+
+As the HDFS implementation is considered the de-facto specification of
+the FileSystem APIs, the fact that `write()` is thread-safe is significant.
+
+For compatibility, not only must other FS clients be thread-safe,
+but new HDFS featues, such as encryption and Erasure Coding must also
+implement consistent behavior with the core HDFS output stream.
+
+Put differently:
+
+*It isn't enough for Output Streams to implement the core semantics
+of `java.io.OutputStream`: they need to implement the extra semantics
+of `HdfsDataOutputStream`. Failure to do so must be considered regressions*
+
+The concurrent `write()` call is the most significant tightening of
+the Java specification.
+
+
+## Class `java.io.OutputStream`
+
+
+A Java `OutputStream` allows applications to write a sequence of bytes to a destination.
+In a Hadoop filesystem, that destination is the data under a path in the filesystem.
+
+
+```java
+public abstract class OutputStream implements Closeable, Flushable {
+  public abstract void write(int b) throws IOException;
+  public void write(byte b[]) throws IOException;
+  public void write(byte b[], int off, int len) throws IOException;
+  public void flush() throws IOException ;
+  public void close() throws IOException;
+}
+```
+
+### <a name="write(data: int)"></a>`write(Stream, data)`
+
+Writes a byte of data to the stream.
+
+#### Preconditions
+
+```python
+Stream.open else raise ClosedChannelException, PathIOException, IOException
+```
+
+The exception `java.nio.channels.ClosedChannelExceptionn` is
+raised in the HDFS output streams when trying to write to a closed file.
+Ths exception does not include the destination path; and
+`Exception.getMessage()` is `null`. It is therefore of limited value in stack
+traces. Implementors may wish to raise exceptions with more detail, such
+as a `PathIOException`.
+
+
+#### Postconditions
+
+The buffer has the lower 8 bits of the data argument appended to it.
+
+```python
+Stream'.buffer = Stream.buffer + [data & 0xff]
+```
+
+There may be an explicit limit on the size of cached data, or an implicit
+limit based by the available capacity of the destination filesystem.
+When a limit is reached, `write()` SHOULD fail with an `IOException`.
+
+### <a name="write(buffer,offset,len)"></a>`write(Stream, byte[] data, int offset, int len)`
+
+
+#### Preconditions
+
+The preconditions are all defined in `OutputStream.write()`
+
+```python
+Stream.open else raise ClosedChannelException, PathIOException, IOException
+data != null else raise NullPointerException
+offset >= 0 else raise IndexOutOfBoundsException
+len >= 0 else raise IndexOutOfBoundsException
+offset < data.length else raise IndexOutOfBoundsException
+offset + len < data.length else raise IndexOutOfBoundsException
+```
+
+There may be an explicit limit on the size of cached data, or an implicit
+limit based by the available capacity of the destination filesystem.
+When a limit is reached, `write()` SHOULD fail with an `IOException`.
+
+After the operation has returned, the buffer may be re-used. The outcome
+of updates to the buffer while the `write()` operation is in progress is undefined.
+
+#### Postconditions
+
+```python
+Stream'.buffer = Stream.buffer + data[offset...(offset + len)]
+```
+
+### <a name="write(buffer)"></a>`write(byte[] data)`
+
+This is defined as the equivalent of:
+
+```python
+write(data, 0, data.length)
+```
+
+### <a name="flush()"></a>`flush()`
+
+Requests that the data is flushed. The specification of `ObjectStream.flush()`
+declares that this SHOULD write data to the "intended destination".
+
+It explicitly precludes any guarantees about durability.
+
+For that reason, this document doesn't provide any normative
+specifications of behaviour.
+
+
+#### Preconditions
+
+
+```python
+Stream.open else raise IOException
+```
+
+#### Postconditions
+
+None.
+
+If the implementation chooses to implement a stream-flushing operation,
+the data may be saved to the file system such that it becomes visible to
+others"
+
+```python
+FS' = FS where data(FS, path) == buffer
+```
+
+Some applications have been known to call `flush()` on a closed stream
+on the assumption that it is harmless. Implementations MAY choose to
+support this behaviour.
+
+### <a name="close"></a>`close()`
+
+The `close()` operation saves all data to the filesystem and
+releases any resources used for writing data.
+
+The `close()` call is expected to block
+until the write has completed (as with `Syncable.hflush()`), possibly
+until it has been written to durable storage (as HDFS does).
+
+After `close()` completes, the data in a file MUST be visible and consistent
+with the data most recently written. The metadata of the file MUST be consistent
+with the data and the write history itself (i.e. any modification time fields
+updated).
+
+After `close()` is invoked, all subsequent `write()` calls on the stream
+MUST fail with an `IOException`.
+
+
+Any locking/leaseholding mechanism is also required to release its lock/lease.
+
+```python
+Stream'.open = false
+FS' = FS where data(FS, path) == buffer
+```
+
+The `close()` call MAY fail during its operation.
+
+1. Callers of the API MUST expect for some calls to fail and SHOULD code appropriately.
+Catching and swallowing exceptions, while common, is not always the ideal solution.
+1. Even after a failure, `close()` MUST place the stream into a closed state.
+Follow-on calls to `close()` are ignored, and calls to other methods
+rejected. That is: caller's cannot be expected to call `close()` repeatedly
+until it succeeds.
+1. The duration of the `call()` operation is undefined. Operations which rely
+on acknowledgements from remote systems to meet the persistence guarantees
+implicitly have to await these acknowledgements. Some Object Store output streams
+upload the entire data file in the `close()` operation. This can take a large amount
+of time. The fact that many user applications assume that `close()` is both fast
+and does not fail means that this behavior is dangerous.
+
+Recommendations for safe use by callers
+
+* Do plan for exceptions being raised, either in catching and logging or
+by throwing the exception further up. Catching and silently swallowing exceptions
+may hide serious problems.
+* Heartbeat operations SHOULD take place on a separate thread, so that a long
+delay in `close()` does not block the thread so long that the heartbeat times
+out.
+
+
+## Interface `StreamCapabilities`
+
+```java
+@InterfaceAudience.Public
+@InterfaceStability.Evolving
+```
+
+The `StreamCapabilities` interface exists to allow callers to dynamically
+determine the behavior of a stream.
+
+
+The reference Implementation of this interface is
+ `org.apache.hadoop.hdfs.DFSOutputStream`
+
+```java
+public boolean hasCapability(String capability) {
+  if (capability.equalsIgnoreCase(HSYNC.getValue()) ||
+      capability.equalsIgnoreCase((HFLUSH.getValue()))) {
+    return true;
+  }
+  return false;
+}
+```
+
+Where `HSYNC` and `HFLUSH` are items in the enumeration
+`org.apache.hadoop.fs.StreamCapabilities.StreamCapability`.
+
+
+## <a name="syncable"></a>`org.apache.hadoop.fs.Syncable`
+
+```java
+@InterfaceAudience.Public
+@InterfaceStability.Evolving
+public interface Syncable {
+  /**
+   * @deprecated As of HADOOP 0.21.0, replaced by hflush
+   * @see #hflush()
+   */
+  @Deprecated void sync() throws IOException;
+
+  /** Flush out the data in client's user buffer. After the return of
+   * this call, new readers will see the data.
+   * @throws IOException if any error occurs
+   */
+  void hflush() throws IOException;
+
+  /** Similar to posix fsync, flush out the data in client's user buffer
+   * all the way to the disk device (but the disk may have it in its cache).
+   * @throws IOException if error occurs
+   */
+  void hsync() throws IOException;
+}
+```
+
+The purpose of `Syncable` interface is to provide guarantees that data is written
+to a filesystem for both visibility and durability.
+
+*SYNC-1*: An `OutputStream` which implements `Syncable` is
+making an explicit declaration of an that it can meet those guarantees.
+meet those guarantees.
+
+*SYNC-2*: The interface MUST NOT be declared as implemented by an `OutputStream` unless
+those guarantees can be met.
+
+The `Syncable` interface has been implemented by other classes than
+subclasses of `OutputStream`, such as `org.apache.hadoop.io.SequenceFile.Writer`.
+
+*SYNC-3* The fact that a class implements `Syncable` does not guarantee
+that `extends OutputStream` holds.
+
+That is, for any class `C`: `(C instanceof Syncable)` does not imply
+`(C instanceof OutputStream)`
+
+This specification only covers the required behavior of `OutputStream` subclasses
+which implement `Syncable`.
+
+
+*SYNC-4:* The return value of `FileSystem.create(Path)` is an instance
+of `FSDataOutputStream`.
+
+*SYNC-5:* `FSDataOutputStream implements Syncable`
+
+
+SYNC-5 and SYNC-1 imply that all output streams which can be created
+with `FileSystem.create(Path)` must support the semantics of `Syncable`.
+Any inspection of the Hadoop codebase will make clear that this is demonstrably
+not true, therefore these declarations SYNC-1 and SYNC-2 are
+demonstrably false.
+
+Put differently: *callers cannot rely on the presence of the interface
+as evidence that the semantics of `Syncable` are supported*. Instead
+they should be dynamically probed for using the `StreamCapabilities`
+interface, where available.
+
+
+### <a name="Syncable.hflush"></a>`Syncable.hflush()`
+
+Flush out the data in client's user buffer. After the return of
+this call, new readers will see the data. The `hflush()` operation
+does contain guarantees as to the durability of the data. only
+is visibility.
+Thus implementations may cache the written data in memory
+—visible to all, but not yet persisted.
+
+#### Preconditions
+
+```python
+hasCapability(Stream. "hflush")
+Stream.open else raise IOException
+```
+
+
+#### Postconditions
+
+```python
+FS' = FS where data(path) == cache
+```
+
+
+After the call returns, the data MUST be visible to all new callers
+of `FileSystem.open`.
+
+There is no requirement or guarantee that clients with an existing
+`DataInputStream` created by a call to `(FS, path)` will see the updated
+data, nor is there a guarantee that they *will not* in a current or subsequent
+read.
+
+Implementation note: as a correct `hsync()` implementation must also
+offer all the semantics of an `hflush()` call, implementations of `hflush()`
+may just invoke `hsync()`:
+
+```java
+public void hflush() throws IOException {
+  hsync();
+}
+```
+
+### <a name="Syncable.hsync"></a>`Syncable.hsync()`
+
+Similar to POSIX `fsync`, this call saves the data in client's user buffer
+all the way to the disk device (but the disk may have it in its cache).
+
+That is: it is a requirement for the underlying FS To save all the data to
+the disk hardware itself, where it is expected to be durable.
+
+#### Preconditions
+
+```python
+hasCapability(Stream, "hsync")
+Stream.open else raise IOException
+```
+
+#### Postconditions
+
+```python
+FS' = FS where data(path) == buffer
+```
+
+The reference implementation, `DFSOutputStream` will block
+until an acknowledgement is received from the datanodes: That is, all hosts
+in the replica write chain have successfully written the file.
+
+That means that the expectation callers may have is that the return of
+the method call contains visibility and durability guarantees which other
+implementations must maintain.
+
+Note, however, that the reference `DFSOutputStream.hsync()` call only actually syncs/
+*the current block*. If there have been a series of writes since the last sync,
+such that a block boundary has been crossed. The `hsync()` call claims only
+to write the most recent.
+
+
+From the javadocs of `DFSOutputStream.hsync(EnumSet<SyncFlag> syncFlags)`
+
+> Note that only the current block is flushed to the disk device.
+> To guarantee durable sync across block boundaries the stream should
+> be created with {@link CreateFlag#SYNC_BLOCK}.
+
+
+In virtual machines, the notion of "disk hardware" is really that of
+another software abstraction: there are guarantees.
+
+
+## `interface CanSetDropBehind`
+
+```java
+@InterfaceAudience.Public
+@InterfaceStability.Evolving
+public interface CanSetDropBehind {
+  /**
+   * Configure whether the stream should drop the cache.
+   *
+   * @param dropCache     Whether to drop the cache.  null means to use the
+   *                      default value.
+   * @throws IOException  If there was an error changing the dropBehind
+   *                      setting.
+   *         UnsupportedOperationException  If this stream doesn't support
+   *                                        setting the drop-behind.
+   */
+  void setDropBehind(Boolean dropCache)
+      throws IOException, UnsupportedOperationException;
+}
+```
+
+This interface allows callers to change policies used inside HDFS.
+They are currently unimplemented by any stream other than those in HDFS.
+
+
+
+## Durability, Concurrency, Consistency and Visibility of stream output.
+
+These are the aspects of the system behaviour which are not directly
+covered in this (very simplistic) Filesystem model, but which are visible
+in production.
+
+
+## Durability
+
+1. `OutputStream.write()` MAY persist the data, synchronously or asynchronously
+1. `OutputStream.flush()` flushes data to the destination. There
+are no strict persistence requirements.
+1. `Syncable.hflush()` synchronously sends all local data to the destination
+filesystem. After returning to the caller, the data MUST be visible to other readers,
+it MAY be durable. That is: it does not have to be persisted, merely guaranteed
+to be consistently visible to all clients attempting to open a new stream reading
+data at the path.
+1. `Syncable.hsync()` MUST flush the data and persist data to the underlying durable
+storage.
+1. `close()` The first call to `close()` MUST flush out all remaining data in
+the buffers, and persist it.
+
+
+## Concurrency
+
+1. The outcome of more than one process writing to the same file is undefined.
+
+1. An input stream opened to read a file *before the file was opened for writing*
+MAY fetch data updated by writes to an OutputStream.
+Because of buffering and caching, this is not a requirement
+—and if a input stream does pick up updated data, the point at
+which the updated data is read is undefined. This surfaces in object stores
+where a `seek()` call which closes and re-opens the connection may pick up
+updated data, while forward stream reads do not. Similarly, in block-oriented
+filesystems, the data may be cached a block at a time —and changes only picked
+up when a different block is read.
+
+1. A Filesystem MAY allow the destination path to be manipulated while a stream
+is writing to it —for example, `rename()` of the path or a parent; `delete()` of
+a path or parent. In such a case, the outcome of future write operations on
+the output stream is undefined. Some filesystems MAY implement locking to
+prevent conflict. However, this tends to be rare on distributed filesystems,
+for reasons well known in the literature.
+
+1. The Java API specification of `java.io.OutputStream` does not require
+an instance of the class to be thread safe.
+However, `org.apache.hadoop.hdfs.DFSOutputStream`
+has a stronger thread safety model (possibly unintentionally). This fact is
+relied upon in Apache HBase, as discovered in HADOOP-11708. Implementations
+SHOULD be thread safe. *Note*: even the `DFSOutputStream` synchronization
+model permits the output stream to `close()`'d while awaiting an acknowledgement
+from datanode or namenode writes in an `hsync()` operation.
+
+
+## Consistency and Visibility
+
+There is no requirement for the data to be immediately visible to other applications
+—not until a specific call to flush buffers or persist it to the underlying storage
+medium are made.
+
+If an output stream is created with `FileSystem.create(path, overwrite==true)`
+and there is an existing file at the path, that is `exists(FS, path)` holds,
+then, the existing data is immediately unavailable; the data at the end of the
+path MUST consist of an empty byte sequence `[]`, with consistent metadata.
+
+
+```python
+exists(FS, path)
+(Stream', FS') = create(FS, path)
+exists(FS', path)
+getFileStatus(FS', path).getLen() = 0
+```
+
+The metadata of a file (`length(FS, path)` in particular) SHOULD be consistent
+with the contents of the file after `flush()` and `sync()`.
+
+
+```python
+(Stream', FS') = create(FS, path)
+(Stream'', FS'') = write(Stream', data)
+(Stream''', FS''') hsync(Stream'')
+exists(FS''', path)
+getFileStatus(FS''', path).getLen() = len(data)
+```
+
+
+HDFS does not do this except when the write crosses a block boundary; to do
+otherwise would overload the Namenode. As a result, while a file is being written
+`length(Filesystem, Path)` MAY be less than the length of `data(Filesystem, Path)`.
+
+The metadata MUST be consistent with the contents of a file after the `close()`
+operation.
+
+After the contents of an output stream have been persisted (`hflush()/hsync()`)
+all new `open(FS, Path)` operations MUST return the updated data.
+
+After `close()` has been invoked on an output stream,
+a call to `getFileStatus(path)` MUST return the final metadata of the written file,
+including length and modification time.
+The metadata of the file returned in any of the FileSystem `list` operations
+MUST be consistent with this metadata.
+
+
+The value of `getFileStatus(path).getModificationTime()` is not defined
+while a stream is being written to.
+ The timestamp MAY be updated while a file is being written,
+especially after a `Syncable.hsync()` call.
+The timestamps MUST be updated after the file is closed
+to that of a clock value observed by the server during the `close()` call.
+It is *likely* to be in the time and time zone of the filesystem, rather
+than that of the client.
+
+Formally, if a `close()` operation triggers an interaction with a server
+which starts at server-side time `t1` and completes at time `t2` with a successfully
+written file, then the last modification time SHOULD be a time `t` where
+`t1 <= t <= t2`
+
+## Issues with the Hadoop Output Stream model.
+
+
+There are some known issues with the output stream model as offered by Hadoop,
+specifically about the guarantees about when data is written and persisted
+—and when the metadata is synchronized.
+These are where implementation aspects of HDFS and the "Local" filesystem
+do not the simple model of the filesystem used in this specification.
+
+
+
+### HDFS
+
+That HDFS file metadata often lags the content of a file being written
+to is not something everyone expects, nor convenient for any program trying
+pick up updated data in a file being written. Most visible is the length
+of a file returned in the various `list` commands and `getFileStatus` —this
+is often out of data.
+
+As HDFS only supports file growth in its output operations, this means
+that the size of the file as listed in the metadata may be less than or equal
+to the number of available bytes —but never larger. This is a guarantee which
+is also held
+
+One Algorithm to determine whether a file in HDFS is updated is:
+
+1. Remember the last read position `pos` in the file, using `0` if this is the initial
+read.
+1. Use `getFileStatus(FS, Path)` to query the updated length of the file as
+recorded in the metadata.
+1. If `Status.length &gt pos`, the file has grown.
+1. If the number has not changed, then
+    1. reopen the file.
+    1. `seek(pos)` to that location
+    1. If `read() != -1`, there is new data.
+
+This algorithm works for filesystems which are consistent with metadata and
+data, as well as HDFS. What is important to know is that, in HDFS
+`getFileStatus(FS, path).getLen()==0` does not imply that `data(FS, path)` is
+empty.
+
+
+### Local Filesystem, `file:`
+
+`LocalFileSystem`, `file:`, (or any other `FileSystem` implementation based on
+`ChecksumFileSystem`) has a different issue. If an output stream
+is obtained from `create()` and `FileSystem.setWriteChecksum(false)` has
+*not* been called on the filesystem, then the FS only flushes as much
+local data as can be written to full checksummed blocks of data.
+
+That is, the flush operations are not guaranteed to write all the pending
+data until the file is finally closed.
+
+That is, `sync()`, `hsync()` and `hflush` may not persist all data written
+to the stream.
+
+For anyone thinking "this is a violation of this specification" —they are correct.
+The local filesystem is intended for testing, rather than production use.
+
+### Checksummed output streams
+
+Because  `org.apache.hadoop.fs.FSOutputSummer` and
+`org.apache.hadoop.fs.ChecksumFileSystem.ChecksumFSOutputSummer`
+implement the underlying checksummed output stream used by HDFS and
+other filesystems, it provides some of the core semantics of the output
+stream behavior.
+
+
+1. The `close()` call is unsynchronized, re-entrant and may attempt
+to close the stream more than once.
+1. It is possible to call `write(int)` on a closed stream (but not
+`write(byte[], int, int)`).
+1. It is possible to call `flush()` on a closed stream.
+
+Behaviors 1 & 2 really have to be considered bugs to fix, albeit with care.
+
+
+
+### Object Stores
+
+Object store streams tend to buffer the entire stream's output
+until the final `close()` operation triggers a single `PUT` of the data
+and materialization of the final output.
+
+This significantly change's their behaviour compared to that of
+POSIX filesystems and that specified in this document.
+
+#### Visibility of newly created files
+
+There is no guarantee that any file will be visible at the path of an output
+stream after the output stream is created .
+
+That is: while `create(FS, path, boolean)` returns a new stream
+
+```python
+Stream' = (path, true, [])
+```
+
+The other postcondition of the operation, `data(FS', path) == []` may not
+hold,
+
+1. `exists(FS, p)` MAY return false.
+1. If a file was created with `overwrite = True`, the existing data my still
+be visible: `data(FS', path) = data(FS, path)`.
+
+1. The check for existing data in a `create()` call with `overwrite=False`, may
+take place in the `create()` call itself, in the `close()` call prior to/during
+the write, or at some point in between. Expect in the special case that the
+object store supports an atomic PUT operation, the check for existence of
+existing data and the subsequent creation of data at the path contains a race
+condition: other clients may create data at the path between the existence check
+and the subsequent qrite.
+
+1. Calls to `create(FS, Path, overwrite=false)` may succeed, returning a new
+`OutputStream`, even while another stream is open and writing to the destination
+path.
+
+This allows for the following sequence of operations, which would
+raise an exception in the second `open()` call if invoked against HDFS:
+
+```python
+Stream1 = open(FS, path, false)
+sleep(200)
+Stream2 = open(FS, path, false)
+Stream.write('a')
+Stream1.close()
+Stream2.close()
+```
+
+For anyone wondering why the clients create a 0-byte file in the create call,
+it would cause problems after `close()` —the marker file could get
+returned in `open()` calls instead of the final data.
+
+
+#### Visibility of the output of a stream after `close()`
+
+One guarantee which Object Stores SHOULD make is the same as those of POSIX
+filesystems: After a stream `close()` call returns, the data MUST be persisted
+durably and visible to all callers. Unfortunately, even that guarantee is
+not always met:
+
+1. Existing data on a path MAY be visible for an indeterminate period of time.
+
+1. If the store has any form of create inconsistency or buffering of negative
+existence probes, then even after the stream's `close()` operation has returned,
+`getFileStatus(FS, path)` and `open(FS, path)` may fail with a `FileNotFoundException`.
+
+In their favour, the atomicity of the store's PUT operations do offer their
+own guarantee: an newly created object is either absent or all of its data
+is present: the act of instantiatng the object, while potentially exhibiting
+create inconsistency, is atomic. Applications may be able to use that fact
+to their advantage.
+
+
+#### Other issues
+
+The `Syncable` interfaces and methods are rarely implemented. Use
+`StreamCapabilities` to determine their availability (Azure's
+`PageBlobOutputStream` is the sole "syncable" object store output stream).
diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/AbstractContractCreateTest.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/AbstractContractCreateTest.java
index 07c99e0b6a528..f137fc3665775 100644
--- a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/AbstractContractCreateTest.java
+++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/AbstractContractCreateTest.java
@@ -18,13 +18,18 @@
 
 package org.apache.hadoop.fs.contract;
 
+import org.apache.hadoop.fs.FSDataInputStream;
 import org.apache.hadoop.fs.FSDataOutputStream;
 import org.apache.hadoop.fs.FileAlreadyExistsException;
 import org.apache.hadoop.fs.FileStatus;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.StreamCapabilities;
+
 import org.junit.Test;
 import org.junit.internal.AssumptionViolatedException;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
 import java.io.FileNotFoundException;
 import java.io.IOException;
@@ -42,6 +47,9 @@
 public abstract class AbstractContractCreateTest extends
                                                  AbstractFSContractTestBase {
 
+  private static final Logger LOG =
+      LoggerFactory.getLogger(AbstractContractCreateTest.class);
+
   /**
    * How long to wait for a path to become visible.
    */
@@ -189,12 +197,8 @@ private void testOverwriteNonEmptyDirectory(boolean useBuilder)
     } catch (FileAlreadyExistsException expected) {
       //expected
       handleExpectedException(expected);
-    } catch (FileNotFoundException e) {
-      handleRelaxedException("overwriting a dir with a file ",
-                             "FileAlreadyExistsException",
-                             e);
     } catch (IOException e) {
-      handleRelaxedException("overwriting a dir with a file ",
+      handleRelaxedException("overwriting a dir with a file",
                              "FileAlreadyExistsException",
                              e);
     }
@@ -332,4 +336,78 @@ public void testCreateMakesParentDirs() throws Throwable {
     assertTrue("Grandparent directory does not appear to be a directory",
         fs.getFileStatus(grandparent).isDirectory());
   }
+
+  @Test
+  public void testHSync() throws Throwable {
+    describe("test declared and actual Syncable behaviors");
+    Path path = methodPath();
+    FileSystem fs = getFileSystem();
+    boolean supportsFlush = isSupported(SUPPORTS_HFLUSH);
+    boolean supportsSync = isSupported(SUPPORTS_HSYNC);
+
+    try (FSDataOutputStream out = fs.create(path, true)) {
+
+      boolean doesHFlush = out.hasCapability(
+          StreamCapabilities.StreamCapability.HFLUSH.getValue());
+      if (doesHFlush) {
+        out.hflush();
+      }
+      assertEquals("hflush support", supportsFlush, doesHFlush);
+      boolean doesHSync = out.hasCapability(
+          StreamCapabilities.StreamCapability.HSYNC.getValue());
+      assertEquals("HFlush support", supportsSync, doesHSync);
+
+      try {
+        out.hflush();
+        if (!supportsFlush) {
+          // hsync not ignored
+          LOG.warn("FS doesn't support Syncable.hflush(),"
+              + " but doesn't reject it either.");
+        }
+      } catch (UnsupportedOperationException e) {
+        if (supportsFlush) {
+          throw new AssertionError("hflush not supported", e);
+        }
+      }
+      out.write('a');
+      try {
+        out.hsync();
+      } catch (UnsupportedOperationException e) {
+        if (supportsSync) {
+          throw new AssertionError("HSync not supported", e);
+        }
+      }
+
+      if (supportsSync) {
+        // if sync really worked, data is visible here
+
+        try(FSDataInputStream in = fs.open(path)) {
+          assertEquals('a', in.read());
+          assertEquals(-1, in.read());
+        }
+      } else {
+        out.flush();
+        // no sync. let's see what's there
+        try (FSDataInputStream in = fs.open(path)) {
+
+          int c = in.read();
+          if (c == -1) {
+            // nothing was synced; sync and flush really aren't there.
+          } else {
+            LOG.info("sync and flush are declared unsupported"
+                + " - but the stream does offer some sync/flush semantics");
+          }
+        } catch (FileNotFoundException e) {
+          // that's OK if it's an object store, but not if its a real
+          // FS
+          if (!isSupported(IS_BLOBSTORE)) {
+            throw e;
+          }
+        }
+
+      }
+
+    }
+
+  }
 }
diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/AbstractFSContractTestBase.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/AbstractFSContractTestBase.java
index 1cd2164fad300..146569e8bf0d9 100644
--- a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/AbstractFSContractTestBase.java
+++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/AbstractFSContractTestBase.java
@@ -272,7 +272,7 @@ protected void handleRelaxedException(String action,
     if (getContract().isSupported(SUPPORTS_STRICT_EXCEPTIONS, false)) {
       throw e;
     }
-    LOG.warn("The expected exception {}  was not the exception class" +
+    LOG.warn("The expected exception {} was not the exception class" +
              " raised on {}: {}", action , e.getClass(), expectedException, e);
   }
 
@@ -375,4 +375,13 @@ protected String generateAndLogErrorListing(Path src, Path dst) throws
     }
     return destDirLS;
   }
+
+  /**
+   * Get a path from the name of the current test case.
+   * @return a path based on the test method being executed.
+   * @throws IOException IO problem
+   */
+  protected Path methodPath() throws IOException {
+    return path(methodName.getMethodName());
+  }
 }
diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/ContractOptions.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/ContractOptions.java
index 91a112141e987..5d258b4144e3b 100644
--- a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/ContractOptions.java
+++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/ContractOptions.java
@@ -232,4 +232,13 @@ public interface ContractOptions {
    */
   String TEST_RANDOM_SEEK_COUNT = "test.random-seek-count";
 
+  /**
+   * Is hflush supported in API and StreamCapabilities?
+   */
+  String SUPPORTS_HFLUSH = "supports-hflush";
+
+  /**
+   * Is hsync supported in API and StreamCapabilities?
+   */
+  String SUPPORTS_HSYNC = "supports-hsync";
 }
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/resources/contract/hdfs.xml b/hadoop-hdfs-project/hadoop-hdfs/src/test/resources/contract/hdfs.xml
index 3c9fcccc73846..d7e039647d5fc 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/test/resources/contract/hdfs.xml
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/resources/contract/hdfs.xml
@@ -116,4 +116,14 @@
     <value>true</value>
   </property>
 
+  <property>
+    <name>fs.contract.supports-hflush</name>
+    <value>true</value>
+  </property>
+
+  <property>
+    <name>fs.contract.supports-hsync</name>
+    <value>true</value>
+  </property>
+
 </configuration>
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3ABlockOutputStream.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3ABlockOutputStream.java
index a60f9af2a938b..29e77ba68ac36 100644
--- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3ABlockOutputStream.java
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3ABlockOutputStream.java
@@ -25,7 +25,6 @@
 import java.util.Locale;
 import java.util.concurrent.ExecutionException;
 import java.util.concurrent.ExecutorService;
-import java.util.concurrent.atomic.AtomicBoolean;
 import java.util.concurrent.atomic.AtomicInteger;
 
 import com.amazonaws.AmazonClientException;
@@ -47,6 +46,7 @@
 import org.apache.hadoop.classification.InterfaceAudience;
 import org.apache.hadoop.classification.InterfaceStability;
 import org.apache.hadoop.fs.StreamCapabilities;
+import org.apache.hadoop.fs.impl.StreamStateModel;
 import org.apache.hadoop.fs.s3a.commit.CommitConstants;
 import org.apache.hadoop.fs.s3a.commit.PutTracker;
 import org.apache.hadoop.util.Progressable;
@@ -98,8 +98,8 @@ class S3ABlockOutputStream extends OutputStream implements
   /** Multipart upload details; null means none started. */
   private MultiPartUpload multiPartUpload;
 
-  /** Closed flag. */
-  private final AtomicBoolean closed = new AtomicBoolean(false);
+  /** Stream state. */
+  private final StreamStateModel stateModel;
 
   /** Current data block. Null means none currently active */
   private S3ADataBlocks.DataBlock activeBlock;
@@ -163,6 +163,7 @@ class S3ABlockOutputStream extends OutputStream implements
     this.progressListener = (progress instanceof ProgressListener) ?
         (ProgressListener) progress
         : new ProgressableListener(progress);
+    this.stateModel = new StreamStateModel(fs.keyToQualifiedPath(key));
     // create that first block. This guarantees that an open + close sequence
     // writes a 0-byte entry.
     createBlockIfNeeded();
@@ -220,33 +221,27 @@ private void clearActiveBlock() {
     }
   }
 
-  /**
-   * Check for the filesystem being open.
-   * @throws IOException if the filesystem is closed.
-   */
-  void checkOpen() throws IOException {
-    if (closed.get()) {
-      throw new IOException("Filesystem " + writeOperationHelper + " closed");
-    }
-  }
-
   /**
    * The flush operation does not trigger an upload; that awaits
    * the next block being full. What it does do is call {@code flush() }
    * on the current block, leaving it to choose how to react.
+   *
+   * Downgrades to a no-op if called on a closed stream.
    * @throws IOException Any IO problem.
    */
   @Override
-  public synchronized void flush() throws IOException {
-    try {
-      checkOpen();
-    } catch (IOException e) {
-      LOG.warn("Stream closed: " + e.getMessage());
-      return;
-    }
-    S3ADataBlocks.DataBlock dataBlock = getActiveBlock();
-    if (dataBlock != null) {
-      dataBlock.flush();
+  public void flush() throws IOException {
+    if (StreamStateModel.State.Open.equals(acquireLock(false))) {
+      try {
+        S3ADataBlocks.DataBlock dataBlock = getActiveBlock();
+        if (dataBlock != null) {
+          dataBlock.flush();
+        }
+      } catch (IOException ex) {
+        throw stateModel.enterErrorState(ex);
+      } finally {
+        stateModel.releaseLock();
+      }
     }
   }
 
@@ -273,14 +268,47 @@ public synchronized void write(int b) throws IOException {
    * @throws IOException on any problem
    */
   @Override
-  public synchronized void write(byte[] source, int offset, int len)
+  public void write(byte[] source, int offset, int len)
       throws IOException {
 
     S3ADataBlocks.validateWriteArgs(source, offset, len);
-    checkOpen();
     if (len == 0) {
       return;
     }
+    acquireLock(true);
+    try {
+      innerWrite(source, offset, len);
+    } catch (IOException ex) {
+      throw stateModel.enterErrorState(ex);
+    } finally {
+      stateModel.releaseLock();
+    }
+  }
+
+  /**
+   * Acquire the lock.
+   * @param checkOpen should the lock
+   * @return The stream state
+   * @throws IOException if the checkOpen operation raises an exception.
+   */
+  private synchronized StreamStateModel.State acquireLock(final boolean checkOpen)
+      throws IOException {
+    stateModel.acquireLock(checkOpen);
+    return stateModel.getState();
+  }
+
+  /**
+   * The inner write.
+   * This is called recursively until all the source data is written.
+   * It requires that the stream state has acquired the lock already, so does
+   * not attempt to reacquire or re-release it.
+   * @param source source file.
+   * @param offset offset
+   * @param len length
+   * @throws IOException on a failure
+   */
+  private void innerWrite(final byte[] source, final int offset, final int len)
+      throws IOException {
     S3ADataBlocks.DataBlock block = createBlockIfNeeded();
     int written = block.write(source, offset, len);
     int remainingCapacity = block.remainingCapacity();
@@ -292,7 +320,7 @@ public synchronized void write(byte[] source, int offset, int len)
       uploadCurrentBlock();
       // tail recursion is mildly expensive, but given buffer sizes must be MB.
       // it's unlikely to recurse very deeply.
-      this.write(source, offset + written, len - written);
+      innerWrite(source, offset + written, len - written);
     } else {
       if (remainingCapacity == 0) {
         // the whole buffer is done, trigger an upload
@@ -344,10 +372,22 @@ private void initMultipartUpload() throws IOException {
    */
   @Override
   public void close() throws IOException {
-    if (closed.getAndSet(true)) {
-      // already closed
-      LOG.debug("Ignoring close() as stream is already closed");
-      return;
+    acquireLock(false);
+    try {
+      synchronized (this) {
+        // this is synchronized to order its execution w.r.t any methods
+        // which are marked as synchronized.
+        // because the whole close() method is called, calling it on a stream
+        // which has just been closed isn't going to block it for the duration
+        // of the entire upload.
+        if (!stateModel.enterClosedState()) {
+          // already closed
+          LOG.debug("Ignoring close() as stream is not open");
+          return;
+        }
+      }
+    } finally {
+      stateModel.releaseLock();
     }
     S3ADataBlocks.DataBlock block = getActiveBlock();
     boolean hasBlock = hasActiveBlock();
@@ -393,6 +433,7 @@ public void close() throws IOException {
       }
       LOG.debug("Upload complete to {} by {}", key, writeOperationHelper);
     } catch (IOException ioe) {
+      stateModel.enterErrorState(ioe);
       writeOperationHelper.writeFailed(ioe);
       throw ioe;
     } finally {
diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/TestS3ABlockOutputStream.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/TestS3ABlockOutputStream.java
deleted file mode 100644
index ff176f58da67d..0000000000000
--- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/TestS3ABlockOutputStream.java
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.hadoop.fs.s3a;
-
-import org.apache.hadoop.fs.s3a.commit.PutTracker;
-import org.apache.hadoop.util.Progressable;
-import org.junit.Before;
-import org.junit.Test;
-
-import java.io.IOException;
-import java.util.concurrent.ExecutorService;
-
-import static org.junit.Assert.*;
-import static org.mockito.Mockito.doThrow;
-import static org.mockito.Mockito.mock;
-import static org.mockito.Mockito.spy;
-
-/**
- * Unit tests for {@link S3ABlockOutputStream}.
- */
-public class TestS3ABlockOutputStream extends AbstractS3AMockTest {
-
-  private S3ABlockOutputStream stream;
-
-  @Before
-  public void setUp() throws Exception {
-    ExecutorService executorService = mock(ExecutorService.class);
-    Progressable progressable = mock(Progressable.class);
-    S3ADataBlocks.BlockFactory blockFactory =
-        mock(S3ADataBlocks.BlockFactory.class);
-    long blockSize = Constants.DEFAULT_MULTIPART_SIZE;
-    S3AInstrumentation.OutputStreamStatistics statistics = null;
-    WriteOperationHelper oHelper = mock(WriteOperationHelper.class);
-    PutTracker putTracker = mock(PutTracker.class);
-    stream = spy(new S3ABlockOutputStream(fs, "", executorService,
-      progressable, blockSize, blockFactory, statistics, oHelper,
-      putTracker));
-  }
-
-  @Test
-  public void testFlushNoOpWhenStreamClosed() throws Exception {
-    doThrow(new IOException()).when(stream).checkOpen();
-
-    try {
-      stream.flush();
-    } catch (Exception e){
-      fail("Should not have any exception.");
-    }
-  }
-}
diff --git a/hadoop-tools/hadoop-azure-datalake/src/main/java/org/apache/hadoop/fs/adl/AdlFsOutputStream.java b/hadoop-tools/hadoop-azure-datalake/src/main/java/org/apache/hadoop/fs/adl/AdlFsOutputStream.java
index 2b89fb0a73242..f8bb8e33b1403 100644
--- a/hadoop-tools/hadoop-azure-datalake/src/main/java/org/apache/hadoop/fs/adl/AdlFsOutputStream.java
+++ b/hadoop-tools/hadoop-azure-datalake/src/main/java/org/apache/hadoop/fs/adl/AdlFsOutputStream.java
@@ -22,6 +22,8 @@
 import org.apache.hadoop.classification.InterfaceAudience;
 import org.apache.hadoop.classification.InterfaceStability;
 import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.impl.StoreImplementationUtils;
+import org.apache.hadoop.fs.StreamCapabilities;
 import org.apache.hadoop.fs.Syncable;
 
 import java.io.IOException;
@@ -42,7 +44,8 @@
  */
 @InterfaceAudience.Private
 @InterfaceStability.Evolving
-public final class AdlFsOutputStream extends OutputStream implements Syncable {
+public final class AdlFsOutputStream extends OutputStream
+    implements Syncable, StreamCapabilities {
   private final ADLFileOutputStream out;
 
   public AdlFsOutputStream(ADLFileOutputStream out, Configuration configuration)
@@ -79,4 +82,9 @@ public synchronized void hflush() throws IOException {
   public synchronized void hsync() throws IOException {
     out.flush();
   }
+
+  @Override
+  public boolean hasCapability(String capability) {
+    return StoreImplementationUtils.supportsSyncable(capability);
+  }
 }
diff --git a/hadoop-tools/hadoop-azure-datalake/src/test/resources/adls.xml b/hadoop-tools/hadoop-azure-datalake/src/test/resources/adls.xml
index 5bbdd6fbb8645..e45a662229377 100644
--- a/hadoop-tools/hadoop-azure-datalake/src/test/resources/adls.xml
+++ b/hadoop-tools/hadoop-azure-datalake/src/test/resources/adls.xml
@@ -148,4 +148,14 @@
     <value>true</value>
   </property>
 
+  <property>
+    <name>fs.contract.supports-hflush</name>
+    <value>true</value>
+  </property>
+
+  <property>
+    <name>fs.contract.supports-hsync</name>
+    <value>true</value>
+  </property>
+
 </configuration>

From 8a2154da19876c86f4f82cee347887fbd06271d4 Mon Sep 17 00:00:00 2001
From: Steve Loughran <stevel@hortonworks.com>
Date: Tue, 26 Mar 2019 13:12:38 +0000
Subject: [PATCH 2/3] HADOOP-13327 update on the OutputStream specification

* Address code-side comments from reviewers; markdown is still WiP.
* move all inner stream propagation of hasCapability() tests to the StoreImplementationUtils operation.

Change-Id: Ic5be4ac09ddd36b8804eacfee5ad0587a819eaf0
---
 .../hadoop/crypto/CryptoOutputStream.java     |  6 +-
 .../apache/hadoop/fs/FSDataInputStream.java   |  6 +-
 .../apache/hadoop/fs/FSDataOutputStream.java  |  6 +-
 .../org/apache/hadoop/fs/FSOutputSummer.java  |  8 +-
 .../fs/impl/StoreImplementationUtils.java     | 20 ++++-
 .../hadoop/fs/impl/StreamStateModel.java      | 64 +++++++------
 .../site/markdown/filesystem/filesystem.md    |  4 +-
 .../src/site/markdown/filesystem/index.md     |  1 +
 .../site/markdown/filesystem/outputstream.md  | 89 +++++++------------
 .../hadoop/fs/s3a/S3ABlockOutputStream.java   | 32 +++++--
 .../fs/azure/NativeAzureFileSystem.java       |  6 +-
 .../fs/azure/SyncableDataOutputStream.java    |  6 +-
 12 files changed, 127 insertions(+), 121 deletions(-)

diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/crypto/CryptoOutputStream.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/crypto/CryptoOutputStream.java
index 8d11043937612..555ca0816cdff 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/crypto/CryptoOutputStream.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/crypto/CryptoOutputStream.java
@@ -28,6 +28,7 @@
 import org.apache.hadoop.fs.CanSetDropBehind;
 import org.apache.hadoop.fs.StreamCapabilities;
 import org.apache.hadoop.fs.Syncable;
+import org.apache.hadoop.fs.impl.StoreImplementationUtils;
 
 import com.google.common.base.Preconditions;
 
@@ -308,9 +309,6 @@ private void freeBuffers() {
 
   @Override
   public boolean hasCapability(String capability) {
-    if (out instanceof StreamCapabilities) {
-      return ((StreamCapabilities) out).hasCapability(capability);
-    }
-    return false;
+    return StoreImplementationUtils.hasCapability(out, capability);
   }
 }
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FSDataInputStream.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FSDataInputStream.java
index 31f82975899e1..861070158b029 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FSDataInputStream.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FSDataInputStream.java
@@ -29,6 +29,7 @@
 
 import org.apache.hadoop.classification.InterfaceAudience;
 import org.apache.hadoop.classification.InterfaceStability;
+import org.apache.hadoop.fs.impl.StoreImplementationUtils;
 import org.apache.hadoop.io.ByteBufferPool;
 import org.apache.hadoop.util.IdentityHashStore;
 
@@ -234,10 +235,7 @@ public void unbuffer() {
 
   @Override
   public boolean hasCapability(String capability) {
-    if (in instanceof StreamCapabilities) {
-      return ((StreamCapabilities) in).hasCapability(capability);
-    }
-    return false;
+    return StoreImplementationUtils.hasCapability(in, capability);
   }
 
   /**
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FSDataOutputStream.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FSDataOutputStream.java
index 5b604e58e2360..08dd3262da8db 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FSDataOutputStream.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FSDataOutputStream.java
@@ -24,6 +24,7 @@
 
 import org.apache.hadoop.classification.InterfaceAudience;
 import org.apache.hadoop.classification.InterfaceStability;
+import org.apache.hadoop.fs.impl.StoreImplementationUtils;
 
 /** Utility that wraps a {@link OutputStream} in a {@link DataOutputStream}.
  */
@@ -122,10 +123,7 @@ public OutputStream getWrappedStream() {
 
   @Override
   public boolean hasCapability(String capability) {
-    if (wrappedStream instanceof StreamCapabilities) {
-      return ((StreamCapabilities) wrappedStream).hasCapability(capability);
-    }
-    return false;
+    return StoreImplementationUtils.hasCapability(wrappedStream, capability);
   }
 
   @Override  // Syncable
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FSOutputSummer.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FSOutputSummer.java
index 2458b2f40d8d7..aaa19adf8c6a4 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FSOutputSummer.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FSOutputSummer.java
@@ -33,7 +33,8 @@
  */
 @InterfaceAudience.LimitedPrivate({"HDFS"})
 @InterfaceStability.Unstable
-abstract public class FSOutputSummer extends OutputStream {
+abstract public class FSOutputSummer extends OutputStream implements
+    StreamCapabilities {
   // data checksum
   private final DataChecksum sum;
   // internal buffer for storing data before it is checksumed
@@ -254,4 +255,9 @@ protected synchronized void setChecksumBufSize(int size) {
   protected synchronized void resetChecksumBufSize() {
     setChecksumBufSize(sum.getBytesPerChecksum() * BUFFER_NUM_CHUNKS);
   }
+
+  @Override
+  public boolean hasCapability(String capability) {
+    return false;
+  }
 }
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/impl/StoreImplementationUtils.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/impl/StoreImplementationUtils.java
index 5d0a00735fb87..655c4e35298f7 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/impl/StoreImplementationUtils.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/impl/StoreImplementationUtils.java
@@ -18,6 +18,7 @@
 
 package org.apache.hadoop.fs.impl;
 
+import java.io.InputStream;
 import java.io.OutputStream;
 
 import org.apache.hadoop.classification.InterfaceAudience;
@@ -52,14 +53,15 @@ public static boolean supportsSyncable(String capability) {
    * Probe for an object having a capability; returns true
    * iff the stream implements {@link StreamCapabilities} and its
    * {@code hasCapabilities()} method returns true for the capability.
-   * This is a private method intended to provided a common implementation
-   * for input, output streams; only the stronger typed
+   * This is a package private method intended to provided a common
+   * implementation for input, output streams; the stronger typed
+   * {@link #hasCapability()} call is for public use.
    * @param object object to probe.
    * @param capability capability to probe for
    * @return true iff the object implements stream capabilities and
    * declares that it supports the capability.
    */
-  private static boolean objectHasCapability(Object object, String capability) {
+  static boolean objectHasCapability(Object object, String capability) {
     if (object instanceof StreamCapabilities) {
       return ((StreamCapabilities) object).hasCapability(capability);
     }
@@ -78,4 +80,16 @@ public static boolean hasCapability(OutputStream out, String capability) {
     return objectHasCapability(out, capability);
   }
 
+  /**
+   * Probe for an input stream having a capability; returns true
+   * iff the stream implements {@link StreamCapabilities} and its
+   * {@code hasCapabilities()} method returns true for the capability.
+   * @param out output stream
+   * @param capability capability to probe for
+   * @return true iff the stream declares that it supports the capability.
+   */
+  public static boolean hasCapability(InputStream out, String capability) {
+    return objectHasCapability(out, capability);
+  }
+
 }
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/impl/StreamStateModel.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/impl/StreamStateModel.java
index 3db52929727f0..1504fbb4a0a3d 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/impl/StreamStateModel.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/impl/StreamStateModel.java
@@ -19,9 +19,12 @@
 package org.apache.hadoop.fs.impl;
 
 import java.io.IOException;
+import java.util.concurrent.atomic.AtomicReference;
 import java.util.concurrent.locks.Lock;
 import java.util.concurrent.locks.ReentrantLock;
 
+import javax.annotation.Nullable;
+
 import com.google.common.base.Preconditions;
 
 import org.apache.commons.lang3.StringUtils;
@@ -33,6 +36,10 @@
 /**
  * Models a stream's state and can be used for checking this state before
  * any operation.
+ * 
+ * It's designed to ensure that a stream knows when it is closed,
+ * and, once it has entered a failed state, that the failure
+ * is not forgotten.
  *
  * The model has three states: Open, Error, and Closed,
  *
@@ -77,20 +84,26 @@ public enum State {
 
   /**
    * Initial state: open.
-   * This is volatile: it can be queried without encountering any locks.
-   * However, to guarantee the state is constant through the life of an
-   * operation, updates must be through the synchronized methods.
    */
-  private volatile State state = State.Open;
+  private final AtomicReference<State> state =
+      new AtomicReference<>(State.Open);
 
   /** Any exception to raise on the next checkOpen call. */
   private IOException exception;
 
-  public StreamStateModel(final Path path) {
-    this.path = path.toString();
+  /**
+   * Create for a path. 
+   * @param path optional path for exception messages.
+   */
+  public StreamStateModel(@Nullable Path path) {
+    this.path = path != null ? path.toString() : "";
   }
 
-  public StreamStateModel(final String path) {
+  /**
+   * Create for a path. 
+   * @param path optional path for exception messages.
+   */  
+  public StreamStateModel(@Nullable final String path) {
     this.path = path;
   }
 
@@ -100,20 +113,15 @@ public StreamStateModel(final String path) {
    * @return the current state.
    */
   public State getState() {
-    return state;
+    return state.get();
   }
 
   /**
-   * Change state to closed. No-op if the state was in closed or error
+   * Change state to closed. No-op if the state was in closed or error.
    * @return true if the state changed.
    */
   public synchronized boolean enterClosedState() {
-    if (state == State.Open) {
-      state = State.Closed;
-      return true;
-    } else {
-      return false;
-    }
+    return state.compareAndSet(State.Open, State.Closed);
   }
 
   /**
@@ -124,12 +132,12 @@ public synchronized boolean enterClosedState() {
    */
   public synchronized IOException enterErrorState(final IOException ex) {
     Preconditions.checkArgument(ex != null, "Null exception");
-    switch (state) {
+    switch (state.get()) {
       // a stream can go into the error state when open or closed
     case Open:
     case Closed:
       exception = ex;
-      state = State.Error;
+      state.set(State.Error);
       break;
     case Error:
       // already in this state; retain the previous exception.
@@ -139,13 +147,16 @@ public synchronized IOException enterErrorState(final IOException ex) {
   }
 
   /**
-   * Check a stream is open.
+   * Verify that a stream is open, throwing an IOException if
+   * not.
    * If in an error state: rethrow that exception. If closed,
    * throw an exception about that.
-   * @throws IOException if the stream is not open.
+   * @throws IOException if the stream is not in {@link State#Open}.
+   * @throws PathIOException if the stream was not open and a path was given
+   * in the constructor.
    */
   public synchronized void checkOpen() throws IOException {
-    switch (state) {
+    switch (state.get()) {
     case Open:
       return;
 
@@ -189,17 +200,4 @@ public void releaseLock() {
     lock.unlock();
   }
 
-  /**
-   * Check for a stream being in a specific state.
-   * The check is synchronized, but not locked; if the caller does
-   * not hold a lock then the state may change before any subsequent
-   * operation.
-   * @param expected expected state
-   * @return return true iff the steam was in the state at the time
-   * of checking.
-   */
-  public synchronized boolean isInState(State expected) {
-    return state == expected;
-  }
-
 }
diff --git a/hadoop-common-project/hadoop-common/src/site/markdown/filesystem/filesystem.md b/hadoop-common-project/hadoop-common/src/site/markdown/filesystem/filesystem.md
index a60490e832181..9e9aa03c9eb74 100644
--- a/hadoop-common-project/hadoop-common/src/site/markdown/filesystem/filesystem.md
+++ b/hadoop-common-project/hadoop-common/src/site/markdown/filesystem/filesystem.md
@@ -609,13 +609,13 @@ The result is `FSDataOutputStream`, which through its operations may generate ne
  clients creating files with `overwrite==true` to fail if the file is created
  by another client between the two tests.
 
-* S3A, Swift and potentially other Object Stores do not currently change the FS state
+* S3A, Swift and potentially other Object Stores do not currently change the `FS` state
 until the output stream `close()` operation is completed.
 This is a significant difference between the behavior of object stores
 and that of filesystems, as it allows >1 client to create a file with `overwrite==false`,
 and potentially confuse file/directory logic. In particular, using `create()` to acquire
 an exclusive lock on a file (whoever creates the file without an error is considered
-the holder of the lock) is not a valid algorithm when working with object stores.
+the holder of the lock) may not not a safe algorithm to use when working with object stores.
 
 * Object stores may create an empty file as a marker when a file is created.
 However, object stores whith overwrite=true semantics may not implement this atomically,
diff --git a/hadoop-common-project/hadoop-common/src/site/markdown/filesystem/index.md b/hadoop-common-project/hadoop-common/src/site/markdown/filesystem/index.md
index df538ee6cf96b..0cc3423b08982 100644
--- a/hadoop-common-project/hadoop-common/src/site/markdown/filesystem/index.md
+++ b/hadoop-common-project/hadoop-common/src/site/markdown/filesystem/index.md
@@ -32,6 +32,7 @@ HDFS as these are commonly expected by Hadoop client applications.
 1. [Notation](notation.html)
 1. [Model](model.html)
 1. [FileSystem class](filesystem.html)
+1. [OutputStream, Syncable and `StreamCapabilities`](outputstream.html)
 1. [FSDataInputStream class](fsdatainputstream.html)
 1. [PathCapabilities interface](pathcapabilities.html)
 1. [FSDataOutputStreamBuilder class](fsdataoutputstreambuilder.html)
diff --git a/hadoop-common-project/hadoop-common/src/site/markdown/filesystem/outputstream.md b/hadoop-common-project/hadoop-common/src/site/markdown/filesystem/outputstream.md
index 6f4969f2742e8..6ef03c491ee12 100644
--- a/hadoop-common-project/hadoop-common/src/site/markdown/filesystem/outputstream.md
+++ b/hadoop-common-project/hadoop-common/src/site/markdown/filesystem/outputstream.md
@@ -16,11 +16,11 @@
 
 # Output: `OutputStream`, `Syncable` and `StreamCapabilities`
 
+This document covers the Output Streams within the context of the
+[Hadoop File System Specification](index.html).
 
-With the exception of `FileSystem.copyFromLocalFile()`,
-all API operations which write data to a filesystem in Hadoop do so
-through the Java "OutputStreams" API. More specifically, they do
-so through `OutputStream` subclasses obtained through calls to
+The core mechanism to write data to files through the Hadoop FileSystem APIs
+is through `OutputStream` subclasses obtained through calls to
 `FileSystem.create()`, `FileSystem.append()`,
 or `FSDataOutputStreamBuilder.build()`.
 
@@ -30,26 +30,20 @@ After a stream's `close()` method is called, all data written to the
 stream MUST BE persisted to the fileysystem and visible to oll other
 clients attempting to read data from that path via `FileSystem.open()`.
 
-As well as operations to write the data, Hadoop's Output Streams
+As well as operations to write the data, Hadoop's OutputStream implementations
 provide methods to flush buffered data back to the filesystem,
 so as to ensure that the data is reliably persisted and/or visible
 to other callers. This is done via the `Syncable` interface. It was
 originally intended that the presence of this interface could be interpreted
-as a guarantee that the stream supported it's methods, but this has proven
+as a guarantee that the stream supported its methods. However, this has proven
 impossible to guarantee as the static nature of the interface is incompatible
 with filesystems whose syncability semantics may vary on a store/path basis.
-As an example, erasure coded files in HDFS have
-A new interface, `StreamCapabilities` has been implemented to allow callers to probe the exact capabilities of a stream, even transitively
+As an example, erasure coded files in HDFS do not support the Sync operations,
+even though they are implemented as subclass of an output stream which is `Syncable`.
+A new interface, `StreamCapabilities` has been implemented to allow callers
+to probe the exact capabilities of a stream, even transitively
 through a chain of streams.
 
-* HDFS's primary stream implementation is
-`org.apache.hadoop.hdfs.DFSOutputStream`.
-* The subclass `org.apache.hadoop.hdfs.DFSStripedOutputStream` supports erasure
-coding: it removes the `Syncable` behaviors from the base class.
-* The output streams `org.apache.hadoop.fs.FSOutputSummer` and
-`org.apache.hadoop.fs.ChecksumFileSystem.ChecksumFSOutputSummer`
-contain the underlying checksummed output stream used by
-both HDFS and the "file" filesystems.
 
 
 ## Output Stream Model
@@ -58,7 +52,7 @@ For this specification, an output stream can be viewed as a list of bytes
 stored in in the client
 
 ```python
-buffer: List[byte]`
+buffer: List[byte]
 ```
 
 A flag, `open` tracks whether the stream is open: after the stream
@@ -69,17 +63,18 @@ open: bool
 buffer: List[byte]
 ```
 
-The destination path of the stream, `path` can be tracked to form a triple
-`Path, open, buffer`
-
+The destination path of the stream, `path`, can be tracked to form a triple
+`path, open, buffer`
 
 ```python
-Stream = (path, open, buffer)
+Stream = (path: Path, open: Boolean, buffer: byte[])
 ```
 
+#### Visibility of flushed Data
+
 (Immediately) after `Syncable` operations which flush data to the filesystem,
-the data at the stream's destination path must match that of
-`buffer`. That is, the following condition holds:
+the data at the stream's destination path MUST match that of
+`buffer`. That is, the following condition MUST hold:
 
 ```python
 FS'.Files(path) == buffer
@@ -89,28 +84,27 @@ Any client reading the data at the path will see the new data.
 The two sync operations, `hflush()` and `hsync()` differ in their durability
 guarantees, not visibility of data.
 
-
 ### State of Stream and filesystem after `Filesystem.create()`
 
-
-The output stream returned by a `FileSystem.create(path)` call contains no
-data:
+The output stream returned by a `FileSystem.create(path)` or
+`FileSystem.createFile(path).build`
+can be modeled as a triple containing an empty array of no data:
 
 ```python
 Stream' = (path, true, [])
 ```
 
-The filesystem `FS'` must contain a 0-byte file at the path:
+The filesystem `FS'` MUST contain a 0-byte file at the path:
 
 ```python
 data(FS', path) == []
 ```
 
-Accordingly, the the initial state of `Stream'.buffer` is implicitly
+Thus, the initial state of `Stream'.buffer` is implicitly
 consistent with the data at the filesystem.
 
 
-*Object Stores*: the 0-byte empty file may not exist in the filesystem.
+*Object Stores*: see caveats in the "Object Stores" section below.
 
 ### State of Stream and filesystem after `Filesystem.append()`
 
@@ -155,7 +149,6 @@ the previous exception, but they MUST NOT retry the write.
 
 # Class `FSDataOutputStream`
 
-
 ```java
 public class FSDataOutputStream
   extends DataOutputStream
@@ -196,19 +189,16 @@ Put differently:
 
 *It isn't enough for Output Streams to implement the core semantics
 of `java.io.OutputStream`: they need to implement the extra semantics
-of `HdfsDataOutputStream`. Failure to do so must be considered regressions*
+of `HdfsDataOutputStream`, especially for HBase to work correctly.
 
 The concurrent `write()` call is the most significant tightening of
 the Java specification.
 
-
 ## Class `java.io.OutputStream`
 
-
 A Java `OutputStream` allows applications to write a sequence of bytes to a destination.
 In a Hadoop filesystem, that destination is the data under a path in the filesystem.
 
-
 ```java
 public abstract class OutputStream implements Closeable, Flushable {
   public abstract void write(int b) throws IOException;
@@ -218,7 +208,6 @@ public abstract class OutputStream implements Closeable, Flushable {
   public void close() throws IOException;
 }
 ```
-
 ### <a name="write(data: int)"></a>`write(Stream, data)`
 
 Writes a byte of data to the stream.
@@ -296,10 +285,8 @@ It explicitly precludes any guarantees about durability.
 For that reason, this document doesn't provide any normative
 specifications of behaviour.
 
-
 #### Preconditions
 
-
 ```python
 Stream.open else raise IOException
 ```
@@ -313,7 +300,7 @@ the data may be saved to the file system such that it becomes visible to
 others"
 
 ```python
-FS' = FS where data(FS, path) == buffer
+FS' = FS where data(FS', path) == buffer
 ```
 
 Some applications have been known to call `flush()` on a closed stream
@@ -342,7 +329,7 @@ Any locking/leaseholding mechanism is also required to release its lock/lease.
 
 ```python
 Stream'.open = false
-FS' = FS where data(FS, path) == buffer
+FS' = FS where data(FS', path) == buffer
 ```
 
 The `close()` call MAY fail during its operation.
@@ -490,7 +477,7 @@ FS' = FS where data(path) == cache
 
 
 After the call returns, the data MUST be visible to all new callers
-of `FileSystem.open`.
+of `FileSystem.open(path)` and `FileSystem.openFile(path).build()`.
 
 There is no requirement or guarantee that clients with an existing
 `DataInputStream` created by a call to `(FS, path)` will see the updated
@@ -629,9 +616,8 @@ However, `org.apache.hadoop.hdfs.DFSOutputStream`
 has a stronger thread safety model (possibly unintentionally). This fact is
 relied upon in Apache HBase, as discovered in HADOOP-11708. Implementations
 SHOULD be thread safe. *Note*: even the `DFSOutputStream` synchronization
-model permits the output stream to `close()`'d while awaiting an acknowledgement
-from datanode or namenode writes in an `hsync()` operation.
-
+model permits the output stream to have `close()` invoked while awaiting an
+acknowledgement from datanode or namenode writes in an `hsync()` operation.
 
 ## Consistency and Visibility
 
@@ -655,7 +641,6 @@ getFileStatus(FS', path).getLen() = 0
 The metadata of a file (`length(FS, path)` in particular) SHOULD be consistent
 with the contents of the file after `flush()` and `sync()`.
 
-
 ```python
 (Stream', FS') = create(FS, path)
 (Stream'', FS'') = write(Stream', data)
@@ -664,7 +649,6 @@ exists(FS''', path)
 getFileStatus(FS''', path).getLen() = len(data)
 ```
 
-
 HDFS does not do this except when the write crosses a block boundary; to do
 otherwise would overload the Namenode. As a result, while a file is being written
 `length(Filesystem, Path)` MAY be less than the length of `data(Filesystem, Path)`.
@@ -681,10 +665,9 @@ including length and modification time.
 The metadata of the file returned in any of the FileSystem `list` operations
 MUST be consistent with this metadata.
 
-
 The value of `getFileStatus(path).getModificationTime()` is not defined
 while a stream is being written to.
- The timestamp MAY be updated while a file is being written,
+The timestamp MAY be updated while a file is being written,
 especially after a `Syncable.hsync()` call.
 The timestamps MUST be updated after the file is closed
 to that of a clock value observed by the server during the `close()` call.
@@ -698,15 +681,12 @@ written file, then the last modification time SHOULD be a time `t` where
 
 ## Issues with the Hadoop Output Stream model.
 
-
 There are some known issues with the output stream model as offered by Hadoop,
 specifically about the guarantees about when data is written and persisted
 —and when the metadata is synchronized.
 These are where implementation aspects of HDFS and the "Local" filesystem
 do not the simple model of the filesystem used in this specification.
 
-
-
 ### HDFS
 
 That HDFS file metadata often lags the content of a file being written
@@ -737,7 +717,6 @@ data, as well as HDFS. What is important to know is that, in HDFS
 `getFileStatus(FS, path).getLen()==0` does not imply that `data(FS, path)` is
 empty.
 
-
 ### Local Filesystem, `file:`
 
 `LocalFileSystem`, `file:`, (or any other `FileSystem` implementation based on
@@ -773,7 +752,6 @@ to close the stream more than once.
 Behaviors 1 & 2 really have to be considered bugs to fix, albeit with care.
 
 
-
 ### Object Stores
 
 Object store streams tend to buffer the entire stream's output
@@ -829,7 +807,6 @@ For anyone wondering why the clients create a 0-byte file in the create call,
 it would cause problems after `close()` —the marker file could get
 returned in `open()` calls instead of the final data.
 
-
 #### Visibility of the output of a stream after `close()`
 
 One guarantee which Object Stores SHOULD make is the same as those of POSIX
@@ -853,5 +830,5 @@ to their advantage.
 #### Other issues
 
 The `Syncable` interfaces and methods are rarely implemented. Use
-`StreamCapabilities` to determine their availability (Azure's
-`PageBlobOutputStream` is the sole "syncable" object store output stream).
+`StreamCapabilities` to determine their availability.
+
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3ABlockOutputStream.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3ABlockOutputStream.java
index 29e77ba68ac36..2d42aade767b4 100644
--- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3ABlockOutputStream.java
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3ABlockOutputStream.java
@@ -238,9 +238,9 @@ public void flush() throws IOException {
           dataBlock.flush();
         }
       } catch (IOException ex) {
-        throw stateModel.enterErrorState(ex);
+        throw enterErrorState(ex);
       } finally {
-        stateModel.releaseLock();
+        releaseLock();
       }
     }
   }
@@ -279,9 +279,9 @@ public void write(byte[] source, int offset, int len)
     try {
       innerWrite(source, offset, len);
     } catch (IOException ex) {
-      throw stateModel.enterErrorState(ex);
+      throw enterErrorState(ex);
     } finally {
-      stateModel.releaseLock();
+      releaseLock();
     }
   }
 
@@ -297,6 +297,26 @@ private synchronized StreamStateModel.State acquireLock(final boolean checkOpen)
     return stateModel.getState();
   }
 
+  /**
+   * Release the state lock through {@link StreamStateModel#releaseLock()}.
+   * Must only be called during the {@code finally} clause following
+   * an {@link #acquireLock(boolean)} call.
+   */
+  private void releaseLock() {
+    stateModel.releaseLock();
+  }
+
+  /**
+   * Change the stream state to error, using the given exception if the stream
+   * was not already in an error state, via
+   * {@link StreamStateModel#enterErrorState(IOException)}.
+   * @param ex exception to record as the cause of the failure.
+   * @return the exception to throw.
+   */
+  private IOException enterErrorState(IOException ex) {
+    return stateModel.enterErrorState(ex);
+  }
+
   /**
    * The inner write.
    * This is called recursively until all the source data is written.
@@ -387,7 +407,7 @@ public void close() throws IOException {
         }
       }
     } finally {
-      stateModel.releaseLock();
+      releaseLock();
     }
     S3ADataBlocks.DataBlock block = getActiveBlock();
     boolean hasBlock = hasActiveBlock();
@@ -433,7 +453,7 @@ public void close() throws IOException {
       }
       LOG.debug("Upload complete to {} by {}", key, writeOperationHelper);
     } catch (IOException ioe) {
-      stateModel.enterErrorState(ioe);
+      enterErrorState(ioe);
       writeOperationHelper.writeFailed(ioe);
       throw ioe;
     } finally {
diff --git a/hadoop-tools/hadoop-azure/src/main/java/org/apache/hadoop/fs/azure/NativeAzureFileSystem.java b/hadoop-tools/hadoop-azure/src/main/java/org/apache/hadoop/fs/azure/NativeAzureFileSystem.java
index 93c54d386f716..f4f0e91da73ef 100644
--- a/hadoop-tools/hadoop-azure/src/main/java/org/apache/hadoop/fs/azure/NativeAzureFileSystem.java
+++ b/hadoop-tools/hadoop-azure/src/main/java/org/apache/hadoop/fs/azure/NativeAzureFileSystem.java
@@ -69,6 +69,7 @@
 import org.apache.hadoop.fs.azure.security.Constants;
 import org.apache.hadoop.fs.azure.security.RemoteWasbDelegationTokenManager;
 import org.apache.hadoop.fs.azure.security.WasbDelegationTokenManager;
+import org.apache.hadoop.fs.impl.StoreImplementationUtils;
 import org.apache.hadoop.fs.permission.FsAction;
 import org.apache.hadoop.fs.permission.FsPermission;
 import org.apache.hadoop.fs.permission.PermissionStatus;
@@ -1051,10 +1052,7 @@ public void hsync() throws IOException {
      */
     @Override // StreamCapability
     public boolean hasCapability(String capability) {
-      if (out instanceof StreamCapabilities) {
-        return ((StreamCapabilities) out).hasCapability(capability);
-      }
-      return false;
+      return StoreImplementationUtils.hasCapability(out, capability);
     }
 
     @Override
diff --git a/hadoop-tools/hadoop-azure/src/main/java/org/apache/hadoop/fs/azure/SyncableDataOutputStream.java b/hadoop-tools/hadoop-azure/src/main/java/org/apache/hadoop/fs/azure/SyncableDataOutputStream.java
index dcfff2fbe3784..0721576bb0687 100644
--- a/hadoop-tools/hadoop-azure/src/main/java/org/apache/hadoop/fs/azure/SyncableDataOutputStream.java
+++ b/hadoop-tools/hadoop-azure/src/main/java/org/apache/hadoop/fs/azure/SyncableDataOutputStream.java
@@ -25,6 +25,7 @@
 import org.apache.hadoop.fs.StreamCapabilities;
 import org.apache.hadoop.fs.Syncable;
 import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.fs.impl.StoreImplementationUtils;
 
 /**
  * Support the Syncable interface on top of a DataOutputStream.
@@ -51,10 +52,7 @@ public OutputStream getOutStream() {
 
   @Override
   public boolean hasCapability(String capability) {
-    if (out instanceof StreamCapabilities) {
-      return ((StreamCapabilities) out).hasCapability(capability);
-    }
-    return false;
+    return StoreImplementationUtils.hasCapability(out, capability);
   }
 
   @Override

From 75b2f01e29435d7fb0e2e9b4fc7ec8285dd60e75 Mon Sep 17 00:00:00 2001
From: Steve Loughran <stevel@cloudera.com>
Date: Fri, 1 Nov 2019 14:49:21 +0000
Subject: [PATCH 3/3] HADOOP-13327 review of past comments

Change-Id: I92788063536a4bf15ee28f17187c7e09fa09940f
---
 .../hadoop/fs/impl/StreamStateModel.java      | 88 ++++++++++++++++---
 .../site/markdown/filesystem/filesystem.md    |  2 +-
 .../site/markdown/filesystem/outputstream.md  | 28 ++++--
 .../hadoop/fs/s3a/S3ABlockOutputStream.java   | 38 +++++---
 4 files changed, 124 insertions(+), 32 deletions(-)

diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/impl/StreamStateModel.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/impl/StreamStateModel.java
index 1504fbb4a0a3d..e20e07537faee 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/impl/StreamStateModel.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/impl/StreamStateModel.java
@@ -28,6 +28,9 @@
 import com.google.common.base.Preconditions;
 
 import org.apache.commons.lang3.StringUtils;
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.classification.InterfaceStability;
+import org.apache.hadoop.fs.FSExceptionMessages;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.fs.PathIOException;
 
@@ -35,20 +38,52 @@
 
 /**
  * Models a stream's state and can be used for checking this state before
- * any operation.
- * 
- * It's designed to ensure that a stream knows when it is closed,
+ * any operation, and for using a lock to manage exclusive access to operations
+ * within the stream.
+ *
+ * The model designed to ensure that a stream knows when it is closed,
  * and, once it has entered a failed state, that the failure
  * is not forgotten.
  *
- * The model has three states: Open, Error, and Closed,
+ * This state model is based on the design of
+ * {@code org.apache.hadoop.fs.azurebfs.services.AbfsOutputStream} and
+ * {@code org.apache.hadoop.fs.azure.PageBlobOutputStream} which both
+ * store and rethrow any previously raised error.
+ *
+ *
+ * The model has three states: Open, Error, and Closed:
+ *
+ * <pre>
+ *   {@link State#Open}: caller can interact with the stream.
+ *   {@link State#Error}: all operations will raise the previously recorded exception.
+ *   {@link State#Closed}: operations will be rejected.
+ * </pre>
+
+ * When an instance of the model is created, it is in {@link State#Open};
+ * a call to {@link #enterClosedState()} will move it from Open to Closed;
+ *
  *
+ * <p>
+ * The lock/unlock operation relays to {@code java.util.concurrent.locks.Lock}
+ * and has a similar usage pattern. A key difference is that a check for the
+ * stream being open can be integrated with the lock acquisition.
+ * If {@link #acquireLock(boolean)} is called with checkOpen == true, then
+ * the caller will know that after the lock is granted then the stream
+ * is open.
  * <pre>
- *   Open: caller can interact with the stream.
- *   Error: all operations will raise the previously recorded exception.
- *   Closed: operations will be rejected.
+ *   model.acquireLock(true);
+ *   try {
+ *     (do some work)
+ *   } catch (IOException e) {
+ *     model.enterErrorState(e);
+ *   } finally {
+ *     model.releaseLock();
+ *   }
  * </pre>
+ *
  */
+@InterfaceAudience.LimitedPrivate("Filesystems")
+@InterfaceStability.Unstable
 public class StreamStateModel {
 
   /**
@@ -126,7 +161,7 @@ public synchronized boolean enterClosedState() {
 
   /**
    * Change state to error and stores first error so it can be re-thrown.
-   * If already in error: return previous exception.
+   * If already in {@link State#Error}: return the previous exception.
    * @param ex the exception to record
    * @return the exception set when the error state was entered.
    */
@@ -149,8 +184,9 @@ public synchronized IOException enterErrorState(final IOException ex) {
   /**
    * Verify that a stream is open, throwing an IOException if
    * not.
-   * If in an error state: rethrow that exception. If closed,
-   * throw an exception about that.
+   * If in an error state: rethrow that exception.
+   * If closed, throw an IOException with
+   * {@link FSExceptionMessages#STREAM_IS_CLOSED} in the message.
    * @throws IOException if the stream is not in {@link State#Open}.
    * @throws PathIOException if the stream was not open and a path was given
    * in the constructor.
@@ -172,11 +208,39 @@ public synchronized void checkOpen() throws IOException {
     }
   }
 
+  /**
+   * Probe for the model being in a specific state.
+   * @param probe state to probe for
+   * @return true if at the time of the check, the service
+   * was in the given state.
+   */
+  public boolean isInState(State probe) {
+    return state.get().equals(probe);
+  }
+
+  /**
+   * If the model is in {@link State#Error} throw the exception.
+   * @throws IOException if one was caught earlier.
+   */
+  public synchronized void throwAnyExceptionRaised() throws IOException {
+    if (exception != null) {
+      throw exception;
+    }
+  }
+
+  /**
+   * Get any exception. Non-null iff the model is in the error state.
+   * @return any exception set on a transition to the error state.
+   */
+  public synchronized IOException getException() {
+    return exception;
+  }
+
   /**
    * Acquire an exclusive lock.
    * @param checkOpen must the stream be open?
-   * @throws IOException if the stream is in error state or checkOpen==true
-   * and the stream is closed.
+   * @throws IOException if the stream is in error state or
+   * {@code checkOpen == true} and the stream is closed.
    */
   public void acquireLock(boolean checkOpen) throws IOException {
     // fail fast if the stream is required to be open and it is not
diff --git a/hadoop-common-project/hadoop-common/src/site/markdown/filesystem/filesystem.md b/hadoop-common-project/hadoop-common/src/site/markdown/filesystem/filesystem.md
index 9e9aa03c9eb74..50f227813435b 100644
--- a/hadoop-common-project/hadoop-common/src/site/markdown/filesystem/filesystem.md
+++ b/hadoop-common-project/hadoop-common/src/site/markdown/filesystem/filesystem.md
@@ -618,7 +618,7 @@ an exclusive lock on a file (whoever creates the file without an error is consid
 the holder of the lock) may not not a safe algorithm to use when working with object stores.
 
 * Object stores may create an empty file as a marker when a file is created.
-However, object stores whith overwrite=true semantics may not implement this atomically,
+However, object stores with overwrite=true semantics may not implement this atomically,
 so creating files with `overwrite==false` cannot be used as an implicit exclusion
 mechanism between processes.
 
diff --git a/hadoop-common-project/hadoop-common/src/site/markdown/filesystem/outputstream.md b/hadoop-common-project/hadoop-common/src/site/markdown/filesystem/outputstream.md
index 6ef03c491ee12..d44baef8bb151 100644
--- a/hadoop-common-project/hadoop-common/src/site/markdown/filesystem/outputstream.md
+++ b/hadoop-common-project/hadoop-common/src/site/markdown/filesystem/outputstream.md
@@ -16,9 +16,24 @@
 
 # Output: `OutputStream`, `Syncable` and `StreamCapabilities`
 
+## Introduction
+
 This document covers the Output Streams within the context of the
 [Hadoop File System Specification](index.html).
 
+It uses the filesystem model defined in [A Model of a Hadoop Filesystem](model.html)
+with the notation defined in [notation](Notation.md).
+
+The target audiences are
+1. Users of the APIs. While `java.io.OutputStream` is a standard interfaces,
+this document clarifies how it is implemented in HDFS and elsewhere.
+The Hadoop-specific interfaces `Syncable` and `StreamCapabilities` are new;
+`Syncable` is notable in offering durability and visibility guarantees which
+exceed that of `OutputStream`.
+1. Implementors of File Systems and clients.
+
+## How data is written to a filesystem
+
 The core mechanism to write data to files through the Hadoop FileSystem APIs
 is through `OutputStream` subclasses obtained through calls to
 `FileSystem.create()`, `FileSystem.append()`,
@@ -30,6 +45,7 @@ After a stream's `close()` method is called, all data written to the
 stream MUST BE persisted to the fileysystem and visible to oll other
 clients attempting to read data from that path via `FileSystem.open()`.
 
+
 As well as operations to write the data, Hadoop's OutputStream implementations
 provide methods to flush buffered data back to the filesystem,
 so as to ensure that the data is reliably persisted and/or visible
@@ -70,7 +86,7 @@ The destination path of the stream, `path`, can be tracked to form a triple
 Stream = (path: Path, open: Boolean, buffer: byte[])
 ```
 
-#### Visibility of flushed Data
+#### Visibility of Flushed Data
 
 (Immediately) after `Syncable` operations which flush data to the filesystem,
 the data at the stream's destination path MUST match that of
@@ -84,7 +100,7 @@ Any client reading the data at the path will see the new data.
 The two sync operations, `hflush()` and `hsync()` differ in their durability
 guarantees, not visibility of data.
 
-### State of Stream and filesystem after `Filesystem.create()`
+### State of Stream and Filesystem after `Filesystem.create()`
 
 The output stream returned by a `FileSystem.create(path)` or
 `FileSystem.createFile(path).build`
@@ -106,7 +122,7 @@ consistent with the data at the filesystem.
 
 *Object Stores*: see caveats in the "Object Stores" section below.
 
-### State of Stream and filesystem after `Filesystem.append()`
+### State of Stream and Filesystem after `Filesystem.append()`
 
 The output stream returned from a call of
  `FileSystem.append(path, buffersize, progress)`,
@@ -785,7 +801,7 @@ the write, or at some point in between. Expect in the special case that the
 object store supports an atomic PUT operation, the check for existence of
 existing data and the subsequent creation of data at the path contains a race
 condition: other clients may create data at the path between the existence check
-and the subsequent qrite.
+and the subsequent write.
 
 1. Calls to `create(FS, Path, overwrite=false)` may succeed, returning a new
 `OutputStream`, even while another stream is open and writing to the destination
@@ -821,8 +837,8 @@ existence probes, then even after the stream's `close()` operation has returned,
 `getFileStatus(FS, path)` and `open(FS, path)` may fail with a `FileNotFoundException`.
 
 In their favour, the atomicity of the store's PUT operations do offer their
-own guarantee: an newly created object is either absent or all of its data
-is present: the act of instantiatng the object, while potentially exhibiting
+own guarantee: a newly created object is either absent or all of its data
+is present: the act of instantiating the object, while potentially exhibiting
 create inconsistency, is atomic. Applications may be able to use that fact
 to their advantage.
 
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3ABlockOutputStream.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3ABlockOutputStream.java
index 2d42aade767b4..0ebb7f9e9c91d 100644
--- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3ABlockOutputStream.java
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3ABlockOutputStream.java
@@ -27,7 +27,6 @@
 import java.util.concurrent.ExecutorService;
 import java.util.concurrent.atomic.AtomicInteger;
 
-import com.amazonaws.AmazonClientException;
 import com.amazonaws.event.ProgressEvent;
 import com.amazonaws.event.ProgressEventType;
 import com.amazonaws.event.ProgressListener;
@@ -392,6 +391,7 @@ private void initMultipartUpload() throws IOException {
    */
   @Override
   public void close() throws IOException {
+    boolean alreadyClosed;
     acquireLock(false);
     try {
       synchronized (this) {
@@ -400,16 +400,30 @@ public void close() throws IOException {
         // because the whole close() method is called, calling it on a stream
         // which has just been closed isn't going to block it for the duration
         // of the entire upload.
-        if (!stateModel.enterClosedState()) {
-          // already closed
-          LOG.debug("Ignoring close() as stream is not open");
-          return;
-        }
+        alreadyClosed = !stateModel.enterClosedState();
       }
     } finally {
       releaseLock();
     }
-    S3ADataBlocks.DataBlock block = getActiveBlock();
+    if (alreadyClosed) {
+      // already closed or in error state.
+      if (stateModel.isInState(StreamStateModel.State.Error)) {
+        // error state.
+        if (multiPartUpload != null) {
+          // attempt to abort any ongoing MPU.
+          multiPartUpload.abort();
+        }
+        // Log the operation as failing and rethrow.
+        IOException ioe = stateModel.getException();
+        writeOperationHelper.writeFailed(ioe);
+        throw ioe;
+      } else {
+        LOG.debug("Ignoring close() as stream is not open");
+        return;
+      }
+    }
+
+      S3ADataBlocks.DataBlock block = getActiveBlock();
     boolean hasBlock = hasActiveBlock();
     LOG.debug("{}: Closing block #{}: current block= {}",
         this,
@@ -453,9 +467,9 @@ public void close() throws IOException {
       }
       LOG.debug("Upload complete to {} by {}", key, writeOperationHelper);
     } catch (IOException ioe) {
-      enterErrorState(ioe);
-      writeOperationHelper.writeFailed(ioe);
-      throw ioe;
+      final IOException ex = enterErrorState(ioe);
+      writeOperationHelper.writeFailed(ex);
+      throw ex;
     } finally {
       closeAll(LOG, block, blockFactory);
       LOG.debug("Statistics: {}", statistics);
@@ -734,9 +748,7 @@ private void complete(List<PartETag> partETags)
      * Abort a multi-part upload. Retries are attempted on failures.
      * IOExceptions are caught; this is expected to be run as a cleanup process.
      */
-    public void abort() {
-      int retryCount = 0;
-      AmazonClientException lastException;
+    private void abort() {
       fs.incrementStatistic(OBJECT_MULTIPART_UPLOAD_ABORTED);
       try {
         writeOperationHelper.abortMultipartUpload(key, uploadId,