Address Burak's comments

zsxwing · zsxwing · commit 9f2d8773fea7 · 2017-01-10T12:27:37.000-08:00
diff --git a/docs/structured-streaming-programming-guide.md b/docs/structured-streaming-programming-guide.md
@@ -374,7 +374,7 @@ The "Output" is defined as what gets written out to the external storage. The ou
 
   - *Append Mode* - Only the new rows appended in the Result Table since the last trigger will be written to the external storage. This is applicable only on the queries where existing rows in the Result Table are not expected to change.
   
-  - *Update Mode* - Only the rows that were updated in the Result Table since the last trigger will be written to the external storage (available since Spark 2.1.1). Note that this is different from the Complete Mode in that this mode only outputs the rows that have changed since the last trigger. If the query doesn't contain aggregations, it will be same as the Append mode.
+  - *Update Mode* - Only the rows that were updated in the Result Table since the last trigger will be written to the external storage (available since Spark 2.1.1). Note that this is different from the Complete Mode in that this mode only outputs the rows that have changed since the last trigger. If the query doesn't contain aggregations, it will be equivalent to Append mode.
 
 Note that each mode is applicable on certain types of queries. This is discussed in detail [later](#output-modes).
 
diff --git a/python/pyspark/sql/streaming.py b/python/pyspark/sql/streaming.py
@@ -665,6 +665,9 @@ def outputMode(self, outputMode):
            the sink
         * `complete`:All the rows in the streaming DataFrame/Dataset will be written to the sink
            every time these is some updates
+        * `update`:only the rows that were updated in the streaming DataFrame/Dataset will be
+           written to the sink every time there are some updates. If the query doesn't contain
+           aggregations, it will be equivalent to the `append` mode.
 
        .. note:: Experimental.
 
@@ -768,7 +771,8 @@ def trigger(self, processingTime=None):
 
     @ignore_unicode_prefix
     @since(2.0)
-    def start(self, path=None, format=None, partitionBy=None, queryName=None, **options):
+    def start(self, path=None, format=None, outputMode=None, partitionBy=None, queryName=None,
+              **options):
         """Streams the contents of the :class:`DataFrame` to a data source.
 
         The data source is specified by the ``format`` and a set of ``options``.
@@ -779,15 +783,20 @@ def start(self, path=None, format=None, partitionBy=None, queryName=None, **opti
 
         :param path: the path in a Hadoop supported file system
         :param format: the format used to save
-
-            * ``append``: Append contents of this :class:`DataFrame` to existing data.
-            * ``overwrite``: Overwrite existing data.
-            * ``ignore``: Silently ignore this operation if data already exists.
-            * ``error`` (default case): Throw an exception if data already exists.
+        :param outputMode: specifies how data of a streaming DataFrame/Dataset is written to a
+        streaming sink. Options include:
+
+            * `append`:Only the new rows in the streaming DataFrame/Dataset will be written to the
+              sink
+            * `complete`:All the rows in the streaming DataFrame/Dataset will be written to the sink
+               every time these is some updates
+            * `update`:only the rows that were updated in the streaming DataFrame/Dataset will be
+              written to the sink every time there are some updates. If the query doesn't contain
+              aggregations, it will be equivalent to the `append` mode.
         :param partitionBy: names of partitioning columns
         :param queryName: unique name for the query
         :param options: All other string options. You may want to provide a `checkpointLocation`
-            for most streams, however it is not required for a `memory` stream.
+                        for most streams, however it is not required for a `memory` stream.
 
         >>> sq = sdf.writeStream.format('memory').queryName('this_query').start()
         >>> sq.isActive
@@ -798,14 +807,16 @@ def start(self, path=None, format=None, partitionBy=None, queryName=None, **opti
         >>> sq.isActive
         False
         >>> sq = sdf.writeStream.trigger(processingTime='5 seconds').start(
-        ...     queryName='that_query', format='memory')
+        ...     queryName='that_query', outputMode="append", format='memory')
         >>> sq.name
         u'that_query'
         >>> sq.isActive
         True
         >>> sq.stop()
         """
         self.options(**options)
+        if outputMode is not None:
+            self.outputMode(outputMode)
         if partitionBy is not None:
             self.partitionBy(partitionBy)
         if format is not None:
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/streaming/OutputMode.java b/sql/catalyst/src/main/java/org/apache/spark/sql/streaming/OutputMode.java
@@ -58,7 +58,7 @@ public static OutputMode Complete() {
   /**
    * OutputMode in which only the rows that were updated in the streaming DataFrame/Dataset will
    * be written to the sink every time there are some updates. If the query doesn't contain
-   * aggregations, it will be same as the `Append` mode.
+   * aggregations, it will be equivalent to the `Append` mode.
    *
    * @since 2.1.1
    */
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/streaming/InternalOutputModes.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/streaming/InternalOutputModes.scala
@@ -41,7 +41,7 @@ private[sql] object InternalOutputModes {
   /**
    * OutputMode in which only the rows in the streaming DataFrame/Dataset that were updated will be
    * written to the sink every time these is some updates. If the query doesn't contain
-   * aggregations, it will be same as the `Append` mode.
+   * aggregations, it will be equivalent to the `Append` mode.
    */
   case object Update extends OutputMode
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamWriter.scala
@@ -46,7 +46,7 @@ final class DataStreamWriter[T] private[sql](ds: Dataset[T]) {
    *                              to the sink every time these is some updates
    *   - `OutputMode.Update()`: only the rows that were updated in the streaming DataFrame/Dataset
    *                            will be written to the sink every time there are some updates. If
-   *                            the query doesn't contain aggregations, it will be same as the
+   *                            the query doesn't contain aggregations, it will be equivalent to the
    *                            `OutputMode.Append()` mode.
    *
    * @since 2.0.0
@@ -64,7 +64,7 @@ final class DataStreamWriter[T] private[sql](ds: Dataset[T]) {
    *                 every time these is some updates
    *   - `update`:   only the rows that were updated in the streaming DataFrame/Dataset will
    *                 be written to the sink every time there are some updates. If the query doesn't
-   *                 contain aggregations, it will be same as the `append` mode.
+   *                 contain aggregations, it will be equivalent to the `append` mode.
    * @since 2.0.0
    */
   def outputMode(outputMode: String): DataStreamWriter[T] = {

Original file line number	Diff line number	Diff line change
`@@ -58,7 +58,7 @@ public static OutputMode Complete() {`
`58`	`58`	`/**`
`59`	`59`	`* OutputMode in which only the rows that were updated in the streaming DataFrame/Dataset will`
`60`	`60`	`* be written to the sink every time there are some updates. If the query doesn't contain`
`61`		- * aggregations, it will be same as the `Append` mode.
	`61`	+ * aggregations, it will be equivalent to the `Append` mode.
`62`	`62`	`*`
`63`	`63`	`* @since 2.1.1`
`64`	`64`	`*/`
Original file line number	Diff line number	Diff line change
`@@ -41,7 +41,7 @@ private[sql] object InternalOutputModes {`
`41`	`41`	`/**`
`42`	`42`	`* OutputMode in which only the rows in the streaming DataFrame/Dataset that were updated will be`
`43`	`43`	`* written to the sink every time these is some updates. If the query doesn't contain`
`44`		- * aggregations, it will be same as the `Append` mode.
	`44`	+ * aggregations, it will be equivalent to the `Append` mode.
`45`	`45`	`*/`
`46`	`46`	`case object Update extends OutputMode`
`47`	`47`	`}`