Use test utils and clean up the examples in doctests in table and toTable

HyukjinKwon · HyukjinKwon · commit 02ce522f3111 · 2020-12-22T00:32:55.000+09:00
diff --git a/python/pyspark/sql/streaming.py b/python/pyspark/sql/streaming.py
@@ -974,9 +974,7 @@ def table(self, tableName):
 
         Examples
         --------
-        >>> csv_sdf = spark.readStream.table('input_table') # doctest: +SKIP
-        >>> csv_sdf.isStreaming # doctest: +SKIP
-        True
+        >>> spark.readStream.table('input_table') # doctest: +SKIP
         """
         if isinstance(tableName, str):
             return self._df(self._jreader.table(tableName))
@@ -1535,23 +1533,15 @@ def toTable(self, tableName, format=None, outputMode=None, partitionBy=None, que
 
         Examples
         --------
-        >>> sq = sdf.writeStream.format('parquet').queryName('this_query').option(
-        ...      'checkpointLocation', '/tmp/checkpoint').toTable('output_table') # doctest: +SKIP
-        >>> sq.isActive # doctest: +SKIP
-        True
-        >>> sq.name # doctest: +SKIP
-        'this_query'
-        >>> sq.stop() # doctest: +SKIP
-        >>> sq.isActive # doctest: +SKIP
-        False
-        >>> sq = sdf.writeStream.trigger(processingTime='5 seconds').toTable(
-        ...     'output_table', queryName='that_query', outputMode="append", format='parquet',
+        >>> sdf.writeStream.format('parquet').queryName('query').toTable('output_table')
+        ... # doctest: +SKIP
+
+        >>> sdf.writeStream.trigger(processingTime='5 seconds').toTable(
+        ...     'output_table',
+        ...     queryName='that_query',
+        ...     outputMode="append",
+        ...     format='parquet',
         ...     checkpointLocation='/tmp/checkpoint') # doctest: +SKIP
-        >>> sq.name # doctest: +SKIP
-        'that_query'
-        >>> sq.isActive # doctest: +SKIP
-        True
-        >>> sq.stop() # doctest: +SKIP
         """
         # TODO(SPARK-33659): document the current behavior for DataStreamWriter.toTable API
         self.options(**options)
diff --git a/python/pyspark/sql/tests/test_streaming.py b/python/pyspark/sql/tests/test_streaming.py
@@ -19,7 +19,6 @@
 import shutil
 import tempfile
 import time
-from random import randint
 
 from pyspark.sql import Row
 from pyspark.sql.functions import lit
@@ -572,28 +571,27 @@ def collectBatch(df, id):
                 q.stop()
 
     def test_streaming_read_from_table(self):
-        input_table_name = "sample_input_table_%d" % randint(0, 100000000)
-        self.spark.sql("CREATE TABLE %s (value string) USING parquet" % input_table_name)
-        self.spark.sql("INSERT INTO %s VALUES ('aaa'), ('bbb'), ('ccc')" % input_table_name)
-        df = self.spark.readStream.table(input_table_name)
-        self.assertTrue(df.isStreaming)
-        q = df.writeStream.format('memory').queryName('this_query').start()
-        q.processAllAvailable()
-        q.stop()
-        result = self.spark.sql("SELECT * FROM this_query ORDER BY value").collect()
-        self.assertEqual([Row(value='aaa'), Row(value='bbb'), Row(value='ccc')], result)
+        with self.table("input_table", "this_query"):
+            self.spark.sql("CREATE TABLE input_table (value string) USING parquet")
+            self.spark.sql("INSERT INTO input_table VALUES ('aaa'), ('bbb'), ('ccc')")
+            df = self.spark.readStream.table("input_table")
+            self.assertTrue(df.isStreaming)
+            q = df.writeStream.format('memory').queryName('this_query').start()
+            q.processAllAvailable()
+            q.stop()
+            result = self.spark.sql("SELECT * FROM this_query ORDER BY value").collect()
+            self.assertEqual(
+                set([Row(value='aaa'), Row(value='bbb'), Row(value='ccc')]), set(result))
 
     def test_streaming_write_to_table(self):
-        output_table_name = "sample_output_table_%d" % randint(0, 100000000)
-        tmpPath = tempfile.mkdtemp()
-        shutil.rmtree(tmpPath)
-        df = self.spark.readStream.format("rate").option("rowsPerSecond", 10).load()
-        q = df.writeStream.toTable(output_table_name, format='parquet', checkpointLocation=tmpPath)
-        self.assertTrue(q.isActive)
-        time.sleep(3)
-        q.stop()
-        result = self.spark.sql("SELECT value FROM %s" % output_table_name).collect()
-        self.assertTrue(len(result) > 0)
+        with self.table("output_table"), tempfile.TemporaryDirectory() as tmpdir:
+            df = self.spark.readStream.format("rate").option("rowsPerSecond", 10).load()
+            q = df.writeStream.toTable("output_table", format='parquet', checkpointLocation=tmpdir)
+            self.assertTrue(q.isActive)
+            time.sleep(3)
+            q.stop()
+            result = self.spark.sql("SELECT value FROM output_table").collect()
+            self.assertTrue(len(result) > 0)
 
 
 if __name__ == "__main__":