apache · GregBowyer · Aug 6, 2016 · Aug 14, 2016 · davies · Sep 2, 2016
diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py
@@ -20,6 +20,7 @@
 if sys.version >= '3':
     basestring = unicode = str
 
+import logging
 from py4j.java_gateway import JavaClass
 
 from pyspark import RDD, since, keyword_only
@@ -370,7 +371,7 @@ def orc(self, path):
 
         >>> df = spark.read.orc('python/test_support/sql/orc_partitioned')
         >>> df.dtypes
-        [('a', 'bigint'), ('b', 'int'), ('c', 'int')]
+        [('name', 'string'), ('year', 'int'), ('month', 'int'), ('day', 'int')]
         """
         return self._df(self._jreader.orc(path))
 
@@ -501,6 +502,46 @@ def partitionBy(self, *cols):
         self._jwrite = self._jwrite.partitionBy(_to_seq(self._spark._sc, cols))
         return self
 
+    @since(2.1)
+    def bucketBy(self, numBuckets, *cols):
+        """Buckets the output by the given columns on the file system.
+
+        :param numBuckets: the number of buckets to save
+        :param cols: name of columns
+
+        >>> (df.write.format('parquet')
+        ...     .bucketBy(100, 'year', 'month')
+        ...     .saveAsTable('bucketed_data'))
+        """
+        if len(cols) == 1 and isinstance(cols[0], (list, tuple)):
+            cols = cols[0]
+
+        col = cols[0]
+        cols = cols[1:]
+
+        self._jwrite = self._jwrite.bucketBy(numBuckets, col, _to_seq(self._spark._sc, cols))
+        return self
+
+    @since(2.1)
+    def sortBy(self, *cols):
+        """Sorts the output in each bucket by the given columns on the file system.
+
+        :param cols: name of columns
+
+        >>> (df.write.format('parquet')
+        ...     .bucketBy(100, 'year', 'month')
+        ...     .sortBy('day')
+        ...     .saveAsTable('sorted_data'))
+        """
+        if len(cols) == 1 and isinstance(cols[0], (list, tuple)):
+            cols = cols[0]
+
+        col = cols[0]
+        cols = cols[1:]
+
+        self._jwrite = self._jwrite.sortBy(col, _to_seq(self._spark._sc, cols))
+        return self
+
     @since(1.4)
     def save(self, path=None, format=None, mode=None, partitionBy=None, **options):
         """Saves the contents of the :class:`DataFrame` to a data source.
@@ -562,6 +603,8 @@ def saveAsTable(self, name, format=None, mode=None, partitionBy=None, **options)
         :param mode: one of `append`, `overwrite`, `error`, `ignore` (default: error)
         :param partitionBy: names of partitioning columns
         :param options: all other string options
+
+        >>> df.write.saveAsTable('my_table')
         """
         self.mode(mode).options(**options)
         if partitionBy is not None:
@@ -693,8 +736,7 @@ def orc(self, path, mode=None, partitionBy=None, compression=None):
                             This will override ``orc.compress``. If None is set, it uses the
                             default value, ``snappy``.
 
-        >>> orc_df = spark.read.orc('python/test_support/sql/orc_partitioned')
-        >>> orc_df.write.orc(os.path.join(tempfile.mkdtemp(), 'data'))
+        >>> df.write.orc(os.path.join(tempfile.mkdtemp(), 'data'))
         """
         self.mode(mode)
         if partitionBy is not None:
@@ -734,11 +776,22 @@ def _test():
     import os
     import tempfile
     import py4j
+    import shutil
+    from random import Random
+    from time import time
     from pyspark.context import SparkContext
     from pyspark.sql import SparkSession, Row
     import pyspark.sql.readwriter
 
-    os.chdir(os.environ["SPARK_HOME"])
+    spark_home = os.path.realpath(os.environ["SPARK_HOME"])
+
+    test_dir = tempfile.mkdtemp()
+    os.chdir(test_dir)
+
+    path = lambda x, y, z: os.path.join(x, y)
+
+    shutil.copytree(path(spark_home, 'python', 'test_support'),
+                    path(test_dir, 'python', 'test_support'))
 
     globs = pyspark.sql.readwriter.__dict__.copy()
     sc = SparkContext('local[4]', 'PythonTest')
@@ -747,16 +800,25 @@ def _test():
     except py4j.protocol.Py4JError:
         spark = SparkSession(sc)
 
+    seed = int(time() * 1000)
+    rng = Random(seed)
+
+    base_df_format = rng.choice(('orc', 'parquet'))
+    loader = getattr(spark.read, base_df_format)
+    path = os.path.join(test_dir, 'python/test_support/sql/%s_partitioned' % base_df_format)
+    df = loader(path)
+
     globs['tempfile'] = tempfile
     globs['os'] = os
     globs['sc'] = sc
     globs['spark'] = spark
-    globs['df'] = spark.read.parquet('python/test_support/sql/parquet_partitioned')
+    globs['df'] = df
     (failure_count, test_count) = doctest.testmod(
         pyspark.sql.readwriter, globs=globs,
         optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE | doctest.REPORT_NDIFF)
     sc.stop()
     if failure_count:
+        logging.error('Random seed for test: %d', seed)
         exit(-1)
 
 

diff --git a/python/test_support/sql/orc_partitioned/._SUCCESS.crc b/python/test_support/sql/orc_partitioned/._SUCCESS.crc
diff --git a/python/test_support/sql/orc_partitioned/_SUCCESS b/python/test_support/sql/orc_partitioned/_SUCCESS
diff --git a/...rt/sql/orc_partitioned/b=0/c=0/.part-r-00000-829af031-b970-49d6-ad39-30460a0be2c8.orc.crc b/...rt/sql/orc_partitioned/b=0/c=0/.part-r-00000-829af031-b970-49d6-ad39-30460a0be2c8.orc.crc
diff --git a/...support/sql/orc_partitioned/b=0/c=0/part-r-00000-829af031-b970-49d6-ad39-30460a0be2c8.orc b/...support/sql/orc_partitioned/b=0/c=0/part-r-00000-829af031-b970-49d6-ad39-30460a0be2c8.orc
diff --git a/...rt/sql/orc_partitioned/b=1/c=1/.part-r-00000-829af031-b970-49d6-ad39-30460a0be2c8.orc.crc b/...rt/sql/orc_partitioned/b=1/c=1/.part-r-00000-829af031-b970-49d6-ad39-30460a0be2c8.orc.crc
diff --git a/...support/sql/orc_partitioned/b=1/c=1/part-r-00000-829af031-b970-49d6-ad39-30460a0be2c8.orc b/...support/sql/orc_partitioned/b=1/c=1/part-r-00000-829af031-b970-49d6-ad39-30460a0be2c8.orc
diff --git a/...year=2014/month=9/day=1/.part-r-00004-d421fe32-be16-4ee1-9be1-b2560a375fec.snappy.orc.crc b/...year=2014/month=9/day=1/.part-r-00004-d421fe32-be16-4ee1-9be1-b2560a375fec.snappy.orc.crc
diff --git a/...oned/year=2014/month=9/day=1/part-r-00004-d421fe32-be16-4ee1-9be1-b2560a375fec.snappy.orc b/...oned/year=2014/month=9/day=1/part-r-00004-d421fe32-be16-4ee1-9be1-b2560a375fec.snappy.orc
diff --git a/...ar=2015/month=10/day=25/.part-r-00000-d421fe32-be16-4ee1-9be1-b2560a375fec.snappy.orc.crc b/...ar=2015/month=10/day=25/.part-r-00000-d421fe32-be16-4ee1-9be1-b2560a375fec.snappy.orc.crc
diff --git a/...ar=2015/month=10/day=25/.part-r-00001-d421fe32-be16-4ee1-9be1-b2560a375fec.snappy.orc.crc b/...ar=2015/month=10/day=25/.part-r-00001-d421fe32-be16-4ee1-9be1-b2560a375fec.snappy.orc.crc
diff --git a/...ed/year=2015/month=10/day=25/part-r-00000-d421fe32-be16-4ee1-9be1-b2560a375fec.snappy.orc b/...ed/year=2015/month=10/day=25/part-r-00000-d421fe32-be16-4ee1-9be1-b2560a375fec.snappy.orc
diff --git a/...ed/year=2015/month=10/day=25/part-r-00001-d421fe32-be16-4ee1-9be1-b2560a375fec.snappy.orc b/...ed/year=2015/month=10/day=25/part-r-00001-d421fe32-be16-4ee1-9be1-b2560a375fec.snappy.orc
diff --git a/...ar=2015/month=10/day=26/.part-r-00003-d421fe32-be16-4ee1-9be1-b2560a375fec.snappy.orc.crc b/...ar=2015/month=10/day=26/.part-r-00003-d421fe32-be16-4ee1-9be1-b2560a375fec.snappy.orc.crc
diff --git a/...ed/year=2015/month=10/day=26/part-r-00003-d421fe32-be16-4ee1-9be1-b2560a375fec.snappy.orc b/...ed/year=2015/month=10/day=26/part-r-00003-d421fe32-be16-4ee1-9be1-b2560a375fec.snappy.orc
diff --git a/...year=2015/month=9/day=1/.part-r-00002-d421fe32-be16-4ee1-9be1-b2560a375fec.snappy.orc.crc b/...year=2015/month=9/day=1/.part-r-00002-d421fe32-be16-4ee1-9be1-b2560a375fec.snappy.orc.crc
diff --git a/...oned/year=2015/month=9/day=1/part-r-00002-d421fe32-be16-4ee1-9be1-b2560a375fec.snappy.orc b/...oned/year=2015/month=9/day=1/part-r-00002-d421fe32-be16-4ee1-9be1-b2560a375fec.snappy.orc
diff --git a/python/test_support/sql/parquet_partitioned/._SUCCESS.crc b/python/test_support/sql/parquet_partitioned/._SUCCESS.crc
diff --git a/python/test_support/sql/parquet_partitioned/_common_metadata b/python/test_support/sql/parquet_partitioned/_common_metadata
diff --git a/python/test_support/sql/parquet_partitioned/_metadata b/python/test_support/sql/parquet_partitioned/_metadata
diff --git a/...=2014/month=9/day=1/.part-r-00004-49e02526-4b87-4d0d-b081-09a3374f8cf4.snappy.parquet.crc b/...=2014/month=9/day=1/.part-r-00004-49e02526-4b87-4d0d-b081-09a3374f8cf4.snappy.parquet.crc
diff --git a/...test_support/sql/parquet_partitioned/year=2014/month=9/day=1/.part-r-00008.gz.parquet.crc b/...test_support/sql/parquet_partitioned/year=2014/month=9/day=1/.part-r-00008.gz.parquet.crc
diff --git a/.../year=2014/month=9/day=1/part-r-00004-49e02526-4b87-4d0d-b081-09a3374f8cf4.snappy.parquet b/.../year=2014/month=9/day=1/part-r-00004-49e02526-4b87-4d0d-b081-09a3374f8cf4.snappy.parquet
diff --git a/python/test_support/sql/parquet_partitioned/year=2014/month=9/day=1/part-r-00008.gz.parquet b/python/test_support/sql/parquet_partitioned/year=2014/month=9/day=1/part-r-00008.gz.parquet
diff --git a/...015/month=10/day=25/.part-r-00000-49e02526-4b87-4d0d-b081-09a3374f8cf4.snappy.parquet.crc b/...015/month=10/day=25/.part-r-00000-49e02526-4b87-4d0d-b081-09a3374f8cf4.snappy.parquet.crc
diff --git a/...015/month=10/day=25/.part-r-00001-49e02526-4b87-4d0d-b081-09a3374f8cf4.snappy.parquet.crc b/...015/month=10/day=25/.part-r-00001-49e02526-4b87-4d0d-b081-09a3374f8cf4.snappy.parquet.crc
diff --git a/...st_support/sql/parquet_partitioned/year=2015/month=10/day=25/.part-r-00002.gz.parquet.crc b/...st_support/sql/parquet_partitioned/year=2015/month=10/day=25/.part-r-00002.gz.parquet.crc
diff --git a/...st_support/sql/parquet_partitioned/year=2015/month=10/day=25/.part-r-00004.gz.parquet.crc b/...st_support/sql/parquet_partitioned/year=2015/month=10/day=25/.part-r-00004.gz.parquet.crc
diff --git a/...ear=2015/month=10/day=25/part-r-00000-49e02526-4b87-4d0d-b081-09a3374f8cf4.snappy.parquet b/...ear=2015/month=10/day=25/part-r-00000-49e02526-4b87-4d0d-b081-09a3374f8cf4.snappy.parquet
diff --git a/...ear=2015/month=10/day=25/part-r-00001-49e02526-4b87-4d0d-b081-09a3374f8cf4.snappy.parquet b/...ear=2015/month=10/day=25/part-r-00001-49e02526-4b87-4d0d-b081-09a3374f8cf4.snappy.parquet
diff --git a/...on/test_support/sql/parquet_partitioned/year=2015/month=10/day=25/part-r-00002.gz.parquet b/...on/test_support/sql/parquet_partitioned/year=2015/month=10/day=25/part-r-00002.gz.parquet
diff --git a/...on/test_support/sql/parquet_partitioned/year=2015/month=10/day=25/part-r-00004.gz.parquet b/...on/test_support/sql/parquet_partitioned/year=2015/month=10/day=25/part-r-00004.gz.parquet
diff --git a/...015/month=10/day=26/.part-r-00003-49e02526-4b87-4d0d-b081-09a3374f8cf4.snappy.parquet.crc b/...015/month=10/day=26/.part-r-00003-49e02526-4b87-4d0d-b081-09a3374f8cf4.snappy.parquet.crc
diff --git a/...st_support/sql/parquet_partitioned/year=2015/month=10/day=26/.part-r-00005.gz.parquet.crc b/...st_support/sql/parquet_partitioned/year=2015/month=10/day=26/.part-r-00005.gz.parquet.crc
diff --git a/...ear=2015/month=10/day=26/part-r-00003-49e02526-4b87-4d0d-b081-09a3374f8cf4.snappy.parquet b/...ear=2015/month=10/day=26/part-r-00003-49e02526-4b87-4d0d-b081-09a3374f8cf4.snappy.parquet
diff --git a/...on/test_support/sql/parquet_partitioned/year=2015/month=10/day=26/part-r-00005.gz.parquet b/...on/test_support/sql/parquet_partitioned/year=2015/month=10/day=26/part-r-00005.gz.parquet
diff --git a/...=2015/month=9/day=1/.part-r-00002-49e02526-4b87-4d0d-b081-09a3374f8cf4.snappy.parquet.crc b/...=2015/month=9/day=1/.part-r-00002-49e02526-4b87-4d0d-b081-09a3374f8cf4.snappy.parquet.crc
diff --git a/...test_support/sql/parquet_partitioned/year=2015/month=9/day=1/.part-r-00007.gz.parquet.crc b/...test_support/sql/parquet_partitioned/year=2015/month=9/day=1/.part-r-00007.gz.parquet.crc
diff --git a/.../year=2015/month=9/day=1/part-r-00002-49e02526-4b87-4d0d-b081-09a3374f8cf4.snappy.parquet b/.../year=2015/month=9/day=1/part-r-00002-49e02526-4b87-4d0d-b081-09a3374f8cf4.snappy.parquet
diff --git a/python/test_support/sql/parquet_partitioned/year=2015/month=9/day=1/part-r-00007.gz.parquet b/python/test_support/sql/parquet_partitioned/year=2015/month=9/day=1/part-r-00007.gz.parquet