[SPARK-16931][PYTHON] PySpark APIS for bucketBy and sortBy

GregBowyer · GregBowyer · commit 47d9ef797e22 · 2016-08-05T17:59:27.000-07:00
diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py
@@ -500,6 +500,41 @@ def partitionBy(self, *cols):
         self._jwrite = self._jwrite.partitionBy(_to_seq(self._spark._sc, cols))
         return self
 
+    @since(2.0)
+    def bucketBy(self, numBuckets, *cols):
+        """Buckets the output by the given columns on the file system.
+
+        :param numBuckets: the number of buckets to save
+        :param cols: name of columns
+
+        >>> df.write.format('parquet').bucketBy('year', 'month').saveAsTable('bucketed_table')
+        """
+        if len(cols) == 1 and isinstance(cols[0], (list, tuple)):
+            cols = cols[0]
+
+        col = cols[0]
+        cols = cols[1:]
+
+        self._jwrite = self._jwrite.bucketBy(numBuckets, col, _to_seq(self._spark._sc, cols))
+        return self
+
+    @since(2.0)
+    def sortBy(self, *cols):
+        """Sorts the output in each bucket by the given columns on the file system.
+
+        :param cols: name of columns
+
+        >>> df.write.format('parquet').bucketBy('year', 'month').sortBy('day').saveAsTable('sorted_bucketed_table')
+        """
+        if len(cols) == 1 and isinstance(cols[0], (list, tuple)):
+            cols = cols[0]
+
+        col = cols[0]
+        cols = cols[1:]
+
+        self._jwrite = self._jwrite.sortBy(col, _to_seq(self._spark._sc, cols))
+        return self
+
     @since(1.4)
     def save(self, path=None, format=None, mode=None, partitionBy=None, **options):
         """Saves the contents of the :class:`DataFrame` to a data source.