Skip to content

Commit 47d9ef7

Browse files
committed
[SPARK-16931][PYTHON] PySpark APIS for bucketBy and sortBy
1 parent 39a2b2e commit 47d9ef7

File tree

1 file changed

+35
-0
lines changed

1 file changed

+35
-0
lines changed

python/pyspark/sql/readwriter.py

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -500,6 +500,41 @@ def partitionBy(self, *cols):
500500
self._jwrite = self._jwrite.partitionBy(_to_seq(self._spark._sc, cols))
501501
return self
502502

503+
@since(2.0)
504+
def bucketBy(self, numBuckets, *cols):
505+
"""Buckets the output by the given columns on the file system.
506+
507+
:param numBuckets: the number of buckets to save
508+
:param cols: name of columns
509+
510+
>>> df.write.format('parquet').bucketBy('year', 'month').saveAsTable('bucketed_table')
511+
"""
512+
if len(cols) == 1 and isinstance(cols[0], (list, tuple)):
513+
cols = cols[0]
514+
515+
col = cols[0]
516+
cols = cols[1:]
517+
518+
self._jwrite = self._jwrite.bucketBy(numBuckets, col, _to_seq(self._spark._sc, cols))
519+
return self
520+
521+
@since(2.0)
522+
def sortBy(self, *cols):
523+
"""Sorts the output in each bucket by the given columns on the file system.
524+
525+
:param cols: name of columns
526+
527+
>>> df.write.format('parquet').bucketBy('year', 'month').sortBy('day').saveAsTable('sorted_bucketed_table')
528+
"""
529+
if len(cols) == 1 and isinstance(cols[0], (list, tuple)):
530+
cols = cols[0]
531+
532+
col = cols[0]
533+
cols = cols[1:]
534+
535+
self._jwrite = self._jwrite.sortBy(col, _to_seq(self._spark._sc, cols))
536+
return self
537+
503538
@since(1.4)
504539
def save(self, path=None, format=None, mode=None, partitionBy=None, **options):
505540
"""Saves the contents of the :class:`DataFrame` to a data source.

0 commit comments

Comments
 (0)