@@ -500,6 +500,41 @@ def partitionBy(self, *cols):
500500 self ._jwrite = self ._jwrite .partitionBy (_to_seq (self ._spark ._sc , cols ))
501501 return self
502502
503+ @since (2.0 )
504+ def bucketBy (self , numBuckets , * cols ):
505+ """Buckets the output by the given columns on the file system.
506+
507+ :param numBuckets: the number of buckets to save
508+ :param cols: name of columns
509+
510+ >>> df.write.format('parquet').bucketBy('year', 'month').saveAsTable('bucketed_table')
511+ """
512+ if len (cols ) == 1 and isinstance (cols [0 ], (list , tuple )):
513+ cols = cols [0 ]
514+
515+ col = cols [0 ]
516+ cols = cols [1 :]
517+
518+ self ._jwrite = self ._jwrite .bucketBy (numBuckets , col , _to_seq (self ._spark ._sc , cols ))
519+ return self
520+
521+ @since (2.0 )
522+ def sortBy (self , * cols ):
523+ """Sorts the output in each bucket by the given columns on the file system.
524+
525+ :param cols: name of columns
526+
527+ >>> df.write.format('parquet').bucketBy('year', 'month').sortBy('day').saveAsTable('sorted_bucketed_table')
528+ """
529+ if len (cols ) == 1 and isinstance (cols [0 ], (list , tuple )):
530+ cols = cols [0 ]
531+
532+ col = cols [0 ]
533+ cols = cols [1 :]
534+
535+ self ._jwrite = self ._jwrite .sortBy (col , _to_seq (self ._spark ._sc , cols ))
536+ return self
537+
503538 @since (1.4 )
504539 def save (self , path = None , format = None , mode = None , partitionBy = None , ** options ):
505540 """Saves the contents of the :class:`DataFrame` to a data source.
0 commit comments