Skip to content

Commit 59cdcc0

Browse files
Davies Liurxin
authored andcommitted
[SPARK-9978] [PYSPARK] [SQL] fix Window.orderBy and doc of ntile()
Author: Davies Liu <[email protected]> Closes #8213 from davies/fix_window. (cherry picked from commit 11ed2b1) Signed-off-by: Reynold Xin <[email protected]>
1 parent 130e06e commit 59cdcc0

File tree

3 files changed

+28
-4
lines changed

3 files changed

+28
-4
lines changed

python/pyspark/sql/functions.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -530,9 +530,10 @@ def lead(col, count=1, default=None):
530530
@since(1.4)
531531
def ntile(n):
532532
"""
533-
Window function: returns a group id from 1 to `n` (inclusive) in a round-robin fashion in
534-
a window partition. Fow example, if `n` is 3, the first row will get 1, the second row will
535-
get 2, the third row will get 3, and the fourth row will get 1...
533+
Window function: returns the ntile group id (from 1 to `n` inclusive)
534+
in an ordered window partition. Fow example, if `n` is 4, the first
535+
quarter of the rows will get value 1, the second quarter will get 2,
536+
the third quarter will get 3, and the last quarter will get 4.
536537
537538
This is equivalent to the NTILE function in SQL.
538539

python/pyspark/sql/tests.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1124,5 +1124,28 @@ def test_window_functions(self):
11241124
for r, ex in zip(rs, expected):
11251125
self.assertEqual(tuple(r), ex[:len(r)])
11261126

1127+
def test_window_functions_without_partitionBy(self):
1128+
df = self.sqlCtx.createDataFrame([(1, "1"), (2, "2"), (1, "2"), (1, "2")], ["key", "value"])
1129+
w = Window.orderBy("key", df.value)
1130+
from pyspark.sql import functions as F
1131+
sel = df.select(df.value, df.key,
1132+
F.max("key").over(w.rowsBetween(0, 1)),
1133+
F.min("key").over(w.rowsBetween(0, 1)),
1134+
F.count("key").over(w.rowsBetween(float('-inf'), float('inf'))),
1135+
F.rowNumber().over(w),
1136+
F.rank().over(w),
1137+
F.denseRank().over(w),
1138+
F.ntile(2).over(w))
1139+
rs = sorted(sel.collect())
1140+
expected = [
1141+
("1", 1, 1, 1, 4, 1, 1, 1, 1),
1142+
("2", 1, 1, 1, 4, 2, 2, 2, 1),
1143+
("2", 1, 2, 1, 4, 3, 2, 2, 2),
1144+
("2", 2, 2, 2, 4, 4, 4, 3, 2)
1145+
]
1146+
for r, ex in zip(rs, expected):
1147+
self.assertEqual(tuple(r), ex[:len(r)])
1148+
1149+
11271150
if __name__ == "__main__":
11281151
unittest.main()

python/pyspark/sql/window.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ def orderBy(*cols):
6464
Creates a :class:`WindowSpec` with the partitioning defined.
6565
"""
6666
sc = SparkContext._active_spark_context
67-
jspec = sc._jvm.org.apache.spark.sql.expressions.Window.partitionBy(_to_java_cols(cols))
67+
jspec = sc._jvm.org.apache.spark.sql.expressions.Window.orderBy(_to_java_cols(cols))
6868
return WindowSpec(jspec)
6969

7070

0 commit comments

Comments
 (0)