Skip to content

Commit 2e84e2d

Browse files
author
Sun Rui
committed
[SPARK-11210][SPARKR] Add window functions into SparkR [step 2].
1 parent 3cac661 commit 2e84e2d

File tree

4 files changed

+106
-0
lines changed

4 files changed

+106
-0
lines changed

R/pkg/NAMESPACE

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,7 @@ exportMethods("%in%",
126126
"datediff",
127127
"dayofmonth",
128128
"dayofyear",
129+
"denseRank",
129130
"desc",
130131
"endsWith",
131132
"exp",
@@ -182,16 +183,19 @@ exportMethods("%in%",
182183
"next_day",
183184
"ntile",
184185
"otherwise",
186+
"percentRank",
185187
"pmod",
186188
"quarter",
187189
"rand",
188190
"randn",
191+
"rank",
189192
"regexp_extract",
190193
"regexp_replace",
191194
"reverse",
192195
"rint",
193196
"rlike",
194197
"round",
198+
"rowNumber",
195199
"rpad",
196200
"rtrim",
197201
"second",

R/pkg/R/functions.R

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2038,6 +2038,28 @@ setMethod("cumeDist",
20382038
column(jc)
20392039
})
20402040

2041+
#' denseRank
2042+
#'
2043+
#' Window function: returns the rank of rows within a window partition, without any gaps.
2044+
#' The difference between rank and denseRank is that denseRank leaves no gaps in ranking
2045+
#' sequence when there are ties. That is, if you were ranking a competition using denseRank
2046+
#' and had three people tie for second place, you would say that all three were in second
2047+
#' place and that the next person came in third.
2048+
#'
2049+
#' This is equivalent to the DENSE_RANK function in SQL.
2050+
#'
2051+
#' @rdname denseRank
2052+
#' @name denseRank
2053+
#' @family window_funcs
2054+
#' @export
2055+
#' @examples \dontrun{denseRank()}
2056+
setMethod("denseRank",
2057+
signature(x = "missing"),
2058+
function() {
2059+
jc <- callJStatic("org.apache.spark.sql.functions", "denseRank")
2060+
column(jc)
2061+
})
2062+
20412063
#' lag
20422064
#'
20432065
#' Window function: returns the value that is `offset` rows before the current row, and
@@ -2111,3 +2133,66 @@ setMethod("ntile",
21112133
jc <- callJStatic("org.apache.spark.sql.functions", "ntile", as.integer(x))
21122134
column(jc)
21132135
})
2136+
2137+
#' percentRank
2138+
#'
2139+
#' Window function: returns the relative rank (i.e. percentile) of rows within a window partition.
2140+
#'
2141+
#' This is computed by:
2142+
#'
2143+
#' (rank of row in its partition - 1) / (number of rows in the partition - 1)
2144+
#'
2145+
#' This is equivalent to the PERCENT_RANK function in SQL.
2146+
#'
2147+
#' @rdname percentRank
2148+
#' @name percentRank
2149+
#' @family window_funcs
2150+
#' @export
2151+
#' @examples \dontrun{percentRank()}
2152+
setMethod("percentRank",
2153+
signature(x = "missing"),
2154+
function() {
2155+
jc <- callJStatic("org.apache.spark.sql.functions", "percentRank")
2156+
column(jc)
2157+
})
2158+
2159+
#' rank
2160+
#'
2161+
#' Window function: returns the rank of rows within a window partition.
2162+
#'
2163+
#' The difference between rank and denseRank is that denseRank leaves no gaps in ranking
2164+
#' sequence when there are ties. That is, if you were ranking a competition using denseRank
2165+
#' and had three people tie for second place, you would say that all three were in second
2166+
#' place and that the next person came in third.
2167+
#'
2168+
#' This is equivalent to the RANK function in SQL.
2169+
#'
2170+
#' @rdname rank
2171+
#' @name rank
2172+
#' @family window_funcs
2173+
#' @export
2174+
#' @examples \dontrun{rank()}
2175+
setMethod("rank",
2176+
signature(x = "missing"),
2177+
function() {
2178+
jc <- callJStatic("org.apache.spark.sql.functions", "rank")
2179+
column(jc)
2180+
})
2181+
2182+
#' rowNumber
2183+
#'
2184+
#' Window function: returns a sequential number starting at 1 within a window partition.
2185+
#'
2186+
#' This is equivalent to the ROW_NUMBER function in SQL.
2187+
#'
2188+
#' @rdname rowNumber
2189+
#' @name rowNumber
2190+
#' @family window_funcs
2191+
#' @export
2192+
#' @examples \dontrun{rowNumber()}
2193+
setMethod("rowNumber",
2194+
signature(x = "missing"),
2195+
function() {
2196+
jc <- callJStatic("org.apache.spark.sql.functions", "rowNumber")
2197+
column(jc)
2198+
})

R/pkg/R/generics.R

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -742,6 +742,10 @@ setGeneric("dayofmonth", function(x) { standardGeneric("dayofmonth") })
742742
#' @export
743743
setGeneric("dayofyear", function(x) { standardGeneric("dayofyear") })
744744

745+
#' @rdname denseRank
746+
#' @export
747+
setGeneric("denseRank", function(x) { standardGeneric("denseRank") })
748+
745749
#' @rdname explode
746750
#' @export
747751
setGeneric("explode", function(x) { standardGeneric("explode") })
@@ -878,6 +882,10 @@ setGeneric("ntile", function(x) { standardGeneric("ntile") })
878882
#' @export
879883
setGeneric("n_distinct", function(x, ...) { standardGeneric("n_distinct") })
880884

885+
#' @rdname percentRank
886+
#' @export
887+
setGeneric("percentRank", function(x) { standardGeneric("percentRank") })
888+
881889
#' @rdname pmod
882890
#' @export
883891
setGeneric("pmod", function(y, x) { standardGeneric("pmod") })
@@ -894,6 +902,10 @@ setGeneric("rand", function(seed) { standardGeneric("rand") })
894902
#' @export
895903
setGeneric("randn", function(seed) { standardGeneric("randn") })
896904

905+
#' @rdname rank
906+
#' @export
907+
setGeneric("rank", function(x) { standardGeneric("rank") })
908+
897909
#' @rdname regexp_extract
898910
#' @export
899911
setGeneric("regexp_extract", function(x, pattern, idx) { standardGeneric("regexp_extract") })
@@ -911,6 +923,10 @@ setGeneric("reverse", function(x) { standardGeneric("reverse") })
911923
#' @export
912924
setGeneric("rint", function(x, ...) { standardGeneric("rint") })
913925

926+
#' @rdname rowNumber
927+
#' @export
928+
setGeneric("rowNumber", function(x) { standardGeneric("rowNumber") })
929+
914930
#' @rdname rpad
915931
#' @export
916932
setGeneric("rpad", function(x, len, pad) { standardGeneric("rpad") })

R/pkg/inst/tests/test_sparkSQL.R

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -831,6 +831,7 @@ test_that("column functions", {
831831
c11 <- to_date(c) + trim(c) + unbase64(c) + unhex(c) + upper(c)
832832
c12 <- lead("col", 1) + lead(c, 1) + lag("col", 1) + lag(c, 1)
833833
c13 <- cumeDist() + ntile(1)
834+
c14 <- denseRank() + percentRank() + rank() + rowNumber()
834835

835836
df <- jsonFile(sqlContext, jsonPath)
836837
df2 <- select(df, between(df$age, c(20, 30)), between(df$age, c(10, 20)))

0 commit comments

Comments
 (0)