From 2e84e2d7e04de1a86c3eb1a24944a3b5b12a97e1 Mon Sep 17 00:00:00 2001 From: Sun Rui Date: Wed, 21 Oct 2015 17:05:17 +0800 Subject: [PATCH 1/2] [SPARK-11210][SPARKR] Add window functions into SparkR [step 2]. --- R/pkg/NAMESPACE | 4 ++ R/pkg/R/functions.R | 85 ++++++++++++++++++++++++++++++++ R/pkg/R/generics.R | 16 ++++++ R/pkg/inst/tests/test_sparkSQL.R | 1 + 4 files changed, 106 insertions(+) diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE index b73bed312824..cd9537a2655f 100644 --- a/R/pkg/NAMESPACE +++ b/R/pkg/NAMESPACE @@ -126,6 +126,7 @@ exportMethods("%in%", "datediff", "dayofmonth", "dayofyear", + "denseRank", "desc", "endsWith", "exp", @@ -182,16 +183,19 @@ exportMethods("%in%", "next_day", "ntile", "otherwise", + "percentRank", "pmod", "quarter", "rand", "randn", + "rank", "regexp_extract", "regexp_replace", "reverse", "rint", "rlike", "round", + "rowNumber", "rpad", "rtrim", "second", diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R index 366290fe6627..035da4e1da16 100644 --- a/R/pkg/R/functions.R +++ b/R/pkg/R/functions.R @@ -2038,6 +2038,28 @@ setMethod("cumeDist", column(jc) }) +#' denseRank +#' +#' Window function: returns the rank of rows within a window partition, without any gaps. +#' The difference between rank and denseRank is that denseRank leaves no gaps in ranking +#' sequence when there are ties. That is, if you were ranking a competition using denseRank +#' and had three people tie for second place, you would say that all three were in second +#' place and that the next person came in third. +#' +#' This is equivalent to the DENSE_RANK function in SQL. +#' +#' @rdname denseRank +#' @name denseRank +#' @family window_funcs +#' @export +#' @examples \dontrun{denseRank()} +setMethod("denseRank", + signature(x = "missing"), + function() { + jc <- callJStatic("org.apache.spark.sql.functions", "denseRank") + column(jc) + }) + #' lag #' #' Window function: returns the value that is `offset` rows before the current row, and @@ -2111,3 +2133,66 @@ setMethod("ntile", jc <- callJStatic("org.apache.spark.sql.functions", "ntile", as.integer(x)) column(jc) }) + +#' percentRank +#' +#' Window function: returns the relative rank (i.e. percentile) of rows within a window partition. +#' +#' This is computed by: +#' +#' (rank of row in its partition - 1) / (number of rows in the partition - 1) +#' +#' This is equivalent to the PERCENT_RANK function in SQL. +#' +#' @rdname percentRank +#' @name percentRank +#' @family window_funcs +#' @export +#' @examples \dontrun{percentRank()} +setMethod("percentRank", + signature(x = "missing"), + function() { + jc <- callJStatic("org.apache.spark.sql.functions", "percentRank") + column(jc) + }) + +#' rank +#' +#' Window function: returns the rank of rows within a window partition. +#' +#' The difference between rank and denseRank is that denseRank leaves no gaps in ranking +#' sequence when there are ties. That is, if you were ranking a competition using denseRank +#' and had three people tie for second place, you would say that all three were in second +#' place and that the next person came in third. +#' +#' This is equivalent to the RANK function in SQL. +#' +#' @rdname rank +#' @name rank +#' @family window_funcs +#' @export +#' @examples \dontrun{rank()} +setMethod("rank", + signature(x = "missing"), + function() { + jc <- callJStatic("org.apache.spark.sql.functions", "rank") + column(jc) + }) + +#' rowNumber +#' +#' Window function: returns a sequential number starting at 1 within a window partition. +#' +#' This is equivalent to the ROW_NUMBER function in SQL. +#' +#' @rdname rowNumber +#' @name rowNumber +#' @family window_funcs +#' @export +#' @examples \dontrun{rowNumber()} +setMethod("rowNumber", + signature(x = "missing"), + function() { + jc <- callJStatic("org.apache.spark.sql.functions", "rowNumber") + column(jc) + }) diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R index c11c3c8d3e15..42b9769cda6f 100644 --- a/R/pkg/R/generics.R +++ b/R/pkg/R/generics.R @@ -742,6 +742,10 @@ setGeneric("dayofmonth", function(x) { standardGeneric("dayofmonth") }) #' @export setGeneric("dayofyear", function(x) { standardGeneric("dayofyear") }) +#' @rdname denseRank +#' @export +setGeneric("denseRank", function(x) { standardGeneric("denseRank") }) + #' @rdname explode #' @export setGeneric("explode", function(x) { standardGeneric("explode") }) @@ -878,6 +882,10 @@ setGeneric("ntile", function(x) { standardGeneric("ntile") }) #' @export setGeneric("n_distinct", function(x, ...) { standardGeneric("n_distinct") }) +#' @rdname percentRank +#' @export +setGeneric("percentRank", function(x) { standardGeneric("percentRank") }) + #' @rdname pmod #' @export setGeneric("pmod", function(y, x) { standardGeneric("pmod") }) @@ -894,6 +902,10 @@ setGeneric("rand", function(seed) { standardGeneric("rand") }) #' @export setGeneric("randn", function(seed) { standardGeneric("randn") }) +#' @rdname rank +#' @export +setGeneric("rank", function(x) { standardGeneric("rank") }) + #' @rdname regexp_extract #' @export setGeneric("regexp_extract", function(x, pattern, idx) { standardGeneric("regexp_extract") }) @@ -911,6 +923,10 @@ setGeneric("reverse", function(x) { standardGeneric("reverse") }) #' @export setGeneric("rint", function(x, ...) { standardGeneric("rint") }) +#' @rdname rowNumber +#' @export +setGeneric("rowNumber", function(x) { standardGeneric("rowNumber") }) + #' @rdname rpad #' @export setGeneric("rpad", function(x, len, pad) { standardGeneric("rpad") }) diff --git a/R/pkg/inst/tests/test_sparkSQL.R b/R/pkg/inst/tests/test_sparkSQL.R index e1d4499925fe..92f9aba5d80a 100644 --- a/R/pkg/inst/tests/test_sparkSQL.R +++ b/R/pkg/inst/tests/test_sparkSQL.R @@ -831,6 +831,7 @@ test_that("column functions", { c11 <- to_date(c) + trim(c) + unbase64(c) + unhex(c) + upper(c) c12 <- lead("col", 1) + lead(c, 1) + lag("col", 1) + lag(c, 1) c13 <- cumeDist() + ntile(1) + c14 <- denseRank() + percentRank() + rank() + rowNumber() df <- jsonFile(sqlContext, jsonPath) df2 <- select(df, between(df$age, c(20, 30)), between(df$age, c(10, 20))) From b52a98d75b340e0f8d290deae528057bb5d28738 Mon Sep 17 00:00:00 2001 From: Sun Rui Date: Wed, 28 Oct 2015 19:32:06 +0800 Subject: [PATCH 2/2] Expose base::rank(). --- R/pkg/R/functions.R | 7 +++++++ R/pkg/R/generics.R | 2 +- R/pkg/inst/tests/test_sparkSQL.R | 4 ++++ 3 files changed, 12 insertions(+), 1 deletion(-) diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R index 035da4e1da16..d7fd27927913 100644 --- a/R/pkg/R/functions.R +++ b/R/pkg/R/functions.R @@ -2179,6 +2179,13 @@ setMethod("rank", column(jc) }) +# Expose rank() in the R base package +setMethod("rank", + signature(x = "ANY"), + function(x, ...) { + base::rank(x, ...) + }) + #' rowNumber #' #' Window function: returns a sequential number starting at 1 within a window partition. diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R index 42b9769cda6f..0b35340e48e4 100644 --- a/R/pkg/R/generics.R +++ b/R/pkg/R/generics.R @@ -904,7 +904,7 @@ setGeneric("randn", function(seed) { standardGeneric("randn") }) #' @rdname rank #' @export -setGeneric("rank", function(x) { standardGeneric("rank") }) +setGeneric("rank", function(x, ...) { standardGeneric("rank") }) #' @rdname regexp_extract #' @export diff --git a/R/pkg/inst/tests/test_sparkSQL.R b/R/pkg/inst/tests/test_sparkSQL.R index 92f9aba5d80a..b4a4d03b2643 100644 --- a/R/pkg/inst/tests/test_sparkSQL.R +++ b/R/pkg/inst/tests/test_sparkSQL.R @@ -833,6 +833,10 @@ test_that("column functions", { c13 <- cumeDist() + ntile(1) c14 <- denseRank() + percentRank() + rank() + rowNumber() + # Test if base::rank() is exposed + expect_equal(class(rank())[[1]], "Column") + expect_equal(rank(1:3), as.numeric(c(1:3))) + df <- jsonFile(sqlContext, jsonPath) df2 <- select(df, between(df$age, c(20, 30)), between(df$age, c(10, 20))) expect_equal(collect(df2)[[2, 1]], TRUE)