From 2e84e2d7e04de1a86c3eb1a24944a3b5b12a97e1 Mon Sep 17 00:00:00 2001
From: Sun Rui <rui.sun@intel.com>
Date: Wed, 21 Oct 2015 17:05:17 +0800
Subject: [PATCH 1/2] [SPARK-11210][SPARKR] Add window functions into SparkR
 [step 2].

---
 R/pkg/NAMESPACE                  |  4 ++
 R/pkg/R/functions.R              | 85 ++++++++++++++++++++++++++++++++
 R/pkg/R/generics.R               | 16 ++++++
 R/pkg/inst/tests/test_sparkSQL.R |  1 +
 4 files changed, 106 insertions(+)

diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
index b73bed312824..cd9537a2655f 100644
--- a/R/pkg/NAMESPACE
+++ b/R/pkg/NAMESPACE
@@ -126,6 +126,7 @@ exportMethods("%in%",
               "datediff",
               "dayofmonth",
               "dayofyear",
+              "denseRank",
               "desc",
               "endsWith",
               "exp",
@@ -182,16 +183,19 @@ exportMethods("%in%",
               "next_day",
               "ntile",
               "otherwise",
+              "percentRank",
               "pmod",
               "quarter",
               "rand",
               "randn",
+              "rank",
               "regexp_extract",
               "regexp_replace",
               "reverse",
               "rint",
               "rlike",
               "round",
+              "rowNumber",
               "rpad",
               "rtrim",
               "second",
diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R
index 366290fe6627..035da4e1da16 100644
--- a/R/pkg/R/functions.R
+++ b/R/pkg/R/functions.R
@@ -2038,6 +2038,28 @@ setMethod("cumeDist",
             column(jc)
           })
 
+#' denseRank
+#' 
+#' Window function: returns the rank of rows within a window partition, without any gaps.
+#' The difference between rank and denseRank is that denseRank leaves no gaps in ranking
+#' sequence when there are ties. That is, if you were ranking a competition using denseRank
+#' and had three people tie for second place, you would say that all three were in second
+#' place and that the next person came in third.
+#' 
+#' This is equivalent to the DENSE_RANK function in SQL.
+#'
+#' @rdname denseRank
+#' @name denseRank
+#' @family window_funcs
+#' @export
+#' @examples \dontrun{denseRank()}
+setMethod("denseRank",
+          signature(x = "missing"),
+          function() {
+            jc <- callJStatic("org.apache.spark.sql.functions", "denseRank")
+            column(jc)
+          })
+
 #' lag
 #'
 #' Window function: returns the value that is `offset` rows before the current row, and
@@ -2111,3 +2133,66 @@ setMethod("ntile",
             jc <- callJStatic("org.apache.spark.sql.functions", "ntile", as.integer(x))
             column(jc)
           })
+
+#' percentRank
+#'
+#' Window function: returns the relative rank (i.e. percentile) of rows within a window partition.
+#' 
+#' This is computed by:
+#' 
+#'   (rank of row in its partition - 1) / (number of rows in the partition - 1)
+#'
+#' This is equivalent to the PERCENT_RANK function in SQL.
+#'
+#' @rdname percentRank
+#' @name percentRank
+#' @family window_funcs
+#' @export
+#' @examples \dontrun{percentRank()}
+setMethod("percentRank",
+          signature(x = "missing"),
+          function() {
+            jc <- callJStatic("org.apache.spark.sql.functions", "percentRank")
+            column(jc)
+          })
+
+#' rank
+#'
+#' Window function: returns the rank of rows within a window partition.
+#' 
+#' The difference between rank and denseRank is that denseRank leaves no gaps in ranking
+#' sequence when there are ties. That is, if you were ranking a competition using denseRank
+#' and had three people tie for second place, you would say that all three were in second
+#' place and that the next person came in third.
+#' 
+#' This is equivalent to the RANK function in SQL.
+#'
+#' @rdname rank
+#' @name rank
+#' @family window_funcs
+#' @export
+#' @examples \dontrun{rank()}
+setMethod("rank",
+          signature(x = "missing"),
+          function() {
+            jc <- callJStatic("org.apache.spark.sql.functions", "rank")
+            column(jc)
+          })
+
+#' rowNumber
+#'
+#' Window function: returns a sequential number starting at 1 within a window partition.
+#' 
+#' This is equivalent to the ROW_NUMBER function in SQL.
+#'
+#' @rdname rowNumber
+#' @name rowNumber
+#' @family window_funcs
+#' @export
+#' @examples \dontrun{rowNumber()}
+setMethod("rowNumber",
+          signature(x = "missing"),
+          function() {
+            jc <- callJStatic("org.apache.spark.sql.functions", "rowNumber")
+            column(jc)
+          })
diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
index c11c3c8d3e15..42b9769cda6f 100644
--- a/R/pkg/R/generics.R
+++ b/R/pkg/R/generics.R
@@ -742,6 +742,10 @@ setGeneric("dayofmonth", function(x) { standardGeneric("dayofmonth") })
 #' @export
 setGeneric("dayofyear", function(x) { standardGeneric("dayofyear") })
 
+#' @rdname denseRank
+#' @export
+setGeneric("denseRank", function(x) { standardGeneric("denseRank") })
+
 #' @rdname explode
 #' @export
 setGeneric("explode", function(x) { standardGeneric("explode") })
@@ -878,6 +882,10 @@ setGeneric("ntile", function(x) { standardGeneric("ntile") })
 #' @export
 setGeneric("n_distinct", function(x, ...) { standardGeneric("n_distinct") })
 
+#' @rdname percentRank
+#' @export
+setGeneric("percentRank", function(x) { standardGeneric("percentRank") })
+
 #' @rdname pmod
 #' @export
 setGeneric("pmod", function(y, x) { standardGeneric("pmod") })
@@ -894,6 +902,10 @@ setGeneric("rand", function(seed) { standardGeneric("rand") })
 #' @export
 setGeneric("randn", function(seed) { standardGeneric("randn") })
 
+#' @rdname rank
+#' @export
+setGeneric("rank", function(x) { standardGeneric("rank") })
+
 #' @rdname regexp_extract
 #' @export
 setGeneric("regexp_extract", function(x, pattern, idx) { standardGeneric("regexp_extract") })
@@ -911,6 +923,10 @@ setGeneric("reverse", function(x) { standardGeneric("reverse") })
 #' @export
 setGeneric("rint", function(x, ...) { standardGeneric("rint") })
 
+#' @rdname rowNumber
+#' @export
+setGeneric("rowNumber", function(x) { standardGeneric("rowNumber") })
+
 #' @rdname rpad
 #' @export
 setGeneric("rpad", function(x, len, pad) { standardGeneric("rpad") })
diff --git a/R/pkg/inst/tests/test_sparkSQL.R b/R/pkg/inst/tests/test_sparkSQL.R
index e1d4499925fe..92f9aba5d80a 100644
--- a/R/pkg/inst/tests/test_sparkSQL.R
+++ b/R/pkg/inst/tests/test_sparkSQL.R
@@ -831,6 +831,7 @@ test_that("column functions", {
   c11 <- to_date(c) + trim(c) + unbase64(c) + unhex(c) + upper(c)
   c12 <- lead("col", 1) + lead(c, 1) + lag("col", 1) + lag(c, 1)
   c13 <- cumeDist() + ntile(1)
+  c14 <- denseRank() + percentRank() + rank() + rowNumber()
 
   df <- jsonFile(sqlContext, jsonPath)
   df2 <- select(df, between(df$age, c(20, 30)), between(df$age, c(10, 20)))

From b52a98d75b340e0f8d290deae528057bb5d28738 Mon Sep 17 00:00:00 2001
From: Sun Rui <rui.sun@intel.com>
Date: Wed, 28 Oct 2015 19:32:06 +0800
Subject: [PATCH 2/2] Expose base::rank().

---
 R/pkg/R/functions.R              | 7 +++++++
 R/pkg/R/generics.R               | 2 +-
 R/pkg/inst/tests/test_sparkSQL.R | 4 ++++
 3 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R
index 035da4e1da16..d7fd27927913 100644
--- a/R/pkg/R/functions.R
+++ b/R/pkg/R/functions.R
@@ -2179,6 +2179,13 @@ setMethod("rank",
             column(jc)
           })
 
+# Expose rank() in the R base package
+setMethod("rank",
+          signature(x = "ANY"),
+          function(x, ...) {
+            base::rank(x, ...)
+          })
+
 #' rowNumber
 #'
 #' Window function: returns a sequential number starting at 1 within a window partition.
diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
index 42b9769cda6f..0b35340e48e4 100644
--- a/R/pkg/R/generics.R
+++ b/R/pkg/R/generics.R
@@ -904,7 +904,7 @@ setGeneric("randn", function(seed) { standardGeneric("randn") })
 
 #' @rdname rank
 #' @export
-setGeneric("rank", function(x) { standardGeneric("rank") })
+setGeneric("rank", function(x, ...) { standardGeneric("rank") })
 
 #' @rdname regexp_extract
 #' @export
diff --git a/R/pkg/inst/tests/test_sparkSQL.R b/R/pkg/inst/tests/test_sparkSQL.R
index 92f9aba5d80a..b4a4d03b2643 100644
--- a/R/pkg/inst/tests/test_sparkSQL.R
+++ b/R/pkg/inst/tests/test_sparkSQL.R
@@ -833,6 +833,10 @@ test_that("column functions", {
   c13 <- cumeDist() + ntile(1)
   c14 <- denseRank() + percentRank() + rank() + rowNumber()
 
+  # Test if base::rank() is exposed
+  expect_equal(class(rank())[[1]], "Column")
+  expect_equal(rank(1:3), as.numeric(c(1:3)))
+
   df <- jsonFile(sqlContext, jsonPath)
   df2 <- select(df, between(df$age, c(20, 30)), between(df$age, c(10, 20)))
   expect_equal(collect(df2)[[2, 1]], TRUE)