From 876f2503fb95084374f624928c04c315785a0a1a Mon Sep 17 00:00:00 2001
From: Rerngvit Yanggratoke <rerngvit@kth.se>
Date: Thu, 8 Oct 2015 19:06:16 +0200
Subject: [PATCH 1/4] [SPARK-10905] Rebase to master

---
 R/pkg/NAMESPACE                  |  1 +
 R/pkg/R/DataFrame.R              |  5 ++++-
 R/pkg/R/generics.R               |  4 ++++
 R/pkg/R/stats.R                  | 27 +++++++++++++++++++++++++++
 R/pkg/inst/tests/test_sparkSQL.R | 23 +++++++++++++++++++++++
 5 files changed, 59 insertions(+), 1 deletion(-)

diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
index 9aad35469bbb..255be2e76ff4 100644
--- a/R/pkg/NAMESPACE
+++ b/R/pkg/NAMESPACE
@@ -40,6 +40,7 @@ exportMethods("arrange",
               "fillna",
               "filter",
               "first",
+              "freqItems",
               "group_by",
               "groupBy",
               "head",
diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index 85db3a5ed370..5c576fd1c761 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -1849,4 +1849,7 @@ setMethod("as.data.frame",
               stop(paste("Unused argument(s): ", paste(list(...), collapse=", ")))
             }
             collect(x)
-          })
+          }
+)
+
+
diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
index e9086fdbd18c..0b153e9e27e5 100644
--- a/R/pkg/R/generics.R
+++ b/R/pkg/R/generics.R
@@ -63,6 +63,10 @@ setGeneric("countByValue", function(x) { standardGeneric("countByValue") })
 # @export
 setGeneric("crosstab", function(x, col1, col2) { standardGeneric("crosstab") })
 
+# @rdname freqItems
+# @export
+setGeneric("freqItems", function(x, ..., support = 0.01) { standardGeneric("freqItems") })
+
 # @rdname distinct
 # @export
 setGeneric("distinct", function(x, numPartitions = 1) { standardGeneric("distinct") })
diff --git a/R/pkg/R/stats.R b/R/pkg/R/stats.R
index 06382d55d086..ba7010c171b2 100644
--- a/R/pkg/R/stats.R
+++ b/R/pkg/R/stats.R
@@ -100,3 +100,30 @@ setMethod("corr",
             statFunctions <- callJMethod(x@sdf, "stat")
             callJMethod(statFunctions, "corr", col1, col2, method)
           })
+
+#' freqItems
+#'
+#' Finding frequent items for columns, possibly with false positives.
+#' Using the frequent element count algorithm described in
+#' \url{http://dx.doi.org/10.1145/762471.762473}, proposed by Karp, Schenker, and Papadimitriou.
+#'
+#' @param x A SparkSQL DataFrame.
+#' @param cols A vector column names to search frequent items in.
+#' @param support (Optional) The minimum frequency for an item to be considered `frequent`. 
+#'                Should be greater than 1e-4. Default support = 0.01.
+#' @return a local R data.frame with the frequent items in each column
+#'
+#' @rdname freqItems
+#' @name freqItems
+#' @export
+#' @examples
+#' \dontrun{
+#' df <- jsonFile(sqlCtx, "/path/to/file.json")
+#' fi = freqItems(df, c("title", "gender"))
+#' }
+setMethod("freqItems", signature(x = "DataFrame", cols = "character"),
+          function(x, cols, support) {
+            statFunctions <- callJMethod(x@sdf, "stat")
+            sct <- callJMethod(statFunctions, "freqItems", as.list(cols), support)
+            collect(dataFrame(sct))
+          })
diff --git a/R/pkg/inst/tests/test_sparkSQL.R b/R/pkg/inst/tests/test_sparkSQL.R
index bcf52b8fa788..5cb6417602cf 100644
--- a/R/pkg/inst/tests/test_sparkSQL.R
+++ b/R/pkg/inst/tests/test_sparkSQL.R
@@ -1340,6 +1340,29 @@ test_that("cov() and corr() on a DataFrame", {
   result <- corr(df, "singles", "doubles", "pearson")
   expect_true(abs(result - 1.0) < 1e-12)
 })
+  
+test_that("freqItems() on a DataFrame", {
+  input <- 1:1000
+  rdf <- data.frame(numbers = input, letters = as.character(input),
+                    negDoubles = input * -1.0, stringsAsFactors = F)
+  rdf[ input %% 3 == 0, ] <- c(1, "1", -1)
+  df <- createDataFrame(sqlContext, rdf)
+  multiColResults <- freqItems(df, c("numbers", "letters"), support=0.1)
+  expect_true(1 %in% multiColResults$numbers[[1]])
+  expect_true("1" %in% multiColResults$letters[[1]])
+  singleColResult <- freqItems(df, "negDoubles", support=0.1)
+  expect_true(-1 %in% head(singleColResult$negDoubles)[[1]])
+})
+
+test_that("freqItems2() on a DataFrame", {
+  l <- lapply(c(0:99), function(i) {
+    if (i %% 2 == 0) { list(1L, -1.0) }
+    else { list(i, i * -1.0) }})
+  df <- createDataFrame(sqlContext, l, c("a", "b"))
+  result <- freqItems(df, c("a", "b"), 0.4)
+  expect_identical(result[[1]], list(list(1L, 99L)))
+  expect_identical(result[[2]], list(list(-1, -99)))
+})
 
 test_that("SQL error message is returned from JVM", {
   retError <- tryCatch(sql(sqlContext, "select * from blah"), error = function(e) e)

From 781a8b93d44d575b0025365912db35cb751605c8 Mon Sep 17 00:00:00 2001
From: Rerngvit Yanggratoke <rerngvit@kth.se>
Date: Sat, 3 Oct 2015 11:30:05 +0200
Subject: [PATCH 2/4] [SPARK-10905] Fix R code style issues

---
 R/pkg/R/generics.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
index 0b153e9e27e5..f52b56e05c1f 100644
--- a/R/pkg/R/generics.R
+++ b/R/pkg/R/generics.R
@@ -65,7 +65,7 @@ setGeneric("crosstab", function(x, col1, col2) { standardGeneric("crosstab") })
 
 # @rdname freqItems
 # @export
-setGeneric("freqItems", function(x, ..., support = 0.01) { standardGeneric("freqItems") })
+setGeneric("freqItems", function(x, cols, support = 0.01) { standardGeneric("freqItems") })
 
 # @rdname distinct
 # @export

From 1339e24e45eae6c09964d4e0808445e176c6feee Mon Sep 17 00:00:00 2001
From: Rerngvit Yanggratoke <rerngvit@kth.se>
Date: Thu, 8 Oct 2015 19:42:56 +0200
Subject: [PATCH 3/4] [SPARK-10905] Revised according to comments     - Move
 code to stats.R     - Revised @rdname document     - Document the x dataframe
     - Add additional testcase     - Convert input columns to a vector instead
 of R ellipsis

---
 R/pkg/R/DataFrame.R              | 5 +----
 R/pkg/R/generics.R               | 2 +-
 R/pkg/R/stats.R                  | 2 +-
 R/pkg/inst/tests/test_sparkSQL.R | 2 +-
 4 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index 5c576fd1c761..85db3a5ed370 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -1849,7 +1849,4 @@ setMethod("as.data.frame",
               stop(paste("Unused argument(s): ", paste(list(...), collapse=", ")))
             }
             collect(x)
-          }
-)
-
-
+          })
diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
index f52b56e05c1f..c4474131804b 100644
--- a/R/pkg/R/generics.R
+++ b/R/pkg/R/generics.R
@@ -63,7 +63,7 @@ setGeneric("countByValue", function(x) { standardGeneric("countByValue") })
 # @export
 setGeneric("crosstab", function(x, col1, col2) { standardGeneric("crosstab") })
 
-# @rdname freqItems
+# @rdname statfunctions
 # @export
 setGeneric("freqItems", function(x, cols, support = 0.01) { standardGeneric("freqItems") })
 
diff --git a/R/pkg/R/stats.R b/R/pkg/R/stats.R
index ba7010c171b2..756934bef27f 100644
--- a/R/pkg/R/stats.R
+++ b/R/pkg/R/stats.R
@@ -113,7 +113,7 @@ setMethod("corr",
 #'                Should be greater than 1e-4. Default support = 0.01.
 #' @return a local R data.frame with the frequent items in each column
 #'
-#' @rdname freqItems
+#' @rdname statfunctions
 #' @name freqItems
 #' @export
 #' @examples
diff --git a/R/pkg/inst/tests/test_sparkSQL.R b/R/pkg/inst/tests/test_sparkSQL.R
index 5cb6417602cf..e59d3d7e8b0a 100644
--- a/R/pkg/inst/tests/test_sparkSQL.R
+++ b/R/pkg/inst/tests/test_sparkSQL.R
@@ -1340,7 +1340,7 @@ test_that("cov() and corr() on a DataFrame", {
   result <- corr(df, "singles", "doubles", "pearson")
   expect_true(abs(result - 1.0) < 1e-12)
 })
-  
+
 test_that("freqItems() on a DataFrame", {
   input <- 1:1000
   rdf <- data.frame(numbers = input, letters = as.character(input),

From 404c8b2e99397bd62c92cc817cdad4651aec8b42 Mon Sep 17 00:00:00 2001
From: Rerngvit Yanggratoke <rerngvit@kth.se>
Date: Fri, 9 Oct 2015 09:45:33 +0200
Subject: [PATCH 4/4] [SPARK-10905] revised according to comments

---
 R/pkg/R/stats.R                  | 4 ++--
 R/pkg/inst/tests/test_sparkSQL.R | 2 --
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/R/pkg/R/stats.R b/R/pkg/R/stats.R
index 756934bef27f..4928cf4d4367 100644
--- a/R/pkg/R/stats.R
+++ b/R/pkg/R/stats.R
@@ -118,11 +118,11 @@ setMethod("corr",
 #' @export
 #' @examples
 #' \dontrun{
-#' df <- jsonFile(sqlCtx, "/path/to/file.json")
+#' df <- jsonFile(sqlContext, "/path/to/file.json")
 #' fi = freqItems(df, c("title", "gender"))
 #' }
 setMethod("freqItems", signature(x = "DataFrame", cols = "character"),
-          function(x, cols, support) {
+          function(x, cols, support = 0.01) {
             statFunctions <- callJMethod(x@sdf, "stat")
             sct <- callJMethod(statFunctions, "freqItems", as.list(cols), support)
             collect(dataFrame(sct))
diff --git a/R/pkg/inst/tests/test_sparkSQL.R b/R/pkg/inst/tests/test_sparkSQL.R
index e59d3d7e8b0a..f77c17bb950b 100644
--- a/R/pkg/inst/tests/test_sparkSQL.R
+++ b/R/pkg/inst/tests/test_sparkSQL.R
@@ -1352,9 +1352,7 @@ test_that("freqItems() on a DataFrame", {
   expect_true("1" %in% multiColResults$letters[[1]])
   singleColResult <- freqItems(df, "negDoubles", support=0.1)
   expect_true(-1 %in% head(singleColResult$negDoubles)[[1]])
-})
 
-test_that("freqItems2() on a DataFrame", {
   l <- lapply(c(0:99), function(i) {
     if (i %% 2 == 0) { list(1L, -1.0) }
     else { list(i, i * -1.0) }})