From 876f2503fb95084374f624928c04c315785a0a1a Mon Sep 17 00:00:00 2001 From: Rerngvit Yanggratoke Date: Thu, 8 Oct 2015 19:06:16 +0200 Subject: [PATCH 1/4] [SPARK-10905] Rebase to master --- R/pkg/NAMESPACE | 1 + R/pkg/R/DataFrame.R | 5 ++++- R/pkg/R/generics.R | 4 ++++ R/pkg/R/stats.R | 27 +++++++++++++++++++++++++++ R/pkg/inst/tests/test_sparkSQL.R | 23 +++++++++++++++++++++++ 5 files changed, 59 insertions(+), 1 deletion(-) diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE index 9aad35469bbb..255be2e76ff4 100644 --- a/R/pkg/NAMESPACE +++ b/R/pkg/NAMESPACE @@ -40,6 +40,7 @@ exportMethods("arrange", "fillna", "filter", "first", + "freqItems", "group_by", "groupBy", "head", diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index 85db3a5ed370..5c576fd1c761 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -1849,4 +1849,7 @@ setMethod("as.data.frame", stop(paste("Unused argument(s): ", paste(list(...), collapse=", "))) } collect(x) - }) + } +) + + diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R index e9086fdbd18c..0b153e9e27e5 100644 --- a/R/pkg/R/generics.R +++ b/R/pkg/R/generics.R @@ -63,6 +63,10 @@ setGeneric("countByValue", function(x) { standardGeneric("countByValue") }) # @export setGeneric("crosstab", function(x, col1, col2) { standardGeneric("crosstab") }) +# @rdname freqItems +# @export +setGeneric("freqItems", function(x, ..., support = 0.01) { standardGeneric("freqItems") }) + # @rdname distinct # @export setGeneric("distinct", function(x, numPartitions = 1) { standardGeneric("distinct") }) diff --git a/R/pkg/R/stats.R b/R/pkg/R/stats.R index 06382d55d086..ba7010c171b2 100644 --- a/R/pkg/R/stats.R +++ b/R/pkg/R/stats.R @@ -100,3 +100,30 @@ setMethod("corr", statFunctions <- callJMethod(x@sdf, "stat") callJMethod(statFunctions, "corr", col1, col2, method) }) + +#' freqItems +#' +#' Finding frequent items for columns, possibly with false positives. +#' Using the frequent element count algorithm described in +#' \url{http://dx.doi.org/10.1145/762471.762473}, proposed by Karp, Schenker, and Papadimitriou. +#' +#' @param x A SparkSQL DataFrame. +#' @param cols A vector column names to search frequent items in. +#' @param support (Optional) The minimum frequency for an item to be considered `frequent`. +#' Should be greater than 1e-4. Default support = 0.01. +#' @return a local R data.frame with the frequent items in each column +#' +#' @rdname freqItems +#' @name freqItems +#' @export +#' @examples +#' \dontrun{ +#' df <- jsonFile(sqlCtx, "/path/to/file.json") +#' fi = freqItems(df, c("title", "gender")) +#' } +setMethod("freqItems", signature(x = "DataFrame", cols = "character"), + function(x, cols, support) { + statFunctions <- callJMethod(x@sdf, "stat") + sct <- callJMethod(statFunctions, "freqItems", as.list(cols), support) + collect(dataFrame(sct)) + }) diff --git a/R/pkg/inst/tests/test_sparkSQL.R b/R/pkg/inst/tests/test_sparkSQL.R index bcf52b8fa788..5cb6417602cf 100644 --- a/R/pkg/inst/tests/test_sparkSQL.R +++ b/R/pkg/inst/tests/test_sparkSQL.R @@ -1340,6 +1340,29 @@ test_that("cov() and corr() on a DataFrame", { result <- corr(df, "singles", "doubles", "pearson") expect_true(abs(result - 1.0) < 1e-12) }) + +test_that("freqItems() on a DataFrame", { + input <- 1:1000 + rdf <- data.frame(numbers = input, letters = as.character(input), + negDoubles = input * -1.0, stringsAsFactors = F) + rdf[ input %% 3 == 0, ] <- c(1, "1", -1) + df <- createDataFrame(sqlContext, rdf) + multiColResults <- freqItems(df, c("numbers", "letters"), support=0.1) + expect_true(1 %in% multiColResults$numbers[[1]]) + expect_true("1" %in% multiColResults$letters[[1]]) + singleColResult <- freqItems(df, "negDoubles", support=0.1) + expect_true(-1 %in% head(singleColResult$negDoubles)[[1]]) +}) + +test_that("freqItems2() on a DataFrame", { + l <- lapply(c(0:99), function(i) { + if (i %% 2 == 0) { list(1L, -1.0) } + else { list(i, i * -1.0) }}) + df <- createDataFrame(sqlContext, l, c("a", "b")) + result <- freqItems(df, c("a", "b"), 0.4) + expect_identical(result[[1]], list(list(1L, 99L))) + expect_identical(result[[2]], list(list(-1, -99))) +}) test_that("SQL error message is returned from JVM", { retError <- tryCatch(sql(sqlContext, "select * from blah"), error = function(e) e) From 781a8b93d44d575b0025365912db35cb751605c8 Mon Sep 17 00:00:00 2001 From: Rerngvit Yanggratoke Date: Sat, 3 Oct 2015 11:30:05 +0200 Subject: [PATCH 2/4] [SPARK-10905] Fix R code style issues --- R/pkg/R/generics.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R index 0b153e9e27e5..f52b56e05c1f 100644 --- a/R/pkg/R/generics.R +++ b/R/pkg/R/generics.R @@ -65,7 +65,7 @@ setGeneric("crosstab", function(x, col1, col2) { standardGeneric("crosstab") }) # @rdname freqItems # @export -setGeneric("freqItems", function(x, ..., support = 0.01) { standardGeneric("freqItems") }) +setGeneric("freqItems", function(x, cols, support = 0.01) { standardGeneric("freqItems") }) # @rdname distinct # @export From 1339e24e45eae6c09964d4e0808445e176c6feee Mon Sep 17 00:00:00 2001 From: Rerngvit Yanggratoke Date: Thu, 8 Oct 2015 19:42:56 +0200 Subject: [PATCH 3/4] [SPARK-10905] Revised according to comments - Move code to stats.R - Revised @rdname document - Document the x dataframe - Add additional testcase - Convert input columns to a vector instead of R ellipsis --- R/pkg/R/DataFrame.R | 5 +---- R/pkg/R/generics.R | 2 +- R/pkg/R/stats.R | 2 +- R/pkg/inst/tests/test_sparkSQL.R | 2 +- 4 files changed, 4 insertions(+), 7 deletions(-) diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index 5c576fd1c761..85db3a5ed370 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -1849,7 +1849,4 @@ setMethod("as.data.frame", stop(paste("Unused argument(s): ", paste(list(...), collapse=", "))) } collect(x) - } -) - - + }) diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R index f52b56e05c1f..c4474131804b 100644 --- a/R/pkg/R/generics.R +++ b/R/pkg/R/generics.R @@ -63,7 +63,7 @@ setGeneric("countByValue", function(x) { standardGeneric("countByValue") }) # @export setGeneric("crosstab", function(x, col1, col2) { standardGeneric("crosstab") }) -# @rdname freqItems +# @rdname statfunctions # @export setGeneric("freqItems", function(x, cols, support = 0.01) { standardGeneric("freqItems") }) diff --git a/R/pkg/R/stats.R b/R/pkg/R/stats.R index ba7010c171b2..756934bef27f 100644 --- a/R/pkg/R/stats.R +++ b/R/pkg/R/stats.R @@ -113,7 +113,7 @@ setMethod("corr", #' Should be greater than 1e-4. Default support = 0.01. #' @return a local R data.frame with the frequent items in each column #' -#' @rdname freqItems +#' @rdname statfunctions #' @name freqItems #' @export #' @examples diff --git a/R/pkg/inst/tests/test_sparkSQL.R b/R/pkg/inst/tests/test_sparkSQL.R index 5cb6417602cf..e59d3d7e8b0a 100644 --- a/R/pkg/inst/tests/test_sparkSQL.R +++ b/R/pkg/inst/tests/test_sparkSQL.R @@ -1340,7 +1340,7 @@ test_that("cov() and corr() on a DataFrame", { result <- corr(df, "singles", "doubles", "pearson") expect_true(abs(result - 1.0) < 1e-12) }) - + test_that("freqItems() on a DataFrame", { input <- 1:1000 rdf <- data.frame(numbers = input, letters = as.character(input), From 404c8b2e99397bd62c92cc817cdad4651aec8b42 Mon Sep 17 00:00:00 2001 From: Rerngvit Yanggratoke Date: Fri, 9 Oct 2015 09:45:33 +0200 Subject: [PATCH 4/4] [SPARK-10905] revised according to comments --- R/pkg/R/stats.R | 4 ++-- R/pkg/inst/tests/test_sparkSQL.R | 2 -- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/R/pkg/R/stats.R b/R/pkg/R/stats.R index 756934bef27f..4928cf4d4367 100644 --- a/R/pkg/R/stats.R +++ b/R/pkg/R/stats.R @@ -118,11 +118,11 @@ setMethod("corr", #' @export #' @examples #' \dontrun{ -#' df <- jsonFile(sqlCtx, "/path/to/file.json") +#' df <- jsonFile(sqlContext, "/path/to/file.json") #' fi = freqItems(df, c("title", "gender")) #' } setMethod("freqItems", signature(x = "DataFrame", cols = "character"), - function(x, cols, support) { + function(x, cols, support = 0.01) { statFunctions <- callJMethod(x@sdf, "stat") sct <- callJMethod(statFunctions, "freqItems", as.list(cols), support) collect(dataFrame(sct)) diff --git a/R/pkg/inst/tests/test_sparkSQL.R b/R/pkg/inst/tests/test_sparkSQL.R index e59d3d7e8b0a..f77c17bb950b 100644 --- a/R/pkg/inst/tests/test_sparkSQL.R +++ b/R/pkg/inst/tests/test_sparkSQL.R @@ -1352,9 +1352,7 @@ test_that("freqItems() on a DataFrame", { expect_true("1" %in% multiColResults$letters[[1]]) singleColResult <- freqItems(df, "negDoubles", support=0.1) expect_true(-1 %in% head(singleColResult$negDoubles)[[1]]) -}) -test_that("freqItems2() on a DataFrame", { l <- lapply(c(0:99), function(i) { if (i %% 2 == 0) { list(1L, -1.0) } else { list(i, i * -1.0) }})