Skip to content

Commit 70f44ad

Browse files
rerngvitshivaram
authored andcommitted
[SPARK-10905] [SPARKR] Export freqItems() for DataFrameStatFunctions
[SPARK-10905][SparkR]: Export freqItems() for DataFrameStatFunctions - Add function (together with roxygen2 doc) to DataFrame.R and generics.R - Expose the function in NAMESPACE - Add unit test for the function Author: Rerngvit Yanggratoke <[email protected]> Closes #8962 from rerngvit/SPARK-10905.
1 parent 5994cfe commit 70f44ad

File tree

4 files changed

+53
-0
lines changed

4 files changed

+53
-0
lines changed

R/pkg/NAMESPACE

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ exportMethods("arrange",
4040
"fillna",
4141
"filter",
4242
"first",
43+
"freqItems",
4344
"group_by",
4445
"groupBy",
4546
"head",

R/pkg/R/generics.R

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,10 @@ setGeneric("countByValue", function(x) { standardGeneric("countByValue") })
6363
# @export
6464
setGeneric("crosstab", function(x, col1, col2) { standardGeneric("crosstab") })
6565

66+
# @rdname statfunctions
67+
# @export
68+
setGeneric("freqItems", function(x, cols, support = 0.01) { standardGeneric("freqItems") })
69+
6670
# @rdname distinct
6771
# @export
6872
setGeneric("distinct", function(x, numPartitions = 1) { standardGeneric("distinct") })

R/pkg/R/stats.R

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,3 +100,30 @@ setMethod("corr",
100100
statFunctions <- callJMethod(x@sdf, "stat")
101101
callJMethod(statFunctions, "corr", col1, col2, method)
102102
})
103+
104+
#' freqItems
105+
#'
106+
#' Finding frequent items for columns, possibly with false positives.
107+
#' Using the frequent element count algorithm described in
108+
#' \url{http://dx.doi.org/10.1145/762471.762473}, proposed by Karp, Schenker, and Papadimitriou.
109+
#'
110+
#' @param x A SparkSQL DataFrame.
111+
#' @param cols A vector column names to search frequent items in.
112+
#' @param support (Optional) The minimum frequency for an item to be considered `frequent`.
113+
#' Should be greater than 1e-4. Default support = 0.01.
114+
#' @return a local R data.frame with the frequent items in each column
115+
#'
116+
#' @rdname statfunctions
117+
#' @name freqItems
118+
#' @export
119+
#' @examples
120+
#' \dontrun{
121+
#' df <- jsonFile(sqlContext, "/path/to/file.json")
122+
#' fi = freqItems(df, c("title", "gender"))
123+
#' }
124+
setMethod("freqItems", signature(x = "DataFrame", cols = "character"),
125+
function(x, cols, support = 0.01) {
126+
statFunctions <- callJMethod(x@sdf, "stat")
127+
sct <- callJMethod(statFunctions, "freqItems", as.list(cols), support)
128+
collect(dataFrame(sct))
129+
})

R/pkg/inst/tests/test_sparkSQL.R

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1350,6 +1350,27 @@ test_that("cov() and corr() on a DataFrame", {
13501350
expect_true(abs(result - 1.0) < 1e-12)
13511351
})
13521352

1353+
test_that("freqItems() on a DataFrame", {
1354+
input <- 1:1000
1355+
rdf <- data.frame(numbers = input, letters = as.character(input),
1356+
negDoubles = input * -1.0, stringsAsFactors = F)
1357+
rdf[ input %% 3 == 0, ] <- c(1, "1", -1)
1358+
df <- createDataFrame(sqlContext, rdf)
1359+
multiColResults <- freqItems(df, c("numbers", "letters"), support=0.1)
1360+
expect_true(1 %in% multiColResults$numbers[[1]])
1361+
expect_true("1" %in% multiColResults$letters[[1]])
1362+
singleColResult <- freqItems(df, "negDoubles", support=0.1)
1363+
expect_true(-1 %in% head(singleColResult$negDoubles)[[1]])
1364+
1365+
l <- lapply(c(0:99), function(i) {
1366+
if (i %% 2 == 0) { list(1L, -1.0) }
1367+
else { list(i, i * -1.0) }})
1368+
df <- createDataFrame(sqlContext, l, c("a", "b"))
1369+
result <- freqItems(df, c("a", "b"), 0.4)
1370+
expect_identical(result[[1]], list(list(1L, 99L)))
1371+
expect_identical(result[[2]], list(list(-1, -99)))
1372+
})
1373+
13531374
test_that("SQL error message is returned from JVM", {
13541375
retError <- tryCatch(sql(sqlContext, "select * from blah"), error = function(e) e)
13551376
expect_equal(grepl("Table Not Found: blah", retError), TRUE)

0 commit comments

Comments
 (0)