Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions R/pkg/NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ exportMethods("arrange",
"fillna",
"filter",
"first",
"freqItems",
"group_by",
"groupBy",
"head",
Expand Down
4 changes: 4 additions & 0 deletions R/pkg/R/generics.R
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,10 @@ setGeneric("countByValue", function(x) { standardGeneric("countByValue") })
# @export
setGeneric("crosstab", function(x, col1, col2) { standardGeneric("crosstab") })

# @rdname statfunctions
# @export
setGeneric("freqItems", function(x, cols, support = 0.01) { standardGeneric("freqItems") })

# @rdname distinct
# @export
setGeneric("distinct", function(x, numPartitions = 1) { standardGeneric("distinct") })
Expand Down
27 changes: 27 additions & 0 deletions R/pkg/R/stats.R
Original file line number Diff line number Diff line change
Expand Up @@ -100,3 +100,30 @@ setMethod("corr",
statFunctions <- callJMethod(x@sdf, "stat")
callJMethod(statFunctions, "corr", col1, col2, method)
})

#' freqItems
#'
#' Finding frequent items for columns, possibly with false positives.
#' Using the frequent element count algorithm described in
#' \url{http://dx.doi.org/10.1145/762471.762473}, proposed by Karp, Schenker, and Papadimitriou.
#'
#' @param x A SparkSQL DataFrame.
#' @param cols A vector column names to search frequent items in.
#' @param support (Optional) The minimum frequency for an item to be considered `frequent`.
#' Should be greater than 1e-4. Default support = 0.01.
#' @return a local R data.frame with the frequent items in each column
#'
#' @rdname statfunctions
#' @name freqItems
#' @export
#' @examples
#' \dontrun{
#' df <- jsonFile(sqlContext, "/path/to/file.json")
#' fi = freqItems(df, c("title", "gender"))
#' }
setMethod("freqItems", signature(x = "DataFrame", cols = "character"),
function(x, cols, support = 0.01) {
statFunctions <- callJMethod(x@sdf, "stat")
sct <- callJMethod(statFunctions, "freqItems", as.list(cols), support)
collect(dataFrame(sct))
})
21 changes: 21 additions & 0 deletions R/pkg/inst/tests/test_sparkSQL.R
Original file line number Diff line number Diff line change
Expand Up @@ -1341,6 +1341,27 @@ test_that("cov() and corr() on a DataFrame", {
expect_true(abs(result - 1.0) < 1e-12)
})

test_that("freqItems() on a DataFrame", {
input <- 1:1000
rdf <- data.frame(numbers = input, letters = as.character(input),
negDoubles = input * -1.0, stringsAsFactors = F)
rdf[ input %% 3 == 0, ] <- c(1, "1", -1)
df <- createDataFrame(sqlContext, rdf)
multiColResults <- freqItems(df, c("numbers", "letters"), support=0.1)
expect_true(1 %in% multiColResults$numbers[[1]])
expect_true("1" %in% multiColResults$letters[[1]])
singleColResult <- freqItems(df, "negDoubles", support=0.1)
expect_true(-1 %in% head(singleColResult$negDoubles)[[1]])

l <- lapply(c(0:99), function(i) {
if (i %% 2 == 0) { list(1L, -1.0) }
else { list(i, i * -1.0) }})
df <- createDataFrame(sqlContext, l, c("a", "b"))
result <- freqItems(df, c("a", "b"), 0.4)
expect_identical(result[[1]], list(list(1L, 99L)))
expect_identical(result[[2]], list(list(-1, -99)))
})

test_that("SQL error message is returned from JVM", {
retError <- tryCatch(sql(sqlContext, "select * from blah"), error = function(e) e)
expect_equal(grepl("Table Not Found: blah", retError), TRUE)
Expand Down