Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions R/pkg/NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -300,6 +300,7 @@ exportMethods("%in%",
"rank",
"regexp_extract",
"regexp_replace",
"repeat_string",
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

good call on these names!

"reverse",
"rint",
"rlike",
Expand All @@ -323,6 +324,7 @@ exportMethods("%in%",
"sort_array",
"soundex",
"spark_partition_id",
"split_string",
"stddev",
"stddev_pop",
"stddev_samp",
Expand Down
58 changes: 58 additions & 0 deletions R/pkg/R/functions.R
Original file line number Diff line number Diff line change
Expand Up @@ -3745,3 +3745,61 @@ setMethod("collect_set",
jc <- callJStatic("org.apache.spark.sql.functions", "collect_set", x@jc)
column(jc)
})

#' split_string
#'
#' Splits string on regular expression.
#'
#' Equivalent to \code{split} SQL function
#'
#' @param x Column to compute on
#' @param pattern Java regular expression
#'
#' @rdname split_string
#' @family string_funcs
#' @aliases split_string,Column-method
#' @export
#' @examples \dontrun{
#' df <- read.text("README.md")
#'
#' head(select(df, split_string(df$value, "\\s+")))
#'
#' # This is equivalent to the following SQL expression
#' head(selectExpr(df, "split(value, '\\\\s+')"))
#' }
#' @note split_string 2.3.0
setMethod("split_string",
signature(x = "Column", pattern = "character"),
function(x, pattern) {
jc <- callJStatic("org.apache.spark.sql.functions", "split", x@jc, pattern)
column(jc)
})

#' repeat_string
#'
#' Repeats string n times.
#'
#' Equivalent to \code{repeat} SQL function
#'
#' @param x Column to compute on
#' @param n Number of repetitions
#'
#' @rdname repeat_string
#' @family string_funcs
#' @aliases repeat_string,Column-method
#' @export
#' @examples \dontrun{
#' df <- read.text("README.md")
#'
#' first(select(df, repeat_string(df$value, 3)))
#'
#' # This is equivalent to the following SQL expression
#' first(selectExpr(df, "repeat(value, 3)"))
#' }
#' @note repeat_string 2.3.0
setMethod("repeat_string",
signature(x = "Column", n = "numeric"),
function(x, n) {
jc <- callJStatic("org.apache.spark.sql.functions", "repeat", x@jc, numToInt(n))
column(jc)
})
8 changes: 8 additions & 0 deletions R/pkg/R/generics.R
Original file line number Diff line number Diff line change
Expand Up @@ -1192,6 +1192,10 @@ setGeneric("regexp_extract", function(x, pattern, idx) { standardGeneric("regexp
setGeneric("regexp_replace",
function(x, pattern, replacement) { standardGeneric("regexp_replace") })

#' @rdname repeat_string
#' @export
setGeneric("repeat_string", function(x, n) { standardGeneric("repeat_string") })

#' @rdname reverse
#' @export
setGeneric("reverse", function(x) { standardGeneric("reverse") })
Expand Down Expand Up @@ -1257,6 +1261,10 @@ setGeneric("skewness", function(x) { standardGeneric("skewness") })
#' @export
setGeneric("sort_array", function(x, asc = TRUE) { standardGeneric("sort_array") })

#' @rdname split_string
#' @export
setGeneric("split_string", function(x, pattern) { standardGeneric("split_string") })

#' @rdname soundex
#' @export
setGeneric("soundex", function(x) { standardGeneric("soundex") })
Expand Down
34 changes: 34 additions & 0 deletions R/pkg/inst/tests/testthat/test_sparkSQL.R
Original file line number Diff line number Diff line change
Expand Up @@ -1546,6 +1546,40 @@ test_that("string operators", {
expect_equal(collect(select(df3, substring_index(df3$a, ".", 2)))[1, 1], "a.b")
expect_equal(collect(select(df3, substring_index(df3$a, ".", -3)))[1, 1], "b.c.d")
expect_equal(collect(select(df3, translate(df3$a, "bc", "12")))[1, 1], "a.1.2.d")

l4 <- list(list(a = "[email protected] 1\\b"))
df4 <- createDataFrame(l4)
expect_equal(
collect(select(df4, split_string(df4$a, "\\s+")))[1, 1],
list(list("[email protected]", "1\\b"))
)
expect_equal(
collect(select(df4, split_string(df4$a, "\\.")))[1, 1],
list(list("a", "b@c", "d 1\\b"))
)
expect_equal(
collect(select(df4, split_string(df4$a, "@")))[1, 1],
list(list("a.b", "c.d 1\\b"))
)
expect_equal(
collect(select(df4, split_string(df4$a, "\\\\")))[1, 1],
list(list("[email protected] 1", "b"))
)

l5 <- list(list(a = "abc"))
df5 <- createDataFrame(l5)
expect_equal(
collect(select(df5, repeat_string(df5$a, 1L)))[1, 1],
"abc"
)
expect_equal(
collect(select(df5, repeat_string(df5$a, 3)))[1, 1],
"abcabcabc"
)
expect_equal(
collect(select(df5, repeat_string(df5$a, -1)))[1, 1],
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

:) ahh, -1 works?!

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Right? I think we should keep it this way to avoid any confusion when users switch between SQL and DSL. If anything changes it will cause test failure and then we can add R side checks.

""
)
})

test_that("date functions on a DataFrame", {
Expand Down