Skip to content

Commit 8a272dd

Browse files
zero323Felix Cheung
authored andcommitted
[SPARK-20438][R] SparkR wrappers for split and repeat
## What changes were proposed in this pull request? Add wrappers for `o.a.s.sql.functions`: - `split` as `split_string` - `repeat` as `repeat_string` ## How was this patch tested? Existing tests, additional unit tests, `check-cran.sh` Author: zero323 <[email protected]> Closes #17729 from zero323/SPARK-20438.
1 parent 90264ac commit 8a272dd

File tree

4 files changed

+102
-0
lines changed

4 files changed

+102
-0
lines changed

R/pkg/NAMESPACE

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -300,6 +300,7 @@ exportMethods("%in%",
300300
"rank",
301301
"regexp_extract",
302302
"regexp_replace",
303+
"repeat_string",
303304
"reverse",
304305
"rint",
305306
"rlike",
@@ -323,6 +324,7 @@ exportMethods("%in%",
323324
"sort_array",
324325
"soundex",
325326
"spark_partition_id",
327+
"split_string",
326328
"stddev",
327329
"stddev_pop",
328330
"stddev_samp",

R/pkg/R/functions.R

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3745,3 +3745,61 @@ setMethod("collect_set",
37453745
jc <- callJStatic("org.apache.spark.sql.functions", "collect_set", x@jc)
37463746
column(jc)
37473747
})
3748+
3749+
#' split_string
3750+
#'
3751+
#' Splits string on regular expression.
3752+
#'
3753+
#' Equivalent to \code{split} SQL function
3754+
#'
3755+
#' @param x Column to compute on
3756+
#' @param pattern Java regular expression
3757+
#'
3758+
#' @rdname split_string
3759+
#' @family string_funcs
3760+
#' @aliases split_string,Column-method
3761+
#' @export
3762+
#' @examples \dontrun{
3763+
#' df <- read.text("README.md")
3764+
#'
3765+
#' head(select(df, split_string(df$value, "\\s+")))
3766+
#'
3767+
#' # This is equivalent to the following SQL expression
3768+
#' head(selectExpr(df, "split(value, '\\\\s+')"))
3769+
#' }
3770+
#' @note split_string 2.3.0
3771+
setMethod("split_string",
3772+
signature(x = "Column", pattern = "character"),
3773+
function(x, pattern) {
3774+
jc <- callJStatic("org.apache.spark.sql.functions", "split", x@jc, pattern)
3775+
column(jc)
3776+
})
3777+
3778+
#' repeat_string
3779+
#'
3780+
#' Repeats string n times.
3781+
#'
3782+
#' Equivalent to \code{repeat} SQL function
3783+
#'
3784+
#' @param x Column to compute on
3785+
#' @param n Number of repetitions
3786+
#'
3787+
#' @rdname repeat_string
3788+
#' @family string_funcs
3789+
#' @aliases repeat_string,Column-method
3790+
#' @export
3791+
#' @examples \dontrun{
3792+
#' df <- read.text("README.md")
3793+
#'
3794+
#' first(select(df, repeat_string(df$value, 3)))
3795+
#'
3796+
#' # This is equivalent to the following SQL expression
3797+
#' first(selectExpr(df, "repeat(value, 3)"))
3798+
#' }
3799+
#' @note repeat_string 2.3.0
3800+
setMethod("repeat_string",
3801+
signature(x = "Column", n = "numeric"),
3802+
function(x, n) {
3803+
jc <- callJStatic("org.apache.spark.sql.functions", "repeat", x@jc, numToInt(n))
3804+
column(jc)
3805+
})

R/pkg/R/generics.R

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1192,6 +1192,10 @@ setGeneric("regexp_extract", function(x, pattern, idx) { standardGeneric("regexp
11921192
setGeneric("regexp_replace",
11931193
function(x, pattern, replacement) { standardGeneric("regexp_replace") })
11941194

1195+
#' @rdname repeat_string
1196+
#' @export
1197+
setGeneric("repeat_string", function(x, n) { standardGeneric("repeat_string") })
1198+
11951199
#' @rdname reverse
11961200
#' @export
11971201
setGeneric("reverse", function(x) { standardGeneric("reverse") })
@@ -1257,6 +1261,10 @@ setGeneric("skewness", function(x) { standardGeneric("skewness") })
12571261
#' @export
12581262
setGeneric("sort_array", function(x, asc = TRUE) { standardGeneric("sort_array") })
12591263

1264+
#' @rdname split_string
1265+
#' @export
1266+
setGeneric("split_string", function(x, pattern) { standardGeneric("split_string") })
1267+
12601268
#' @rdname soundex
12611269
#' @export
12621270
setGeneric("soundex", function(x) { standardGeneric("soundex") })

R/pkg/inst/tests/testthat/test_sparkSQL.R

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1546,6 +1546,40 @@ test_that("string operators", {
15461546
expect_equal(collect(select(df3, substring_index(df3$a, ".", 2)))[1, 1], "a.b")
15471547
expect_equal(collect(select(df3, substring_index(df3$a, ".", -3)))[1, 1], "b.c.d")
15481548
expect_equal(collect(select(df3, translate(df3$a, "bc", "12")))[1, 1], "a.1.2.d")
1549+
1550+
l4 <- list(list(a = "[email protected] 1\\b"))
1551+
df4 <- createDataFrame(l4)
1552+
expect_equal(
1553+
collect(select(df4, split_string(df4$a, "\\s+")))[1, 1],
1554+
list(list("[email protected]", "1\\b"))
1555+
)
1556+
expect_equal(
1557+
collect(select(df4, split_string(df4$a, "\\.")))[1, 1],
1558+
list(list("a", "b@c", "d 1\\b"))
1559+
)
1560+
expect_equal(
1561+
collect(select(df4, split_string(df4$a, "@")))[1, 1],
1562+
list(list("a.b", "c.d 1\\b"))
1563+
)
1564+
expect_equal(
1565+
collect(select(df4, split_string(df4$a, "\\\\")))[1, 1],
1566+
list(list("[email protected] 1", "b"))
1567+
)
1568+
1569+
l5 <- list(list(a = "abc"))
1570+
df5 <- createDataFrame(l5)
1571+
expect_equal(
1572+
collect(select(df5, repeat_string(df5$a, 1L)))[1, 1],
1573+
"abc"
1574+
)
1575+
expect_equal(
1576+
collect(select(df5, repeat_string(df5$a, 3)))[1, 1],
1577+
"abcabcabc"
1578+
)
1579+
expect_equal(
1580+
collect(select(df5, repeat_string(df5$a, -1)))[1, 1],
1581+
""
1582+
)
15491583
})
15501584

15511585
test_that("date functions on a DataFrame", {

0 commit comments

Comments
 (0)