Skip to content

Commit d5a9ca3

Browse files
committed
Merge branch 'master' into SPARK-24319
2 parents b03e3de + 01452ea commit d5a9ca3

File tree

182 files changed

+7720
-1195
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

182 files changed

+7720
-1195
lines changed

R/pkg/NAMESPACE

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -201,10 +201,13 @@ exportMethods("%<=>%",
201201
"approxCountDistinct",
202202
"approxQuantile",
203203
"array_contains",
204+
"array_join",
204205
"array_max",
205206
"array_min",
206207
"array_position",
208+
"array_repeat",
207209
"array_sort",
210+
"arrays_overlap",
208211
"asc",
209212
"ascii",
210213
"asin",
@@ -302,6 +305,7 @@ exportMethods("%<=>%",
302305
"lower",
303306
"lpad",
304307
"ltrim",
308+
"map_entries",
305309
"map_keys",
306310
"map_values",
307311
"max",

R/pkg/R/DataFrame.R

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2297,6 +2297,8 @@ setMethod("rename",
22972297

22982298
setClassUnion("characterOrColumn", c("character", "Column"))
22992299

2300+
setClassUnion("numericOrColumn", c("numeric", "Column"))
2301+
23002302
#' Arrange Rows by Variables
23012303
#'
23022304
#' Sort a SparkDataFrame by the specified column(s).

R/pkg/R/functions.R

Lines changed: 79 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -189,6 +189,7 @@ NULL
189189
#' the map or array of maps.
190190
#' \item \code{from_json}: it is the column containing the JSON string.
191191
#' }
192+
#' @param y Column to compute on.
192193
#' @param value A value to compute on.
193194
#' \itemize{
194195
#' \item \code{array_contains}: a value to be checked if contained in the column.
@@ -207,7 +208,7 @@ NULL
207208
#' tmp <- mutate(df, v1 = create_array(df$mpg, df$cyl, df$hp))
208209
#' head(select(tmp, array_contains(tmp$v1, 21), size(tmp$v1)))
209210
#' head(select(tmp, array_max(tmp$v1), array_min(tmp$v1)))
210-
#' head(select(tmp, array_position(tmp$v1, 21), array_sort(tmp$v1)))
211+
#' head(select(tmp, array_position(tmp$v1, 21), array_repeat(df$mpg, 3), array_sort(tmp$v1)))
211212
#' head(select(tmp, flatten(tmp$v1), reverse(tmp$v1)))
212213
#' tmp2 <- mutate(tmp, v2 = explode(tmp$v1))
213214
#' head(tmp2)
@@ -216,12 +217,13 @@ NULL
216217
#' head(select(tmp, sort_array(tmp$v1)))
217218
#' head(select(tmp, sort_array(tmp$v1, asc = FALSE)))
218219
#' tmp3 <- mutate(df, v3 = create_map(df$model, df$cyl))
219-
#' head(select(tmp3, map_keys(tmp3$v3)))
220-
#' head(select(tmp3, map_values(tmp3$v3)))
220+
#' head(select(tmp3, map_entries(tmp3$v3), map_keys(tmp3$v3), map_values(tmp3$v3)))
221221
#' head(select(tmp3, element_at(tmp3$v3, "Valiant")))
222-
#' tmp4 <- mutate(df, v4 = create_array(df$mpg, df$cyl), v5 = create_array(df$hp))
223-
#' head(select(tmp4, concat(tmp4$v4, tmp4$v5)))
224-
#' head(select(tmp, concat(df$mpg, df$cyl, df$hp)))}
222+
#' tmp4 <- mutate(df, v4 = create_array(df$mpg, df$cyl), v5 = create_array(df$cyl, df$hp))
223+
#' head(select(tmp4, concat(tmp4$v4, tmp4$v5), arrays_overlap(tmp4$v4, tmp4$v5)))
224+
#' head(select(tmp, concat(df$mpg, df$cyl, df$hp)))
225+
#' tmp5 <- mutate(df, v6 = create_array(df$model, df$model))
226+
#' head(select(tmp5, array_join(tmp5$v6, "#"), array_join(tmp5$v6, "#", "NULL")))}
225227
NULL
226228

227229
#' Window functions for Column operations
@@ -3006,6 +3008,27 @@ setMethod("array_contains",
30063008
column(jc)
30073009
})
30083010

3011+
#' @details
3012+
#' \code{array_join}: Concatenates the elements of column using the delimiter.
3013+
#' Null values are replaced with nullReplacement if set, otherwise they are ignored.
3014+
#'
3015+
#' @param delimiter a character string that is used to concatenate the elements of column.
3016+
#' @param nullReplacement an optional character string that is used to replace the Null values.
3017+
#' @rdname column_collection_functions
3018+
#' @aliases array_join array_join,Column-method
3019+
#' @note array_join since 2.4.0
3020+
setMethod("array_join",
3021+
signature(x = "Column", delimiter = "character"),
3022+
function(x, delimiter, nullReplacement = NULL) {
3023+
jc <- if (is.null(nullReplacement)) {
3024+
callJStatic("org.apache.spark.sql.functions", "array_join", x@jc, delimiter)
3025+
} else {
3026+
callJStatic("org.apache.spark.sql.functions", "array_join", x@jc, delimiter,
3027+
as.character(nullReplacement))
3028+
}
3029+
column(jc)
3030+
})
3031+
30093032
#' @details
30103033
#' \code{array_max}: Returns the maximum value of the array.
30113034
#'
@@ -3048,6 +3071,26 @@ setMethod("array_position",
30483071
column(jc)
30493072
})
30503073

3074+
#' @details
3075+
#' \code{array_repeat}: Creates an array containing \code{x} repeated the number of times
3076+
#' given by \code{count}.
3077+
#'
3078+
#' @param count a Column or constant determining the number of repetitions.
3079+
#' @rdname column_collection_functions
3080+
#' @aliases array_repeat array_repeat,Column,numericOrColumn-method
3081+
#' @note array_repeat since 2.4.0
3082+
setMethod("array_repeat",
3083+
signature(x = "Column", count = "numericOrColumn"),
3084+
function(x, count) {
3085+
if (class(count) == "Column") {
3086+
count <- count@jc
3087+
} else {
3088+
count <- as.integer(count)
3089+
}
3090+
jc <- callJStatic("org.apache.spark.sql.functions", "array_repeat", x@jc, count)
3091+
column(jc)
3092+
})
3093+
30513094
#' @details
30523095
#' \code{array_sort}: Sorts the input array in ascending order. The elements of the input array
30533096
#' must be orderable. NA elements will be placed at the end of the returned array.
@@ -3062,6 +3105,21 @@ setMethod("array_sort",
30623105
column(jc)
30633106
})
30643107

3108+
#' @details
3109+
#' \code{arrays_overlap}: Returns true if the input arrays have at least one non-null element in
3110+
#' common. If not and both arrays are non-empty and any of them contains a null, it returns null.
3111+
#' It returns false otherwise.
3112+
#'
3113+
#' @rdname column_collection_functions
3114+
#' @aliases arrays_overlap arrays_overlap,Column-method
3115+
#' @note arrays_overlap since 2.4.0
3116+
setMethod("arrays_overlap",
3117+
signature(x = "Column", y = "Column"),
3118+
function(x, y) {
3119+
jc <- callJStatic("org.apache.spark.sql.functions", "arrays_overlap", x@jc, y@jc)
3120+
column(jc)
3121+
})
3122+
30653123
#' @details
30663124
#' \code{flatten}: Creates a single array from an array of arrays.
30673125
#' If a structure of nested arrays is deeper than two levels, only one level of nesting is removed.
@@ -3076,6 +3134,19 @@ setMethod("flatten",
30763134
column(jc)
30773135
})
30783136

3137+
#' @details
3138+
#' \code{map_entries}: Returns an unordered array of all entries in the given map.
3139+
#'
3140+
#' @rdname column_collection_functions
3141+
#' @aliases map_entries map_entries,Column-method
3142+
#' @note map_entries since 2.4.0
3143+
setMethod("map_entries",
3144+
signature(x = "Column"),
3145+
function(x) {
3146+
jc <- callJStatic("org.apache.spark.sql.functions", "map_entries", x@jc)
3147+
column(jc)
3148+
})
3149+
30793150
#' @details
30803151
#' \code{map_keys}: Returns an unordered array containing the keys of the map.
30813152
#'
@@ -3149,8 +3220,8 @@ setMethod("size",
31493220
#' (or starting from the end if start is negative) with the specified length.
31503221
#'
31513222
#' @rdname column_collection_functions
3152-
#' @param start an index indicating the first element occuring in the result.
3153-
#' @param length a number of consecutive elements choosen to the result.
3223+
#' @param start an index indicating the first element occurring in the result.
3224+
#' @param length a number of consecutive elements chosen to the result.
31543225
#' @aliases slice slice,Column-method
31553226
#' @note slice since 2.4.0
31563227
setMethod("slice",

R/pkg/R/generics.R

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -757,6 +757,10 @@ setGeneric("approxCountDistinct", function(x, ...) { standardGeneric("approxCoun
757757
#' @name NULL
758758
setGeneric("array_contains", function(x, value) { standardGeneric("array_contains") })
759759

760+
#' @rdname column_collection_functions
761+
#' @name NULL
762+
setGeneric("array_join", function(x, delimiter, ...) { standardGeneric("array_join") })
763+
760764
#' @rdname column_collection_functions
761765
#' @name NULL
762766
setGeneric("array_max", function(x) { standardGeneric("array_max") })
@@ -769,10 +773,18 @@ setGeneric("array_min", function(x) { standardGeneric("array_min") })
769773
#' @name NULL
770774
setGeneric("array_position", function(x, value) { standardGeneric("array_position") })
771775

776+
#' @rdname column_collection_functions
777+
#' @name NULL
778+
setGeneric("array_repeat", function(x, count) { standardGeneric("array_repeat") })
779+
772780
#' @rdname column_collection_functions
773781
#' @name NULL
774782
setGeneric("array_sort", function(x) { standardGeneric("array_sort") })
775783

784+
#' @rdname column_collection_functions
785+
#' @name NULL
786+
setGeneric("arrays_overlap", function(x, y) { standardGeneric("arrays_overlap") })
787+
776788
#' @rdname column_string_functions
777789
#' @name NULL
778790
setGeneric("ascii", function(x) { standardGeneric("ascii") })
@@ -1034,6 +1046,10 @@ setGeneric("lpad", function(x, len, pad) { standardGeneric("lpad") })
10341046
#' @name NULL
10351047
setGeneric("ltrim", function(x, trimString) { standardGeneric("ltrim") })
10361048

1049+
#' @rdname column_collection_functions
1050+
#' @name NULL
1051+
setGeneric("map_entries", function(x) { standardGeneric("map_entries") })
1052+
10371053
#' @rdname column_collection_functions
10381054
#' @name NULL
10391055
setGeneric("map_keys", function(x) { standardGeneric("map_keys") })

R/pkg/tests/fulltests/test_sparkSQL.R

Lines changed: 36 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1503,6 +1503,36 @@ test_that("column functions", {
15031503
result <- collect(select(df2, reverse(df2[[1]])))[[1]]
15041504
expect_equal(result, "cba")
15051505

1506+
# Test array_repeat()
1507+
df <- createDataFrame(list(list("a", 3L), list("b", 2L)))
1508+
result <- collect(select(df, array_repeat(df[[1]], df[[2]])))[[1]]
1509+
expect_equal(result, list(list("a", "a", "a"), list("b", "b")))
1510+
1511+
result <- collect(select(df, array_repeat(df[[1]], 2L)))[[1]]
1512+
expect_equal(result, list(list("a", "a"), list("b", "b")))
1513+
1514+
# Test arrays_overlap()
1515+
df <- createDataFrame(list(list(list(1L, 2L), list(3L, 1L)),
1516+
list(list(1L, 2L), list(3L, 4L)),
1517+
list(list(1L, NA), list(3L, 4L))))
1518+
result <- collect(select(df, arrays_overlap(df[[1]], df[[2]])))[[1]]
1519+
expect_equal(result, c(TRUE, FALSE, NA))
1520+
1521+
# Test array_join()
1522+
df <- createDataFrame(list(list(list("Hello", "World!"))))
1523+
result <- collect(select(df, array_join(df[[1]], "#")))[[1]]
1524+
expect_equal(result, "Hello#World!")
1525+
df2 <- createDataFrame(list(list(list("Hello", NA, "World!"))))
1526+
result <- collect(select(df2, array_join(df2[[1]], "#", "Beautiful")))[[1]]
1527+
expect_equal(result, "Hello#Beautiful#World!")
1528+
result <- collect(select(df2, array_join(df2[[1]], "#")))[[1]]
1529+
expect_equal(result, "Hello#World!")
1530+
df3 <- createDataFrame(list(list(list("Hello", NULL, "World!"))))
1531+
result <- collect(select(df3, array_join(df3[[1]], "#", "Beautiful")))[[1]]
1532+
expect_equal(result, "Hello#Beautiful#World!")
1533+
result <- collect(select(df3, array_join(df3[[1]], "#")))[[1]]
1534+
expect_equal(result, "Hello#World!")
1535+
15061536
# Test array_sort() and sort_array()
15071537
df <- createDataFrame(list(list(list(2L, 1L, 3L, NA)), list(list(NA, 6L, 5L, NA, 4L))))
15081538

@@ -1531,8 +1561,13 @@ test_that("column functions", {
15311561
result <- collect(select(df, flatten(df[[1]])))[[1]]
15321562
expect_equal(result, list(list(1L, 2L, 3L, 4L), list(5L, 6L, 7L, 8L)))
15331563

1534-
# Test map_keys(), map_values() and element_at()
1564+
# Test map_entries(), map_keys(), map_values() and element_at()
15351565
df <- createDataFrame(list(list(map = as.environment(list(x = 1, y = 2)))))
1566+
result <- collect(select(df, map_entries(df$map)))[[1]]
1567+
expected_entries <- list(listToStruct(list(key = "x", value = 1)),
1568+
listToStruct(list(key = "y", value = 2)))
1569+
expect_equal(result, list(expected_entries))
1570+
15361571
result <- collect(select(df, map_keys(df$map)))[[1]]
15371572
expect_equal(result, list(list("x", "y")))
15381573

README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,8 @@ can be run using:
8181
Please see the guidance on how to
8282
[run tests for a module, or individual tests](http://spark.apache.org/developer-tools.html#individual-tests).
8383

84+
There is also a Kubernetes integration test, see resource-managers/kubernetes/integration-tests/README.md
85+
8486
## A Note About Hadoop Versions
8587

8688
Spark uses the Hadoop core library to talk to HDFS and other Hadoop-supported

bin/docker-image-tool.sh

Lines changed: 17 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -63,12 +63,20 @@ function build {
6363
if [ ! -d "$IMG_PATH" ]; then
6464
error "Cannot find docker image. This script must be run from a runnable distribution of Apache Spark."
6565
fi
66-
67-
local DOCKERFILE=${DOCKERFILE:-"$IMG_PATH/spark/Dockerfile"}
66+
local BINDING_BUILD_ARGS=(
67+
--build-arg
68+
base_img=$(image_ref spark)
69+
)
70+
local BASEDOCKERFILE=${BASEDOCKERFILE:-"$IMG_PATH/spark/Dockerfile"}
71+
local PYDOCKERFILE=${PYDOCKERFILE:-"$IMG_PATH/spark/bindings/python/Dockerfile"}
6872

6973
docker build "${BUILD_ARGS[@]}" \
7074
-t $(image_ref spark) \
71-
-f "$DOCKERFILE" .
75+
-f "$BASEDOCKERFILE" .
76+
77+
docker build "${BINDING_BUILD_ARGS[@]}" \
78+
-t $(image_ref spark-py) \
79+
-f "$PYDOCKERFILE" .
7280
}
7381

7482
function push {
@@ -86,7 +94,8 @@ Commands:
8694
push Push a pre-built image to a registry. Requires a repository address to be provided.
8795
8896
Options:
89-
-f file Dockerfile to build. By default builds the Dockerfile shipped with Spark.
97+
-f file Dockerfile to build for JVM based Jobs. By default builds the Dockerfile shipped with Spark.
98+
-p file Dockerfile with Python baked in. By default builds the Dockerfile shipped with Spark.
9099
-r repo Repository address.
91100
-t tag Tag to apply to the built image, or to identify the image to be pushed.
92101
-m Use minikube's Docker daemon.
@@ -116,12 +125,14 @@ fi
116125

117126
REPO=
118127
TAG=
119-
DOCKERFILE=
128+
BASEDOCKERFILE=
129+
PYDOCKERFILE=
120130
while getopts f:mr:t: option
121131
do
122132
case "${option}"
123133
in
124-
f) DOCKERFILE=${OPTARG};;
134+
f) BASEDOCKERFILE=${OPTARG};;
135+
p) PYDOCKERFILE=${OPTARG};;
125136
r) REPO=${OPTARG};;
126137
t) TAG=${OPTARG};;
127138
m)

0 commit comments

Comments
 (0)