Skip to content

Commit b27245e

Browse files
committed
Resolve conflicts
2 parents d468821 + a75571b commit b27245e

File tree

235 files changed

+8832
-1255
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

235 files changed

+8832
-1255
lines changed

R/pkg/NAMESPACE

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -201,13 +201,16 @@ exportMethods("%<=>%",
201201
"approxCountDistinct",
202202
"approxQuantile",
203203
"array_contains",
204+
"array_distinct",
204205
"array_join",
205206
"array_max",
206207
"array_min",
207208
"array_position",
209+
"array_remove",
208210
"array_repeat",
209211
"array_sort",
210212
"arrays_overlap",
213+
"arrays_zip",
211214
"asc",
212215
"ascii",
213216
"asin",
@@ -306,6 +309,7 @@ exportMethods("%<=>%",
306309
"lpad",
307310
"ltrim",
308311
"map_entries",
312+
"map_from_arrays",
309313
"map_keys",
310314
"map_values",
311315
"max",

R/pkg/R/client.R

Lines changed: 15 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -71,15 +71,20 @@ checkJavaVersion <- function() {
7171

7272
# If java is missing from PATH, we get an error in Unix and a warning in Windows
7373
javaVersionOut <- tryCatch(
74-
launchScript(javaBin, "-version", wait = TRUE, stdout = TRUE, stderr = TRUE),
75-
error = function(e) {
76-
stop("Java version check failed. Please make sure Java is installed",
77-
" and set JAVA_HOME to point to the installation directory.", e)
78-
},
79-
warning = function(w) {
80-
stop("Java version check failed. Please make sure Java is installed",
81-
" and set JAVA_HOME to point to the installation directory.", w)
82-
})
74+
if (is_windows()) {
75+
# See SPARK-24535
76+
system2(javaBin, "-version", wait = TRUE, stdout = TRUE, stderr = TRUE)
77+
} else {
78+
launchScript(javaBin, "-version", wait = TRUE, stdout = TRUE, stderr = TRUE)
79+
},
80+
error = function(e) {
81+
stop("Java version check failed. Please make sure Java is installed",
82+
" and set JAVA_HOME to point to the installation directory.", e)
83+
},
84+
warning = function(w) {
85+
stop("Java version check failed. Please make sure Java is installed",
86+
" and set JAVA_HOME to point to the installation directory.", w)
87+
})
8388
javaVersionFilter <- Filter(
8489
function(x) {
8590
grepl(" version", x)
@@ -93,6 +98,7 @@ checkJavaVersion <- function() {
9398
stop(paste("Java version", sparkJavaVersion, "is required for this package; found version:",
9499
javaVersionStr))
95100
}
101+
return(javaVersionNum)
96102
}
97103

98104
launchBackend <- function(args, sparkHome, jars, sparkSubmitOpts, packages) {

R/pkg/R/functions.R

Lines changed: 66 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -194,10 +194,12 @@ NULL
194194
#' \itemize{
195195
#' \item \code{array_contains}: a value to be checked if contained in the column.
196196
#' \item \code{array_position}: a value to locate in the given array.
197+
#' \item \code{array_remove}: a value to remove in the given array.
197198
#' }
198199
#' @param ... additional argument(s). In \code{to_json} and \code{from_json}, this contains
199200
#' additional named properties to control how it is converted, accepts the same
200-
#' options as the JSON data source.
201+
#' options as the JSON data source. In \code{arrays_zip}, this contains additional
202+
#' Columns of arrays to be merged.
201203
#' @name column_collection_functions
202204
#' @rdname column_collection_functions
203205
#' @family collection functions
@@ -207,9 +209,9 @@ NULL
207209
#' df <- createDataFrame(cbind(model = rownames(mtcars), mtcars))
208210
#' tmp <- mutate(df, v1 = create_array(df$mpg, df$cyl, df$hp))
209211
#' head(select(tmp, array_contains(tmp$v1, 21), size(tmp$v1)))
210-
#' head(select(tmp, array_max(tmp$v1), array_min(tmp$v1)))
212+
#' head(select(tmp, array_max(tmp$v1), array_min(tmp$v1), array_distinct(tmp$v1)))
211213
#' head(select(tmp, array_position(tmp$v1, 21), array_repeat(df$mpg, 3), array_sort(tmp$v1)))
212-
#' head(select(tmp, flatten(tmp$v1), reverse(tmp$v1)))
214+
#' head(select(tmp, flatten(tmp$v1), reverse(tmp$v1), array_remove(tmp$v1, 21)))
213215
#' tmp2 <- mutate(tmp, v2 = explode(tmp$v1))
214216
#' head(tmp2)
215217
#' head(select(tmp, posexplode(tmp$v1)))
@@ -221,6 +223,7 @@ NULL
221223
#' head(select(tmp3, element_at(tmp3$v3, "Valiant")))
222224
#' tmp4 <- mutate(df, v4 = create_array(df$mpg, df$cyl), v5 = create_array(df$cyl, df$hp))
223225
#' head(select(tmp4, concat(tmp4$v4, tmp4$v5), arrays_overlap(tmp4$v4, tmp4$v5)))
226+
#' head(select(tmp4, arrays_zip(tmp4$v4, tmp4$v5), map_from_arrays(tmp4$v4, tmp4$v5)))
224227
#' head(select(tmp, concat(df$mpg, df$cyl, df$hp)))
225228
#' tmp5 <- mutate(df, v6 = create_array(df$model, df$model))
226229
#' head(select(tmp5, array_join(tmp5$v6, "#"), array_join(tmp5$v6, "#", "NULL")))}
@@ -1978,7 +1981,7 @@ setMethod("levenshtein", signature(y = "Column"),
19781981
})
19791982

19801983
#' @details
1981-
#' \code{months_between}: Returns number of months between dates \code{y} and \code{x}.
1984+
#' \code{months_between}: Returns number of months between dates \code{y} and \code{x}.
19821985
#' If \code{y} is later than \code{x}, then the result is positive. If \code{y} and \code{x}
19831986
#' are on the same day of month, or both are the last day of month, time of day will be ignored.
19841987
#' Otherwise, the difference is calculated based on 31 days per month, and rounded to 8 digits.
@@ -3008,6 +3011,19 @@ setMethod("array_contains",
30083011
column(jc)
30093012
})
30103013

3014+
#' @details
3015+
#' \code{array_distinct}: Removes duplicate values from the array.
3016+
#'
3017+
#' @rdname column_collection_functions
3018+
#' @aliases array_distinct array_distinct,Column-method
3019+
#' @note array_distinct since 2.4.0
3020+
setMethod("array_distinct",
3021+
signature(x = "Column"),
3022+
function(x) {
3023+
jc <- callJStatic("org.apache.spark.sql.functions", "array_distinct", x@jc)
3024+
column(jc)
3025+
})
3026+
30113027
#' @details
30123028
#' \code{array_join}: Concatenates the elements of column using the delimiter.
30133029
#' Null values are replaced with nullReplacement if set, otherwise they are ignored.
@@ -3071,6 +3087,19 @@ setMethod("array_position",
30713087
column(jc)
30723088
})
30733089

3090+
#' @details
3091+
#' \code{array_remove}: Removes all elements that equal to element from the given array.
3092+
#'
3093+
#' @rdname column_collection_functions
3094+
#' @aliases array_remove array_remove,Column-method
3095+
#' @note array_remove since 2.4.0
3096+
setMethod("array_remove",
3097+
signature(x = "Column", value = "ANY"),
3098+
function(x, value) {
3099+
jc <- callJStatic("org.apache.spark.sql.functions", "array_remove", x@jc, value)
3100+
column(jc)
3101+
})
3102+
30743103
#' @details
30753104
#' \code{array_repeat}: Creates an array containing \code{x} repeated the number of times
30763105
#' given by \code{count}.
@@ -3120,6 +3149,24 @@ setMethod("arrays_overlap",
31203149
column(jc)
31213150
})
31223151

3152+
#' @details
3153+
#' \code{arrays_zip}: Returns a merged array of structs in which the N-th struct contains all N-th
3154+
#' values of input arrays.
3155+
#'
3156+
#' @rdname column_collection_functions
3157+
#' @aliases arrays_zip arrays_zip,Column-method
3158+
#' @note arrays_zip since 2.4.0
3159+
setMethod("arrays_zip",
3160+
signature(x = "Column"),
3161+
function(x, ...) {
3162+
jcols <- lapply(list(x, ...), function(arg) {
3163+
stopifnot(class(arg) == "Column")
3164+
arg@jc
3165+
})
3166+
jc <- callJStatic("org.apache.spark.sql.functions", "arrays_zip", jcols)
3167+
column(jc)
3168+
})
3169+
31233170
#' @details
31243171
#' \code{flatten}: Creates a single array from an array of arrays.
31253172
#' If a structure of nested arrays is deeper than two levels, only one level of nesting is removed.
@@ -3147,6 +3194,21 @@ setMethod("map_entries",
31473194
column(jc)
31483195
})
31493196

3197+
#' @details
3198+
#' \code{map_from_arrays}: Creates a new map column. The array in the first column is used for
3199+
#' keys. The array in the second column is used for values. All elements in the array for key
3200+
#' should not be null.
3201+
#'
3202+
#' @rdname column_collection_functions
3203+
#' @aliases map_from_arrays map_from_arrays,Column-method
3204+
#' @note map_from_arrays since 2.4.0
3205+
setMethod("map_from_arrays",
3206+
signature(x = "Column", y = "Column"),
3207+
function(x, y) {
3208+
jc <- callJStatic("org.apache.spark.sql.functions", "map_from_arrays", x@jc, y@jc)
3209+
column(jc)
3210+
})
3211+
31503212
#' @details
31513213
#' \code{map_keys}: Returns an unordered array containing the keys of the map.
31523214
#'

R/pkg/R/generics.R

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -757,6 +757,10 @@ setGeneric("approxCountDistinct", function(x, ...) { standardGeneric("approxCoun
757757
#' @name NULL
758758
setGeneric("array_contains", function(x, value) { standardGeneric("array_contains") })
759759

760+
#' @rdname column_collection_functions
761+
#' @name NULL
762+
setGeneric("array_distinct", function(x) { standardGeneric("array_distinct") })
763+
760764
#' @rdname column_collection_functions
761765
#' @name NULL
762766
setGeneric("array_join", function(x, delimiter, ...) { standardGeneric("array_join") })
@@ -773,6 +777,10 @@ setGeneric("array_min", function(x) { standardGeneric("array_min") })
773777
#' @name NULL
774778
setGeneric("array_position", function(x, value) { standardGeneric("array_position") })
775779

780+
#' @rdname column_collection_functions
781+
#' @name NULL
782+
setGeneric("array_remove", function(x, value) { standardGeneric("array_remove") })
783+
776784
#' @rdname column_collection_functions
777785
#' @name NULL
778786
setGeneric("array_repeat", function(x, count) { standardGeneric("array_repeat") })
@@ -785,6 +793,10 @@ setGeneric("array_sort", function(x) { standardGeneric("array_sort") })
785793
#' @name NULL
786794
setGeneric("arrays_overlap", function(x, y) { standardGeneric("arrays_overlap") })
787795

796+
#' @rdname column_collection_functions
797+
#' @name NULL
798+
setGeneric("arrays_zip", function(x, ...) { standardGeneric("arrays_zip") })
799+
788800
#' @rdname column_string_functions
789801
#' @name NULL
790802
setGeneric("ascii", function(x) { standardGeneric("ascii") })
@@ -1050,6 +1062,10 @@ setGeneric("ltrim", function(x, trimString) { standardGeneric("ltrim") })
10501062
#' @name NULL
10511063
setGeneric("map_entries", function(x) { standardGeneric("map_entries") })
10521064

1065+
#' @rdname column_collection_functions
1066+
#' @name NULL
1067+
setGeneric("map_from_arrays", function(x, y) { standardGeneric("map_from_arrays") })
1068+
10531069
#' @rdname column_collection_functions
10541070
#' @name NULL
10551071
setGeneric("map_keys", function(x) { standardGeneric("map_keys") })

R/pkg/R/sparkR.R

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -167,7 +167,7 @@ sparkR.sparkContext <- function(
167167
submitOps <- getClientModeSparkSubmitOpts(
168168
Sys.getenv("SPARKR_SUBMIT_ARGS", "sparkr-shell"),
169169
sparkEnvirMap)
170-
checkJavaVersion()
170+
invisible(checkJavaVersion())
171171
launchBackend(
172172
args = path,
173173
sparkHome = sparkHome,

R/pkg/inst/tests/testthat/test_basic.R

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,10 @@
1818
context("basic tests for CRAN")
1919

2020
test_that("create DataFrame from list or data.frame", {
21+
tryCatch( checkJavaVersion(),
22+
error = function(e) { skip("error on Java check") },
23+
warning = function(e) { skip("warning on Java check") } )
24+
2125
sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE,
2226
sparkConfig = sparkRTestConfig)
2327

@@ -50,6 +54,10 @@ test_that("create DataFrame from list or data.frame", {
5054
})
5155

5256
test_that("spark.glm and predict", {
57+
tryCatch( checkJavaVersion(),
58+
error = function(e) { skip("error on Java check") },
59+
warning = function(e) { skip("warning on Java check") } )
60+
5361
sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE,
5462
sparkConfig = sparkRTestConfig)
5563

R/pkg/tests/fulltests/test_sparkSQL.R

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1503,6 +1503,27 @@ test_that("column functions", {
15031503
result <- collect(select(df2, reverse(df2[[1]])))[[1]]
15041504
expect_equal(result, "cba")
15051505

1506+
# Test array_distinct() and array_remove()
1507+
df <- createDataFrame(list(list(list(1L, 2L, 3L, 1L, 2L)), list(list(6L, 5L, 5L, 4L, 6L))))
1508+
result <- collect(select(df, array_distinct(df[[1]])))[[1]]
1509+
expect_equal(result, list(list(1L, 2L, 3L), list(6L, 5L, 4L)))
1510+
1511+
result <- collect(select(df, array_remove(df[[1]], 2L)))[[1]]
1512+
expect_equal(result, list(list(1L, 3L, 1L), list(6L, 5L, 5L, 4L, 6L)))
1513+
1514+
# Test arrays_zip()
1515+
df <- createDataFrame(list(list(list(1L, 2L), list(3L, 4L))), schema = c("c1", "c2"))
1516+
result <- collect(select(df, arrays_zip(df[[1]], df[[2]])))[[1]]
1517+
expected_entries <- list(listToStruct(list(c1 = 1L, c2 = 3L)),
1518+
listToStruct(list(c1 = 2L, c2 = 4L)))
1519+
expect_equal(result, list(expected_entries))
1520+
1521+
# Test map_from_arrays()
1522+
df <- createDataFrame(list(list(list("x", "y"), list(1, 2))), schema = c("k", "v"))
1523+
result <- collect(select(df, map_from_arrays(df$k, df$v)))[[1]]
1524+
expected_entries <- list(as.environment(list(x = 1, y = 2)))
1525+
expect_equal(result, expected_entries)
1526+
15061527
# Test array_repeat()
15071528
df <- createDataFrame(list(list("a", 3L), list("b", 2L)))
15081529
result <- collect(select(df, array_repeat(df[[1]], df[[2]])))[[1]]

R/pkg/vignettes/sparkr-vignettes.Rmd

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -590,6 +590,7 @@ summary(model)
590590
Predict values on training data
591591
```{r}
592592
prediction <- predict(model, training)
593+
head(select(prediction, "Class", "Sex", "Age", "Freq", "Survived", "prediction"))
593594
```
594595

595596
#### Logistic Regression
@@ -613,6 +614,7 @@ summary(model)
613614
Predict values on training data
614615
```{r}
615616
fitted <- predict(model, training)
617+
head(select(fitted, "Class", "Sex", "Age", "Freq", "Survived", "prediction"))
616618
```
617619

618620
Multinomial logistic regression against three classes
@@ -807,6 +809,7 @@ df <- createDataFrame(t)
807809
dtModel <- spark.decisionTree(df, Survived ~ ., type = "classification", maxDepth = 2)
808810
summary(dtModel)
809811
predictions <- predict(dtModel, df)
812+
head(select(predictions, "Class", "Sex", "Age", "Freq", "Survived", "prediction"))
810813
```
811814

812815
#### Gradient-Boosted Trees
@@ -822,6 +825,7 @@ df <- createDataFrame(t)
822825
gbtModel <- spark.gbt(df, Survived ~ ., type = "classification", maxDepth = 2, maxIter = 2)
823826
summary(gbtModel)
824827
predictions <- predict(gbtModel, df)
828+
head(select(predictions, "Class", "Sex", "Age", "Freq", "Survived", "prediction"))
825829
```
826830

827831
#### Random Forest
@@ -837,6 +841,7 @@ df <- createDataFrame(t)
837841
rfModel <- spark.randomForest(df, Survived ~ ., type = "classification", maxDepth = 2, numTrees = 2)
838842
summary(rfModel)
839843
predictions <- predict(rfModel, df)
844+
head(select(predictions, "Class", "Sex", "Age", "Freq", "Survived", "prediction"))
840845
```
841846

842847
#### Bisecting k-Means

core/pom.xml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@
5656
</dependency>
5757
<dependency>
5858
<groupId>org.apache.xbean</groupId>
59-
<artifactId>xbean-asm5-shaded</artifactId>
59+
<artifactId>xbean-asm6-shaded</artifactId>
6060
</dependency>
6161
<dependency>
6262
<groupId>org.apache.hadoop</groupId>

core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -385,7 +385,7 @@ private[spark] class SparkSubmit extends Logging {
385385
val forceDownloadSchemes = sparkConf.get(FORCE_DOWNLOAD_SCHEMES)
386386

387387
def shouldDownload(scheme: String): Boolean = {
388-
forceDownloadSchemes.contains(scheme) ||
388+
forceDownloadSchemes.contains("*") || forceDownloadSchemes.contains(scheme) ||
389389
Try { FileSystem.getFileSystemClass(scheme, hadoopConf) }.isFailure
390390
}
391391

@@ -578,7 +578,8 @@ private[spark] class SparkSubmit extends Logging {
578578
}
579579
// Add the main application jar and any added jars to classpath in case YARN client
580580
// requires these jars.
581-
// This assumes both primaryResource and user jars are local jars, otherwise it will not be
581+
// This assumes both primaryResource and user jars are local jars, or already downloaded
582+
// to local by configuring "spark.yarn.dist.forceDownloadSchemes", otherwise it will not be
582583
// added to the classpath of YARN client.
583584
if (isYarnCluster) {
584585
if (isUserJar(args.primaryResource)) {

0 commit comments

Comments
 (0)