Skip to content

Commit 59b45d1

Browse files
Merge branch 'master' into logBatches
2 parents 7f26798 + 887279c commit 59b45d1

File tree

293 files changed

+6682
-3935
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

293 files changed

+6682
-3935
lines changed

R/pkg/NAMESPACE

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -408,6 +408,7 @@ exportMethods("%<=>%",
408408
"weekofyear",
409409
"when",
410410
"window",
411+
"xxhash64",
411412
"year")
412413

413414
exportClasses("GroupedData")

R/pkg/R/functions.R

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -735,6 +735,25 @@ setMethod("hash",
735735
column(jc)
736736
})
737737

738+
#' @details
739+
#' \code{xxhash64}: Calculates the hash code of given columns using the 64-bit
740+
#' variant of the xxHash algorithm, and returns the result as a long
741+
#' column.
742+
#'
743+
#' @rdname column_misc_functions
744+
#' @aliases xxhash64 xxhash64,Column-method
745+
#' @note xxhash64 since 3.0.0
746+
setMethod("xxhash64",
747+
signature(x = "Column"),
748+
function(x, ...) {
749+
jcols <- lapply(list(x, ...), function(x) {
750+
stopifnot(class(x) == "Column")
751+
x@jc
752+
})
753+
jc <- callJStatic("org.apache.spark.sql.functions", "xxhash64", jcols)
754+
column(jc)
755+
})
756+
738757
#' @details
739758
#' \code{dayofmonth}: Extracts the day of the month as an integer from a
740759
#' given date/timestamp/string.

R/pkg/R/generics.R

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1394,6 +1394,10 @@ setGeneric("weekofyear", function(x) { standardGeneric("weekofyear") })
13941394
#' @name NULL
13951395
setGeneric("window", function(x, ...) { standardGeneric("window") })
13961396

1397+
#' @rdname column_misc_functions
1398+
#' @name NULL
1399+
setGeneric("xxhash64", function(x, ...) { standardGeneric("xxhash64") })
1400+
13971401
#' @rdname column_datetime_functions
13981402
#' @name NULL
13991403
setGeneric("year", function(x) { standardGeneric("year") })

R/pkg/R/sparkR.R

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -269,7 +269,7 @@ sparkR.sparkContext <- function(
269269
#' sparkR.session("yarn-client", "SparkR", "/home/spark",
270270
#' list(spark.executor.memory="4g"),
271271
#' c("one.jar", "two.jar", "three.jar"),
272-
#' c("com.databricks:spark-avro_2.11:2.0.1"))
272+
#' c("com.databricks:spark-avro_2.12:2.0.1"))
273273
#' sparkR.session(spark.master = "yarn-client", spark.executor.memory = "4g")
274274
#'}
275275
#' @note sparkR.session since 2.0.0

R/pkg/tests/fulltests/test_client.R

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ test_that("multiple packages don't produce a warning", {
3737

3838
test_that("sparkJars sparkPackages as character vectors", {
3939
args <- generateSparkSubmitArgs("", "", c("one.jar", "two.jar", "three.jar"), "",
40-
c("com.databricks:spark-avro_2.11:2.0.1"))
40+
c("com.databricks:spark-avro_2.12:2.0.1"))
4141
expect_match(args, "--jars one.jar,two.jar,three.jar")
42-
expect_match(args, "--packages com.databricks:spark-avro_2.11:2.0.1")
42+
expect_match(args, "--packages com.databricks:spark-avro_2.12:2.0.1")
4343
})

R/pkg/tests/fulltests/test_mllib_classification.R

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -299,21 +299,21 @@ test_that("spark.mlp", {
299299
df <- read.df(absoluteSparkPath("data/mllib/sample_multiclass_classification_data.txt"),
300300
source = "libsvm")
301301
model <- spark.mlp(df, label ~ features, blockSize = 128, layers = c(4, 5, 4, 3),
302-
solver = "l-bfgs", maxIter = 100, tol = 0.5, stepSize = 1, seed = 1)
302+
solver = "l-bfgs", maxIter = 100, tol = 0.00001, stepSize = 1, seed = 1)
303303

304304
# Test summary method
305305
summary <- summary(model)
306306
expect_equal(summary$numOfInputs, 4)
307307
expect_equal(summary$numOfOutputs, 3)
308308
expect_equal(summary$layers, c(4, 5, 4, 3))
309309
expect_equal(length(summary$weights), 64)
310-
expect_equal(head(summary$weights, 5), list(-0.878743, 0.2154151, -1.16304, -0.6583214, 1.009825),
310+
expect_equal(head(summary$weights, 5), list(-24.28415, 107.8701, 16.86376, 1.103736, 9.244488),
311311
tolerance = 1e-6)
312312

313313
# Test predict method
314314
mlpTestDF <- df
315315
mlpPredictions <- collect(select(predict(model, mlpTestDF), "prediction"))
316-
expect_equal(head(mlpPredictions$prediction, 6), c("0.0", "1.0", "1.0", "1.0", "1.0", "1.0"))
316+
expect_equal(head(mlpPredictions$prediction, 6), c("1.0", "1.0", "1.0", "1.0", "0.0", "1.0"))
317317

318318
# Test model save/load
319319
if (windows_with_hadoop()) {

R/pkg/tests/fulltests/test_mllib_clustering.R

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -153,7 +153,7 @@ test_that("spark.kmeans", {
153153
model <- spark.kmeans(data = training, ~ ., k = 2, maxIter = 10, initMode = "random")
154154
sample <- take(select(predict(model, training), "prediction"), 1)
155155
expect_equal(typeof(sample$prediction), "integer")
156-
expect_equal(sample$prediction, 1)
156+
expect_equal(sample$prediction, 0)
157157

158158
# Test stats::kmeans is working
159159
statsModel <- kmeans(x = newIris, centers = 2)

R/pkg/tests/fulltests/test_mllib_recommendation.R

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,13 +27,13 @@ test_that("spark.als", {
2727
list(2, 1, 1.0), list(2, 2, 5.0))
2828
df <- createDataFrame(data, c("user", "item", "score"))
2929
model <- spark.als(df, ratingCol = "score", userCol = "user", itemCol = "item",
30-
rank = 10, maxIter = 5, seed = 0, regParam = 0.1)
30+
rank = 10, maxIter = 15, seed = 0, regParam = 0.1)
3131
stats <- summary(model)
3232
expect_equal(stats$rank, 10)
3333
test <- createDataFrame(list(list(0, 2), list(1, 0), list(2, 0)), c("user", "item"))
3434
predictions <- collect(predict(model, test))
3535

36-
expect_equal(predictions$prediction, c(-0.1380762, 2.6258414, -1.5018409),
36+
expect_equal(predictions$prediction, c(0.6324540, 3.6218479, -0.4568263),
3737
tolerance = 1e-4)
3838

3939
# Test model save/load

R/pkg/tests/fulltests/test_mllib_tree.R

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -148,10 +148,10 @@ test_that("spark.randomForest", {
148148
model <- spark.randomForest(data, Employed ~ ., "regression", maxDepth = 5, maxBins = 16,
149149
numTrees = 20, seed = 123)
150150
predictions <- collect(predict(model, data))
151-
expect_equal(predictions$prediction, c(60.32820, 61.22315, 60.69025, 62.11070,
152-
63.53160, 64.05470, 65.12710, 64.30450,
153-
66.70910, 67.86125, 68.08700, 67.21865,
154-
68.89275, 69.53180, 69.39640, 69.68250),
151+
expect_equal(predictions$prediction, c(60.32495, 61.06495, 60.52120, 61.98500,
152+
63.64450, 64.21910, 65.00810, 64.30450,
153+
66.70910, 67.96875, 68.22140, 67.21865,
154+
68.89275, 69.55900, 69.30160, 69.93050),
155155
tolerance = 1e-4)
156156
stats <- summary(model)
157157
expect_equal(stats$numTrees, 20)

R/pkg/tests/fulltests/test_sparkSQL.R

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1390,7 +1390,7 @@ test_that("column functions", {
13901390
c9 <- signum(c) + sin(c) + sinh(c) + size(c) + stddev(c) + soundex(c) + sqrt(c) + sum(c)
13911391
c10 <- sumDistinct(c) + tan(c) + tanh(c) + degrees(c) + radians(c)
13921392
c11 <- to_date(c) + trim(c) + unbase64(c) + unhex(c) + upper(c)
1393-
c12 <- variance(c) + ltrim(c, "a") + rtrim(c, "b") + trim(c, "c")
1393+
c12 <- variance(c) + xxhash64(c) + ltrim(c, "a") + rtrim(c, "b") + trim(c, "c")
13941394
c13 <- lead("col", 1) + lead(c, 1) + lag("col", 1) + lag(c, 1)
13951395
c14 <- cume_dist() + ntile(1) + corr(c, c1)
13961396
c15 <- dense_rank() + percent_rank() + rank() + row_number()
@@ -1786,9 +1786,9 @@ test_that("column binary mathfunctions", {
17861786
expect_equal(collect(select(df, shiftRight(df$b, 1)))[4, 1], 4)
17871787
expect_equal(collect(select(df, shiftRightUnsigned(df$b, 1)))[4, 1], 4)
17881788
expect_equal(class(collect(select(df, rand()))[2, 1]), "numeric")
1789-
expect_equal(collect(select(df, rand(1)))[1, 1], 0.134, tolerance = 0.01)
1789+
expect_equal(collect(select(df, rand(1)))[1, 1], 0.636, tolerance = 0.01)
17901790
expect_equal(class(collect(select(df, randn()))[2, 1]), "numeric")
1791-
expect_equal(collect(select(df, randn(1)))[1, 1], -1.03, tolerance = 0.01)
1791+
expect_equal(collect(select(df, randn(1)))[1, 1], 1.68, tolerance = 0.01)
17921792
})
17931793

17941794
test_that("string operators", {
@@ -2360,7 +2360,7 @@ test_that("join(), crossJoin() and merge() on a DataFrame", {
23602360
expect_equal(names(joined3), c("age", "name", "name", "test"))
23612361
expect_equal(count(joined3), 4)
23622362
expect_true(is.na(collect(orderBy(joined3, joined3$age))$age[2]))
2363-
2363+
23642364
joined4 <- join(df, df2, df$name == df2$name, "right_outer")
23652365
expect_equal(names(joined4), c("age", "name", "name", "test"))
23662366
expect_equal(count(joined4), 4)
@@ -2377,19 +2377,19 @@ test_that("join(), crossJoin() and merge() on a DataFrame", {
23772377
expect_equal(names(joined6), c("newAge", "name", "test"))
23782378
expect_equal(count(joined6), 4)
23792379
expect_equal(collect(orderBy(joined6, joined6$name))$newAge[3], 24)
2380-
2380+
23812381
joined7 <- select(join(df, df2, df$name == df2$name, "full"),
23822382
alias(df$age + 5, "newAge"), df$name, df2$test)
23832383
expect_equal(names(joined7), c("newAge", "name", "test"))
23842384
expect_equal(count(joined7), 4)
23852385
expect_equal(collect(orderBy(joined7, joined7$name))$newAge[3], 24)
2386-
2386+
23872387
joined8 <- select(join(df, df2, df$name == df2$name, "fullouter"),
23882388
alias(df$age + 5, "newAge"), df$name, df2$test)
23892389
expect_equal(names(joined8), c("newAge", "name", "test"))
23902390
expect_equal(count(joined8), 4)
23912391
expect_equal(collect(orderBy(joined8, joined8$name))$newAge[3], 24)
2392-
2392+
23932393
joined9 <- select(join(df, df2, df$name == df2$name, "full_outer"),
23942394
alias(df$age + 5, "newAge"), df$name, df2$test)
23952395
expect_equal(names(joined9), c("newAge", "name", "test"))
@@ -2400,12 +2400,12 @@ test_that("join(), crossJoin() and merge() on a DataFrame", {
24002400
expect_equal(names(joined10), c("age", "name", "name", "test"))
24012401
expect_equal(count(joined10), 3)
24022402
expect_true(is.na(collect(orderBy(joined10, joined10$age))$age[1]))
2403-
2403+
24042404
joined11 <- join(df, df2, df$name == df2$name, "leftouter")
24052405
expect_equal(names(joined11), c("age", "name", "name", "test"))
24062406
expect_equal(count(joined11), 3)
24072407
expect_true(is.na(collect(orderBy(joined11, joined11$age))$age[1]))
2408-
2408+
24092409
joined12 <- join(df, df2, df$name == df2$name, "left_outer")
24102410
expect_equal(names(joined12), c("age", "name", "name", "test"))
24112411
expect_equal(count(joined12), 3)
@@ -2418,23 +2418,23 @@ test_that("join(), crossJoin() and merge() on a DataFrame", {
24182418
joined14 <- join(df, df2, df$name == df2$name, "semi")
24192419
expect_equal(names(joined14), c("age", "name"))
24202420
expect_equal(count(joined14), 3)
2421-
2421+
24222422
joined14 <- join(df, df2, df$name == df2$name, "leftsemi")
24232423
expect_equal(names(joined14), c("age", "name"))
24242424
expect_equal(count(joined14), 3)
2425-
2425+
24262426
joined15 <- join(df, df2, df$name == df2$name, "left_semi")
24272427
expect_equal(names(joined15), c("age", "name"))
24282428
expect_equal(count(joined15), 3)
2429-
2429+
24302430
joined16 <- join(df2, df, df2$name == df$name, "anti")
24312431
expect_equal(names(joined16), c("name", "test"))
24322432
expect_equal(count(joined16), 1)
2433-
2433+
24342434
joined17 <- join(df2, df, df2$name == df$name, "leftanti")
24352435
expect_equal(names(joined17), c("name", "test"))
24362436
expect_equal(count(joined17), 1)
2437-
2437+
24382438
joined18 <- join(df2, df, df2$name == df$name, "left_anti")
24392439
expect_equal(names(joined18), c("name", "test"))
24402440
expect_equal(count(joined18), 1)
@@ -2444,7 +2444,7 @@ test_that("join(), crossJoin() and merge() on a DataFrame", {
24442444
"'left', 'leftouter', 'left_outer', 'right', 'rightouter', 'right_outer',",
24452445
"'semi', 'leftsemi', 'left_semi', 'anti', 'leftanti' or 'left_anti'.")
24462446
expect_error(join(df2, df, df2$name == df$name, "invalid"), error_msg)
2447-
2447+
24482448
merged <- merge(df, df2, by.x = "name", by.y = "name", all.x = TRUE, all.y = TRUE)
24492449
expect_equal(count(merged), 4)
24502450
expect_equal(names(merged), c("age", "name_x", "name_y", "test"))
@@ -3026,7 +3026,7 @@ test_that("sampleBy() on a DataFrame", {
30263026
sample <- sampleBy(df, "key", fractions, 0)
30273027
result <- collect(orderBy(count(groupBy(sample, "key")), "key"))
30283028
expect_identical(as.list(result[1, ]), list(key = "0", count = 3))
3029-
expect_identical(as.list(result[2, ]), list(key = "1", count = 7))
3029+
expect_identical(as.list(result[2, ]), list(key = "1", count = 8))
30303030
})
30313031

30323032
test_that("approxQuantile() on a DataFrame", {

0 commit comments

Comments
 (0)