apache
diff --git a/‎R/pkg/NAMESPACE‎
Lines changed: 1 addition & 0 deletions b/‎R/pkg/NAMESPACE‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎R/pkg/R/functions.R‎
Lines changed: 19 additions & 0 deletions b/‎R/pkg/R/functions.R‎
Lines changed: 19 additions & 0 deletions
diff --git a/‎R/pkg/R/generics.R‎
Lines changed: 4 additions & 0 deletions b/‎R/pkg/R/generics.R‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎R/pkg/R/sparkR.R‎
Lines changed: 1 addition & 1 deletion b/‎R/pkg/R/sparkR.R‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎R/pkg/tests/fulltests/test_client.R‎
Lines changed: 2 additions & 2 deletions b/‎R/pkg/tests/fulltests/test_client.R‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎R/pkg/tests/fulltests/test_mllib_classification.R‎
Lines changed: 3 additions & 3 deletions b/‎R/pkg/tests/fulltests/test_mllib_classification.R‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎R/pkg/tests/fulltests/test_mllib_clustering.R‎
Lines changed: 1 addition & 1 deletion b/‎R/pkg/tests/fulltests/test_mllib_clustering.R‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎R/pkg/tests/fulltests/test_mllib_recommendation.R‎
Lines changed: 2 additions & 2 deletions b/‎R/pkg/tests/fulltests/test_mllib_recommendation.R‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎R/pkg/tests/fulltests/test_mllib_tree.R‎
Lines changed: 4 additions & 4 deletions b/‎R/pkg/tests/fulltests/test_mllib_tree.R‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎R/pkg/tests/fulltests/test_sparkSQL.R‎
Lines changed: 16 additions & 16 deletions b/‎R/pkg/tests/fulltests/test_sparkSQL.R‎
Lines changed: 16 additions & 16 deletions
@@ -408,6 +408,7 @@ exportMethods("%<=>%",
               "weekofyear",
               "when",
               "window",
+              "xxhash64",
               "year")
 
 exportClasses("GroupedData")
 
@@ -735,6 +735,25 @@ setMethod("hash",
             column(jc)
           })
 
+#' @details
+#' \code{xxhash64}: Calculates the hash code of given columns using the 64-bit
+#' variant of the xxHash algorithm, and returns the result as a long
+#' column.
+#'
+#' @rdname column_misc_functions
+#' @aliases xxhash64 xxhash64,Column-method
+#' @note xxhash64 since 3.0.0
+setMethod("xxhash64",
+          signature(x = "Column"),
+          function(x, ...) {
+            jcols <- lapply(list(x, ...), function(x) {
+              stopifnot(class(x) == "Column")
+              x@jc
+            })
+            jc <- callJStatic("org.apache.spark.sql.functions", "xxhash64", jcols)
+            column(jc)
+          })
+
 #' @details
 #' \code{dayofmonth}: Extracts the day of the month as an integer from a
 #' given date/timestamp/string.
 
@@ -1394,6 +1394,10 @@ setGeneric("weekofyear", function(x) { standardGeneric("weekofyear") })
 #' @name NULL
 setGeneric("window", function(x, ...) { standardGeneric("window") })
 
+#' @rdname column_misc_functions
+#' @name NULL
+setGeneric("xxhash64", function(x, ...) { standardGeneric("xxhash64") })
+
 #' @rdname column_datetime_functions
 #' @name NULL
 setGeneric("year", function(x) { standardGeneric("year") })
 
@@ -269,7 +269,7 @@ sparkR.sparkContext <- function(
 #' sparkR.session("yarn-client", "SparkR", "/home/spark",
 #'                list(spark.executor.memory="4g"),
 #'                c("one.jar", "two.jar", "three.jar"),
-#'                c("com.databricks:spark-avro_2.11:2.0.1"))
+#'                c("com.databricks:spark-avro_2.12:2.0.1"))
 #' sparkR.session(spark.master = "yarn-client", spark.executor.memory = "4g")
 #'}
 #' @note sparkR.session since 2.0.0
 
@@ -37,7 +37,7 @@ test_that("multiple packages don't produce a warning", {
 
 test_that("sparkJars sparkPackages as character vectors", {
   args <- generateSparkSubmitArgs("", "", c("one.jar", "two.jar", "three.jar"), "",
-                                  c("com.databricks:spark-avro_2.11:2.0.1"))
+                                  c("com.databricks:spark-avro_2.12:2.0.1"))
   expect_match(args, "--jars one.jar,two.jar,three.jar")
-  expect_match(args, "--packages com.databricks:spark-avro_2.11:2.0.1")
+  expect_match(args, "--packages com.databricks:spark-avro_2.12:2.0.1")
 })
@@ -299,21 +299,21 @@ test_that("spark.mlp", {
   df <- read.df(absoluteSparkPath("data/mllib/sample_multiclass_classification_data.txt"),
                 source = "libsvm")
   model <- spark.mlp(df, label ~ features, blockSize = 128, layers = c(4, 5, 4, 3),
-                     solver = "l-bfgs", maxIter = 100, tol = 0.5, stepSize = 1, seed = 1)
+                     solver = "l-bfgs", maxIter = 100, tol = 0.00001, stepSize = 1, seed = 1)
 
   # Test summary method
   summary <- summary(model)
   expect_equal(summary$numOfInputs, 4)
   expect_equal(summary$numOfOutputs, 3)
   expect_equal(summary$layers, c(4, 5, 4, 3))
   expect_equal(length(summary$weights), 64)
-  expect_equal(head(summary$weights, 5), list(-0.878743, 0.2154151, -1.16304, -0.6583214, 1.009825),
+  expect_equal(head(summary$weights, 5), list(-24.28415, 107.8701, 16.86376, 1.103736, 9.244488),
                tolerance = 1e-6)
 
   # Test predict method
   mlpTestDF <- df
   mlpPredictions <- collect(select(predict(model, mlpTestDF), "prediction"))
-  expect_equal(head(mlpPredictions$prediction, 6), c("0.0", "1.0", "1.0", "1.0", "1.0", "1.0"))
+  expect_equal(head(mlpPredictions$prediction, 6), c("1.0", "1.0", "1.0", "1.0", "0.0", "1.0"))
 
   # Test model save/load
   if (windows_with_hadoop()) {
 
@@ -153,7 +153,7 @@ test_that("spark.kmeans", {
   model <- spark.kmeans(data = training, ~ ., k = 2, maxIter = 10, initMode = "random")
   sample <- take(select(predict(model, training), "prediction"), 1)
   expect_equal(typeof(sample$prediction), "integer")
-  expect_equal(sample$prediction, 1)
+  expect_equal(sample$prediction, 0)
 
   # Test stats::kmeans is working
   statsModel <- kmeans(x = newIris, centers = 2)
 
@@ -27,13 +27,13 @@ test_that("spark.als", {
                list(2, 1, 1.0), list(2, 2, 5.0))
   df <- createDataFrame(data, c("user", "item", "score"))
   model <- spark.als(df, ratingCol = "score", userCol = "user", itemCol = "item",
-                     rank = 10, maxIter = 5, seed = 0, regParam = 0.1)
+                     rank = 10, maxIter = 15, seed = 0, regParam = 0.1)
   stats <- summary(model)
   expect_equal(stats$rank, 10)
   test <- createDataFrame(list(list(0, 2), list(1, 0), list(2, 0)), c("user", "item"))
   predictions <- collect(predict(model, test))
 
-  expect_equal(predictions$prediction, c(-0.1380762, 2.6258414, -1.5018409),
+  expect_equal(predictions$prediction, c(0.6324540, 3.6218479, -0.4568263),
   tolerance = 1e-4)
 
   # Test model save/load
 
@@ -148,10 +148,10 @@ test_that("spark.randomForest", {
   model <- spark.randomForest(data, Employed ~ ., "regression", maxDepth = 5, maxBins = 16,
                               numTrees = 20, seed = 123)
   predictions <- collect(predict(model, data))
-  expect_equal(predictions$prediction, c(60.32820, 61.22315, 60.69025, 62.11070,
-                                         63.53160, 64.05470, 65.12710, 64.30450,
-                                         66.70910, 67.86125, 68.08700, 67.21865,
-                                         68.89275, 69.53180, 69.39640, 69.68250),
+  expect_equal(predictions$prediction, c(60.32495, 61.06495, 60.52120, 61.98500,
+                                         63.64450, 64.21910, 65.00810, 64.30450,
+                                         66.70910, 67.96875, 68.22140, 67.21865,
+                                         68.89275, 69.55900, 69.30160, 69.93050),
                tolerance = 1e-4)
   stats <- summary(model)
   expect_equal(stats$numTrees, 20)
 
@@ -1390,7 +1390,7 @@ test_that("column functions", {
   c9 <- signum(c) + sin(c) + sinh(c) + size(c) + stddev(c) + soundex(c) + sqrt(c) + sum(c)
   c10 <- sumDistinct(c) + tan(c) + tanh(c) + degrees(c) + radians(c)
   c11 <- to_date(c) + trim(c) + unbase64(c) + unhex(c) + upper(c)
-  c12 <- variance(c) + ltrim(c, "a") + rtrim(c, "b") + trim(c, "c")
+  c12 <- variance(c) + xxhash64(c) + ltrim(c, "a") + rtrim(c, "b") + trim(c, "c")
   c13 <- lead("col", 1) + lead(c, 1) + lag("col", 1) + lag(c, 1)
   c14 <- cume_dist() + ntile(1) + corr(c, c1)
   c15 <- dense_rank() + percent_rank() + rank() + row_number()
@@ -1786,9 +1786,9 @@ test_that("column binary mathfunctions", {
   expect_equal(collect(select(df, shiftRight(df$b, 1)))[4, 1], 4)
   expect_equal(collect(select(df, shiftRightUnsigned(df$b, 1)))[4, 1], 4)
   expect_equal(class(collect(select(df, rand()))[2, 1]), "numeric")
-  expect_equal(collect(select(df, rand(1)))[1, 1], 0.134, tolerance = 0.01)
+  expect_equal(collect(select(df, rand(1)))[1, 1], 0.636, tolerance = 0.01)
   expect_equal(class(collect(select(df, randn()))[2, 1]), "numeric")
-  expect_equal(collect(select(df, randn(1)))[1, 1], -1.03, tolerance = 0.01)
+  expect_equal(collect(select(df, randn(1)))[1, 1], 1.68, tolerance = 0.01)
 })
 
 test_that("string operators", {
@@ -2360,7 +2360,7 @@ test_that("join(), crossJoin() and merge() on a DataFrame", {
   expect_equal(names(joined3), c("age", "name", "name", "test"))
   expect_equal(count(joined3), 4)
   expect_true(is.na(collect(orderBy(joined3, joined3$age))$age[2]))
-  
+
   joined4 <- join(df, df2, df$name == df2$name, "right_outer")
   expect_equal(names(joined4), c("age", "name", "name", "test"))
   expect_equal(count(joined4), 4)
@@ -2377,19 +2377,19 @@ test_that("join(), crossJoin() and merge() on a DataFrame", {
   expect_equal(names(joined6), c("newAge", "name", "test"))
   expect_equal(count(joined6), 4)
   expect_equal(collect(orderBy(joined6, joined6$name))$newAge[3], 24)
-  
+
   joined7 <- select(join(df, df2, df$name == df2$name, "full"),
                     alias(df$age + 5, "newAge"), df$name, df2$test)
   expect_equal(names(joined7), c("newAge", "name", "test"))
   expect_equal(count(joined7), 4)
   expect_equal(collect(orderBy(joined7, joined7$name))$newAge[3], 24)
-  
+
   joined8 <- select(join(df, df2, df$name == df2$name, "fullouter"),
                     alias(df$age + 5, "newAge"), df$name, df2$test)
   expect_equal(names(joined8), c("newAge", "name", "test"))
   expect_equal(count(joined8), 4)
   expect_equal(collect(orderBy(joined8, joined8$name))$newAge[3], 24)
-  
+
   joined9 <- select(join(df, df2, df$name == df2$name, "full_outer"),
                     alias(df$age + 5, "newAge"), df$name, df2$test)
   expect_equal(names(joined9), c("newAge", "name", "test"))
@@ -2400,12 +2400,12 @@ test_that("join(), crossJoin() and merge() on a DataFrame", {
   expect_equal(names(joined10), c("age", "name", "name", "test"))
   expect_equal(count(joined10), 3)
   expect_true(is.na(collect(orderBy(joined10, joined10$age))$age[1]))
-  
+
   joined11 <- join(df, df2, df$name == df2$name, "leftouter")
   expect_equal(names(joined11), c("age", "name", "name", "test"))
   expect_equal(count(joined11), 3)
   expect_true(is.na(collect(orderBy(joined11, joined11$age))$age[1]))
-  
+
   joined12 <- join(df, df2, df$name == df2$name, "left_outer")
   expect_equal(names(joined12), c("age", "name", "name", "test"))
   expect_equal(count(joined12), 3)
@@ -2418,23 +2418,23 @@ test_that("join(), crossJoin() and merge() on a DataFrame", {
   joined14 <- join(df, df2, df$name == df2$name, "semi")
   expect_equal(names(joined14), c("age", "name"))
   expect_equal(count(joined14), 3)
-  
+
   joined14 <- join(df, df2, df$name == df2$name, "leftsemi")
   expect_equal(names(joined14), c("age", "name"))
   expect_equal(count(joined14), 3)
-  
+
   joined15 <- join(df, df2, df$name == df2$name, "left_semi")
   expect_equal(names(joined15), c("age", "name"))
   expect_equal(count(joined15), 3)
-  
+
   joined16 <- join(df2, df, df2$name == df$name, "anti")
   expect_equal(names(joined16), c("name", "test"))
   expect_equal(count(joined16), 1)
-  
+
   joined17 <- join(df2, df, df2$name == df$name, "leftanti")
   expect_equal(names(joined17), c("name", "test"))
   expect_equal(count(joined17), 1)
-  
+
   joined18 <- join(df2, df, df2$name == df$name, "left_anti")
   expect_equal(names(joined18), c("name", "test"))
   expect_equal(count(joined18), 1)
@@ -2444,7 +2444,7 @@ test_that("join(), crossJoin() and merge() on a DataFrame", {
                  "'left', 'leftouter', 'left_outer', 'right', 'rightouter', 'right_outer',",
                  "'semi', 'leftsemi', 'left_semi', 'anti', 'leftanti' or 'left_anti'.")
   expect_error(join(df2, df, df2$name == df$name, "invalid"), error_msg)
-  
+
   merged <- merge(df, df2, by.x = "name", by.y = "name", all.x = TRUE, all.y = TRUE)
   expect_equal(count(merged), 4)
   expect_equal(names(merged), c("age", "name_x", "name_y", "test"))
@@ -3026,7 +3026,7 @@ test_that("sampleBy() on a DataFrame", {
   sample <- sampleBy(df, "key", fractions, 0)
   result <- collect(orderBy(count(groupBy(sample, "key")), "key"))
   expect_identical(as.list(result[1, ]), list(key = "0", count = 3))
-  expect_identical(as.list(result[2, ]), list(key = "1", count = 7))
+  expect_identical(as.list(result[2, ]), list(key = "1", count = 8))
 })
 
 test_that("approxQuantile() on a DataFrame", {