apache
diff --git a/‎R/pkg/NAMESPACE‎
Lines changed: 1 addition & 0 deletions b/‎R/pkg/NAMESPACE‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎R/pkg/R/DataFrame.R‎
Lines changed: 28 additions & 2 deletions b/‎R/pkg/R/DataFrame.R‎
Lines changed: 28 additions & 2 deletions
diff --git a/‎R/pkg/R/RDD.R‎
Lines changed: 1 addition & 1 deletion b/‎R/pkg/R/RDD.R‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎R/pkg/R/generics.R‎
Lines changed: 5 additions & 1 deletion b/‎R/pkg/R/generics.R‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎R/pkg/R/mllib.R‎
Lines changed: 10 additions & 4 deletions b/‎R/pkg/R/mllib.R‎
Lines changed: 10 additions & 4 deletions
diff --git a/‎R/pkg/R/utils.R‎
Lines changed: 41 additions & 0 deletions b/‎R/pkg/R/utils.R‎
Lines changed: 41 additions & 0 deletions
diff --git a/‎R/pkg/inst/tests/testthat/test_mllib.R‎
Lines changed: 15 additions & 0 deletions b/‎R/pkg/inst/tests/testthat/test_mllib.R‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎R/pkg/inst/tests/testthat/test_sparkSQL.R‎
Lines changed: 5 additions & 2 deletions b/‎R/pkg/inst/tests/testthat/test_sparkSQL.R‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/api/python/WriteInputFormatTestDataGenerator.scala‎
Lines changed: 1 addition & 1 deletion b/‎core/src/main/scala/org/apache/spark/api/python/WriteInputFormatTestDataGenerator.scala‎
Lines changed: 1 addition & 1 deletion
@@ -124,6 +124,7 @@ exportMethods("arrange",
               "selectExpr",
               "show",
               "showDF",
+              "storageLevel",
               "subset",
               "summarize",
               "summary",
 
@@ -365,7 +365,7 @@ setMethod("colnames<-",
 
             # Check if the column names have . in it
             if (any(regexec(".", value, fixed = TRUE)[[1]][1] != -1)) {
-              stop("Colum names cannot contain the '.' symbol.")
+              stop("Column names cannot contain the '.' symbol.")
             }
 
             sdf <- callJMethod(x@sdf, "toDF", as.list(value))
@@ -633,7 +633,7 @@ setMethod("persist",
 #' @param ... further arguments to be passed to or from other methods.
 #'
 #' @family SparkDataFrame functions
-#' @rdname unpersist-methods
+#' @rdname unpersist
 #' @aliases unpersist,SparkDataFrame-method
 #' @name unpersist
 #' @export
@@ -654,6 +654,32 @@ setMethod("unpersist",
             x
           })
 
+#' StorageLevel
+#'
+#' Get storagelevel of this SparkDataFrame.
+#'
+#' @param x the SparkDataFrame to get the storageLevel.
+#'
+#' @family SparkDataFrame functions
+#' @rdname storageLevel
+#' @aliases storageLevel,SparkDataFrame-method
+#' @name storageLevel
+#' @export
+#' @examples
+#'\dontrun{
+#' sparkR.session()
+#' path <- "path/to/file.json"
+#' df <- read.json(path)
+#' persist(df, "MEMORY_AND_DISK")
+#' storageLevel(df)
+#'}
+#' @note storageLevel since 2.1.0
+setMethod("storageLevel",
+          signature(x = "SparkDataFrame"),
+          function(x) {
+            storageLevelToString(callJMethod(x@sdf, "storageLevel"))
+          })
+
 #' Repartition
 #'
 #' The following options for repartition are possible:
 
@@ -261,7 +261,7 @@ setMethod("persistRDD",
 #' cache(rdd) # rdd@@env$isCached == TRUE
 #' unpersistRDD(rdd) # rdd@@env$isCached == FALSE
 #'}
-#' @rdname unpersist-methods
+#' @rdname unpersist
 #' @aliases unpersist,RDD-method
 #' @noRd
 setMethod("unpersistRDD",
 
@@ -691,6 +691,10 @@ setGeneric("selectExpr", function(x, expr, ...) { standardGeneric("selectExpr")
 #' @export
 setGeneric("showDF", function(x, ...) { standardGeneric("showDF") })
 
+# @rdname storageLevel
+# @export
+setGeneric("storageLevel", function(x) { standardGeneric("storageLevel") })
+
 #' @rdname subset
 #' @export
 setGeneric("subset", function(x, ...) { standardGeneric("subset") })
@@ -715,7 +719,7 @@ setGeneric("union", function(x, y) { standardGeneric("union") })
 #' @export
 setGeneric("unionAll", function(x, y) { standardGeneric("unionAll") })
 
-#' @rdname unpersist-methods
+#' @rdname unpersist
 #' @export
 setGeneric("unpersist", function(x, ...) { standardGeneric("unpersist") })
 
 
@@ -665,6 +665,8 @@ setMethod("predict", signature(object = "KMeansModel"),
 #' @param tol convergence tolerance of iterations.
 #' @param stepSize stepSize parameter.
 #' @param seed seed parameter for weights initialization.
+#' @param initialWeights initialWeights parameter for weights initialization, it should be a
+#' numeric vector.
 #' @param ... additional arguments passed to the method.
 #' @return \code{spark.mlp} returns a fitted Multilayer Perceptron Classification Model.
 #' @rdname spark.mlp
@@ -677,8 +679,9 @@ setMethod("predict", signature(object = "KMeansModel"),
 #' df <- read.df("data/mllib/sample_multiclass_classification_data.txt", source = "libsvm")
 #'
 #' # fit a Multilayer Perceptron Classification Model
-#' model <- spark.mlp(df, blockSize = 128, layers = c(4, 5, 4, 3), solver = "l-bfgs",
-#'                    maxIter = 100, tol = 0.5, stepSize = 1, seed = 1)
+#' model <- spark.mlp(df, blockSize = 128, layers = c(4, 3), solver = "l-bfgs",
+#'                    maxIter = 100, tol = 0.5, stepSize = 1, seed = 1,
+#'                    initialWeights = c(0, 0, 0, 0, 0, 5, 5, 5, 5, 5, 9, 9, 9, 9, 9))
 #'
 #' # get the summary of the model
 #' summary(model)
@@ -695,7 +698,7 @@ setMethod("predict", signature(object = "KMeansModel"),
 #' @note spark.mlp since 2.1.0
 setMethod("spark.mlp", signature(data = "SparkDataFrame"),
           function(data, layers, blockSize = 128, solver = "l-bfgs", maxIter = 100,
-                   tol = 1E-6, stepSize = 0.03, seed = NULL) {
+                   tol = 1E-6, stepSize = 0.03, seed = NULL, initialWeights = NULL) {
             if (is.null(layers)) {
               stop ("layers must be a integer vector with length > 1.")
             }
@@ -706,10 +709,13 @@ setMethod("spark.mlp", signature(data = "SparkDataFrame"),
             if (!is.null(seed)) {
               seed <- as.character(as.integer(seed))
             }
+            if (!is.null(initialWeights)) {
+              initialWeights <- as.array(as.numeric(na.omit(initialWeights)))
+            }
             jobj <- callJStatic("org.apache.spark.ml.r.MultilayerPerceptronClassifierWrapper",
                                 "fit", data@sdf, as.integer(blockSize), as.array(layers),
                                 as.character(solver), as.integer(maxIter), as.numeric(tol),
-                                as.numeric(stepSize), seed)
+                                as.numeric(stepSize), seed, initialWeights)
             new("MultilayerPerceptronClassificationModel", jobj = jobj)
           })
 
 
@@ -385,6 +385,47 @@ getStorageLevel <- function(newLevel = c("DISK_ONLY",
                          "OFF_HEAP" = callJStatic(storageLevelClass, "OFF_HEAP"))
 }
 
+storageLevelToString <- function(levelObj) {
+  useDisk <- callJMethod(levelObj, "useDisk")
+  useMemory <- callJMethod(levelObj, "useMemory")
+  useOffHeap <- callJMethod(levelObj, "useOffHeap")
+  deserialized <- callJMethod(levelObj, "deserialized")
+  replication <- callJMethod(levelObj, "replication")
+  shortName <- if (!useDisk && !useMemory && !useOffHeap && !deserialized && replication == 1) {
+    "NONE"
+  } else if (useDisk && !useMemory && !useOffHeap && !deserialized && replication == 1) {
+    "DISK_ONLY"
+  } else if (useDisk && !useMemory && !useOffHeap && !deserialized && replication == 2) {
+    "DISK_ONLY_2"
+  } else if (!useDisk && useMemory && !useOffHeap && deserialized && replication == 1) {
+    "MEMORY_ONLY"
+  } else if (!useDisk && useMemory && !useOffHeap && deserialized && replication == 2) {
+    "MEMORY_ONLY_2"
+  } else if (!useDisk && useMemory && !useOffHeap && !deserialized && replication == 1) {
+    "MEMORY_ONLY_SER"
+  } else if (!useDisk && useMemory && !useOffHeap && !deserialized && replication == 2) {
+    "MEMORY_ONLY_SER_2"
+  } else if (useDisk && useMemory && !useOffHeap && deserialized && replication == 1) {
+    "MEMORY_AND_DISK"
+  } else if (useDisk && useMemory && !useOffHeap && deserialized && replication == 2) {
+    "MEMORY_AND_DISK_2"
+  } else if (useDisk && useMemory && !useOffHeap && !deserialized && replication == 1) {
+    "MEMORY_AND_DISK_SER"
+  } else if (useDisk && useMemory && !useOffHeap && !deserialized && replication == 2) {
+    "MEMORY_AND_DISK_SER_2"
+  } else if (useDisk && useMemory && useOffHeap && !deserialized && replication == 1) {
+    "OFF_HEAP"
+  } else {
+    NULL
+  }
+  fullInfo <- callJMethod(levelObj, "toString")
+  if (is.null(shortName)) {
+    fullInfo
+  } else {
+    paste(shortName, "-", fullInfo)
+  }
+}
+
 # Utility function for functions where an argument needs to be integer but we want to allow
 # the user to type (for example) `5` instead of `5L` to avoid a confusing error message.
 numToInt <- function(num) {
 
@@ -410,6 +410,21 @@ test_that("spark.mlp", {
   model <- spark.mlp(df, layers = c(4, 5, 4, 3), maxIter = 10, seed = 10)
   mlpPredictions <- collect(select(predict(model, mlpTestDF), "prediction"))
   expect_equal(head(mlpPredictions$prediction, 12), c(1, 1, 1, 1, 2, 1, 2, 2, 1, 0, 0, 1))
+
+  # test initialWeights
+  model <- spark.mlp(df, layers = c(4, 3), maxIter = 2, initialWeights =
+    c(0, 0, 0, 0, 0, 5, 5, 5, 5, 5, 9, 9, 9, 9, 9))
+  mlpPredictions <- collect(select(predict(model, mlpTestDF), "prediction"))
+  expect_equal(head(mlpPredictions$prediction, 12), c(1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1))
+
+  model <- spark.mlp(df, layers = c(4, 3), maxIter = 2, initialWeights =
+    c(0.0, 0.0, 0.0, 0.0, 0.0, 5.0, 5.0, 5.0, 5.0, 5.0, 9.0, 9.0, 9.0, 9.0, 9.0))
+  mlpPredictions <- collect(select(predict(model, mlpTestDF), "prediction"))
+  expect_equal(head(mlpPredictions$prediction, 12), c(1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1))
+
+  model <- spark.mlp(df, layers = c(4, 3), maxIter = 2)
+  mlpPredictions <- collect(select(predict(model, mlpTestDF), "prediction"))
+  expect_equal(head(mlpPredictions$prediction, 12), c(1, 1, 1, 1, 0, 1, 0, 2, 1, 0, 0, 1))
 })
 
 test_that("spark.naiveBayes", {
 
@@ -796,7 +796,7 @@ test_that("multiple pipeline transformations result in an RDD with the correct v
   expect_false(collectRDD(second)[[3]]$testCol)
 })
 
-test_that("cache(), persist(), and unpersist() on a DataFrame", {
+test_that("cache(), storageLevel(), persist(), and unpersist() on a DataFrame", {
   df <- read.json(jsonPath)
   expect_false(df@env$isCached)
   cache(df)
@@ -808,6 +808,9 @@ test_that("cache(), persist(), and unpersist() on a DataFrame", {
   persist(df, "MEMORY_AND_DISK")
   expect_true(df@env$isCached)
 
+  expect_equal(storageLevel(df),
+    "MEMORY_AND_DISK - StorageLevel(disk, memory, deserialized, 1 replicas)")
+
   unpersist(df)
   expect_false(df@env$isCached)
 
@@ -845,7 +848,7 @@ test_that("names() colnames() set the column names", {
   expect_equal(names(df)[1], "col3")
 
   expect_error(colnames(df) <- c("sepal.length", "sepal_width"),
-               "Colum names cannot contain the '.' symbol.")
+               "Column names cannot contain the '.' symbol.")
   expect_error(colnames(df) <- c(1, 2), "Invalid column names.")
   expect_error(colnames(df) <- c("a"),
                "Column names must have the same length as the number of columns in the dataset.")
 
@@ -144,7 +144,7 @@ object WriteInputFormatTestDataGenerator {
 
     // Create test data for ArrayWritable
     val data = Seq(
-      (1, Array()),
+      (1, Array.empty[Double]),
       (2, Array(3.0, 4.0, 5.0)),
       (3, Array(4.0, 5.0, 6.0))
     )
Original file line number	Diff line number	Diff line change
`@@ -144,7 +144,7 @@ object WriteInputFormatTestDataGenerator {`
`144`	`144`
`145`	`145`	`// Create test data for ArrayWritable`
`146`	`146`	`val data = Seq(`
`147`		`- (1, Array()),`
	`147`	`+ (1, Array.empty[Double]),`
`148`	`148`	`(2, Array(3.0, 4.0, 5.0)),`
`149`	`149`	`(3, Array(4.0, 5.0, 6.0))`
`150`	`150`	`)`