make SparkR model params and default values consistent with MLlib

mengxr · mengxr · commit 39a4c4c3b9e4 · 2016-06-20T23:55:01.000-07:00
diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R
@@ -64,8 +64,8 @@ setClass("KMeansModel", representation(jobj = "jobj"))
 #'               This can be a character string naming a family function, a family function or
 #'               the result of a call to a family function. Refer R family at
 #'               \url{https://stat.ethz.ch/R-manual/R-devel/library/stats/html/family.html}.
-#' @param epsilon Positive convergence tolerance of iterations.
-#' @param maxit Integer giving the maximal number of IRLS iterations.
+#' @param tol Positive convergence tolerance of iterations.
+#' @param maxIter Integer giving the maximal number of IRLS iterations.
 #' @return a fitted generalized linear model
 #' @rdname spark.glm
 #' @export
@@ -74,32 +74,30 @@ setClass("KMeansModel", representation(jobj = "jobj"))
 #' sparkR.session()
 #' data(iris)
 #' df <- createDataFrame(iris)
-#' model <- spark.glm(df, Sepal_Length ~ Sepal_Width, family="gaussian")
+#' model <- spark.glm(df, Sepal_Length ~ Sepal_Width, family = "gaussian")
 #' summary(model)
 #' }
 #' @note spark.glm since 2.0.0
-setMethod(
-    "spark.glm",
-    signature(data = "SparkDataFrame", formula = "formula"),
-    function(data, formula, family = gaussian, epsilon = 1e-06, maxit = 25) {
-        if (is.character(family)) {
-            family <- get(family, mode = "function", envir = parent.frame())
-        }
-        if (is.function(family)) {
-            family <- family()
-        }
-        if (is.null(family$family)) {
-            print(family)
-            stop("'family' not recognized")
-        }
+setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"),
+          function(data, formula, family = gaussian, tol = 1e-6, maxIter = 25) {
+            if (is.character(family)) {
+              family <- get(family, mode = "function", envir = parent.frame())
+            }
+            if (is.function(family)) {
+              family <- family()
+            }
+            if (is.null(family$family)) {
+              print(family)
+              stop("'family' not recognized")
+            }
 
-        formula <- paste(deparse(formula), collapse = "")
+            formula <- paste(deparse(formula), collapse = "")
 
-        jobj <- callJStatic("org.apache.spark.ml.r.GeneralizedLinearRegressionWrapper",
-        "fit", formula, data@sdf, family$family, family$link,
-        epsilon, as.integer(maxit))
-        return(new("GeneralizedLinearRegressionModel", jobj = jobj))
-})
+            jobj <- callJStatic("org.apache.spark.ml.r.GeneralizedLinearRegressionWrapper",
+                                "fit", formula, data@sdf, family$family, family$link,
+                                tol, as.integer(maxIter))
+            return(new("GeneralizedLinearRegressionModel", jobj = jobj))
+          })
 
 #' Fits a generalized linear model (R-compliant).
 #'
@@ -122,13 +120,13 @@ setMethod(
 #' sparkR.session()
 #' data(iris)
 #' df <- createDataFrame(iris)
-#' model <- glm(Sepal_Length ~ Sepal_Width, df, family="gaussian")
+#' model <- glm(Sepal_Length ~ Sepal_Width, df, family = "gaussian")
 #' summary(model)
 #' }
 #' @note glm since 1.5.0
 setMethod("glm", signature(formula = "formula", family = "ANY", data = "SparkDataFrame"),
-          function(formula, family = gaussian, data, epsilon = 1e-06, maxit = 25) {
-            spark.glm(data, formula, family, epsilon, maxit)
+          function(formula, family = gaussian, data, epsilon = 1e-6, maxit = 25) {
+            spark.glm(data, formula, family, tol = epsilon, maxIter = maxit)
           })
 
 #' Get the summary of a generalized linear model
@@ -298,17 +296,17 @@ setMethod("summary", signature(object = "NaiveBayesModel"),
 #' @export
 #' @examples
 #' \dontrun{
-#' model <- spark.kmeans(data, ~ ., k=2, initMode="random")
+#' model <- spark.kmeans(data, ~ ., k = 4, initMode = "random")
 #' }
 #' @note spark.kmeans since 2.0.0
 setMethod("spark.kmeans", signature(data = "SparkDataFrame", formula = "formula"),
-          function(data, formula, k, maxIter = 10, initMode = c("random", "k-means||")) {
+          function(data, formula, k = 2, maxIter = 20, initMode = c("k-means||", "random")) {
             formula <- paste(deparse(formula), collapse = "")
             initMode <- match.arg(initMode)
             jobj <- callJStatic("org.apache.spark.ml.r.KMeansWrapper", "fit", data@sdf, formula,
                                 as.integer(k), as.integer(maxIter), initMode)
             return(new("KMeansModel", jobj = jobj))
-         })
+          })
 
 #' Get fitted result from a k-means model
 #'
@@ -401,24 +399,24 @@ setMethod("predict", signature(object = "KMeansModel"),
 #' @param data SparkDataFrame for training
 #' @param formula A symbolic description of the model to be fitted. Currently only a few formula
 #'               operators are supported, including '~', '.', ':', '+', and '-'.
-#' @param laplace Smoothing parameter
+#' @param smoothing Smoothing parameter
 #' @return a fitted naive Bayes model
 #' @rdname spark.naiveBayes
 #' @seealso e1071: \url{https://cran.r-project.org/web/packages/e1071/}
 #' @export
 #' @examples
 #' \dontrun{
 #' df <- createDataFrame(infert)
-#' model <- spark.naiveBayes(df, education ~ ., laplace = 0)
+#' model <- spark.naiveBayes(df, education ~ ., smoothing = 0)
 #'}
 #' @note spark.naiveBayes since 2.0.0
 setMethod("spark.naiveBayes", signature(data = "SparkDataFrame", formula = "formula"),
-    function(data, formula, laplace = 0, ...) {
-        formula <- paste(deparse(formula), collapse = "")
-        jobj <- callJStatic("org.apache.spark.ml.r.NaiveBayesWrapper", "fit",
-          formula, data@sdf, laplace)
-        return(new("NaiveBayesModel", jobj = jobj))
-    })
+          function(data, formula, smoothing = 1.0, ...) {
+            formula <- paste(deparse(formula), collapse = "")
+            jobj <- callJStatic("org.apache.spark.ml.r.NaiveBayesWrapper", "fit",
+            formula, data@sdf, smoothing)
+            return(new("NaiveBayesModel", jobj = jobj))
+          })
 
 #' Save fitted MLlib model to the input path
 #'
@@ -435,7 +433,7 @@ setMethod("spark.naiveBayes", signature(data = "SparkDataFrame", formula = "form
 #' @examples
 #' \dontrun{
 #' df <- createDataFrame(infert)
-#' model <- spark.naiveBayes(df, education ~ ., laplace = 0)
+#' model <- spark.naiveBayes(df, education ~ ., smoothing = 0)
 #' path <- "path/to/model"
 #' write.ml(model, path)
 #' }
diff --git a/R/pkg/inst/tests/testthat/test_mllib.R b/R/pkg/inst/tests/testthat/test_mllib.R
@@ -363,7 +363,7 @@ test_that("spark.naiveBayes", {
   t <- as.data.frame(Titanic)
   t1 <- t[t$Freq > 0, -5]
   df <- suppressWarnings(createDataFrame(t1))
-  m <- spark.naiveBayes(df, Survived ~ .)
+  m <- spark.naiveBayes(df, Survived ~ ., smoothing = 0.0)
   s <- summary(m)
   expect_equal(as.double(s$apriori[1, "Yes"]), 0.5833333, tolerance = 1e-6)
   expect_equal(sum(s$apriori), 1)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala b/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala
@@ -67,8 +67,8 @@ private[r] object GeneralizedLinearRegressionWrapper
       data: DataFrame,
       family: String,
       link: String,
-      epsilon: Double,
-      maxit: Int): GeneralizedLinearRegressionWrapper = {
+      tol: Double,
+      maxIter: Int): GeneralizedLinearRegressionWrapper = {
     val rFormula = new RFormula()
       .setFormula(formula)
     val rFormulaModel = rFormula.fit(data)
@@ -82,8 +82,8 @@ private[r] object GeneralizedLinearRegressionWrapper
       .setFamily(family)
       .setLink(link)
       .setFitIntercept(rFormula.hasIntercept)
-      .setTol(epsilon)
-      .setMaxIter(maxit)
+      .setTol(tol)
+      .setMaxIter(maxIter)
     val pipeline = new Pipeline()
       .setStages(Array(rFormulaModel, glr))
       .fit(data)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/r/NaiveBayesWrapper.scala b/mllib/src/main/scala/org/apache/spark/ml/r/NaiveBayesWrapper.scala
@@ -56,7 +56,7 @@ private[r] object NaiveBayesWrapper extends MLReadable[NaiveBayesWrapper] {
   val PREDICTED_LABEL_INDEX_COL = "pred_label_idx"
   val PREDICTED_LABEL_COL = "prediction"
 
-  def fit(formula: String, data: DataFrame, laplace: Double): NaiveBayesWrapper = {
+  def fit(formula: String, data: DataFrame, smoothing: Double): NaiveBayesWrapper = {
     val rFormula = new RFormula()
       .setFormula(formula)
       .fit(data)
@@ -70,7 +70,7 @@ private[r] object NaiveBayesWrapper extends MLReadable[NaiveBayesWrapper] {
     val features = featureAttrs.map(_.name.get)
     // assemble and fit the pipeline
     val naiveBayes = new NaiveBayes()
-      .setSmoothing(laplace)
+      .setSmoothing(smoothing)
       .setModelType("bernoulli")
       .setPredictionCol(PREDICTED_LABEL_INDEX_COL)
     val idxToStr = new IndexToString()