diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R index f46681149d5a..02380900dd63 100644 --- a/R/pkg/R/mllib.R +++ b/R/pkg/R/mllib.R @@ -19,11 +19,11 @@ # Integration with R's standard functions. # Most of MLlib's argorithms are provided in two flavours: -# - a specialization of the default R methods (glm). These methods try to respect +# - a specialization of the default R methods (such as glm). These methods try to respect # the inputs and the outputs of R's method to the largest extent, but some small differences # may exist. -# - a set of methods that reflect the arguments of the other languages supported by Spark. These -# methods are prefixed with the `spark.` prefix: spark.glm, spark.kmeans, etc. +# - a set of methods that reflect the arguments of other languages supported by Spark. These +# methods are prefixed with `spark.`: spark.glm, spark.kmeans, etc. #' @title S4 class that represents a generalized linear model #' @param jobj a Java object reference to the backing Scala GeneralizedLinearRegressionWrapper @@ -35,7 +35,7 @@ setClass("GeneralizedLinearRegressionModel", representation(jobj = "jobj")) #' @export setClass("NaiveBayesModel", representation(jobj = "jobj")) -#' @title S4 class that represents a AFTSurvivalRegressionModel +#' @title S4 class that represents an AFTSurvivalRegressionModel #' @param jobj a Java object reference to the backing Scala AFTSurvivalRegressionWrapper #' @export setClass("AFTSurvivalRegressionModel", representation(jobj = "jobj")) @@ -45,7 +45,7 @@ setClass("AFTSurvivalRegressionModel", representation(jobj = "jobj")) #' @export setClass("KMeansModel", representation(jobj = "jobj")) -#' Fits a generalized linear model +#' Fit a generalized linear model #' #' Fits a generalized linear model against a Spark DataFrame. #' @@ -56,10 +56,11 @@ setClass("KMeansModel", representation(jobj = "jobj")) #' This can be a character string naming a family function, a family function or #' the result of a call to a family function. Refer R family at #' \url{https://stat.ethz.ch/R-manual/R-devel/library/stats/html/family.html}. -#' @param epsilon Positive convergence tolerance of iterations. -#' @param maxit Integer giving the maximal number of IRLS iterations. +#' @param tol Positive convergence tolerance of iterations. +#' @param maxIter Integer giving the maximal number of IRLS iterations. #' @return a fitted generalized linear model #' @rdname spark.glm +#' @name spark.glm #' @export #' @examples #' \dontrun{ @@ -67,33 +68,31 @@ setClass("KMeansModel", representation(jobj = "jobj")) #' sqlContext <- sparkRSQL.init(sc) #' data(iris) #' df <- createDataFrame(sqlContext, iris) -#' model <- spark.glm(df, Sepal_Length ~ Sepal_Width, family="gaussian") +#' model <- spark.glm(df, Sepal_Length ~ Sepal_Width, family = "gaussian") #' summary(model) #' } -setMethod( - "spark.glm", - signature(data = "SparkDataFrame", formula = "formula"), - function(data, formula, family = gaussian, epsilon = 1e-06, maxit = 25) { - if (is.character(family)) { - family <- get(family, mode = "function", envir = parent.frame()) - } - if (is.function(family)) { - family <- family() - } - if (is.null(family$family)) { - print(family) - stop("'family' not recognized") - } +setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"), + function(data, formula, family = gaussian, tol = 1e-06, maxIter = 25) { + if (is.character(family)) { + family <- get(family, mode = "function", envir = parent.frame()) + } + if (is.function(family)) { + family <- family() + } + if (is.null(family$family)) { + print(family) + stop("'family' not recognized") + } - formula <- paste(deparse(formula), collapse = "") + formula <- paste(deparse(formula), collapse = "") - jobj <- callJStatic("org.apache.spark.ml.r.GeneralizedLinearRegressionWrapper", - "fit", formula, data@sdf, family$family, family$link, - epsilon, as.integer(maxit)) - return(new("GeneralizedLinearRegressionModel", jobj = jobj)) -}) + jobj <- callJStatic("org.apache.spark.ml.r.GeneralizedLinearRegressionWrapper", + "fit", formula, data@sdf, family$family, family$link, + tol, as.integer(maxIter)) + return(new("GeneralizedLinearRegressionModel", jobj = jobj)) + }) -#' Fits a generalized linear model (R-compliant). +#' Fit a generalized linear model (R-compliant) #' #' Fits a generalized linear model, similarly to R's glm(). #' @@ -108,6 +107,7 @@ setMethod( #' @param maxit Integer giving the maximal number of IRLS iterations. #' @return a fitted generalized linear model #' @rdname glm +#' @name glm #' @export #' @examples #' \dontrun{ @@ -115,7 +115,7 @@ setMethod( #' sqlContext <- sparkRSQL.init(sc) #' data(iris) #' df <- createDataFrame(sqlContext, iris) -#' model <- glm(Sepal_Length ~ Sepal_Width, df, family="gaussian") +#' model <- glm(Sepal_Length ~ Sepal_Width, df, family = "gaussian") #' summary(model) #' } setMethod("glm", signature(formula = "formula", family = "ANY", data = "SparkDataFrame"), @@ -128,8 +128,13 @@ setMethod("glm", signature(formula = "formula", family = "ANY", data = "SparkDat #' Returns the summary of a model produced by glm() or spark.glm(), similarly to R's summary(). #' #' @param object A fitted generalized linear model -#' @return coefficients the model's coefficients, intercept +#' @return a list with 'deviance.resid', 'coefficients', 'dispersion', 'null.deviance', 'deviance', +#' 'df.null', 'df.residual', 'aic', 'iter', 'family', 'is.loaded' components. +#' The 'coefficients' gives the estimated coefficients, their estimated standard errors, +#' t values and p-values. Note 'deviance.resid' in the summary of a saved-loaded model +#' is NULL. #' @rdname summary +#' @name summary #' @export #' @examples #' \dontrun{ @@ -168,7 +173,7 @@ setMethod("summary", signature(object = "GeneralizedLinearRegressionModel"), #' Print the summary of GeneralizedLinearRegressionModel #' -#' @rdname print +#' @rdname print.summary.GeneralizedLinearRegressionModel #' @name print.summary.GeneralizedLinearRegressionModel #' @export print.summary.GeneralizedLinearRegressionModel <- function(x, ...) { @@ -195,9 +200,9 @@ print.summary.GeneralizedLinearRegressionModel <- function(x, ...) { "Number of Fisher Scoring iterations: ", x$iter, "\n", sep = "") cat("\n") invisible(x) - } +} -#' Make predictions from a generalized linear model +#' Make predictions from a fitted MLlib model #' #' Makes predictions from a generalized linear model produced by glm() or spark.glm(), #' similarly to R's predict(). @@ -206,6 +211,7 @@ print.summary.GeneralizedLinearRegressionModel <- function(x, ...) { #' @param newData SparkDataFrame for testing #' @return SparkDataFrame containing predicted labels in a column named "prediction" #' @rdname predict +#' @name predict #' @export #' @examples #' \dontrun{ @@ -218,29 +224,102 @@ setMethod("predict", signature(object = "GeneralizedLinearRegressionModel"), return(dataFrame(callJMethod(object@jobj, "transform", newData@sdf))) }) -#' Make predictions from a naive Bayes model +#' Fit an accelerated failure time (AFT) survival regression model #' -#' Makes predictions from a model produced by spark.naiveBayes(), -#' similarly to R package e1071's predict. +#' Fits an accelerated failure time (AFT) survival regression model on a Spark DataFrame. #' -#' @param object A fitted naive Bayes model +#' @param data SparkDataFrame for training. +#' @param formula A symbolic description of the model to be fitted. Currently only a few formula +#' operators are supported, including '~', ':', '+', and '-'. +#' Note that operator '.' is not supported currently. +#' @return a fitted AFT survival regression model +#' @rdname spark.survreg +#' @name spark.survreg +#' @seealso survival: \url{https://cran.r-project.org/web/packages/survival/} +#' @export +#' @examples +#' \dontrun{ +#' df <- createDataFrame(sqlContext, ovarian) +#' model <- spark.survreg(df, Surv(futime, fustat) ~ ecog_ps + rx) +#' } +setMethod("spark.survreg", signature(data = "SparkDataFrame", formula = "formula"), + function(data, formula, ...) { + formula <- paste(deparse(formula), collapse = "") + jobj <- callJStatic("org.apache.spark.ml.r.AFTSurvivalRegressionWrapper", + "fit", formula, data@sdf) + return(new("AFTSurvivalRegressionModel", jobj = jobj)) + }) + +#' Returns the summary of an AFT survival regression model produced by spark.survreg(), +#' similarly to R's summary(). +#' +#' @param object a fitted AFT survival regression model +#' @return coefficients the model's coefficients, intercept and log(scale). +#' @rdname summary +#' @name summary +#' @export +#' @examples +#' \dontrun{ +#' model <- spark.survreg(trainingData, Surv(futime, fustat) ~ ecog_ps + rx) +#' summary(model) +#' } +setMethod("summary", signature(object = "AFTSurvivalRegressionModel"), + function(object, ...) { + jobj <- object@jobj + features <- callJMethod(jobj, "rFeatures") + coefficients <- callJMethod(jobj, "rCoefficients") + coefficients <- as.matrix(unlist(coefficients)) + colnames(coefficients) <- c("Value") + rownames(coefficients) <- unlist(features) + return(list(coefficients = coefficients)) + }) + +#' Makes predictions from a model produced by spark.survreg(), +#' similarly to R package survival's predict. +#' +#' @param object A fitted AFT survival regression model #' @param newData SparkDataFrame for testing #' @return SparkDataFrame containing predicted labels in a column named "prediction" #' @rdname predict +#' @name predict #' @export #' @examples #' \dontrun{ -#' model <- spark.naiveBayes(trainingData, y ~ x) +#' model <- spark.survreg(trainingData, Surv(futime, fustat) ~ ecog_ps + rx) #' predicted <- predict(model, testData) #' showDF(predicted) -#'} -setMethod("predict", signature(object = "NaiveBayesModel"), +#' } +setMethod("predict", signature(object = "AFTSurvivalRegressionModel"), function(object, newData) { return(dataFrame(callJMethod(object@jobj, "transform", newData@sdf))) }) -#' Get the summary of a naive Bayes model +#' Fit a Bernoulli naive Bayes model #' +#' Fits a Bernoulli naive Bayes model on a Spark DataFrame (only categorical data is supported). +#' +#' @param data SparkDataFrame for training +#' @param formula A symbolic description of the model to be fitted. Currently only a few formula +#' operators are supported, including '~', '.', ':', '+', and '-'. +#' @param smoothing Smoothing parameter +#' @return a fitted naive Bayes model +#' @rdname spark.naiveBayes +#' @name spark.naiveBayes +#' @seealso e1071: \url{https://cran.r-project.org/web/packages/e1071/} +#' @export +#' @examples +#' \dontrun{ +#' df <- createDataFrame(sqlContext, infert) +#' model <- spark.naiveBayes(df, education ~ ., smoothing = 0) +#' } +setMethod("spark.naiveBayes", signature(data = "SparkDataFrame", formula = "formula"), + function(data, formula, smoothing = 1.0, ...) { + formula <- paste(deparse(formula), collapse = "") + jobj <- callJStatic("org.apache.spark.ml.r.NaiveBayesWrapper", "fit", + formula, data@sdf, smoothing) + return(new("NaiveBayesModel", jobj = jobj)) + }) + #' Returns the summary of a naive Bayes model produced by spark.naiveBayes(), #' similarly to R's summary(). #' @@ -248,12 +327,13 @@ setMethod("predict", signature(object = "NaiveBayesModel"), #' @return a list containing 'apriori', the label distribution, and 'tables', conditional # probabilities given the target label #' @rdname summary +#' @name summary #' @export #' @examples #' \dontrun{ #' model <- spark.naiveBayes(trainingData, y ~ x) #' summary(model) -#'} +#' } setMethod("summary", signature(object = "NaiveBayesModel"), function(object, ...) { jobj <- object@jobj @@ -269,9 +349,29 @@ setMethod("summary", signature(object = "NaiveBayesModel"), return(list(apriori = apriori, tables = tables)) }) +#' Makes predictions from a model produced by spark.naiveBayes(), +#' similarly to R package e1071's predict. +#' +#' @param object A fitted naive Bayes model +#' @param newData SparkDataFrame for testing +#' @return SparkDataFrame containing predicted labels in a column named "prediction" +#' @rdname predict +#' @name predict +#' @export +#' @examples +#' \dontrun{ +#' model <- spark.naiveBayes(trainingData, y ~ x) +#' predicted <- predict(model, testData) +#' showDF(predicted) +#'} +setMethod("predict", signature(object = "NaiveBayesModel"), + function(object, newData) { + return(dataFrame(callJMethod(object@jobj, "transform", newData@sdf))) + }) + #' Fit a k-means model #' -#' Fit a k-means model, similarly to R's kmeans(). +#' Fits a k-means model, similarly to R's kmeans(). #' #' @param data SparkDataFrame for training #' @param formula A symbolic description of the model to be fitted. Currently only a few formula @@ -282,13 +382,14 @@ setMethod("summary", signature(object = "NaiveBayesModel"), #' @param initMode The initialization algorithm choosen to fit the model #' @return A fitted k-means model #' @rdname spark.kmeans +#' @name spark.kmeans #' @export #' @examples #' \dontrun{ -#' model <- spark.kmeans(data, ~ ., k=2, initMode="random") +#' model <- spark.kmeans(data, ~ ., k = 2, initMode = "random") #' } setMethod("spark.kmeans", signature(data = "SparkDataFrame", formula = "formula"), - function(data, formula, k, maxIter = 10, initMode = c("random", "k-means||")) { + function(data, formula, k = 2, maxIter = 20, initMode = c("random", "k-means||")) { formula <- paste(deparse(formula), collapse = "") initMode <- match.arg(initMode) jobj <- callJStatic("org.apache.spark.ml.r.KMeansWrapper", "fit", data@sdf, formula, @@ -298,19 +399,20 @@ setMethod("spark.kmeans", signature(data = "SparkDataFrame", formula = "formula" #' Get fitted result from a k-means model #' -#' Get fitted result from a k-means model, similarly to R's fitted(). +#' Gets fitted result from a k-means model, similarly to R's fitted(). #' Note: A saved-loaded model does not support this method. #' #' @param object A fitted k-means model #' @return SparkDataFrame containing fitted values #' @rdname fitted +#' @name fitted #' @export #' @examples #' \dontrun{ -#' model <- spark.kmeans(trainingData, ~ ., 2) +#' model <- spark.kmeans(trainingData, ~ ., k = 2) #' fitted.model <- fitted(model) #' showDF(fitted.model) -#'} +#' } setMethod("fitted", signature(object = "KMeansModel"), function(object, method = c("centers", "classes"), ...) { method <- match.arg(method) @@ -323,18 +425,18 @@ setMethod("fitted", signature(object = "KMeansModel"), } }) -#' Get the summary of a k-means model -#' #' Returns the summary of a k-means model produced by spark.kmeans(), #' similarly to R's summary(). #' #' @param object a fitted k-means model -#' @return the model's coefficients, size and cluster +#' @return the model's 'coefficients', 'size', 'cluster' and 'is.loaded'. +#' Note 'cluster' in the summary of a saved-loaded model is NULL. #' @rdname summary +#' @name summary #' @export #' @examples #' \dontrun{ -#' model <- spark.kmeans(trainingData, ~ ., 2) +#' model <- spark.kmeans(trainingData, ~ ., k = 2) #' summary(model) #' } setMethod("summary", signature(object = "KMeansModel"), @@ -357,14 +459,13 @@ setMethod("summary", signature(object = "KMeansModel"), cluster = cluster, is.loaded = is.loaded)) }) -#' Make predictions from a k-means model -#' -#' Make predictions from a model produced by spark.kmeans(). +#' Makes predictions from a model produced by spark.kmeans(). #' #' @param object A fitted k-means model #' @param newData SparkDataFrame for testing #' @return SparkDataFrame containing predicted labels in a column named "prediction" #' @rdname predict +#' @name predict #' @export #' @examples #' \dontrun{ @@ -377,32 +478,9 @@ setMethod("predict", signature(object = "KMeansModel"), return(dataFrame(callJMethod(object@jobj, "transform", newData@sdf))) }) -#' Fit a Bernoulli naive Bayes model -#' -#' Fit a Bernoulli naive Bayes model on a Spark DataFrame (only categorical data is supported). +#' Save a fitted MLlib model to the input path #' -#' @param data SparkDataFrame for training -#' @param formula A symbolic description of the model to be fitted. Currently only a few formula -#' operators are supported, including '~', '.', ':', '+', and '-'. -#' @param laplace Smoothing parameter -#' @return a fitted naive Bayes model -#' @rdname spark.naiveBayes -#' @seealso e1071: \url{https://cran.r-project.org/web/packages/e1071/} -#' @export -#' @examples -#' \dontrun{ -#' df <- createDataFrame(sqlContext, infert) -#' model <- spark.naiveBayes(df, education ~ ., laplace = 0) -#'} -setMethod("spark.naiveBayes", signature(data = "SparkDataFrame", formula = "formula"), - function(data, formula, laplace = 0, ...) { - formula <- paste(deparse(formula), collapse = "") - jobj <- callJStatic("org.apache.spark.ml.r.NaiveBayesWrapper", "fit", - formula, data@sdf, laplace) - return(new("NaiveBayesModel", jobj = jobj)) - }) - -#' Save the Bernoulli naive Bayes model to the input path. +#' Saves the Bernoulli naive Bayes model to the input path. #' #' @param object A fitted Bernoulli naive Bayes model #' @param path The directory where the model is saved @@ -415,7 +493,7 @@ setMethod("spark.naiveBayes", signature(data = "SparkDataFrame", formula = "form #' @examples #' \dontrun{ #' df <- createDataFrame(sqlContext, infert) -#' model <- spark.naiveBayes(df, education ~ ., laplace = 0) +#' model <- spark.naiveBayes(df, education ~ ., smoothing = 0) #' path <- "path/to/model" #' write.ml(model, path) #' } @@ -428,7 +506,7 @@ setMethod("write.ml", signature(object = "NaiveBayesModel", path = "character"), invisible(callJMethod(writer, "save", path)) }) -#' Save the AFT survival regression model to the input path. +#' Saves the AFT survival regression model to the input path. #' #' @param object A fitted AFT survival regression model #' @param path The directory where the model is saved @@ -453,7 +531,7 @@ setMethod("write.ml", signature(object = "AFTSurvivalRegressionModel", path = "c invisible(callJMethod(writer, "save", path)) }) -#' Save the generalized linear model to the input path. +#' Saves the generalized linear model to the input path. #' #' @param object A fitted generalized linear model #' @param path The directory where the model is saved @@ -478,7 +556,7 @@ setMethod("write.ml", signature(object = "GeneralizedLinearRegressionModel", pat invisible(callJMethod(writer, "save", path)) }) -#' Save the k-means model to the input path. +#' Saves the k-means model to the input path. #' #' @param object A fitted k-means model #' @param path The directory where the model is saved @@ -503,7 +581,7 @@ setMethod("write.ml", signature(object = "KMeansModel", path = "character"), invisible(callJMethod(writer, "save", path)) }) -#' Load a fitted MLlib model from the input path. +#' Load a fitted MLlib model from the input path #' #' @param path Path of the model to read. #' @return a fitted MLlib model @@ -530,75 +608,3 @@ read.ml <- function(path) { stop(paste("Unsupported model: ", jobj)) } } - -#' Fit an accelerated failure time (AFT) survival regression model. -#' -#' Fit an accelerated failure time (AFT) survival regression model on a Spark DataFrame. -#' -#' @param data SparkDataFrame for training. -#' @param formula A symbolic description of the model to be fitted. Currently only a few formula -#' operators are supported, including '~', ':', '+', and '-'. -#' Note that operator '.' is not supported currently. -#' @return a fitted AFT survival regression model -#' @rdname spark.survreg -#' @seealso survival: \url{https://cran.r-project.org/web/packages/survival/} -#' @export -#' @examples -#' \dontrun{ -#' df <- createDataFrame(sqlContext, ovarian) -#' model <- spark.survreg(df, Surv(futime, fustat) ~ ecog_ps + rx) -#' } -setMethod("spark.survreg", signature(data = "SparkDataFrame", formula = "formula"), - function(data, formula, ...) { - formula <- paste(deparse(formula), collapse = "") - jobj <- callJStatic("org.apache.spark.ml.r.AFTSurvivalRegressionWrapper", - "fit", formula, data@sdf) - return(new("AFTSurvivalRegressionModel", jobj = jobj)) - }) - - -#' Get the summary of an AFT survival regression model -#' -#' Returns the summary of an AFT survival regression model produced by spark.survreg(), -#' similarly to R's summary(). -#' -#' @param object a fitted AFT survival regression model -#' @return coefficients the model's coefficients, intercept and log(scale). -#' @rdname summary -#' @export -#' @examples -#' \dontrun{ -#' model <- spark.survreg(trainingData, Surv(futime, fustat) ~ ecog_ps + rx) -#' summary(model) -#' } -setMethod("summary", signature(object = "AFTSurvivalRegressionModel"), - function(object, ...) { - jobj <- object@jobj - features <- callJMethod(jobj, "rFeatures") - coefficients <- callJMethod(jobj, "rCoefficients") - coefficients <- as.matrix(unlist(coefficients)) - colnames(coefficients) <- c("Value") - rownames(coefficients) <- unlist(features) - return(list(coefficients = coefficients)) - }) - -#' Make predictions from an AFT survival regression model -#' -#' Make predictions from a model produced by spark.survreg(), -#' similarly to R package survival's predict. -#' -#' @param object A fitted AFT survival regression model -#' @param newData SparkDataFrame for testing -#' @return SparkDataFrame containing predicted labels in a column named "prediction" -#' @rdname predict -#' @export -#' @examples -#' \dontrun{ -#' model <- spark.survreg(trainingData, Surv(futime, fustat) ~ ecog_ps + rx) -#' predicted <- predict(model, testData) -#' showDF(predicted) -#' } -setMethod("predict", signature(object = "AFTSurvivalRegressionModel"), - function(object, newData) { - return(dataFrame(callJMethod(object@jobj, "transform", newData@sdf))) - }) diff --git a/R/pkg/inst/tests/testthat/test_mllib.R b/R/pkg/inst/tests/testthat/test_mllib.R index 5f8a27d4e094..ce04de9f0907 100644 --- a/R/pkg/inst/tests/testthat/test_mllib.R +++ b/R/pkg/inst/tests/testthat/test_mllib.R @@ -28,7 +28,7 @@ sqlContext <- sparkRSQL.init(sc) test_that("formula of spark.glm", { training <- suppressWarnings(createDataFrame(sqlContext, iris)) # directly calling the spark API - # dot minus and intercept vs native glm + # dot, minus and intercept vs native glm model <- spark.glm(training, Sepal_Width ~ . - Species + 0) vals <- collect(select(predict(model, training), "prediction")) rVals <- predict(glm(Sepal.Width ~ . - Species + 0, data = iris), iris) @@ -40,7 +40,7 @@ test_that("formula of spark.glm", { rVals <- predict(glm(Sepal.Width ~ Species:Sepal.Length, data = iris), iris) expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals) - # glm should work with long formula + # spark.glm should work with long formula training <- suppressWarnings(createDataFrame(sqlContext, iris)) training$LongLongLongLongLongName <- training$Sepal_Width training$VeryLongLongLongLonLongName <- training$Sepal_Length @@ -69,7 +69,7 @@ test_that("spark.glm and predict", { expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "double") vals <- collect(select(prediction, "prediction")) rVals <- suppressWarnings(predict(glm(Sepal.Width ~ Sepal.Length + Species, - data = iris, family = poisson(link = identity)), iris)) + data = iris, family = poisson(link = identity)), iris)) expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals) # Test stats::predict is working @@ -120,11 +120,6 @@ test_that("spark.glm summary", { expect_equal(stats$df.null, rStats$df.null) expect_equal(stats$df.residual, rStats$df.residual) expect_equal(stats$aic, rStats$aic) - - # Test summary works on base GLM models - baseModel <- stats::glm(Sepal.Width ~ Sepal.Length + Species, data = iris) - baseSummary <- summary(baseModel) - expect_true(abs(baseSummary$deviance - 12.19313) < 1e-4) }) test_that("spark.glm save/load", { @@ -154,66 +149,21 @@ test_that("spark.glm save/load", { unlink(modelPath) }) - - -test_that("formula of glm", { - training <- suppressWarnings(createDataFrame(sqlContext, iris)) - # dot minus and intercept vs native glm - model <- glm(Sepal_Width ~ . - Species + 0, data = training) - vals <- collect(select(predict(model, training), "prediction")) - rVals <- predict(glm(Sepal.Width ~ . - Species + 0, data = iris), iris) - expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals) - - # feature interaction vs native glm - model <- glm(Sepal_Width ~ Species:Sepal_Length, data = training) - vals <- collect(select(predict(model, training), "prediction")) - rVals <- predict(glm(Sepal.Width ~ Species:Sepal.Length, data = iris), iris) - expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals) - - # glm should work with long formula - training <- suppressWarnings(createDataFrame(sqlContext, iris)) - training$LongLongLongLongLongName <- training$Sepal_Width - training$VeryLongLongLongLonLongName <- training$Sepal_Length - training$AnotherLongLongLongLongName <- training$Species - model <- glm(LongLongLongLongLongName ~ VeryLongLongLongLonLongName + AnotherLongLongLongLongName, - data = training) - vals <- collect(select(predict(model, training), "prediction")) - rVals <- predict(glm(Sepal.Width ~ Sepal.Length + Species, data = iris), iris) - expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals) -}) - -test_that("glm and predict", { +test_that("glm, predict and summary", { + # Test R-compliant glm() which is another entrance of spark.glm(). training <- suppressWarnings(createDataFrame(sqlContext, iris)) - # gaussian family model <- glm(Sepal_Width ~ Sepal_Length + Species, data = training) - prediction <- predict(model, training) - expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "double") - vals <- collect(select(prediction, "prediction")) - rVals <- predict(glm(Sepal.Width ~ Sepal.Length + Species, data = iris), iris) - expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals) - # poisson family - model <- glm(Sepal_Width ~ Sepal_Length + Species, data = training, - family = poisson(link = identity)) + rModel <- glm(Sepal.Width ~ Sepal.Length + Species, data = iris) + prediction <- predict(model, training) expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "double") vals <- collect(select(prediction, "prediction")) - rVals <- suppressWarnings(predict(glm(Sepal.Width ~ Sepal.Length + Species, - data = iris, family = poisson(link = identity)), iris)) + rVals <- predict(rModel, iris) expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals) - # Test stats::predict is working - x <- rnorm(15) - y <- x + rnorm(15) - expect_equal(length(predict(lm(y ~ x))), 15) -}) - -test_that("glm summary", { - # gaussian family - training <- suppressWarnings(createDataFrame(sqlContext, iris)) - stats <- summary(glm(Sepal_Width ~ Sepal_Length + Species, data = training)) - - rStats <- summary(glm(Sepal.Width ~ Sepal.Length + Species, data = iris)) + stats <- summary(model) + rStats <- summary(rModel) coefs <- unlist(stats$coefficients) rCoefs <- unlist(rStats$coefficients) @@ -228,62 +178,12 @@ test_that("glm summary", { expect_equal(stats$df.residual, rStats$df.residual) expect_equal(stats$aic, rStats$aic) - # binomial family - df <- suppressWarnings(createDataFrame(sqlContext, iris)) - training <- df[df$Species %in% c("versicolor", "virginica"), ] - stats <- summary(glm(Species ~ Sepal_Length + Sepal_Width, data = training, - family = binomial(link = "logit"))) - - rTraining <- iris[iris$Species %in% c("versicolor", "virginica"), ] - rStats <- summary(glm(Species ~ Sepal.Length + Sepal.Width, data = rTraining, - family = binomial(link = "logit"))) - - coefs <- unlist(stats$coefficients) - rCoefs <- unlist(rStats$coefficients) - expect_true(all(abs(rCoefs - coefs) < 1e-4)) - expect_true(all( - rownames(stats$coefficients) == - c("(Intercept)", "Sepal_Length", "Sepal_Width"))) - expect_equal(stats$dispersion, rStats$dispersion) - expect_equal(stats$null.deviance, rStats$null.deviance) - expect_equal(stats$deviance, rStats$deviance) - expect_equal(stats$df.null, rStats$df.null) - expect_equal(stats$df.residual, rStats$df.residual) - expect_equal(stats$aic, rStats$aic) - # Test summary works on base GLM models baseModel <- stats::glm(Sepal.Width ~ Sepal.Length + Species, data = iris) baseSummary <- summary(baseModel) expect_true(abs(baseSummary$deviance - 12.19313) < 1e-4) }) -test_that("glm save/load", { - training <- suppressWarnings(createDataFrame(sqlContext, iris)) - m <- glm(Sepal_Width ~ Sepal_Length + Species, data = training) - s <- summary(m) - - modelPath <- tempfile(pattern = "glm", fileext = ".tmp") - write.ml(m, modelPath) - expect_error(write.ml(m, modelPath)) - write.ml(m, modelPath, overwrite = TRUE) - m2 <- read.ml(modelPath) - s2 <- summary(m2) - - expect_equal(s$coefficients, s2$coefficients) - expect_equal(rownames(s$coefficients), rownames(s2$coefficients)) - expect_equal(s$dispersion, s2$dispersion) - expect_equal(s$null.deviance, s2$null.deviance) - expect_equal(s$deviance, s2$deviance) - expect_equal(s$df.null, s2$df.null) - expect_equal(s$df.residual, s2$df.residual) - expect_equal(s$aic, s2$aic) - expect_equal(s$iter, s2$iter) - expect_true(!s$is.loaded) - expect_true(s2$is.loaded) - - unlink(modelPath) -}) - test_that("spark.kmeans", { newIris <- iris newIris$Species <- NULL @@ -366,7 +266,7 @@ test_that("spark.naiveBayes", { t <- as.data.frame(Titanic) t1 <- t[t$Freq > 0, -5] df <- suppressWarnings(createDataFrame(sqlContext, t1)) - m <- spark.naiveBayes(df, Survived ~ .) + m <- spark.naiveBayes(df, Survived ~ ., smoothing = 0.0) s <- summary(m) expect_equal(as.double(s$apriori[1, "Yes"]), 0.5833333, tolerance = 1e-6) expect_equal(sum(s$apriori), 1) diff --git a/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala b/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala index 9618a3423e9a..5642abc6450f 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala @@ -67,8 +67,8 @@ private[r] object GeneralizedLinearRegressionWrapper data: DataFrame, family: String, link: String, - epsilon: Double, - maxit: Int): GeneralizedLinearRegressionWrapper = { + tol: Double, + maxIter: Int): GeneralizedLinearRegressionWrapper = { val rFormula = new RFormula() .setFormula(formula) val rFormulaModel = rFormula.fit(data) @@ -82,8 +82,8 @@ private[r] object GeneralizedLinearRegressionWrapper .setFamily(family) .setLink(link) .setFitIntercept(rFormula.hasIntercept) - .setTol(epsilon) - .setMaxIter(maxit) + .setTol(tol) + .setMaxIter(maxIter) val pipeline = new Pipeline() .setStages(Array(rFormulaModel, glr)) .fit(data) diff --git a/mllib/src/main/scala/org/apache/spark/ml/r/NaiveBayesWrapper.scala b/mllib/src/main/scala/org/apache/spark/ml/r/NaiveBayesWrapper.scala index 28925c79da66..1dac246b0332 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/r/NaiveBayesWrapper.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/r/NaiveBayesWrapper.scala @@ -56,7 +56,7 @@ private[r] object NaiveBayesWrapper extends MLReadable[NaiveBayesWrapper] { val PREDICTED_LABEL_INDEX_COL = "pred_label_idx" val PREDICTED_LABEL_COL = "prediction" - def fit(formula: String, data: DataFrame, laplace: Double): NaiveBayesWrapper = { + def fit(formula: String, data: DataFrame, smoothing: Double): NaiveBayesWrapper = { val rFormula = new RFormula() .setFormula(formula) .fit(data) @@ -70,7 +70,7 @@ private[r] object NaiveBayesWrapper extends MLReadable[NaiveBayesWrapper] { val features = featureAttrs.map(_.name.get) // assemble and fit the pipeline val naiveBayes = new NaiveBayes() - .setSmoothing(laplace) + .setSmoothing(smoothing) .setModelType("bernoulli") .setPredictionCol(PREDICTED_LABEL_INDEX_COL) val idxToStr = new IndexToString()