Skip to content

Commit 90b59d1

Browse files
committed
[SPARK-18686][SPARKR][ML] Several cleanup and improvements for spark.logit.
## What changes were proposed in this pull request? Several cleanup and improvements for ```spark.logit```: * ```summary``` should return coefficients matrix, and should output labels for each class if the model is multinomial logistic regression model. * ```summary``` should not return ```areaUnderROC, roc, pr, ...```, since most of them are DataFrame which are less important for R users. Meanwhile, these metrics ignore instance weights (setting all to 1.0) which will be changed in later Spark version. In case it will introduce breaking changes, we do not expose them currently. * SparkR test improvement: comparing the training result with native R glmnet. * Remove argument ```aggregationDepth``` from ```spark.logit```, since it's an expert Param(related with Spark architecture and job execution) that would be used rarely by R users. ## How was this patch tested? Unit tests. The ```summary``` output after this change: multinomial logistic regression: ``` > df <- suppressWarnings(createDataFrame(iris)) > model <- spark.logit(df, Species ~ ., regParam = 0.5) > summary(model) $coefficients versicolor virginica setosa (Intercept) 1.514031 -2.609108 1.095077 Sepal_Length 0.02511006 0.2649821 -0.2900921 Sepal_Width -0.5291215 -0.02016446 0.549286 Petal_Length 0.03647411 0.1544119 -0.190886 Petal_Width 0.000236092 0.4195804 -0.4198165 ``` binomial logistic regression: ``` > df <- suppressWarnings(createDataFrame(iris)) > training <- df[df$Species %in% c("versicolor", "virginica"), ] > model <- spark.logit(training, Species ~ ., regParam = 0.5) > summary(model) $coefficients Estimate (Intercept) -6.053815 Sepal_Length 0.2449379 Sepal_Width 0.1648321 Petal_Length 0.4730718 Petal_Width 1.031947 ``` Author: Yanbo Liang <[email protected]> Closes #16117 from yanboliang/spark-18686.
1 parent 5c6bcdb commit 90b59d1

File tree

3 files changed

+203
-147
lines changed

3 files changed

+203
-147
lines changed

R/pkg/R/mllib.R

Lines changed: 29 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -733,8 +733,6 @@ setMethod("predict", signature(object = "KMeansModel"),
733733
#' excepting that at most one value may be 0. The class with largest value p/t is predicted, where p
734734
#' is the original probability of that class and t is the class's threshold.
735735
#' @param weightCol The weight column name.
736-
#' @param aggregationDepth depth for treeAggregate (>= 2). If the dimensions of features or the number of partitions
737-
#' are large, this param could be adjusted to a larger size.
738736
#' @param probabilityCol column name for predicted class conditional probabilities.
739737
#' @param ... additional arguments passed to the method.
740738
#' @return \code{spark.logit} returns a fitted logistic regression model
@@ -746,45 +744,35 @@ setMethod("predict", signature(object = "KMeansModel"),
746744
#' \dontrun{
747745
#' sparkR.session()
748746
#' # binary logistic regression
749-
#' label <- c(0.0, 0.0, 0.0, 1.0, 1.0)
750-
#' features <- c(1.1419053, 0.9194079, -0.9498666, -1.1069903, 0.2809776)
751-
#' binary_data <- as.data.frame(cbind(label, features))
752-
#' binary_df <- createDataFrame(binary_data)
753-
#' blr_model <- spark.logit(binary_df, label ~ features, thresholds = 1.0)
754-
#' blr_predict <- collect(select(predict(blr_model, binary_df), "prediction"))
755-
#'
756-
#' # summary of binary logistic regression
757-
#' blr_summary <- summary(blr_model)
758-
#' blr_fmeasure <- collect(select(blr_summary$fMeasureByThreshold, "threshold", "F-Measure"))
747+
#' df <- createDataFrame(iris)
748+
#' training <- df[df$Species %in% c("versicolor", "virginica"), ]
749+
#' model <- spark.logit(training, Species ~ ., regParam = 0.5)
750+
#' summary <- summary(model)
751+
#'
752+
#' # fitted values on training data
753+
#' fitted <- predict(model, training)
754+
#'
759755
#' # save fitted model to input path
760756
#' path <- "path/to/model"
761-
#' write.ml(blr_model, path)
757+
#' write.ml(model, path)
762758
#'
763759
#' # can also read back the saved model and predict
764760
#' # Note that summary deos not work on loaded model
765761
#' savedModel <- read.ml(path)
766-
#' blr_predict2 <- collect(select(predict(savedModel, binary_df), "prediction"))
762+
#' summary(savedModel)
767763
#'
768764
#' # multinomial logistic regression
769765
#'
770-
#' label <- c(0.0, 1.0, 2.0, 0.0, 0.0)
771-
#' feature1 <- c(4.845940, 5.64480, 7.430381, 6.464263, 5.555667)
772-
#' feature2 <- c(2.941319, 2.614812, 2.162451, 3.339474, 2.970987)
773-
#' feature3 <- c(1.322733, 1.348044, 3.861237, 9.686976, 3.447130)
774-
#' feature4 <- c(1.3246388, 0.5510444, 0.9225810, 1.2147881, 1.6020842)
775-
#' data <- as.data.frame(cbind(label, feature1, feature2, feature3, feature4))
776-
#' df <- createDataFrame(data)
766+
#' df <- createDataFrame(iris)
767+
#' model <- spark.logit(df, Species ~ ., regParam = 0.5)
768+
#' summary <- summary(model)
777769
#'
778-
#' # Note that summary of multinomial logistic regression is not implemented yet
779-
#' model <- spark.logit(df, label ~ ., family = "multinomial", thresholds = c(0, 1, 1))
780-
#' predict1 <- collect(select(predict(model, df), "prediction"))
781770
#' }
782771
#' @note spark.logit since 2.1.0
783772
setMethod("spark.logit", signature(data = "SparkDataFrame", formula = "formula"),
784773
function(data, formula, regParam = 0.0, elasticNetParam = 0.0, maxIter = 100,
785774
tol = 1E-6, family = "auto", standardization = TRUE,
786-
thresholds = 0.5, weightCol = NULL, aggregationDepth = 2,
787-
probabilityCol = "probability") {
775+
thresholds = 0.5, weightCol = NULL, probabilityCol = "probability") {
788776
formula <- paste(deparse(formula), collapse = "")
789777

790778
if (is.null(weightCol)) {
@@ -796,8 +784,7 @@ setMethod("spark.logit", signature(data = "SparkDataFrame", formula = "formula")
796784
as.numeric(elasticNetParam), as.integer(maxIter),
797785
as.numeric(tol), as.character(family),
798786
as.logical(standardization), as.array(thresholds),
799-
as.character(weightCol), as.integer(aggregationDepth),
800-
as.character(probabilityCol))
787+
as.character(weightCol), as.character(probabilityCol))
801788
new("LogisticRegressionModel", jobj = jobj)
802789
})
803790

@@ -817,44 +804,29 @@ setMethod("predict", signature(object = "LogisticRegressionModel"),
817804
# Get the summary of an LogisticRegressionModel
818805

819806
#' @param object an LogisticRegressionModel fitted by \code{spark.logit}
820-
#' @return \code{summary} returns the Binary Logistic regression results of a given model as list,
821-
#' including roc, areaUnderROC, pr, fMeasureByThreshold, precisionByThreshold,
822-
#' recallByThreshold, totalIterations, objectiveHistory. Note that Multinomial logistic
823-
#' regression summary is not available now.
807+
#' @return \code{summary} returns coefficients matrix of the fitted model
824808
#' @rdname spark.logit
825809
#' @aliases summary,LogisticRegressionModel-method
826810
#' @export
827811
#' @note summary(LogisticRegressionModel) since 2.1.0
828812
setMethod("summary", signature(object = "LogisticRegressionModel"),
829813
function(object) {
830814
jobj <- object@jobj
831-
is.loaded <- callJMethod(jobj, "isLoaded")
832-
833-
if (is.loaded) {
834-
stop("Loaded model doesn't have training summary.")
815+
features <- callJMethod(jobj, "rFeatures")
816+
labels <- callJMethod(jobj, "labels")
817+
coefficients <- callJMethod(jobj, "rCoefficients")
818+
nCol <- length(coefficients) / length(features)
819+
coefficients <- matrix(coefficients, ncol = nCol)
820+
# If nCol == 1, means this is a binomial logistic regression model with pivoting.
821+
# Otherwise, it's a multinomial logistic regression model without pivoting.
822+
if (nCol == 1) {
823+
colnames(coefficients) <- c("Estimate")
824+
} else {
825+
colnames(coefficients) <- unlist(labels)
835826
}
827+
rownames(coefficients) <- unlist(features)
836828

837-
roc <- dataFrame(callJMethod(jobj, "roc"))
838-
839-
areaUnderROC <- callJMethod(jobj, "areaUnderROC")
840-
841-
pr <- dataFrame(callJMethod(jobj, "pr"))
842-
843-
fMeasureByThreshold <- dataFrame(callJMethod(jobj, "fMeasureByThreshold"))
844-
845-
precisionByThreshold <- dataFrame(callJMethod(jobj, "precisionByThreshold"))
846-
847-
recallByThreshold <- dataFrame(callJMethod(jobj, "recallByThreshold"))
848-
849-
totalIterations <- callJMethod(jobj, "totalIterations")
850-
851-
objectiveHistory <- callJMethod(jobj, "objectiveHistory")
852-
853-
list(roc = roc, areaUnderROC = areaUnderROC, pr = pr,
854-
fMeasureByThreshold = fMeasureByThreshold,
855-
precisionByThreshold = precisionByThreshold,
856-
recallByThreshold = recallByThreshold,
857-
totalIterations = totalIterations, objectiveHistory = objectiveHistory)
829+
list(coefficients = coefficients)
858830
})
859831

860832
#' Multilayer Perceptron Classification Model

R/pkg/inst/tests/testthat/test_mllib.R

Lines changed: 128 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -635,68 +635,141 @@ test_that("spark.isotonicRegression", {
635635
})
636636

637637
test_that("spark.logit", {
638-
# test binary logistic regression
639-
label <- c(0.0, 0.0, 0.0, 1.0, 1.0)
640-
feature <- c(1.1419053, 0.9194079, -0.9498666, -1.1069903, 0.2809776)
641-
binary_data <- as.data.frame(cbind(label, feature))
642-
binary_df <- createDataFrame(binary_data)
643-
644-
blr_model <- spark.logit(binary_df, label ~ feature, thresholds = 1.0)
645-
blr_predict <- collect(select(predict(blr_model, binary_df), "prediction"))
646-
expect_equal(blr_predict$prediction, c("0.0", "0.0", "0.0", "0.0", "0.0"))
647-
blr_model1 <- spark.logit(binary_df, label ~ feature, thresholds = 0.0)
648-
blr_predict1 <- collect(select(predict(blr_model1, binary_df), "prediction"))
649-
expect_equal(blr_predict1$prediction, c("1.0", "1.0", "1.0", "1.0", "1.0"))
650-
651-
# test summary of binary logistic regression
652-
blr_summary <- summary(blr_model)
653-
blr_fmeasure <- collect(select(blr_summary$fMeasureByThreshold, "threshold", "F-Measure"))
654-
expect_equal(blr_fmeasure$threshold, c(0.6565513, 0.6214563, 0.3325291, 0.2115995, 0.1778653),
655-
tolerance = 1e-4)
656-
expect_equal(blr_fmeasure$"F-Measure", c(0.6666667, 0.5000000, 0.8000000, 0.6666667, 0.5714286),
657-
tolerance = 1e-4)
658-
blr_precision <- collect(select(blr_summary$precisionByThreshold, "threshold", "precision"))
659-
expect_equal(blr_precision$precision, c(1.0000000, 0.5000000, 0.6666667, 0.5000000, 0.4000000),
660-
tolerance = 1e-4)
661-
blr_recall <- collect(select(blr_summary$recallByThreshold, "threshold", "recall"))
662-
expect_equal(blr_recall$recall, c(0.5000000, 0.5000000, 1.0000000, 1.0000000, 1.0000000),
663-
tolerance = 1e-4)
638+
# R code to reproduce the result.
639+
# nolint start
640+
#' library(glmnet)
641+
#' iris.x = as.matrix(iris[, 1:4])
642+
#' iris.y = as.factor(as.character(iris[, 5]))
643+
#' logit = glmnet(iris.x, iris.y, family="multinomial", alpha=0, lambda=0.5)
644+
#' coef(logit)
645+
#
646+
# $setosa
647+
# 5 x 1 sparse Matrix of class "dgCMatrix"
648+
# s0
649+
# 1.0981324
650+
# Sepal.Length -0.2909860
651+
# Sepal.Width 0.5510907
652+
# Petal.Length -0.1915217
653+
# Petal.Width -0.4211946
654+
#
655+
# $versicolor
656+
# 5 x 1 sparse Matrix of class "dgCMatrix"
657+
# s0
658+
# 1.520061e+00
659+
# Sepal.Length 2.524501e-02
660+
# Sepal.Width -5.310313e-01
661+
# Petal.Length 3.656543e-02
662+
# Petal.Width -3.144464e-05
663+
#
664+
# $virginica
665+
# 5 x 1 sparse Matrix of class "dgCMatrix"
666+
# s0
667+
# -2.61819385
668+
# Sepal.Length 0.26574097
669+
# Sepal.Width -0.02005932
670+
# Petal.Length 0.15495629
671+
# Petal.Width 0.42122607
672+
# nolint end
664673

665-
# test model save and read
666-
modelPath <- tempfile(pattern = "spark-logisticRegression", fileext = ".tmp")
667-
write.ml(blr_model, modelPath)
668-
expect_error(write.ml(blr_model, modelPath))
669-
write.ml(blr_model, modelPath, overwrite = TRUE)
670-
blr_model2 <- read.ml(modelPath)
671-
blr_predict2 <- collect(select(predict(blr_model2, binary_df), "prediction"))
672-
expect_equal(blr_predict$prediction, blr_predict2$prediction)
673-
expect_error(summary(blr_model2))
674+
# Test multinomial logistic regression againt three classes
675+
df <- suppressWarnings(createDataFrame(iris))
676+
model <- spark.logit(df, Species ~ ., regParam = 0.5)
677+
summary <- summary(model)
678+
versicolorCoefsR <- c(1.52, 0.03, -0.53, 0.04, 0.00)
679+
virginicaCoefsR <- c(-2.62, 0.27, -0.02, 0.16, 0.42)
680+
setosaCoefsR <- c(1.10, -0.29, 0.55, -0.19, -0.42)
681+
versicolorCoefs <- unlist(summary$coefficients[, "versicolor"])
682+
virginicaCoefs <- unlist(summary$coefficients[, "virginica"])
683+
setosaCoefs <- unlist(summary$coefficients[, "setosa"])
684+
expect_true(all(abs(versicolorCoefsR - versicolorCoefs) < 0.1))
685+
expect_true(all(abs(virginicaCoefsR - virginicaCoefs) < 0.1))
686+
expect_true(all(abs(setosaCoefs - setosaCoefs) < 0.1))
687+
688+
# Test model save and load
689+
modelPath <- tempfile(pattern = "spark-logit", fileext = ".tmp")
690+
write.ml(model, modelPath)
691+
expect_error(write.ml(model, modelPath))
692+
write.ml(model, modelPath, overwrite = TRUE)
693+
model2 <- read.ml(modelPath)
694+
coefs <- summary(model)$coefficients
695+
coefs2 <- summary(model2)$coefficients
696+
expect_equal(coefs, coefs2)
674697
unlink(modelPath)
675698

676-
# test prediction label as text
677-
training <- suppressWarnings(createDataFrame(iris))
678-
binomial_training <- training[training$Species %in% c("versicolor", "virginica"), ]
679-
binomial_model <- spark.logit(binomial_training, Species ~ Sepal_Length + Sepal_Width)
680-
prediction <- predict(binomial_model, binomial_training)
699+
# R code to reproduce the result.
700+
# nolint start
701+
#' library(glmnet)
702+
#' iris2 <- iris[iris$Species %in% c("versicolor", "virginica"), ]
703+
#' iris.x = as.matrix(iris2[, 1:4])
704+
#' iris.y = as.factor(as.character(iris2[, 5]))
705+
#' logit = glmnet(iris.x, iris.y, family="multinomial", alpha=0, lambda=0.5)
706+
#' coef(logit)
707+
#
708+
# $versicolor
709+
# 5 x 1 sparse Matrix of class "dgCMatrix"
710+
# s0
711+
# 3.93844796
712+
# Sepal.Length -0.13538675
713+
# Sepal.Width -0.02386443
714+
# Petal.Length -0.35076451
715+
# Petal.Width -0.77971954
716+
#
717+
# $virginica
718+
# 5 x 1 sparse Matrix of class "dgCMatrix"
719+
# s0
720+
# -3.93844796
721+
# Sepal.Length 0.13538675
722+
# Sepal.Width 0.02386443
723+
# Petal.Length 0.35076451
724+
# Petal.Width 0.77971954
725+
#
726+
#' logit = glmnet(iris.x, iris.y, family="binomial", alpha=0, lambda=0.5)
727+
#' coef(logit)
728+
#
729+
# 5 x 1 sparse Matrix of class "dgCMatrix"
730+
# s0
731+
# (Intercept) -6.0824412
732+
# Sepal.Length 0.2458260
733+
# Sepal.Width 0.1642093
734+
# Petal.Length 0.4759487
735+
# Petal.Width 1.0383948
736+
#
737+
# nolint end
738+
739+
# Test multinomial logistic regression againt two classes
740+
df <- suppressWarnings(createDataFrame(iris))
741+
training <- df[df$Species %in% c("versicolor", "virginica"), ]
742+
model <- spark.logit(training, Species ~ ., regParam = 0.5, family = "multinomial")
743+
summary <- summary(model)
744+
versicolorCoefsR <- c(3.94, -0.16, -0.02, -0.35, -0.78)
745+
virginicaCoefsR <- c(-3.94, 0.16, -0.02, 0.35, 0.78)
746+
versicolorCoefs <- unlist(summary$coefficients[, "versicolor"])
747+
virginicaCoefs <- unlist(summary$coefficients[, "virginica"])
748+
expect_true(all(abs(versicolorCoefsR - versicolorCoefs) < 0.1))
749+
expect_true(all(abs(virginicaCoefsR - virginicaCoefs) < 0.1))
750+
751+
# Test binomial logistic regression againt two classes
752+
model <- spark.logit(training, Species ~ ., regParam = 0.5)
753+
summary <- summary(model)
754+
coefsR <- c(-6.08, 0.25, 0.16, 0.48, 1.04)
755+
coefs <- unlist(summary$coefficients[, "Estimate"])
756+
expect_true(all(abs(coefsR - coefs) < 0.1))
757+
758+
# Test prediction with string label
759+
prediction <- predict(model, training)
681760
expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "character")
682-
expected <- c("virginica", "virginica", "virginica", "versicolor", "virginica",
683-
"versicolor", "virginica", "versicolor", "virginica", "versicolor")
761+
expected <- c("versicolor", "versicolor", "virginica", "versicolor", "versicolor",
762+
"versicolor", "versicolor", "versicolor", "versicolor", "versicolor")
684763
expect_equal(as.list(take(select(prediction, "prediction"), 10))[[1]], expected)
685764

686-
# test multinomial logistic regression
687-
label <- c(0.0, 1.0, 2.0, 0.0, 0.0)
688-
feature1 <- c(4.845940, 5.64480, 7.430381, 6.464263, 5.555667)
689-
feature2 <- c(2.941319, 2.614812, 2.162451, 3.339474, 2.970987)
690-
feature3 <- c(1.322733, 1.348044, 3.861237, 9.686976, 3.447130)
691-
feature4 <- c(1.3246388, 0.5510444, 0.9225810, 1.2147881, 1.6020842)
692-
data <- as.data.frame(cbind(label, feature1, feature2, feature3, feature4))
765+
# Test prediction with numeric label
766+
label <- c(0.0, 0.0, 0.0, 1.0, 1.0)
767+
feature <- c(1.1419053, 0.9194079, -0.9498666, -1.1069903, 0.2809776)
768+
data <- as.data.frame(cbind(label, feature))
693769
df <- createDataFrame(data)
694-
695-
model <- spark.logit(df, label ~., family = "multinomial", thresholds = c(0, 1, 1))
696-
predict1 <- collect(select(predict(model, df), "prediction"))
697-
expect_equal(predict1$prediction, c("0.0", "0.0", "0.0", "0.0", "0.0"))
698-
# Summary of multinomial logistic regression is not implemented yet
699-
expect_error(summary(model))
770+
model <- spark.logit(df, label ~ feature)
771+
prediction <- collect(select(predict(model, df), "prediction"))
772+
expect_equal(prediction$prediction, c("0.0", "0.0", "1.0", "1.0", "0.0"))
700773
})
701774

702775
test_that("spark.gaussianMixture", {

0 commit comments

Comments
 (0)