Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
86 changes: 29 additions & 57 deletions R/pkg/R/mllib.R
Original file line number Diff line number Diff line change
Expand Up @@ -733,8 +733,6 @@ setMethod("predict", signature(object = "KMeansModel"),
#' excepting that at most one value may be 0. The class with largest value p/t is predicted, where p
#' is the original probability of that class and t is the class's threshold.
#' @param weightCol The weight column name.
#' @param aggregationDepth depth for treeAggregate (>= 2). If the dimensions of features or the number of partitions
#' are large, this param could be adjusted to a larger size.
#' @param probabilityCol column name for predicted class conditional probabilities.
#' @param ... additional arguments passed to the method.
#' @return \code{spark.logit} returns a fitted logistic regression model
Expand All @@ -746,45 +744,35 @@ setMethod("predict", signature(object = "KMeansModel"),
#' \dontrun{
#' sparkR.session()
#' # binary logistic regression
#' label <- c(0.0, 0.0, 0.0, 1.0, 1.0)
#' features <- c(1.1419053, 0.9194079, -0.9498666, -1.1069903, 0.2809776)
#' binary_data <- as.data.frame(cbind(label, features))
#' binary_df <- createDataFrame(binary_data)
#' blr_model <- spark.logit(binary_df, label ~ features, thresholds = 1.0)
#' blr_predict <- collect(select(predict(blr_model, binary_df), "prediction"))
#'
#' # summary of binary logistic regression
#' blr_summary <- summary(blr_model)
#' blr_fmeasure <- collect(select(blr_summary$fMeasureByThreshold, "threshold", "F-Measure"))
#' df <- createDataFrame(iris)
#' training <- df[df$Species %in% c("versicolor", "virginica"), ]
#' model <- spark.logit(training, Species ~ ., regParam = 0.5)
#' summary <- summary(model)
#'
#' # fitted values on training data
#' fitted <- predict(model, training)
#'
#' # save fitted model to input path
#' path <- "path/to/model"
#' write.ml(blr_model, path)
#' write.ml(model, path)
#'
#' # can also read back the saved model and predict
#' # Note that summary deos not work on loaded model
#' savedModel <- read.ml(path)
#' blr_predict2 <- collect(select(predict(savedModel, binary_df), "prediction"))
#' summary(savedModel)
#'
#' # multinomial logistic regression
#'
#' label <- c(0.0, 1.0, 2.0, 0.0, 0.0)
#' feature1 <- c(4.845940, 5.64480, 7.430381, 6.464263, 5.555667)
#' feature2 <- c(2.941319, 2.614812, 2.162451, 3.339474, 2.970987)
#' feature3 <- c(1.322733, 1.348044, 3.861237, 9.686976, 3.447130)
#' feature4 <- c(1.3246388, 0.5510444, 0.9225810, 1.2147881, 1.6020842)
#' data <- as.data.frame(cbind(label, feature1, feature2, feature3, feature4))
#' df <- createDataFrame(data)
#' df <- createDataFrame(iris)
#' model <- spark.logit(df, Species ~ ., regParam = 0.5)
#' summary <- summary(model)
#'
#' # Note that summary of multinomial logistic regression is not implemented yet
#' model <- spark.logit(df, label ~ ., family = "multinomial", thresholds = c(0, 1, 1))
#' predict1 <- collect(select(predict(model, df), "prediction"))
#' }
#' @note spark.logit since 2.1.0
setMethod("spark.logit", signature(data = "SparkDataFrame", formula = "formula"),
function(data, formula, regParam = 0.0, elasticNetParam = 0.0, maxIter = 100,
tol = 1E-6, family = "auto", standardization = TRUE,
thresholds = 0.5, weightCol = NULL, aggregationDepth = 2,
probabilityCol = "probability") {
thresholds = 0.5, weightCol = NULL, probabilityCol = "probability") {
formula <- paste(deparse(formula), collapse = "")

if (is.null(weightCol)) {
Expand All @@ -796,8 +784,7 @@ setMethod("spark.logit", signature(data = "SparkDataFrame", formula = "formula")
as.numeric(elasticNetParam), as.integer(maxIter),
as.numeric(tol), as.character(family),
as.logical(standardization), as.array(thresholds),
as.character(weightCol), as.integer(aggregationDepth),
as.character(probabilityCol))
as.character(weightCol), as.character(probabilityCol))
new("LogisticRegressionModel", jobj = jobj)
})

Expand All @@ -817,44 +804,29 @@ setMethod("predict", signature(object = "LogisticRegressionModel"),
# Get the summary of an LogisticRegressionModel

#' @param object an LogisticRegressionModel fitted by \code{spark.logit}
#' @return \code{summary} returns the Binary Logistic regression results of a given model as list,
#' including roc, areaUnderROC, pr, fMeasureByThreshold, precisionByThreshold,
#' recallByThreshold, totalIterations, objectiveHistory. Note that Multinomial logistic
#' regression summary is not available now.
#' @return \code{summary} returns coefficients matrix of the fitted model
#' @rdname spark.logit
#' @aliases summary,LogisticRegressionModel-method
#' @export
#' @note summary(LogisticRegressionModel) since 2.1.0
setMethod("summary", signature(object = "LogisticRegressionModel"),
function(object) {
jobj <- object@jobj
is.loaded <- callJMethod(jobj, "isLoaded")

if (is.loaded) {
stop("Loaded model doesn't have training summary.")
features <- callJMethod(jobj, "rFeatures")
labels <- callJMethod(jobj, "labels")
coefficients <- callJMethod(jobj, "rCoefficients")
nCol <- length(coefficients) / length(features)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we do the nCol calculation and column name on scala side? So, we don't have to call rFeatures and labels on R side, which makes the logic simpler.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, we could. The reason I did this way is we may add more model statistics which are different for binomial and multinomial logistic regression later, so we need to distinguish them at R side in any way.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sounds good.

coefficients <- matrix(coefficients, ncol = nCol)
# If nCol == 1, means this is a binomial logistic regression model with pivoting.
# Otherwise, it's a multinomial logistic regression model without pivoting.
if (nCol == 1) {
colnames(coefficients) <- c("Estimate")
} else {
colnames(coefficients) <- unlist(labels)
}
rownames(coefficients) <- unlist(features)

roc <- dataFrame(callJMethod(jobj, "roc"))

areaUnderROC <- callJMethod(jobj, "areaUnderROC")

pr <- dataFrame(callJMethod(jobj, "pr"))

fMeasureByThreshold <- dataFrame(callJMethod(jobj, "fMeasureByThreshold"))

precisionByThreshold <- dataFrame(callJMethod(jobj, "precisionByThreshold"))

recallByThreshold <- dataFrame(callJMethod(jobj, "recallByThreshold"))

totalIterations <- callJMethod(jobj, "totalIterations")

objectiveHistory <- callJMethod(jobj, "objectiveHistory")

list(roc = roc, areaUnderROC = areaUnderROC, pr = pr,
fMeasureByThreshold = fMeasureByThreshold,
precisionByThreshold = precisionByThreshold,
recallByThreshold = recallByThreshold,
totalIterations = totalIterations, objectiveHistory = objectiveHistory)
list(coefficients = coefficients)
})

#' Multilayer Perceptron Classification Model
Expand Down
183 changes: 128 additions & 55 deletions R/pkg/inst/tests/testthat/test_mllib.R
Original file line number Diff line number Diff line change
Expand Up @@ -645,68 +645,141 @@ test_that("spark.isotonicRegression", {
})

test_that("spark.logit", {
# test binary logistic regression
label <- c(0.0, 0.0, 0.0, 1.0, 1.0)
feature <- c(1.1419053, 0.9194079, -0.9498666, -1.1069903, 0.2809776)
binary_data <- as.data.frame(cbind(label, feature))
binary_df <- createDataFrame(binary_data)

blr_model <- spark.logit(binary_df, label ~ feature, thresholds = 1.0)
blr_predict <- collect(select(predict(blr_model, binary_df), "prediction"))
expect_equal(blr_predict$prediction, c("0.0", "0.0", "0.0", "0.0", "0.0"))
blr_model1 <- spark.logit(binary_df, label ~ feature, thresholds = 0.0)
blr_predict1 <- collect(select(predict(blr_model1, binary_df), "prediction"))
expect_equal(blr_predict1$prediction, c("1.0", "1.0", "1.0", "1.0", "1.0"))

# test summary of binary logistic regression
blr_summary <- summary(blr_model)
blr_fmeasure <- collect(select(blr_summary$fMeasureByThreshold, "threshold", "F-Measure"))
expect_equal(blr_fmeasure$threshold, c(0.6565513, 0.6214563, 0.3325291, 0.2115995, 0.1778653),
tolerance = 1e-4)
expect_equal(blr_fmeasure$"F-Measure", c(0.6666667, 0.5000000, 0.8000000, 0.6666667, 0.5714286),
tolerance = 1e-4)
blr_precision <- collect(select(blr_summary$precisionByThreshold, "threshold", "precision"))
expect_equal(blr_precision$precision, c(1.0000000, 0.5000000, 0.6666667, 0.5000000, 0.4000000),
tolerance = 1e-4)
blr_recall <- collect(select(blr_summary$recallByThreshold, "threshold", "recall"))
expect_equal(blr_recall$recall, c(0.5000000, 0.5000000, 1.0000000, 1.0000000, 1.0000000),
tolerance = 1e-4)
# R code to reproduce the result.
# nolint start
#' library(glmnet)
#' iris.x = as.matrix(iris[, 1:4])
#' iris.y = as.factor(as.character(iris[, 5]))
#' logit = glmnet(iris.x, iris.y, family="multinomial", alpha=0, lambda=0.5)
#' coef(logit)
#
# $setosa
# 5 x 1 sparse Matrix of class "dgCMatrix"
# s0
# 1.0981324
# Sepal.Length -0.2909860
# Sepal.Width 0.5510907
# Petal.Length -0.1915217
# Petal.Width -0.4211946
#
# $versicolor
# 5 x 1 sparse Matrix of class "dgCMatrix"
# s0
# 1.520061e+00
# Sepal.Length 2.524501e-02
# Sepal.Width -5.310313e-01
# Petal.Length 3.656543e-02
# Petal.Width -3.144464e-05
#
# $virginica
# 5 x 1 sparse Matrix of class "dgCMatrix"
# s0
# -2.61819385
# Sepal.Length 0.26574097
# Sepal.Width -0.02005932
# Petal.Length 0.15495629
# Petal.Width 0.42122607
# nolint end

# test model save and read
modelPath <- tempfile(pattern = "spark-logisticRegression", fileext = ".tmp")
write.ml(blr_model, modelPath)
expect_error(write.ml(blr_model, modelPath))
write.ml(blr_model, modelPath, overwrite = TRUE)
blr_model2 <- read.ml(modelPath)
blr_predict2 <- collect(select(predict(blr_model2, binary_df), "prediction"))
expect_equal(blr_predict$prediction, blr_predict2$prediction)
expect_error(summary(blr_model2))
# Test multinomial logistic regression againt three classes
df <- suppressWarnings(createDataFrame(iris))
model <- spark.logit(df, Species ~ ., regParam = 0.5)
summary <- summary(model)
versicolorCoefsR <- c(1.52, 0.03, -0.53, 0.04, 0.00)
virginicaCoefsR <- c(-2.62, 0.27, -0.02, 0.16, 0.42)
setosaCoefsR <- c(1.10, -0.29, 0.55, -0.19, -0.42)
versicolorCoefs <- unlist(summary$coefficients[, "versicolor"])
virginicaCoefs <- unlist(summary$coefficients[, "virginica"])
setosaCoefs <- unlist(summary$coefficients[, "setosa"])
expect_true(all(abs(versicolorCoefsR - versicolorCoefs) < 0.1))
expect_true(all(abs(virginicaCoefsR - virginicaCoefs) < 0.1))
expect_true(all(abs(setosaCoefs - setosaCoefs) < 0.1))

# Test model save and load
modelPath <- tempfile(pattern = "spark-logit", fileext = ".tmp")
write.ml(model, modelPath)
expect_error(write.ml(model, modelPath))
write.ml(model, modelPath, overwrite = TRUE)
model2 <- read.ml(modelPath)
coefs <- summary(model)$coefficients
coefs2 <- summary(model2)$coefficients
expect_equal(coefs, coefs2)
unlink(modelPath)

# test prediction label as text
training <- suppressWarnings(createDataFrame(iris))
binomial_training <- training[training$Species %in% c("versicolor", "virginica"), ]
binomial_model <- spark.logit(binomial_training, Species ~ Sepal_Length + Sepal_Width)
prediction <- predict(binomial_model, binomial_training)
# R code to reproduce the result.
# nolint start
#' library(glmnet)
#' iris2 <- iris[iris$Species %in% c("versicolor", "virginica"), ]
#' iris.x = as.matrix(iris2[, 1:4])
#' iris.y = as.factor(as.character(iris2[, 5]))
#' logit = glmnet(iris.x, iris.y, family="multinomial", alpha=0, lambda=0.5)
#' coef(logit)
#
# $versicolor
# 5 x 1 sparse Matrix of class "dgCMatrix"
# s0
# 3.93844796
# Sepal.Length -0.13538675
# Sepal.Width -0.02386443
# Petal.Length -0.35076451
# Petal.Width -0.77971954
#
# $virginica
# 5 x 1 sparse Matrix of class "dgCMatrix"
# s0
# -3.93844796
# Sepal.Length 0.13538675
# Sepal.Width 0.02386443
# Petal.Length 0.35076451
# Petal.Width 0.77971954
#
#' logit = glmnet(iris.x, iris.y, family="binomial", alpha=0, lambda=0.5)
#' coef(logit)
#
# 5 x 1 sparse Matrix of class "dgCMatrix"
# s0
# (Intercept) -6.0824412
# Sepal.Length 0.2458260
# Sepal.Width 0.1642093
# Petal.Length 0.4759487
# Petal.Width 1.0383948
#
# nolint end

# Test multinomial logistic regression againt two classes
df <- suppressWarnings(createDataFrame(iris))
training <- df[df$Species %in% c("versicolor", "virginica"), ]
model <- spark.logit(training, Species ~ ., regParam = 0.5, family = "multinomial")
summary <- summary(model)
versicolorCoefsR <- c(3.94, -0.16, -0.02, -0.35, -0.78)
virginicaCoefsR <- c(-3.94, 0.16, -0.02, 0.35, 0.78)
versicolorCoefs <- unlist(summary$coefficients[, "versicolor"])
virginicaCoefs <- unlist(summary$coefficients[, "virginica"])
expect_true(all(abs(versicolorCoefsR - versicolorCoefs) < 0.1))
expect_true(all(abs(virginicaCoefsR - virginicaCoefs) < 0.1))

# Test binomial logistic regression againt two classes
model <- spark.logit(training, Species ~ ., regParam = 0.5)
summary <- summary(model)
coefsR <- c(-6.08, 0.25, 0.16, 0.48, 1.04)
coefs <- unlist(summary$coefficients[, "Estimate"])
expect_true(all(abs(coefsR - coefs) < 0.1))

# Test prediction with string label
prediction <- predict(model, training)
expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "character")
expected <- c("virginica", "virginica", "virginica", "versicolor", "virginica",
"versicolor", "virginica", "versicolor", "virginica", "versicolor")
expected <- c("versicolor", "versicolor", "virginica", "versicolor", "versicolor",
"versicolor", "versicolor", "versicolor", "versicolor", "versicolor")
expect_equal(as.list(take(select(prediction, "prediction"), 10))[[1]], expected)

# test multinomial logistic regression
label <- c(0.0, 1.0, 2.0, 0.0, 0.0)
feature1 <- c(4.845940, 5.64480, 7.430381, 6.464263, 5.555667)
feature2 <- c(2.941319, 2.614812, 2.162451, 3.339474, 2.970987)
feature3 <- c(1.322733, 1.348044, 3.861237, 9.686976, 3.447130)
feature4 <- c(1.3246388, 0.5510444, 0.9225810, 1.2147881, 1.6020842)
data <- as.data.frame(cbind(label, feature1, feature2, feature3, feature4))
# Test prediction with numeric label
label <- c(0.0, 0.0, 0.0, 1.0, 1.0)
feature <- c(1.1419053, 0.9194079, -0.9498666, -1.1069903, 0.2809776)
data <- as.data.frame(cbind(label, feature))
df <- createDataFrame(data)

model <- spark.logit(df, label ~., family = "multinomial", thresholds = c(0, 1, 1))
predict1 <- collect(select(predict(model, df), "prediction"))
expect_equal(predict1$prediction, c("0.0", "0.0", "0.0", "0.0", "0.0"))
# Summary of multinomial logistic regression is not implemented yet
expect_error(summary(model))
model <- spark.logit(df, label ~ feature)
prediction <- collect(select(predict(model, df), "prediction"))
expect_equal(prediction$prediction, c("0.0", "0.0", "1.0", "1.0", "0.0"))
})

test_that("spark.gaussianMixture", {
Expand Down
Loading