From 0ff84f0a59352f0ecce0bf88de4414b9e3d2e4fa Mon Sep 17 00:00:00 2001 From: "wm624@hotmail.com" Date: Thu, 10 Nov 2016 14:40:57 -0800 Subject: [PATCH 1/7] remove unused import --- core/src/main/scala/org/apache/spark/SparkContext.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala index 1261e3e735761..7cb6c2da2b1ab 100644 --- a/core/src/main/scala/org/apache/spark/SparkContext.scala +++ b/core/src/main/scala/org/apache/spark/SparkContext.scala @@ -19,7 +19,7 @@ package org.apache.spark import java.io._ import java.lang.reflect.Constructor -import java.net.{MalformedURLException, URI} +import java.net.{URI} import java.util.{Arrays, Locale, Properties, ServiceLoader, UUID} import java.util.concurrent.{ConcurrentHashMap, ConcurrentMap} import java.util.concurrent.atomic.{AtomicBoolean, AtomicInteger, AtomicReference} From 9a07aa15963378dbfe58e1641077dc4183a06872 Mon Sep 17 00:00:00 2001 From: "wm624@hotmail.com" Date: Fri, 11 Nov 2016 10:23:20 -0800 Subject: [PATCH 2/7] add label StringIndexer --- R/pkg/inst/tests/testthat/test_mllib.R | 24 ++++++++++++----- .../ml/r/LogisticRegressionWrapper.scala | 27 ++++++++++++++++--- 2 files changed, 40 insertions(+), 11 deletions(-) diff --git a/R/pkg/inst/tests/testthat/test_mllib.R b/R/pkg/inst/tests/testthat/test_mllib.R index 467e00cf7919b..613a7ee6b3194 100644 --- a/R/pkg/inst/tests/testthat/test_mllib.R +++ b/R/pkg/inst/tests/testthat/test_mllib.R @@ -646,15 +646,15 @@ test_that("spark.isotonicRegression", { test_that("spark.logit", { # test binary logistic regression - label <- c(1.0, 1.0, 1.0, 0.0, 0.0) + labels <- c(1.0, 1.0, 1.0, 0.0, 0.0) feature <- c(1.1419053, 0.9194079, -0.9498666, -1.1069903, 0.2809776) - binary_data <- as.data.frame(cbind(label, feature)) + binary_data <- as.data.frame(cbind(labels, feature)) binary_df <- createDataFrame(binary_data) - blr_model <- spark.logit(binary_df, label ~ feature, thresholds = 1.0) + blr_model <- spark.logit(binary_df, labels ~ feature, thresholds = 1.0) blr_predict <- collect(select(predict(blr_model, binary_df), "prediction")) expect_equal(blr_predict$prediction, c(0, 0, 0, 0, 0)) - blr_model1 <- spark.logit(binary_df, label ~ feature, thresholds = 0.0) + blr_model1 <- spark.logit(binary_df, labels ~ feature, thresholds = 0.0) blr_predict1 <- collect(select(predict(blr_model1, binary_df), "prediction")) expect_equal(blr_predict1$prediction, c(1, 1, 1, 1, 1)) @@ -683,16 +683,26 @@ test_that("spark.logit", { expect_error(summary(blr_model2)) unlink(modelPath) + # test prediction label as text + training <- suppressWarnings(createDataFrame(iris)) + binomial_training <- training[training$Species %in% c("versicolor", "virginica"), ] + binomial_model <- spark.logit(binomial_training, Species ~ Sepal_Length + Sepal_Width) + prediction <- predict(binomial_model, binomial_training) + expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "character") + expected <- c("virginica", "virginica", "virginica", "versicolor", "virginica", + "versicolor", "virginica", "versicolor", "virginica", "versicolor") + expect_equal(as.list(take(select(prediction, "prediction"), 10))[[1]], expected) + # test multinomial logistic regression - label <- c(0.0, 1.0, 2.0, 0.0, 0.0) + labels <- c(0.0, 1.0, 2.0, 0.0, 0.0) feature1 <- c(4.845940, 5.64480, 7.430381, 6.464263, 5.555667) feature2 <- c(2.941319, 2.614812, 2.162451, 3.339474, 2.970987) feature3 <- c(1.322733, 1.348044, 3.861237, 9.686976, 3.447130) feature4 <- c(1.3246388, 0.5510444, 0.9225810, 1.2147881, 1.6020842) - data <- as.data.frame(cbind(label, feature1, feature2, feature3, feature4)) + data <- as.data.frame(cbind(labels, feature1, feature2, feature3, feature4)) df <- createDataFrame(data) - model <- spark.logit(df, label ~., family = "multinomial", thresholds = c(0, 1, 1)) + model <- spark.logit(df, labels ~., family = "multinomial", thresholds = c(0, 1, 1)) predict1 <- collect(select(predict(model, df), "prediction")) expect_equal(predict1$prediction, c(0, 0, 0, 0, 0)) # Summary of multinomial logistic regression is not implemented yet diff --git a/mllib/src/main/scala/org/apache/spark/ml/r/LogisticRegressionWrapper.scala b/mllib/src/main/scala/org/apache/spark/ml/r/LogisticRegressionWrapper.scala index 9b352c9863114..2bda4be3155c3 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/r/LogisticRegressionWrapper.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/r/LogisticRegressionWrapper.scala @@ -23,9 +23,9 @@ import org.json4s.JsonDSL._ import org.json4s.jackson.JsonMethods._ import org.apache.spark.ml.{Pipeline, PipelineModel} -import org.apache.spark.ml.attribute.AttributeGroup +import org.apache.spark.ml.attribute.{Attribute, AttributeGroup, NominalAttribute} import org.apache.spark.ml.classification.{BinaryLogisticRegressionSummary, LogisticRegression, LogisticRegressionModel} -import org.apache.spark.ml.feature.RFormula +import org.apache.spark.ml.feature.{IndexToString, RFormula} import org.apache.spark.ml.util._ import org.apache.spark.sql.{DataFrame, Dataset} @@ -34,6 +34,8 @@ private[r] class LogisticRegressionWrapper private ( val features: Array[String], val isLoaded: Boolean = false) extends MLWritable { + import LogisticRegressionWrapper._ + private val logisticRegressionModel: LogisticRegressionModel = pipeline.stages(1).asInstanceOf[LogisticRegressionModel] @@ -57,7 +59,9 @@ private[r] class LogisticRegressionWrapper private ( lazy val recallByThreshold: DataFrame = blrSummary.recallByThreshold def transform(dataset: Dataset[_]): DataFrame = { - pipeline.transform(dataset).drop(logisticRegressionModel.getFeaturesCol) + pipeline.transform(dataset) + .drop(PREDICTED_LABEL_INDEX_COL) + .drop(logisticRegressionModel.getFeaturesCol) } override def write: MLWriter = new LogisticRegressionWrapper.LogisticRegressionWrapperWriter(this) @@ -66,6 +70,9 @@ private[r] class LogisticRegressionWrapper private ( private[r] object LogisticRegressionWrapper extends MLReadable[LogisticRegressionWrapper] { + val PREDICTED_LABEL_INDEX_COL = "pred_label_idx" + val PREDICTED_LABEL_COL = "prediction" + def fit( // scalastyle:ignore data: DataFrame, formula: String, @@ -84,6 +91,7 @@ private[r] object LogisticRegressionWrapper val rFormula = new RFormula() .setFormula(formula) + .setForceIndexLabel(true) RWrapperUtils.checkDataColumns(rFormula, data) val rFormulaModel = rFormula.fit(data) @@ -93,6 +101,11 @@ private[r] object LogisticRegressionWrapper .attributes.get val features = featureAttrs.map(_.name.get) + // get label names from output schema + val labelAttr = Attribute.fromStructField(schema(rFormulaModel.getLabelCol)) + .asInstanceOf[NominalAttribute] + val labels = labelAttr.values.get + // assemble and fit the pipeline val logisticRegression = new LogisticRegression() .setRegParam(regParam) @@ -106,6 +119,7 @@ private[r] object LogisticRegressionWrapper .setAggregationDepth(aggregationDepth) .setFeaturesCol(rFormula.getFeaturesCol) .setProbabilityCol(probability) + .setPredictionCol(PREDICTED_LABEL_INDEX_COL) if (thresholds.length > 1) { logisticRegression.setThresholds(thresholds) @@ -113,8 +127,13 @@ private[r] object LogisticRegressionWrapper logisticRegression.setThreshold(thresholds(0)) } + val idxToStr = new IndexToString() + .setInputCol(PREDICTED_LABEL_INDEX_COL) + .setOutputCol(PREDICTED_LABEL_COL) + .setLabels(labels) + val pipeline = new Pipeline() - .setStages(Array(rFormulaModel, logisticRegression)) + .setStages(Array(rFormulaModel, logisticRegression, idxToStr)) .fit(data) new LogisticRegressionWrapper(pipeline, features) From 9d19284dd8b68eaacc6cb0d476eeb9ebfd7cc4c7 Mon Sep 17 00:00:00 2001 From: "wm624@hotmail.com" Date: Wed, 16 Nov 2016 16:28:40 -0800 Subject: [PATCH 3/7] modify test case; add output label --- R/pkg/R/mllib.R | 2 +- R/pkg/inst/tests/testthat/test_mllib.R | 36 +++++++++---------- .../ml/r/LogisticRegressionWrapper.scala | 19 ++++------ 3 files changed, 26 insertions(+), 31 deletions(-) diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R index 02bc6456de4d0..54ea8373f2094 100644 --- a/R/pkg/R/mllib.R +++ b/R/pkg/R/mllib.R @@ -747,7 +747,7 @@ setMethod("predict", signature(object = "KMeansModel"), #' \dontrun{ #' sparkR.session() #' # binary logistic regression -#' label <- c(1.0, 1.0, 1.0, 0.0, 0.0) +#' label <- c(0.0, 0.0, 0.0, 1.0, 1.0) #' feature <- c(1.1419053, 0.9194079, -0.9498666, -1.1069903, 0.2809776) #' binary_data <- as.data.frame(cbind(label, feature)) #' binary_df <- createDataFrame(binary_data) diff --git a/R/pkg/inst/tests/testthat/test_mllib.R b/R/pkg/inst/tests/testthat/test_mllib.R index 613a7ee6b3194..be3087e77b083 100644 --- a/R/pkg/inst/tests/testthat/test_mllib.R +++ b/R/pkg/inst/tests/testthat/test_mllib.R @@ -646,31 +646,31 @@ test_that("spark.isotonicRegression", { test_that("spark.logit", { # test binary logistic regression - labels <- c(1.0, 1.0, 1.0, 0.0, 0.0) + label <- c(0.0, 0.0, 0.0, 1.0, 1.0) feature <- c(1.1419053, 0.9194079, -0.9498666, -1.1069903, 0.2809776) - binary_data <- as.data.frame(cbind(labels, feature)) + binary_data <- as.data.frame(cbind(label, feature)) binary_df <- createDataFrame(binary_data) - blr_model <- spark.logit(binary_df, labels ~ feature, thresholds = 1.0) + blr_model <- spark.logit(binary_df, label ~ feature, thresholds = 1.0) blr_predict <- collect(select(predict(blr_model, binary_df), "prediction")) - expect_equal(blr_predict$prediction, c(0, 0, 0, 0, 0)) - blr_model1 <- spark.logit(binary_df, labels ~ feature, thresholds = 0.0) + expect_equal(blr_predict$prediction, c("0.0", "0.0", "0.0", "0.0", "0.0")) + blr_model1 <- spark.logit(binary_df, label ~ feature, thresholds = 0.0) blr_predict1 <- collect(select(predict(blr_model1, binary_df), "prediction")) - expect_equal(blr_predict1$prediction, c(1, 1, 1, 1, 1)) + expect_equal(blr_predict1$prediction, c("1.0", "1.0", "1.0", "1.0", "1.0")) # test summary of binary logistic regression blr_summary <- summary(blr_model) blr_fmeasure <- collect(select(blr_summary$fMeasureByThreshold, "threshold", "F-Measure")) - expect_equal(blr_fmeasure$threshold, c(0.8221347, 0.7884005, 0.6674709, 0.3785437, 0.3434487), - tolerance = 1e-4) - expect_equal(blr_fmeasure$"F-Measure", c(0.5000000, 0.8000000, 0.6666667, 0.8571429, 0.7500000), - tolerance = 1e-4) + expect_equal(blr_fmeasure$threshold, c(0.6565513, 0.6214563, 0.3325291, 0.2115995, 0.1778653), + tolerance = 1e-4) + expect_equal(blr_fmeasure$"F-Measure", c(0.6666667, 0.5000000, 0.8000000, 0.6666667, 0.5714286), + tolerance = 1e-4) blr_precision <- collect(select(blr_summary$precisionByThreshold, "threshold", "precision")) - expect_equal(blr_precision$precision, c(1.0000000, 1.0000000, 0.6666667, 0.7500000, 0.6000000), - tolerance = 1e-4) + expect_equal(blr_precision$precision, c(1.0000000, 0.5000000, 0.6666667, 0.5000000, 0.4000000), + tolerance = 1e-4) blr_recall <- collect(select(blr_summary$recallByThreshold, "threshold", "recall")) - expect_equal(blr_recall$recall, c(0.3333333, 0.6666667, 0.6666667, 1.0000000, 1.0000000), - tolerance = 1e-4) + expect_equal(blr_recall$recall, c(0.5000000, 0.5000000, 1.0000000, 1.0000000, 1.0000000), + tolerance = 1e-4) # test model save and read modelPath <- tempfile(pattern = "spark-logisticRegression", fileext = ".tmp") @@ -694,17 +694,17 @@ test_that("spark.logit", { expect_equal(as.list(take(select(prediction, "prediction"), 10))[[1]], expected) # test multinomial logistic regression - labels <- c(0.0, 1.0, 2.0, 0.0, 0.0) + label <- c(0.0, 1.0, 2.0, 0.0, 0.0) feature1 <- c(4.845940, 5.64480, 7.430381, 6.464263, 5.555667) feature2 <- c(2.941319, 2.614812, 2.162451, 3.339474, 2.970987) feature3 <- c(1.322733, 1.348044, 3.861237, 9.686976, 3.447130) feature4 <- c(1.3246388, 0.5510444, 0.9225810, 1.2147881, 1.6020842) - data <- as.data.frame(cbind(labels, feature1, feature2, feature3, feature4)) + data <- as.data.frame(cbind(label, feature1, feature2, feature3, feature4)) df <- createDataFrame(data) - model <- spark.logit(df, labels ~., family = "multinomial", thresholds = c(0, 1, 1)) + model <- spark.logit(df, label ~., family = "multinomial", thresholds = c(0, 1, 1)) predict1 <- collect(select(predict(model, df), "prediction")) - expect_equal(predict1$prediction, c(0, 0, 0, 0, 0)) + expect_equal(predict1$prediction, c("0.0", "0.0", "0.0", "0.0", "0.0")) # Summary of multinomial logistic regression is not implemented yet expect_error(summary(model)) }) diff --git a/mllib/src/main/scala/org/apache/spark/ml/r/LogisticRegressionWrapper.scala b/mllib/src/main/scala/org/apache/spark/ml/r/LogisticRegressionWrapper.scala index 2bda4be3155c3..42f376c425402 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/r/LogisticRegressionWrapper.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/r/LogisticRegressionWrapper.scala @@ -23,9 +23,9 @@ import org.json4s.JsonDSL._ import org.json4s.jackson.JsonMethods._ import org.apache.spark.ml.{Pipeline, PipelineModel} -import org.apache.spark.ml.attribute.{Attribute, AttributeGroup, NominalAttribute} import org.apache.spark.ml.classification.{BinaryLogisticRegressionSummary, LogisticRegression, LogisticRegressionModel} import org.apache.spark.ml.feature.{IndexToString, RFormula} +import org.apache.spark.ml.r.RWrapperUtils._ import org.apache.spark.ml.util._ import org.apache.spark.sql.{DataFrame, Dataset} @@ -62,6 +62,8 @@ private[r] class LogisticRegressionWrapper private ( pipeline.transform(dataset) .drop(PREDICTED_LABEL_INDEX_COL) .drop(logisticRegressionModel.getFeaturesCol) + .drop(logisticRegressionModel.getLabelCol) + } override def write: MLWriter = new LogisticRegressionWrapper.LogisticRegressionWrapperWriter(this) @@ -92,19 +94,11 @@ private[r] object LogisticRegressionWrapper val rFormula = new RFormula() .setFormula(formula) .setForceIndexLabel(true) - RWrapperUtils.checkDataColumns(rFormula, data) + checkDataColumns(rFormula, data) val rFormulaModel = rFormula.fit(data) - // get feature names from output schema - val schema = rFormulaModel.transform(data).schema - val featureAttrs = AttributeGroup.fromStructField(schema(rFormulaModel.getFeaturesCol)) - .attributes.get - val features = featureAttrs.map(_.name.get) - - // get label names from output schema - val labelAttr = Attribute.fromStructField(schema(rFormulaModel.getLabelCol)) - .asInstanceOf[NominalAttribute] - val labels = labelAttr.values.get + // get labels and feature names from output schema + val (features, labels) = getFeaturesAndLabels(rFormulaModel, data) // assemble and fit the pipeline val logisticRegression = new LogisticRegression() @@ -118,6 +112,7 @@ private[r] object LogisticRegressionWrapper .setWeightCol(weightCol) .setAggregationDepth(aggregationDepth) .setFeaturesCol(rFormula.getFeaturesCol) + .setLabelCol(rFormula.getLabelCol) .setProbabilityCol(probability) .setPredictionCol(PREDICTED_LABEL_INDEX_COL) From 71f7de283625ee9b123fc2c81b5e969160b9a4a6 Mon Sep 17 00:00:00 2001 From: "wm624@hotmail.com" Date: Wed, 16 Nov 2016 16:45:01 -0800 Subject: [PATCH 4/7] fix scala style error --- .../scala/org/apache/spark/ml/r/LogisticRegressionWrapper.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/r/LogisticRegressionWrapper.scala b/mllib/src/main/scala/org/apache/spark/ml/r/LogisticRegressionWrapper.scala index 42f376c425402..4a86fe0e73531 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/r/LogisticRegressionWrapper.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/r/LogisticRegressionWrapper.scala @@ -63,7 +63,7 @@ private[r] class LogisticRegressionWrapper private ( .drop(PREDICTED_LABEL_INDEX_COL) .drop(logisticRegressionModel.getFeaturesCol) .drop(logisticRegressionModel.getLabelCol) - + } override def write: MLWriter = new LogisticRegressionWrapper.LogisticRegressionWrapperWriter(this) From 57cf430743b836f9e145285a203146ceef9d4ad8 Mon Sep 17 00:00:00 2001 From: "wm624@hotmail.com" Date: Mon, 28 Nov 2016 10:58:33 -0800 Subject: [PATCH 5/7] address review comments --- R/pkg/R/mllib.R | 16 ++++++++-------- R/pkg/inst/tests/testthat/test_mllib.R | 8 ++++---- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R index 54ea8373f2094..7a04708739251 100644 --- a/R/pkg/R/mllib.R +++ b/R/pkg/R/mllib.R @@ -748,10 +748,10 @@ setMethod("predict", signature(object = "KMeansModel"), #' sparkR.session() #' # binary logistic regression #' label <- c(0.0, 0.0, 0.0, 1.0, 1.0) -#' feature <- c(1.1419053, 0.9194079, -0.9498666, -1.1069903, 0.2809776) -#' binary_data <- as.data.frame(cbind(label, feature)) +#' features <- c(1.1419053, 0.9194079, -0.9498666, -1.1069903, 0.2809776) +#' binary_data <- as.data.frame(cbind(label, features)) #' binary_df <- createDataFrame(binary_data) -#' blr_model <- spark.logit(binary_df, label ~ feature, thresholds = 1.0) +#' blr_model <- spark.logit(binary_df, label ~ features, thresholds = 1.0) #' blr_predict <- collect(select(predict(blr_model, binary_df), "prediction")) #' #' # summary of binary logistic regression @@ -769,11 +769,11 @@ setMethod("predict", signature(object = "KMeansModel"), #' # multinomial logistic regression #' #' label <- c(0.0, 1.0, 2.0, 0.0, 0.0) -#' feature1 <- c(4.845940, 5.64480, 7.430381, 6.464263, 5.555667) -#' feature2 <- c(2.941319, 2.614812, 2.162451, 3.339474, 2.970987) -#' feature3 <- c(1.322733, 1.348044, 3.861237, 9.686976, 3.447130) -#' feature4 <- c(1.3246388, 0.5510444, 0.9225810, 1.2147881, 1.6020842) -#' data <- as.data.frame(cbind(label, feature1, feature2, feature3, feature4)) +#' features1 <- c(4.845940, 5.64480, 7.430381, 6.464263, 5.555667) +#' features2 <- c(2.941319, 2.614812, 2.162451, 3.339474, 2.970987) +#' features3 <- c(1.322733, 1.348044, 3.861237, 9.686976, 3.447130) +#' features4 <- c(1.3246388, 0.5510444, 0.9225810, 1.2147881, 1.6020842) +#' data <- as.data.frame(cbind(label, features1, features2, features3, features4)) #' df <- createDataFrame(data) #' #' # Note that summary of multinomial logistic regression is not implemented yet diff --git a/R/pkg/inst/tests/testthat/test_mllib.R b/R/pkg/inst/tests/testthat/test_mllib.R index be3087e77b083..0553e704bde9f 100644 --- a/R/pkg/inst/tests/testthat/test_mllib.R +++ b/R/pkg/inst/tests/testthat/test_mllib.R @@ -662,15 +662,15 @@ test_that("spark.logit", { blr_summary <- summary(blr_model) blr_fmeasure <- collect(select(blr_summary$fMeasureByThreshold, "threshold", "F-Measure")) expect_equal(blr_fmeasure$threshold, c(0.6565513, 0.6214563, 0.3325291, 0.2115995, 0.1778653), - tolerance = 1e-4) + tolerance = 1e-4) expect_equal(blr_fmeasure$"F-Measure", c(0.6666667, 0.5000000, 0.8000000, 0.6666667, 0.5714286), - tolerance = 1e-4) + tolerance = 1e-4) blr_precision <- collect(select(blr_summary$precisionByThreshold, "threshold", "precision")) expect_equal(blr_precision$precision, c(1.0000000, 0.5000000, 0.6666667, 0.5000000, 0.4000000), - tolerance = 1e-4) + tolerance = 1e-4) blr_recall <- collect(select(blr_summary$recallByThreshold, "threshold", "recall")) expect_equal(blr_recall$recall, c(0.5000000, 0.5000000, 1.0000000, 1.0000000, 1.0000000), - tolerance = 1e-4) + tolerance = 1e-4) # test model save and read modelPath <- tempfile(pattern = "spark-logisticRegression", fileext = ".tmp") From de749062bcca36c6a331639a7f891008d970422d Mon Sep 17 00:00:00 2001 From: "wm624@hotmail.com" Date: Mon, 28 Nov 2016 14:55:01 -0800 Subject: [PATCH 6/7] fix bug of fitintercept --- R/pkg/R/mllib.R | 11 +++++------ .../apache/spark/ml/r/LogisticRegressionWrapper.scala | 3 ++- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R index 7a04708739251..3e467ad964f9e 100644 --- a/R/pkg/R/mllib.R +++ b/R/pkg/R/mllib.R @@ -712,7 +712,6 @@ setMethod("predict", signature(object = "KMeansModel"), #' of L1 and L2. Default is 0.0 which is an L2 penalty. #' @param maxIter maximum iteration number. #' @param tol convergence tolerance of iterations. -#' @param fitIntercept whether to fit an intercept term. #' @param family the name of family which is a description of the label distribution to be used in the model. #' Supported options: #' \itemize{ @@ -783,7 +782,7 @@ setMethod("predict", signature(object = "KMeansModel"), #' @note spark.logit since 2.1.0 setMethod("spark.logit", signature(data = "SparkDataFrame", formula = "formula"), function(data, formula, regParam = 0.0, elasticNetParam = 0.0, maxIter = 100, - tol = 1E-6, fitIntercept = TRUE, family = "auto", standardization = TRUE, + tol = 1E-6, family = "auto", standardization = TRUE, thresholds = 0.5, weightCol = NULL, aggregationDepth = 2, probabilityCol = "probability") { formula <- paste(deparse(formula), collapse = "") @@ -795,10 +794,10 @@ setMethod("spark.logit", signature(data = "SparkDataFrame", formula = "formula") jobj <- callJStatic("org.apache.spark.ml.r.LogisticRegressionWrapper", "fit", data@sdf, formula, as.numeric(regParam), as.numeric(elasticNetParam), as.integer(maxIter), - as.numeric(tol), as.logical(fitIntercept), - as.character(family), as.logical(standardization), - as.array(thresholds), as.character(weightCol), - as.integer(aggregationDepth), as.character(probabilityCol)) + as.numeric(tol), as.character(family), + as.logical(standardization), as.array(thresholds), + as.character(weightCol), as.integer(aggregationDepth), + as.character(probabilityCol)) new("LogisticRegressionModel", jobj = jobj) }) diff --git a/mllib/src/main/scala/org/apache/spark/ml/r/LogisticRegressionWrapper.scala b/mllib/src/main/scala/org/apache/spark/ml/r/LogisticRegressionWrapper.scala index 4a86fe0e73531..9fe6202980fca 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/r/LogisticRegressionWrapper.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/r/LogisticRegressionWrapper.scala @@ -82,7 +82,6 @@ private[r] object LogisticRegressionWrapper elasticNetParam: Double, maxIter: Int, tol: Double, - fitIntercept: Boolean, family: String, standardization: Boolean, thresholds: Array[Double], @@ -97,6 +96,8 @@ private[r] object LogisticRegressionWrapper checkDataColumns(rFormula, data) val rFormulaModel = rFormula.fit(data) + val fitIntercept = rFormula.hasIntercept + // get labels and feature names from output schema val (features, labels) = getFeaturesAndLabels(rFormulaModel, data) From b1f7b238a4655e5f4493a741daff1d0a45521e5a Mon Sep 17 00:00:00 2001 From: "wm624@hotmail.com" Date: Wed, 30 Nov 2016 09:27:30 -0800 Subject: [PATCH 7/7] address review comments --- R/pkg/R/mllib.R | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R index 3e467ad964f9e..eed829356f2be 100644 --- a/R/pkg/R/mllib.R +++ b/R/pkg/R/mllib.R @@ -768,11 +768,11 @@ setMethod("predict", signature(object = "KMeansModel"), #' # multinomial logistic regression #' #' label <- c(0.0, 1.0, 2.0, 0.0, 0.0) -#' features1 <- c(4.845940, 5.64480, 7.430381, 6.464263, 5.555667) -#' features2 <- c(2.941319, 2.614812, 2.162451, 3.339474, 2.970987) -#' features3 <- c(1.322733, 1.348044, 3.861237, 9.686976, 3.447130) -#' features4 <- c(1.3246388, 0.5510444, 0.9225810, 1.2147881, 1.6020842) -#' data <- as.data.frame(cbind(label, features1, features2, features3, features4)) +#' feature1 <- c(4.845940, 5.64480, 7.430381, 6.464263, 5.555667) +#' feature2 <- c(2.941319, 2.614812, 2.162451, 3.339474, 2.970987) +#' feature3 <- c(1.322733, 1.348044, 3.861237, 9.686976, 3.447130) +#' feature4 <- c(1.3246388, 0.5510444, 0.9225810, 1.2147881, 1.6020842) +#' data <- as.data.frame(cbind(label, feature1, feature2, feature3, feature4)) #' df <- createDataFrame(data) #' #' # Note that summary of multinomial logistic regression is not implemented yet