From 0ff84f0a59352f0ecce0bf88de4414b9e3d2e4fa Mon Sep 17 00:00:00 2001
From: "wm624@hotmail.com" <wm624@hotmail.com>
Date: Thu, 10 Nov 2016 14:40:57 -0800
Subject: [PATCH 1/7] remove unused import

---
 core/src/main/scala/org/apache/spark/SparkContext.scala | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala
index 1261e3e735761..7cb6c2da2b1ab 100644
--- a/core/src/main/scala/org/apache/spark/SparkContext.scala
+++ b/core/src/main/scala/org/apache/spark/SparkContext.scala
@@ -19,7 +19,7 @@ package org.apache.spark
 
 import java.io._
 import java.lang.reflect.Constructor
-import java.net.{MalformedURLException, URI}
+import java.net.{URI}
 import java.util.{Arrays, Locale, Properties, ServiceLoader, UUID}
 import java.util.concurrent.{ConcurrentHashMap, ConcurrentMap}
 import java.util.concurrent.atomic.{AtomicBoolean, AtomicInteger, AtomicReference}

From 9a07aa15963378dbfe58e1641077dc4183a06872 Mon Sep 17 00:00:00 2001
From: "wm624@hotmail.com" <wm624@hotmail.com>
Date: Fri, 11 Nov 2016 10:23:20 -0800
Subject: [PATCH 2/7] add label StringIndexer

---
 R/pkg/inst/tests/testthat/test_mllib.R        | 24 ++++++++++++-----
 .../ml/r/LogisticRegressionWrapper.scala      | 27 ++++++++++++++++---
 2 files changed, 40 insertions(+), 11 deletions(-)

diff --git a/R/pkg/inst/tests/testthat/test_mllib.R b/R/pkg/inst/tests/testthat/test_mllib.R
index 467e00cf7919b..613a7ee6b3194 100644
--- a/R/pkg/inst/tests/testthat/test_mllib.R
+++ b/R/pkg/inst/tests/testthat/test_mllib.R
@@ -646,15 +646,15 @@ test_that("spark.isotonicRegression", {
 
 test_that("spark.logit", {
   # test binary logistic regression
-  label <- c(1.0, 1.0, 1.0, 0.0, 0.0)
+  labels <- c(1.0, 1.0, 1.0, 0.0, 0.0)
   feature <- c(1.1419053, 0.9194079, -0.9498666, -1.1069903, 0.2809776)
-  binary_data <- as.data.frame(cbind(label, feature))
+  binary_data <- as.data.frame(cbind(labels, feature))
   binary_df <- createDataFrame(binary_data)
 
-  blr_model <- spark.logit(binary_df, label ~ feature, thresholds = 1.0)
+  blr_model <- spark.logit(binary_df, labels ~ feature, thresholds = 1.0)
   blr_predict <- collect(select(predict(blr_model, binary_df), "prediction"))
   expect_equal(blr_predict$prediction, c(0, 0, 0, 0, 0))
-  blr_model1 <- spark.logit(binary_df, label ~ feature, thresholds = 0.0)
+  blr_model1 <- spark.logit(binary_df, labels ~ feature, thresholds = 0.0)
   blr_predict1 <- collect(select(predict(blr_model1, binary_df), "prediction"))
   expect_equal(blr_predict1$prediction, c(1, 1, 1, 1, 1))
 
@@ -683,16 +683,26 @@ test_that("spark.logit", {
   expect_error(summary(blr_model2))
   unlink(modelPath)
 
+  # test prediction label as text
+  training <- suppressWarnings(createDataFrame(iris))
+  binomial_training <- training[training$Species %in% c("versicolor", "virginica"), ]
+  binomial_model <- spark.logit(binomial_training, Species ~ Sepal_Length + Sepal_Width)
+  prediction <- predict(binomial_model, binomial_training)
+  expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "character")
+  expected <- c("virginica", "virginica", "virginica", "versicolor", "virginica",
+                "versicolor", "virginica", "versicolor", "virginica", "versicolor")
+  expect_equal(as.list(take(select(prediction, "prediction"), 10))[[1]], expected)
+
   # test multinomial logistic regression
-  label <- c(0.0, 1.0, 2.0, 0.0, 0.0)
+  labels <- c(0.0, 1.0, 2.0, 0.0, 0.0)
   feature1 <- c(4.845940, 5.64480, 7.430381, 6.464263, 5.555667)
   feature2 <- c(2.941319, 2.614812, 2.162451, 3.339474, 2.970987)
   feature3 <- c(1.322733, 1.348044, 3.861237, 9.686976, 3.447130)
   feature4 <- c(1.3246388, 0.5510444, 0.9225810, 1.2147881, 1.6020842)
-  data <- as.data.frame(cbind(label, feature1, feature2, feature3, feature4))
+  data <- as.data.frame(cbind(labels, feature1, feature2, feature3, feature4))
   df <- createDataFrame(data)
 
-  model <- spark.logit(df, label ~., family = "multinomial", thresholds = c(0, 1, 1))
+  model <- spark.logit(df, labels ~., family = "multinomial", thresholds = c(0, 1, 1))
   predict1 <- collect(select(predict(model, df), "prediction"))
   expect_equal(predict1$prediction, c(0, 0, 0, 0, 0))
   # Summary of multinomial logistic regression is not implemented yet
diff --git a/mllib/src/main/scala/org/apache/spark/ml/r/LogisticRegressionWrapper.scala b/mllib/src/main/scala/org/apache/spark/ml/r/LogisticRegressionWrapper.scala
index 9b352c9863114..2bda4be3155c3 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/r/LogisticRegressionWrapper.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/r/LogisticRegressionWrapper.scala
@@ -23,9 +23,9 @@ import org.json4s.JsonDSL._
 import org.json4s.jackson.JsonMethods._
 
 import org.apache.spark.ml.{Pipeline, PipelineModel}
-import org.apache.spark.ml.attribute.AttributeGroup
+import org.apache.spark.ml.attribute.{Attribute, AttributeGroup, NominalAttribute}
 import org.apache.spark.ml.classification.{BinaryLogisticRegressionSummary, LogisticRegression, LogisticRegressionModel}
-import org.apache.spark.ml.feature.RFormula
+import org.apache.spark.ml.feature.{IndexToString, RFormula}
 import org.apache.spark.ml.util._
 import org.apache.spark.sql.{DataFrame, Dataset}
 
@@ -34,6 +34,8 @@ private[r] class LogisticRegressionWrapper private (
     val features: Array[String],
     val isLoaded: Boolean = false) extends MLWritable {
 
+  import LogisticRegressionWrapper._
+
   private val logisticRegressionModel: LogisticRegressionModel =
     pipeline.stages(1).asInstanceOf[LogisticRegressionModel]
 
@@ -57,7 +59,9 @@ private[r] class LogisticRegressionWrapper private (
   lazy val recallByThreshold: DataFrame = blrSummary.recallByThreshold
 
   def transform(dataset: Dataset[_]): DataFrame = {
-    pipeline.transform(dataset).drop(logisticRegressionModel.getFeaturesCol)
+    pipeline.transform(dataset)
+      .drop(PREDICTED_LABEL_INDEX_COL)
+      .drop(logisticRegressionModel.getFeaturesCol)
   }
 
   override def write: MLWriter = new LogisticRegressionWrapper.LogisticRegressionWrapperWriter(this)
@@ -66,6 +70,9 @@ private[r] class LogisticRegressionWrapper private (
 private[r] object LogisticRegressionWrapper
     extends MLReadable[LogisticRegressionWrapper] {
 
+  val PREDICTED_LABEL_INDEX_COL = "pred_label_idx"
+  val PREDICTED_LABEL_COL = "prediction"
+
   def fit( // scalastyle:ignore
       data: DataFrame,
       formula: String,
@@ -84,6 +91,7 @@ private[r] object LogisticRegressionWrapper
 
     val rFormula = new RFormula()
       .setFormula(formula)
+      .setForceIndexLabel(true)
     RWrapperUtils.checkDataColumns(rFormula, data)
     val rFormulaModel = rFormula.fit(data)
 
@@ -93,6 +101,11 @@ private[r] object LogisticRegressionWrapper
       .attributes.get
     val features = featureAttrs.map(_.name.get)
 
+    // get label names from output schema
+    val labelAttr = Attribute.fromStructField(schema(rFormulaModel.getLabelCol))
+      .asInstanceOf[NominalAttribute]
+    val labels = labelAttr.values.get
+
     // assemble and fit the pipeline
     val logisticRegression = new LogisticRegression()
       .setRegParam(regParam)
@@ -106,6 +119,7 @@ private[r] object LogisticRegressionWrapper
       .setAggregationDepth(aggregationDepth)
       .setFeaturesCol(rFormula.getFeaturesCol)
       .setProbabilityCol(probability)
+      .setPredictionCol(PREDICTED_LABEL_INDEX_COL)
 
     if (thresholds.length > 1) {
       logisticRegression.setThresholds(thresholds)
@@ -113,8 +127,13 @@ private[r] object LogisticRegressionWrapper
       logisticRegression.setThreshold(thresholds(0))
     }
 
+    val idxToStr = new IndexToString()
+      .setInputCol(PREDICTED_LABEL_INDEX_COL)
+      .setOutputCol(PREDICTED_LABEL_COL)
+      .setLabels(labels)
+
     val pipeline = new Pipeline()
-      .setStages(Array(rFormulaModel, logisticRegression))
+      .setStages(Array(rFormulaModel, logisticRegression, idxToStr))
       .fit(data)
 
     new LogisticRegressionWrapper(pipeline, features)

From 9d19284dd8b68eaacc6cb0d476eeb9ebfd7cc4c7 Mon Sep 17 00:00:00 2001
From: "wm624@hotmail.com" <wm624@hotmail.com>
Date: Wed, 16 Nov 2016 16:28:40 -0800
Subject: [PATCH 3/7] modify test case; add output label

---
 R/pkg/R/mllib.R                               |  2 +-
 R/pkg/inst/tests/testthat/test_mllib.R        | 36 +++++++++----------
 .../ml/r/LogisticRegressionWrapper.scala      | 19 ++++------
 3 files changed, 26 insertions(+), 31 deletions(-)

diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R
index 02bc6456de4d0..54ea8373f2094 100644
--- a/R/pkg/R/mllib.R
+++ b/R/pkg/R/mllib.R
@@ -747,7 +747,7 @@ setMethod("predict", signature(object = "KMeansModel"),
 #' \dontrun{
 #' sparkR.session()
 #' # binary logistic regression
-#' label <- c(1.0, 1.0, 1.0, 0.0, 0.0)
+#' label <- c(0.0, 0.0, 0.0, 1.0, 1.0)
 #' feature <- c(1.1419053, 0.9194079, -0.9498666, -1.1069903, 0.2809776)
 #' binary_data <- as.data.frame(cbind(label, feature))
 #' binary_df <- createDataFrame(binary_data)
diff --git a/R/pkg/inst/tests/testthat/test_mllib.R b/R/pkg/inst/tests/testthat/test_mllib.R
index 613a7ee6b3194..be3087e77b083 100644
--- a/R/pkg/inst/tests/testthat/test_mllib.R
+++ b/R/pkg/inst/tests/testthat/test_mllib.R
@@ -646,31 +646,31 @@ test_that("spark.isotonicRegression", {
 
 test_that("spark.logit", {
   # test binary logistic regression
-  labels <- c(1.0, 1.0, 1.0, 0.0, 0.0)
+  label <- c(0.0, 0.0, 0.0, 1.0, 1.0)
   feature <- c(1.1419053, 0.9194079, -0.9498666, -1.1069903, 0.2809776)
-  binary_data <- as.data.frame(cbind(labels, feature))
+  binary_data <- as.data.frame(cbind(label, feature))
   binary_df <- createDataFrame(binary_data)
 
-  blr_model <- spark.logit(binary_df, labels ~ feature, thresholds = 1.0)
+  blr_model <- spark.logit(binary_df, label ~ feature, thresholds = 1.0)
   blr_predict <- collect(select(predict(blr_model, binary_df), "prediction"))
-  expect_equal(blr_predict$prediction, c(0, 0, 0, 0, 0))
-  blr_model1 <- spark.logit(binary_df, labels ~ feature, thresholds = 0.0)
+  expect_equal(blr_predict$prediction, c("0.0", "0.0", "0.0", "0.0", "0.0"))
+  blr_model1 <- spark.logit(binary_df, label ~ feature, thresholds = 0.0)
   blr_predict1 <- collect(select(predict(blr_model1, binary_df), "prediction"))
-  expect_equal(blr_predict1$prediction, c(1, 1, 1, 1, 1))
+  expect_equal(blr_predict1$prediction, c("1.0", "1.0", "1.0", "1.0", "1.0"))
 
   # test summary of binary logistic regression
   blr_summary <- summary(blr_model)
   blr_fmeasure <- collect(select(blr_summary$fMeasureByThreshold, "threshold", "F-Measure"))
-  expect_equal(blr_fmeasure$threshold, c(0.8221347, 0.7884005, 0.6674709, 0.3785437, 0.3434487),
-               tolerance = 1e-4)
-  expect_equal(blr_fmeasure$"F-Measure", c(0.5000000, 0.8000000, 0.6666667, 0.8571429, 0.7500000),
-               tolerance = 1e-4)
+  expect_equal(blr_fmeasure$threshold, c(0.6565513, 0.6214563, 0.3325291, 0.2115995, 0.1778653),
+  tolerance = 1e-4)
+  expect_equal(blr_fmeasure$"F-Measure", c(0.6666667, 0.5000000, 0.8000000, 0.6666667, 0.5714286),
+  tolerance = 1e-4)
   blr_precision <- collect(select(blr_summary$precisionByThreshold, "threshold", "precision"))
-  expect_equal(blr_precision$precision, c(1.0000000, 1.0000000, 0.6666667, 0.7500000, 0.6000000),
-               tolerance = 1e-4)
+  expect_equal(blr_precision$precision, c(1.0000000, 0.5000000, 0.6666667, 0.5000000, 0.4000000),
+  tolerance = 1e-4)
   blr_recall <- collect(select(blr_summary$recallByThreshold, "threshold", "recall"))
-  expect_equal(blr_recall$recall, c(0.3333333, 0.6666667, 0.6666667, 1.0000000, 1.0000000),
-               tolerance = 1e-4)
+  expect_equal(blr_recall$recall, c(0.5000000, 0.5000000, 1.0000000, 1.0000000, 1.0000000),
+  tolerance = 1e-4)
 
   # test model save and read
   modelPath <- tempfile(pattern = "spark-logisticRegression", fileext = ".tmp")
@@ -694,17 +694,17 @@ test_that("spark.logit", {
   expect_equal(as.list(take(select(prediction, "prediction"), 10))[[1]], expected)
 
   # test multinomial logistic regression
-  labels <- c(0.0, 1.0, 2.0, 0.0, 0.0)
+  label <- c(0.0, 1.0, 2.0, 0.0, 0.0)
   feature1 <- c(4.845940, 5.64480, 7.430381, 6.464263, 5.555667)
   feature2 <- c(2.941319, 2.614812, 2.162451, 3.339474, 2.970987)
   feature3 <- c(1.322733, 1.348044, 3.861237, 9.686976, 3.447130)
   feature4 <- c(1.3246388, 0.5510444, 0.9225810, 1.2147881, 1.6020842)
-  data <- as.data.frame(cbind(labels, feature1, feature2, feature3, feature4))
+  data <- as.data.frame(cbind(label, feature1, feature2, feature3, feature4))
   df <- createDataFrame(data)
 
-  model <- spark.logit(df, labels ~., family = "multinomial", thresholds = c(0, 1, 1))
+  model <- spark.logit(df, label ~., family = "multinomial", thresholds = c(0, 1, 1))
   predict1 <- collect(select(predict(model, df), "prediction"))
-  expect_equal(predict1$prediction, c(0, 0, 0, 0, 0))
+  expect_equal(predict1$prediction, c("0.0", "0.0", "0.0", "0.0", "0.0"))
   # Summary of multinomial logistic regression is not implemented yet
   expect_error(summary(model))
 })
diff --git a/mllib/src/main/scala/org/apache/spark/ml/r/LogisticRegressionWrapper.scala b/mllib/src/main/scala/org/apache/spark/ml/r/LogisticRegressionWrapper.scala
index 2bda4be3155c3..42f376c425402 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/r/LogisticRegressionWrapper.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/r/LogisticRegressionWrapper.scala
@@ -23,9 +23,9 @@ import org.json4s.JsonDSL._
 import org.json4s.jackson.JsonMethods._
 
 import org.apache.spark.ml.{Pipeline, PipelineModel}
-import org.apache.spark.ml.attribute.{Attribute, AttributeGroup, NominalAttribute}
 import org.apache.spark.ml.classification.{BinaryLogisticRegressionSummary, LogisticRegression, LogisticRegressionModel}
 import org.apache.spark.ml.feature.{IndexToString, RFormula}
+import org.apache.spark.ml.r.RWrapperUtils._
 import org.apache.spark.ml.util._
 import org.apache.spark.sql.{DataFrame, Dataset}
 
@@ -62,6 +62,8 @@ private[r] class LogisticRegressionWrapper private (
     pipeline.transform(dataset)
       .drop(PREDICTED_LABEL_INDEX_COL)
       .drop(logisticRegressionModel.getFeaturesCol)
+      .drop(logisticRegressionModel.getLabelCol)
+    
   }
 
   override def write: MLWriter = new LogisticRegressionWrapper.LogisticRegressionWrapperWriter(this)
@@ -92,19 +94,11 @@ private[r] object LogisticRegressionWrapper
     val rFormula = new RFormula()
       .setFormula(formula)
       .setForceIndexLabel(true)
-    RWrapperUtils.checkDataColumns(rFormula, data)
+    checkDataColumns(rFormula, data)
     val rFormulaModel = rFormula.fit(data)
 
-    // get feature names from output schema
-    val schema = rFormulaModel.transform(data).schema
-    val featureAttrs = AttributeGroup.fromStructField(schema(rFormulaModel.getFeaturesCol))
-      .attributes.get
-    val features = featureAttrs.map(_.name.get)
-
-    // get label names from output schema
-    val labelAttr = Attribute.fromStructField(schema(rFormulaModel.getLabelCol))
-      .asInstanceOf[NominalAttribute]
-    val labels = labelAttr.values.get
+    // get labels and feature names from output schema
+    val (features, labels) = getFeaturesAndLabels(rFormulaModel, data)
 
     // assemble and fit the pipeline
     val logisticRegression = new LogisticRegression()
@@ -118,6 +112,7 @@ private[r] object LogisticRegressionWrapper
       .setWeightCol(weightCol)
       .setAggregationDepth(aggregationDepth)
       .setFeaturesCol(rFormula.getFeaturesCol)
+      .setLabelCol(rFormula.getLabelCol)
       .setProbabilityCol(probability)
       .setPredictionCol(PREDICTED_LABEL_INDEX_COL)
 

From 71f7de283625ee9b123fc2c81b5e969160b9a4a6 Mon Sep 17 00:00:00 2001
From: "wm624@hotmail.com" <wm624@hotmail.com>
Date: Wed, 16 Nov 2016 16:45:01 -0800
Subject: [PATCH 4/7] fix scala style error

---
 .../scala/org/apache/spark/ml/r/LogisticRegressionWrapper.scala | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/r/LogisticRegressionWrapper.scala b/mllib/src/main/scala/org/apache/spark/ml/r/LogisticRegressionWrapper.scala
index 42f376c425402..4a86fe0e73531 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/r/LogisticRegressionWrapper.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/r/LogisticRegressionWrapper.scala
@@ -63,7 +63,7 @@ private[r] class LogisticRegressionWrapper private (
       .drop(PREDICTED_LABEL_INDEX_COL)
       .drop(logisticRegressionModel.getFeaturesCol)
       .drop(logisticRegressionModel.getLabelCol)
-    
+
   }
 
   override def write: MLWriter = new LogisticRegressionWrapper.LogisticRegressionWrapperWriter(this)

From 57cf430743b836f9e145285a203146ceef9d4ad8 Mon Sep 17 00:00:00 2001
From: "wm624@hotmail.com" <wm624@hotmail.com>
Date: Mon, 28 Nov 2016 10:58:33 -0800
Subject: [PATCH 5/7] address review comments

---
 R/pkg/R/mllib.R                        | 16 ++++++++--------
 R/pkg/inst/tests/testthat/test_mllib.R |  8 ++++----
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R
index 54ea8373f2094..7a04708739251 100644
--- a/R/pkg/R/mllib.R
+++ b/R/pkg/R/mllib.R
@@ -748,10 +748,10 @@ setMethod("predict", signature(object = "KMeansModel"),
 #' sparkR.session()
 #' # binary logistic regression
 #' label <- c(0.0, 0.0, 0.0, 1.0, 1.0)
-#' feature <- c(1.1419053, 0.9194079, -0.9498666, -1.1069903, 0.2809776)
-#' binary_data <- as.data.frame(cbind(label, feature))
+#' features <- c(1.1419053, 0.9194079, -0.9498666, -1.1069903, 0.2809776)
+#' binary_data <- as.data.frame(cbind(label, features))
 #' binary_df <- createDataFrame(binary_data)
-#' blr_model <- spark.logit(binary_df, label ~ feature, thresholds = 1.0)
+#' blr_model <- spark.logit(binary_df, label ~ features, thresholds = 1.0)
 #' blr_predict <- collect(select(predict(blr_model, binary_df), "prediction"))
 #'
 #' # summary of binary logistic regression
@@ -769,11 +769,11 @@ setMethod("predict", signature(object = "KMeansModel"),
 #' # multinomial logistic regression
 #'
 #' label <- c(0.0, 1.0, 2.0, 0.0, 0.0)
-#' feature1 <- c(4.845940, 5.64480, 7.430381, 6.464263, 5.555667)
-#' feature2 <- c(2.941319, 2.614812, 2.162451, 3.339474, 2.970987)
-#' feature3 <- c(1.322733, 1.348044, 3.861237, 9.686976, 3.447130)
-#' feature4 <- c(1.3246388, 0.5510444, 0.9225810, 1.2147881, 1.6020842)
-#' data <- as.data.frame(cbind(label, feature1, feature2, feature3, feature4))
+#' features1 <- c(4.845940, 5.64480, 7.430381, 6.464263, 5.555667)
+#' features2 <- c(2.941319, 2.614812, 2.162451, 3.339474, 2.970987)
+#' features3 <- c(1.322733, 1.348044, 3.861237, 9.686976, 3.447130)
+#' features4 <- c(1.3246388, 0.5510444, 0.9225810, 1.2147881, 1.6020842)
+#' data <- as.data.frame(cbind(label, features1, features2, features3, features4))
 #' df <- createDataFrame(data)
 #'
 #' # Note that summary of multinomial logistic regression is not implemented yet
diff --git a/R/pkg/inst/tests/testthat/test_mllib.R b/R/pkg/inst/tests/testthat/test_mllib.R
index be3087e77b083..0553e704bde9f 100644
--- a/R/pkg/inst/tests/testthat/test_mllib.R
+++ b/R/pkg/inst/tests/testthat/test_mllib.R
@@ -662,15 +662,15 @@ test_that("spark.logit", {
   blr_summary <- summary(blr_model)
   blr_fmeasure <- collect(select(blr_summary$fMeasureByThreshold, "threshold", "F-Measure"))
   expect_equal(blr_fmeasure$threshold, c(0.6565513, 0.6214563, 0.3325291, 0.2115995, 0.1778653),
-  tolerance = 1e-4)
+               tolerance = 1e-4)
   expect_equal(blr_fmeasure$"F-Measure", c(0.6666667, 0.5000000, 0.8000000, 0.6666667, 0.5714286),
-  tolerance = 1e-4)
+               tolerance = 1e-4)
   blr_precision <- collect(select(blr_summary$precisionByThreshold, "threshold", "precision"))
   expect_equal(blr_precision$precision, c(1.0000000, 0.5000000, 0.6666667, 0.5000000, 0.4000000),
-  tolerance = 1e-4)
+               tolerance = 1e-4)
   blr_recall <- collect(select(blr_summary$recallByThreshold, "threshold", "recall"))
   expect_equal(blr_recall$recall, c(0.5000000, 0.5000000, 1.0000000, 1.0000000, 1.0000000),
-  tolerance = 1e-4)
+               tolerance = 1e-4)
 
   # test model save and read
   modelPath <- tempfile(pattern = "spark-logisticRegression", fileext = ".tmp")

From de749062bcca36c6a331639a7f891008d970422d Mon Sep 17 00:00:00 2001
From: "wm624@hotmail.com" <wm624@hotmail.com>
Date: Mon, 28 Nov 2016 14:55:01 -0800
Subject: [PATCH 6/7] fix bug of fitintercept

---
 R/pkg/R/mllib.R                                       | 11 +++++------
 .../apache/spark/ml/r/LogisticRegressionWrapper.scala |  3 ++-
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R
index 7a04708739251..3e467ad964f9e 100644
--- a/R/pkg/R/mllib.R
+++ b/R/pkg/R/mllib.R
@@ -712,7 +712,6 @@ setMethod("predict", signature(object = "KMeansModel"),
 #'                        of L1 and L2. Default is 0.0 which is an L2 penalty.
 #' @param maxIter maximum iteration number.
 #' @param tol convergence tolerance of iterations.
-#' @param fitIntercept whether to fit an intercept term.
 #' @param family the name of family which is a description of the label distribution to be used in the model.
 #'               Supported options:
 #'                 \itemize{
@@ -783,7 +782,7 @@ setMethod("predict", signature(object = "KMeansModel"),
 #' @note spark.logit since 2.1.0
 setMethod("spark.logit", signature(data = "SparkDataFrame", formula = "formula"),
           function(data, formula, regParam = 0.0, elasticNetParam = 0.0, maxIter = 100,
-                   tol = 1E-6, fitIntercept = TRUE, family = "auto", standardization = TRUE,
+                   tol = 1E-6, family = "auto", standardization = TRUE,
                    thresholds = 0.5, weightCol = NULL, aggregationDepth = 2,
                    probabilityCol = "probability") {
             formula <- paste(deparse(formula), collapse = "")
@@ -795,10 +794,10 @@ setMethod("spark.logit", signature(data = "SparkDataFrame", formula = "formula")
             jobj <- callJStatic("org.apache.spark.ml.r.LogisticRegressionWrapper", "fit",
                                 data@sdf, formula, as.numeric(regParam),
                                 as.numeric(elasticNetParam), as.integer(maxIter),
-                                as.numeric(tol), as.logical(fitIntercept),
-                                as.character(family), as.logical(standardization),
-                                as.array(thresholds), as.character(weightCol),
-                                as.integer(aggregationDepth), as.character(probabilityCol))
+                                as.numeric(tol), as.character(family),
+                                as.logical(standardization), as.array(thresholds),
+                                as.character(weightCol), as.integer(aggregationDepth),
+                                as.character(probabilityCol))
             new("LogisticRegressionModel", jobj = jobj)
           })
 
diff --git a/mllib/src/main/scala/org/apache/spark/ml/r/LogisticRegressionWrapper.scala b/mllib/src/main/scala/org/apache/spark/ml/r/LogisticRegressionWrapper.scala
index 4a86fe0e73531..9fe6202980fca 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/r/LogisticRegressionWrapper.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/r/LogisticRegressionWrapper.scala
@@ -82,7 +82,6 @@ private[r] object LogisticRegressionWrapper
       elasticNetParam: Double,
       maxIter: Int,
       tol: Double,
-      fitIntercept: Boolean,
       family: String,
       standardization: Boolean,
       thresholds: Array[Double],
@@ -97,6 +96,8 @@ private[r] object LogisticRegressionWrapper
     checkDataColumns(rFormula, data)
     val rFormulaModel = rFormula.fit(data)
 
+    val fitIntercept = rFormula.hasIntercept
+
     // get labels and feature names from output schema
     val (features, labels) = getFeaturesAndLabels(rFormulaModel, data)
 

From b1f7b238a4655e5f4493a741daff1d0a45521e5a Mon Sep 17 00:00:00 2001
From: "wm624@hotmail.com" <wm624@hotmail.com>
Date: Wed, 30 Nov 2016 09:27:30 -0800
Subject: [PATCH 7/7] address review comments

---
 R/pkg/R/mllib.R | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R
index 3e467ad964f9e..eed829356f2be 100644
--- a/R/pkg/R/mllib.R
+++ b/R/pkg/R/mllib.R
@@ -768,11 +768,11 @@ setMethod("predict", signature(object = "KMeansModel"),
 #' # multinomial logistic regression
 #'
 #' label <- c(0.0, 1.0, 2.0, 0.0, 0.0)
-#' features1 <- c(4.845940, 5.64480, 7.430381, 6.464263, 5.555667)
-#' features2 <- c(2.941319, 2.614812, 2.162451, 3.339474, 2.970987)
-#' features3 <- c(1.322733, 1.348044, 3.861237, 9.686976, 3.447130)
-#' features4 <- c(1.3246388, 0.5510444, 0.9225810, 1.2147881, 1.6020842)
-#' data <- as.data.frame(cbind(label, features1, features2, features3, features4))
+#' feature1 <- c(4.845940, 5.64480, 7.430381, 6.464263, 5.555667)
+#' feature2 <- c(2.941319, 2.614812, 2.162451, 3.339474, 2.970987)
+#' feature3 <- c(1.322733, 1.348044, 3.861237, 9.686976, 3.447130)
+#' feature4 <- c(1.3246388, 0.5510444, 0.9225810, 1.2147881, 1.6020842)
+#' data <- as.data.frame(cbind(label, feature1, feature2, feature3, feature4))
 #' df <- createDataFrame(data)
 #'
 #' # Note that summary of multinomial logistic regression is not implemented yet