From 4d27123926ee87231a73aea9dc34555c404c7f1b Mon Sep 17 00:00:00 2001
From: Wayne Zhang <actuaryzhang@uber.com>
Date: Fri, 12 May 2017 00:52:13 -0700
Subject: [PATCH 01/10] add stringOrderType to RFormula

---
 .../apache/spark/ml/feature/RFormula.scala    | 30 +++++++++++++
 .../spark/ml/feature/RFormulaSuite.scala      | 44 +++++++++++++++++++
 2 files changed, 74 insertions(+)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala
index 5a3e2929f5f52..5eeb282b35da6 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala
@@ -26,6 +26,7 @@ import org.apache.spark.annotation.{Experimental, Since}
 import org.apache.spark.ml.{Estimator, Model, Pipeline, PipelineModel, PipelineStage, Transformer}
 import org.apache.spark.ml.attribute.AttributeGroup
 import org.apache.spark.ml.linalg.VectorUDT
+import org.apache.spark.ml.param._
 import org.apache.spark.ml.param.{BooleanParam, Param, ParamMap}
 import org.apache.spark.ml.param.shared.{HasFeaturesCol, HasLabelCol}
 import org.apache.spark.ml.util._
@@ -37,6 +38,29 @@ import org.apache.spark.sql.types._
  */
 private[feature] trait RFormulaBase extends HasFeaturesCol with HasLabelCol {
 
+  /**
+    * Param for how to order labels of string column. The first label after ordering is assigned
+    * an index of 0.
+    * Options are:
+    *   - 'frequencyDesc': descending order by label frequency (most frequent label assigned 0)
+    *   - 'frequencyAsc': ascending order by label frequency (least frequent label assigned 0)
+    *   - 'alphabetDesc': descending alphabetical order
+    *   - 'alphabetAsc': ascending alphabetical order
+    * Default is 'frequencyDesc'.
+    *
+    * @group param
+    */
+  @Since("2.3.0")
+  final val stringOrderType: Param[String] = new Param(this, "stringOrderType",
+    "how to order labels of string column. " +
+      "The first label after ordering is assigned an index of 0. " +
+      s"Supported options: ${StringIndexer.supportedStringOrderType.mkString(", ")}.",
+    ParamValidators.inArray(StringIndexer.supportedStringOrderType))
+
+  /** @group getParam */
+  @Since("2.3.0")
+  def getStringOrderType: String = $(stringOrderType)
+
   protected def hasLabelCol(schema: StructType): Boolean = {
     schema.map(_.name).contains($(labelCol))
   }
@@ -125,6 +149,11 @@ class RFormula @Since("1.5.0") (@Since("1.5.0") override val uid: String)
   @Since("2.1.0")
   def setForceIndexLabel(value: Boolean): this.type = set(forceIndexLabel, value)
 
+  /** @group setParam */
+  @Since("2.3.0")
+  def setStringOrderType(value: String): this.type = set(stringOrderType, value)
+  setDefault(stringOrderType, StringIndexer.frequencyDesc)
+
   /** Whether the formula specifies fitting an intercept. */
   private[ml] def hasIntercept: Boolean = {
     require(isDefined(formula), "Formula must be defined first.")
@@ -155,6 +184,7 @@ class RFormula @Since("1.5.0") (@Since("1.5.0") override val uid: String)
           encoderStages += new StringIndexer()
             .setInputCol(term)
             .setOutputCol(indexCol)
+            .setStringOrderType($(stringOrderType))
           prefixesToRewrite(indexCol + "_") = term + "_"
           (term, indexCol)
         case _ =>
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala
index fbebd75d70ac5..68708e5d60538 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala
@@ -129,6 +129,50 @@ class RFormulaSuite extends SparkFunSuite with MLlibTestSparkContext with Defaul
     assert(result.collect() === expected.collect())
   }
 
+  test("encodes string terms with string order type") {
+    val formula = new RFormula().setFormula("id ~ a + b")
+    val original = Seq((1, "foo", 4), (2, "bar", 4), (3, "bar", 5), (4, "aaz", 5))
+      .toDF("id", "a", "b")
+
+    val expected = Seq(
+      Seq(
+        (1, "foo", 4, Vectors.dense(0.0, 0.0, 4.0), 1.0),
+        (2, "bar", 4, Vectors.dense(1.0, 0.0, 4.0), 2.0),
+        (3, "bar", 5, Vectors.dense(1.0, 0.0, 5.0), 3.0),
+        (4, "aaz", 5, Vectors.dense(0.0, 1.0, 5.0), 4.0)
+      ).toDF("id", "a", "b", "features", "label"),
+      Seq(
+        (1, "foo", 4, Vectors.dense(0.0, 1.0, 4.0), 1.0),
+        (2, "bar", 4, Vectors.dense(0.0, 0.0, 4.0), 2.0),
+        (3, "bar", 5, Vectors.dense(0.0, 0.0, 5.0), 3.0),
+        (4, "aaz", 5, Vectors.dense(1.0, 0.0, 5.0), 4.0)
+      ).toDF("id", "a", "b", "features", "label"),
+      Seq(
+        (1, "foo", 4, Vectors.dense(1.0, 0.0, 4.0), 1.0),
+        (2, "bar", 4, Vectors.dense(0.0, 1.0, 4.0), 2.0),
+        (3, "bar", 5, Vectors.dense(0.0, 1.0, 5.0), 3.0),
+        (4, "aaz", 5, Vectors.dense(0.0, 0.0, 5.0), 4.0)
+      ).toDF("id", "a", "b", "features", "label"),
+      Seq(
+        (1, "foo", 4, Vectors.dense(0.0, 0.0, 4.0), 1.0),
+        (2, "bar", 4, Vectors.dense(0.0, 1.0, 4.0), 2.0),
+        (3, "bar", 5, Vectors.dense(0.0, 1.0, 5.0), 3.0),
+        (4, "aaz", 5, Vectors.dense(1.0, 0.0, 5.0), 4.0)
+      ).toDF("id", "a", "b", "features", "label")
+    )
+
+    var idx = 0
+    for (orderType <- StringIndexer.supportedStringOrderType) {
+      val model = formula.setStringOrderType(orderType).fit(original)
+      val result = model.transform(original)
+      val resultSchema = model.transformSchema(original.schema)
+
+      assert(result.schema.toString == resultSchema.toString)
+      assert(result.collect() === expected(idx).collect())
+      idx += 1
+    }
+  }
+
   test("index string label") {
     val formula = new RFormula().setFormula("id ~ a + b")
     val original =

From 6841c33768adf1b1397dc5aa36e34abdb8d6ff8a Mon Sep 17 00:00:00 2001
From: Wayne Zhang <actuaryzhang@uber.com>
Date: Fri, 12 May 2017 09:30:12 -0700
Subject: [PATCH 02/10] clean up import

---
 .../src/main/scala/org/apache/spark/ml/feature/RFormula.scala  | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala
index 5eeb282b35da6..b6ec58df62db8 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala
@@ -26,8 +26,7 @@ import org.apache.spark.annotation.{Experimental, Since}
 import org.apache.spark.ml.{Estimator, Model, Pipeline, PipelineModel, PipelineStage, Transformer}
 import org.apache.spark.ml.attribute.AttributeGroup
 import org.apache.spark.ml.linalg.VectorUDT
-import org.apache.spark.ml.param._
-import org.apache.spark.ml.param.{BooleanParam, Param, ParamMap}
+import org.apache.spark.ml.param.{BooleanParam, Param, ParamMap, ParamValidators}
 import org.apache.spark.ml.param.shared.{HasFeaturesCol, HasLabelCol}
 import org.apache.spark.ml.util._
 import org.apache.spark.sql.{DataFrame, Dataset}

From 77fe864770420719d396715479fc1f452a80b8da Mon Sep 17 00:00:00 2001
From: Wayne Zhang <actuaryzhang@uber.com>
Date: Fri, 12 May 2017 10:48:44 -0700
Subject: [PATCH 03/10] add comparison to R

---
 .../spark/ml/feature/RFormulaSuite.scala      | 40 +++++++++++++++++++
 1 file changed, 40 insertions(+)

diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala
index 68708e5d60538..b010a80c28c2f 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala
@@ -173,6 +173,46 @@ class RFormulaSuite extends SparkFunSuite with MLlibTestSparkContext with Defaul
     }
   }
 
+  test("test consistency with R when encoding string terms") {
+    /*
+     R code:
+
+     df <- list(list(1, "foo", 4), list(2, "bar", 4), list(3, "bar", 5), list(4, "aaz", 5))
+     df <- do.call(rbind, lapply(df, as.data.frame, col.names = c("id", "a", "b")))
+     model.matrix(id ~ a + b, df)[, -1]
+
+     abar aaaz b
+      0    0   4
+      1    0   4
+      1    0   5
+      0    1   5
+    */
+    val original = Seq((1, "foo", 4), (2, "bar", 4), (3, "bar", 5), (4, "aaz", 5))
+      .toDF("id", "a", "b")
+    val formula = new RFormula().setFormula("id ~ a + b")
+      .setStringOrderType(StringIndexer.alphabetAsc)
+
+    /*
+     Note that the category dropped after encoding is the same between R and Spark
+     (i.e., "foo" is treated as the reference level).
+     However, the column order is still different:
+     R renders the columns in descending alphabetical order ("bar", "aaz"), while
+     RFormula renders the columns in ascending alphabetical order ("aaz", "bar").
+    */
+    val expected = Seq(
+      (1, "foo", 4, Vectors.dense(0.0, 0.0, 4.0), 1.0),
+      (2, "bar", 4, Vectors.dense(0.0, 1.0, 4.0), 2.0),
+      (3, "bar", 5, Vectors.dense(0.0, 1.0, 5.0), 3.0),
+      (4, "aaz", 5, Vectors.dense(1.0, 0.0, 5.0), 4.0)
+    ).toDF("id", "a", "b", "features", "label")
+
+    val model = formula.fit(original)
+    val result = model.transform(original)
+    val resultSchema = model.transformSchema(original.schema)
+    assert(result.schema.toString == resultSchema.toString)
+    assert(result.collect() === expected.collect())
+  }
+
   test("index string label") {
     val formula = new RFormula().setFormula("id ~ a + b")
     val original =

From a1be94cf92649ec553da3b47fd481f5a1ac37079 Mon Sep 17 00:00:00 2001
From: Wayne Zhang <actuaryzhang@uber.com>
Date: Fri, 12 May 2017 11:04:59 -0700
Subject: [PATCH 04/10] fix style

---
 .../apache/spark/ml/feature/RFormula.scala    | 26 +++++++++----------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala
index b6ec58df62db8..0a7e1dca427a4 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala
@@ -38,22 +38,22 @@ import org.apache.spark.sql.types._
 private[feature] trait RFormulaBase extends HasFeaturesCol with HasLabelCol {
 
   /**
-    * Param for how to order labels of string column. The first label after ordering is assigned
-    * an index of 0.
-    * Options are:
-    *   - 'frequencyDesc': descending order by label frequency (most frequent label assigned 0)
-    *   - 'frequencyAsc': ascending order by label frequency (least frequent label assigned 0)
-    *   - 'alphabetDesc': descending alphabetical order
-    *   - 'alphabetAsc': ascending alphabetical order
-    * Default is 'frequencyDesc'.
-    *
-    * @group param
-    */
+   * Param for how to order labels of string column. The first label after ordering is assigned
+   * an index of 0.
+   * Options are:
+   *   - 'frequencyDesc': descending order by label frequency (most frequent label assigned 0)
+   *   - 'frequencyAsc': ascending order by label frequency (least frequent label assigned 0)
+   *   - 'alphabetDesc': descending alphabetical order
+   *   - 'alphabetAsc': ascending alphabetical order
+   * Default is 'frequencyDesc'.
+   *
+   * @group param
+   */
   @Since("2.3.0")
   final val stringOrderType: Param[String] = new Param(this, "stringOrderType",
     "how to order labels of string column. " +
-      "The first label after ordering is assigned an index of 0. " +
-      s"Supported options: ${StringIndexer.supportedStringOrderType.mkString(", ")}.",
+    "The first label after ordering is assigned an index of 0. " +
+    s"Supported options: ${StringIndexer.supportedStringOrderType.mkString(", ")}.",
     ParamValidators.inArray(StringIndexer.supportedStringOrderType))
 
   /** @group getParam */

From 698588e15b0407e987dad77fb060f0404c8276a9 Mon Sep 17 00:00:00 2001
From: Wayne Zhang <actuaryzhang@uber.com>
Date: Sun, 14 May 2017 13:27:45 -0700
Subject: [PATCH 05/10] correct typo and update example

---
 .../apache/spark/ml/feature/RFormula.scala    |  2 +-
 .../spark/ml/feature/StringIndexer.scala      |  4 ++--
 .../spark/ml/feature/RFormulaSuite.scala      | 24 +++++++++----------
 3 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala
index 0a7e1dca427a4..5d9916437d831 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala
@@ -51,7 +51,7 @@ private[feature] trait RFormulaBase extends HasFeaturesCol with HasLabelCol {
    */
   @Since("2.3.0")
   final val stringOrderType: Param[String] = new Param(this, "stringOrderType",
-    "how to order labels of string column. " +
+    "How to order labels of string column. " +
     "The first label after ordering is assigned an index of 0. " +
     s"Supported options: ${StringIndexer.supportedStringOrderType.mkString(", ")}.",
     ParamValidators.inArray(StringIndexer.supportedStringOrderType))
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala
index b2dc4fcb61964..dfc902bd0b0f1 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala
@@ -47,7 +47,7 @@ private[feature] trait StringIndexerBase extends Params with HasInputCol with Ha
    * @group param
    */
   @Since("1.6.0")
-  val handleInvalid: Param[String] = new Param[String](this, "handleInvalid", "how to handle " +
+  val handleInvalid: Param[String] = new Param[String](this, "handleInvalid", "How to handle " +
     "invalid data (unseen labels or NULL values). " +
     "Options are 'skip' (filter out rows with invalid data), error (throw an error), " +
     "or 'keep' (put invalid data in a special additional bucket, at index numLabels).",
@@ -73,7 +73,7 @@ private[feature] trait StringIndexerBase extends Params with HasInputCol with Ha
    */
   @Since("2.3.0")
   final val stringOrderType: Param[String] = new Param(this, "stringOrderType",
-    "how to order labels of string column. " +
+    "How to order labels of string column. " +
     "The first label after ordering is assigned an index of 0. " +
     s"Supported options: ${StringIndexer.supportedStringOrderType.mkString(", ")}.",
     ParamValidators.inArray(StringIndexer.supportedStringOrderType))
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala
index b010a80c28c2f..89ad8fe460740 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala
@@ -166,7 +166,6 @@ class RFormulaSuite extends SparkFunSuite with MLlibTestSparkContext with Defaul
       val model = formula.setStringOrderType(orderType).fit(original)
       val result = model.transform(original)
       val resultSchema = model.transformSchema(original.schema)
-
       assert(result.schema.toString == resultSchema.toString)
       assert(result.collect() === expected(idx).collect())
       idx += 1
@@ -177,33 +176,34 @@ class RFormulaSuite extends SparkFunSuite with MLlibTestSparkContext with Defaul
     /*
      R code:
 
-     df <- list(list(1, "foo", 4), list(2, "bar", 4), list(3, "bar", 5), list(4, "aaz", 5))
-     df <- do.call(rbind, lapply(df, as.data.frame, col.names = c("id", "a", "b")))
+     df <- data.frame(id = c(1, 2, 3, 4),
+                  a = c("foo", "bar", "bar", "aaz"),
+                  b = c(4, 4, 5, 5))
      model.matrix(id ~ a + b, df)[, -1]
 
-     abar aaaz b
-      0    0   4
+     abar afoo b
+      0    1   4
       1    0   4
       1    0   5
-      0    1   5
+      0    0   5
     */
     val original = Seq((1, "foo", 4), (2, "bar", 4), (3, "bar", 5), (4, "aaz", 5))
       .toDF("id", "a", "b")
     val formula = new RFormula().setFormula("id ~ a + b")
-      .setStringOrderType(StringIndexer.alphabetAsc)
+      .setStringOrderType(StringIndexer.alphabetDesc)
 
     /*
      Note that the category dropped after encoding is the same between R and Spark
-     (i.e., "foo" is treated as the reference level).
+     (i.e., "aaz" is treated as the reference level).
      However, the column order is still different:
-     R renders the columns in descending alphabetical order ("bar", "aaz"), while
-     RFormula renders the columns in ascending alphabetical order ("aaz", "bar").
+     R renders the columns in ascending alphabetical order ("bar", "foo"), while
+     RFormula renders the columns in descending alphabetical order ("foo", "bar").
     */
     val expected = Seq(
-      (1, "foo", 4, Vectors.dense(0.0, 0.0, 4.0), 1.0),
+      (1, "foo", 4, Vectors.dense(1.0, 0.0, 4.0), 1.0),
       (2, "bar", 4, Vectors.dense(0.0, 1.0, 4.0), 2.0),
       (3, "bar", 5, Vectors.dense(0.0, 1.0, 5.0), 3.0),
-      (4, "aaz", 5, Vectors.dense(1.0, 0.0, 5.0), 4.0)
+      (4, "aaz", 5, Vectors.dense(0.0, 0.0, 5.0), 4.0)
     ).toDF("id", "a", "b", "features", "label")
 
     val model = formula.fit(original)

From 147311ba34db55f6aa6ffc3cf75f0c80c8c29cbf Mon Sep 17 00:00:00 2001
From: Wayne Zhang <actuaryzhang@uber.com>
Date: Fri, 19 May 2017 00:00:20 -0700
Subject: [PATCH 06/10] improve doc

---
 mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala
index 5d9916437d831..3e0388a522c8d 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala
@@ -46,6 +46,8 @@ private[feature] trait RFormulaBase extends HasFeaturesCol with HasLabelCol {
    *   - 'alphabetDesc': descending alphabetical order
    *   - 'alphabetAsc': ascending alphabetical order
    * Default is 'frequencyDesc'.
+   * When the ordering is set to 'alphabetDesc', `RFormula` drops the same category as R
+   * when encoding strings.
    *
    * @group param
    */

From 5f31d311c0c39da1968686dd4147376b3888cee3 Mon Sep 17 00:00:00 2001
From: Wayne Zhang <actuaryzhang@uber.com>
Date: Fri, 19 May 2017 19:32:12 -0700
Subject: [PATCH 07/10] change param name and update doc

---
 .../apache/spark/ml/feature/RFormula.scala    | 40 +++++++++++--------
 .../spark/ml/feature/RFormulaSuite.scala      |  6 +--
 2 files changed, 26 insertions(+), 20 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala
index 3e0388a522c8d..7e73f02e99793 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala
@@ -38,29 +38,35 @@ import org.apache.spark.sql.types._
 private[feature] trait RFormulaBase extends HasFeaturesCol with HasLabelCol {
 
   /**
-   * Param for how to order labels of string column. The first label after ordering is assigned
-   * an index of 0.
-   * Options are:
-   *   - 'frequencyDesc': descending order by label frequency (most frequent label assigned 0)
-   *   - 'frequencyAsc': ascending order by label frequency (least frequent label assigned 0)
-   *   - 'alphabetDesc': descending alphabetical order
-   *   - 'alphabetAsc': ascending alphabetical order
-   * Default is 'frequencyDesc'.
-   * When the ordering is set to 'alphabetDesc', `RFormula` drops the same category as R
-   * when encoding strings.
+   * Param for how to order categories of a FEATURE string column used by `StringIndexer`.
+   * The last category after ordering is dropped when encoding strings.
+   * The options are explained using an example string: 'b', 'a', 'b', 'a', 'c', 'b'
+   * |
+   * | Option | Category mapped to 0 by StringIndexer |  Category dropped by RFormula
+   * | 'frequencyDesc' | most frequent category ('b') | least frequent category ('c')
+   * | 'frequencyAsc' | least frequent category ('c') | most frequent category ('b')
+   * | 'alphabetDesc' | first alphabetical category ('a') | last alphabetical category ('c')
+   * | 'alphabetAsc' | last alphabetical category ('c') | last alphabetical category ('a')
+   * |
+   * The default value is 'frequencyDesc'. When the ordering is set to 'alphabetDesc', `RFormula`
+   * drops the same category as R when encoding strings.
+   * Note that this ordering option is NOT used for the label column. When the label column is
+   * indexed, it uses the default descending frequency ordering in `StringIndexer`.
    *
    * @group param
    */
   @Since("2.3.0")
-  final val stringOrderType: Param[String] = new Param(this, "stringOrderType",
-    "How to order labels of string column. " +
-    "The first label after ordering is assigned an index of 0. " +
+  final val stringIndexerOrderType: Param[String] = new Param(this, "stringIndexerOrderType",
+    "How to order categories of a FEATURE string column used by StringIndexer. " +
+    "The last category after ordering is dropped when encoding strings. " +
+    "The default value is 'frequencyDesc'. When the ordering is set to 'alphabetDesc', " +
+    "RFormula drops the same category as R when encoding strings." +
     s"Supported options: ${StringIndexer.supportedStringOrderType.mkString(", ")}.",
     ParamValidators.inArray(StringIndexer.supportedStringOrderType))
 
   /** @group getParam */
   @Since("2.3.0")
-  def getStringOrderType: String = $(stringOrderType)
+  def getStringIndexerOrderType: String = $(stringIndexerOrderType)
 
   protected def hasLabelCol(schema: StructType): Boolean = {
     schema.map(_.name).contains($(labelCol))
@@ -152,8 +158,8 @@ class RFormula @Since("1.5.0") (@Since("1.5.0") override val uid: String)
 
   /** @group setParam */
   @Since("2.3.0")
-  def setStringOrderType(value: String): this.type = set(stringOrderType, value)
-  setDefault(stringOrderType, StringIndexer.frequencyDesc)
+  def setStringIndexerOrderType(value: String): this.type = set(stringIndexerOrderType, value)
+  setDefault(stringIndexerOrderType, StringIndexer.frequencyDesc)
 
   /** Whether the formula specifies fitting an intercept. */
   private[ml] def hasIntercept: Boolean = {
@@ -185,7 +191,7 @@ class RFormula @Since("1.5.0") (@Since("1.5.0") override val uid: String)
           encoderStages += new StringIndexer()
             .setInputCol(term)
             .setOutputCol(indexCol)
-            .setStringOrderType($(stringOrderType))
+            .setStringOrderType($(stringIndexerOrderType))
           prefixesToRewrite(indexCol + "_") = term + "_"
           (term, indexCol)
         case _ =>
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala
index 89ad8fe460740..41d0062c2cabd 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala
@@ -129,7 +129,7 @@ class RFormulaSuite extends SparkFunSuite with MLlibTestSparkContext with Defaul
     assert(result.collect() === expected.collect())
   }
 
-  test("encodes string terms with string order type") {
+  test("encodes string terms with string indexer order type") {
     val formula = new RFormula().setFormula("id ~ a + b")
     val original = Seq((1, "foo", 4), (2, "bar", 4), (3, "bar", 5), (4, "aaz", 5))
       .toDF("id", "a", "b")
@@ -163,7 +163,7 @@ class RFormulaSuite extends SparkFunSuite with MLlibTestSparkContext with Defaul
 
     var idx = 0
     for (orderType <- StringIndexer.supportedStringOrderType) {
-      val model = formula.setStringOrderType(orderType).fit(original)
+      val model = formula.setStringIndexerOrderType(orderType).fit(original)
       val result = model.transform(original)
       val resultSchema = model.transformSchema(original.schema)
       assert(result.schema.toString == resultSchema.toString)
@@ -190,7 +190,7 @@ class RFormulaSuite extends SparkFunSuite with MLlibTestSparkContext with Defaul
     val original = Seq((1, "foo", 4), (2, "bar", 4), (3, "bar", 5), (4, "aaz", 5))
       .toDF("id", "a", "b")
     val formula = new RFormula().setFormula("id ~ a + b")
-      .setStringOrderType(StringIndexer.alphabetDesc)
+      .setStringIndexerOrderType(StringIndexer.alphabetDesc)
 
     /*
      Note that the category dropped after encoding is the same between R and Spark

From 341949c4c1e09baa9478e54e06aa1133b3c6fc86 Mon Sep 17 00:00:00 2001
From: Wayne Zhang <actuaryzhang@uber.com>
Date: Sat, 20 May 2017 00:12:22 -0700
Subject: [PATCH 08/10] fix scala doc style

---
 .../org/apache/spark/ml/feature/RFormula.scala  | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala
index 7e73f02e99793..92440a2af6aa4 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala
@@ -41,13 +41,16 @@ private[feature] trait RFormulaBase extends HasFeaturesCol with HasLabelCol {
    * Param for how to order categories of a FEATURE string column used by `StringIndexer`.
    * The last category after ordering is dropped when encoding strings.
    * The options are explained using an example string: 'b', 'a', 'b', 'a', 'c', 'b'
-   * |
-   * | Option | Category mapped to 0 by StringIndexer |  Category dropped by RFormula
-   * | 'frequencyDesc' | most frequent category ('b') | least frequent category ('c')
-   * | 'frequencyAsc' | least frequent category ('c') | most frequent category ('b')
-   * | 'alphabetDesc' | first alphabetical category ('a') | last alphabetical category ('c')
-   * | 'alphabetAsc' | last alphabetical category ('c') | last alphabetical category ('a')
-   * |
+   * {{{
+   * +-----------------+---------------------------------------+---------------------------------+
+   * |      Option     | Category mapped to 0 by StringIndexer |  Category dropped by RFormula   |
+   * +-----------------+---------------------------------------+---------------------------------+
+   * | 'frequencyDesc' | most frequent category ('b')          | least frequent category ('c')   |
+   * | 'frequencyAsc'  | least frequent category ('c')         | most frequent category ('b')    |
+   * | 'alphabetDesc'  | first alphabetical category ('a')     | last alphabetical category ('c')|
+   * | 'alphabetAsc'   | last alphabetical category ('c')      | last alphabetical category ('a')|
+   * +-----------------+---------------------------------------+---------------------------------+
+   * }}}
    * The default value is 'frequencyDesc'. When the ordering is set to 'alphabetDesc', `RFormula`
    * drops the same category as R when encoding strings.
    * Note that this ordering option is NOT used for the label column. When the label column is

From 24818a7b77676665f9e58a88f8cc59073e368062 Mon Sep 17 00:00:00 2001
From: Wayne Zhang <actuaryzhang@uber.com>
Date: Sat, 20 May 2017 12:24:14 -0700
Subject: [PATCH 09/10] fix typo

---
 .../apache/spark/ml/feature/RFormula.scala    | 32 ++++++++++---------
 1 file changed, 17 insertions(+), 15 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala
index 92440a2af6aa4..2f9f131f0bc2c 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala
@@ -38,21 +38,23 @@ import org.apache.spark.sql.types._
 private[feature] trait RFormulaBase extends HasFeaturesCol with HasLabelCol {
 
   /**
-   * Param for how to order categories of a FEATURE string column used by `StringIndexer`.
+   * Param for how to order categories of a string FEATURE column used by `StringIndexer`.
    * The last category after ordering is dropped when encoding strings.
-   * The options are explained using an example string: 'b', 'a', 'b', 'a', 'c', 'b'
-   * {{{
-   * +-----------------+---------------------------------------+---------------------------------+
-   * |      Option     | Category mapped to 0 by StringIndexer |  Category dropped by RFormula   |
-   * +-----------------+---------------------------------------+---------------------------------+
-   * | 'frequencyDesc' | most frequent category ('b')          | least frequent category ('c')   |
-   * | 'frequencyAsc'  | least frequent category ('c')         | most frequent category ('b')    |
-   * | 'alphabetDesc'  | first alphabetical category ('a')     | last alphabetical category ('c')|
-   * | 'alphabetAsc'   | last alphabetical category ('c')      | last alphabetical category ('a')|
-   * +-----------------+---------------------------------------+---------------------------------+
-   * }}}
+   * Supported options: 'frequencyDesc', 'frequencyAsc', 'alphabetDesc', 'alphabetAsc'.
    * The default value is 'frequencyDesc'. When the ordering is set to 'alphabetDesc', `RFormula`
    * drops the same category as R when encoding strings.
+   *
+   * The options are explained using an example `'b', 'a', 'b', 'a', 'c', 'b'`:
+   * {{{
+   * +-----------------+---------------------------------------+----------------------------------+
+   * |      Option     | Category mapped to 0 by StringIndexer |  Category dropped by RFormula    |
+   * +-----------------+---------------------------------------+----------------------------------+
+   * | 'frequencyDesc' | most frequent category ('b')          | least frequent category ('c')    |
+   * | 'frequencyAsc'  | least frequent category ('c')         | most frequent category ('b')     |
+   * | 'alphabetDesc'  | first alphabetical category ('a')     | last alphabetical category ('c') |
+   * | 'alphabetAsc'   | last alphabetical category ('c')      | first alphabetical category ('a')|
+   * +-----------------+---------------------------------------+----------------------------------+
+   * }}}
    * Note that this ordering option is NOT used for the label column. When the label column is
    * indexed, it uses the default descending frequency ordering in `StringIndexer`.
    *
@@ -60,11 +62,11 @@ private[feature] trait RFormulaBase extends HasFeaturesCol with HasLabelCol {
    */
   @Since("2.3.0")
   final val stringIndexerOrderType: Param[String] = new Param(this, "stringIndexerOrderType",
-    "How to order categories of a FEATURE string column used by StringIndexer. " +
+    "How to order categories of a string FEATURE column used by StringIndexer. " +
     "The last category after ordering is dropped when encoding strings. " +
+    s"Supported options: ${StringIndexer.supportedStringOrderType.mkString(", ")}. " +
     "The default value is 'frequencyDesc'. When the ordering is set to 'alphabetDesc', " +
-    "RFormula drops the same category as R when encoding strings." +
-    s"Supported options: ${StringIndexer.supportedStringOrderType.mkString(", ")}.",
+    "RFormula drops the same category as R when encoding strings.",
     ParamValidators.inArray(StringIndexer.supportedStringOrderType))
 
   /** @group getParam */

From 1a1e06c9f1690e0654f78313f674c07da2b6b6f2 Mon Sep 17 00:00:00 2001
From: Wayne Zhang <actuaryzhang@uber.com>
Date: Mon, 22 May 2017 15:31:24 -0700
Subject: [PATCH 10/10] fix error in doc example

---
 .../src/main/scala/org/apache/spark/ml/feature/RFormula.scala | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala
index 2f9f131f0bc2c..1fad0a6fc9443 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala
@@ -51,8 +51,8 @@ private[feature] trait RFormulaBase extends HasFeaturesCol with HasLabelCol {
    * +-----------------+---------------------------------------+----------------------------------+
    * | 'frequencyDesc' | most frequent category ('b')          | least frequent category ('c')    |
    * | 'frequencyAsc'  | least frequent category ('c')         | most frequent category ('b')     |
-   * | 'alphabetDesc'  | first alphabetical category ('a')     | last alphabetical category ('c') |
-   * | 'alphabetAsc'   | last alphabetical category ('c')      | first alphabetical category ('a')|
+   * | 'alphabetDesc'  | last alphabetical category ('c')      | first alphabetical category ('a')|
+   * | 'alphabetAsc'   | first alphabetical category ('a')     | last alphabetical category ('c') |
    * +-----------------+---------------------------------------+----------------------------------+
    * }}}
    * Note that this ordering option is NOT used for the label column. When the label column is