apache · actuaryzhang · May 12, 2017 · May 12, 2017 · May 12, 2017 · May 12, 2017
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala
@@ -26,7 +26,7 @@ import org.apache.spark.annotation.{Experimental, Since}
 import org.apache.spark.ml.{Estimator, Model, Pipeline, PipelineModel, PipelineStage, Transformer}
 import org.apache.spark.ml.attribute.AttributeGroup
 import org.apache.spark.ml.linalg.VectorUDT
-import org.apache.spark.ml.param.{BooleanParam, Param, ParamMap}
+import org.apache.spark.ml.param.{BooleanParam, Param, ParamMap, ParamValidators}
 import org.apache.spark.ml.param.shared.{HasFeaturesCol, HasLabelCol}
 import org.apache.spark.ml.util._
 import org.apache.spark.sql.{DataFrame, Dataset}
@@ -37,6 +37,42 @@ import org.apache.spark.sql.types._
  */
 private[feature] trait RFormulaBase extends HasFeaturesCol with HasLabelCol {
 
+  /**
+   * Param for how to order categories of a string FEATURE column used by `StringIndexer`.
+   * The last category after ordering is dropped when encoding strings.
+   * Supported options: 'frequencyDesc', 'frequencyAsc', 'alphabetDesc', 'alphabetAsc'.
+   * The default value is 'frequencyDesc'. When the ordering is set to 'alphabetDesc', `RFormula`
+   * drops the same category as R when encoding strings.
+   *
+   * The options are explained using an example `'b', 'a', 'b', 'a', 'c', 'b'`:
+   * {{{
+   * +-----------------+---------------------------------------+----------------------------------+
    * <ul> 
    * <li>`primitivesAsString` (default `false`): infers all primitive values as a string type</li> 
    * <li>`prefersDecimal` (default `false`): infers all floating-point values as a decimal 
    * type. If the values do not fit in decimal, then it infers them as doubles.</li> 
    * <li>`allowComments` (default `false`): ignores Java/C++ style comment in JSON records</li> 
    * <li>`allowUnquotedFieldNames` (default `false`): allows unquoted JSON field names</li> 
    * <li>`allowSingleQuotes` (default `true`): allows single quotes in addition to double quotes 
    * </li> 
    * <li>`allowNumericLeadingZeros` (default `false`): allows leading zeros in numbers 
    * (e.g. 00012)</li> 
    * <li>`allowBackslashEscapingAnyCharacter` (default `false`): allows accepting quoting of all 
    * character using backslash quoting mechanism</li> 
    * <li>`mode` (default `PERMISSIVE`): allows a mode for dealing with corrupt records 
    * during parsing. 
    *   <ul> 
    *     <li>`PERMISSIVE` : sets other fields to `null` when it meets a corrupted record, and puts 
    *     the malformed string into a field configured by `columnNameOfCorruptRecord`. To keep 
    *     corrupt records, an user can set a string type field named `columnNameOfCorruptRecord` 
    *     in an user-defined schema. If a schema does not have the field, it drops corrupt records 
    *     during parsing. When inferring a schema, it implicitly adds a `columnNameOfCorruptRecord` 
    *     field in an output schema.</li> 
    *     <li>`DROPMALFORMED` : ignores the whole corrupted records.</li> 
    *     <li>`FAILFAST` : throws an exception when it meets corrupted records.</li> 
    *   </ul> 
    * </li> 
    * <li>`columnNameOfCorruptRecord` (default is the value specified in 
    * `spark.sql.columnNameOfCorruptRecord`): allows renaming the new field having malformed string 
    * created by `PERMISSIVE` mode. This overrides `spark.sql.columnNameOfCorruptRecord`.</li> 
    * <li>`dateFormat` (default `yyyy-MM-dd`): sets the string that indicates a date format. 
    * Custom date formats follow the formats at `java.text.SimpleDateFormat`. This applies to 
    * date type.</li> 
    * <li>`timestampFormat` (default `yyyy-MM-dd'T'HH:mm:ss.SSSXXX`): sets the string that 
    * indicates a timestamp format. Custom date formats follow the formats at 
    * `java.text.SimpleDateFormat`. This applies to timestamp type.</li> 
    * <li>`wholeFile` (default `false`): parse one record, which may span multiple lines, 
    * per file</li> 
    * </ul> 
    * <ul> 
    * <li>`primitivesAsString` (default `false`): infers all primitive values as a string type</li> 
    * <li>`prefersDecimal` (default `false`): infers all floating-point values as a decimal 
    * type. If the values do not fit in decimal, then it infers them as doubles.</li> 
    * <li>`allowComments` (default `false`): ignores Java/C++ style comment in JSON records</li> 
    * <li>`allowUnquotedFieldNames` (default `false`): allows unquoted JSON field names</li> 
    * <li>`allowSingleQuotes` (default `true`): allows single quotes in addition to double quotes 
    * </li> 
    * <li>`allowNumericLeadingZeros` (default `false`): allows leading zeros in numbers 
    * (e.g. 00012)</li> 
    * <li>`allowBackslashEscapingAnyCharacter` (default `false`): allows accepting quoting of all 
    * character using backslash quoting mechanism</li> 
    * <li>`mode` (default `PERMISSIVE`): allows a mode for dealing with corrupt records 
    * during parsing. 
    *   <ul> 
    *     <li>`PERMISSIVE` : sets other fields to `null` when it meets a corrupted record, and puts 
    *     the malformed string into a field configured by `columnNameOfCorruptRecord`. To keep 
    *     corrupt records, an user can set a string type field named `columnNameOfCorruptRecord` 
    *     in an user-defined schema. If a schema does not have the field, it drops corrupt records 
    *     during parsing. When inferring a schema, it implicitly adds a `columnNameOfCorruptRecord` 
    *     field in an output schema.</li> 
    *     <li>`DROPMALFORMED` : ignores the whole corrupted records.</li> 
    *     <li>`FAILFAST` : throws an exception when it meets corrupted records.</li> 
    *   </ul> 
    * </li> 
    * <li>`columnNameOfCorruptRecord` (default is the value specified in 
    * `spark.sql.columnNameOfCorruptRecord`): allows renaming the new field having malformed string 
    * created by `PERMISSIVE` mode. This overrides `spark.sql.columnNameOfCorruptRecord`.</li> 
    * <li>`dateFormat` (default `yyyy-MM-dd`): sets the string that indicates a date format. 
    * Custom date formats follow the formats at `java.text.SimpleDateFormat`. This applies to 
    * date type.</li> 
    * <li>`timestampFormat` (default `yyyy-MM-dd'T'HH:mm:ss.SSSXXX`): sets the string that 
    * indicates a timestamp format. Custom date formats follow the formats at 
    * `java.text.SimpleDateFormat`. This applies to timestamp type.</li> 
    * <li>`wholeFile` (default `false`): parse one record, which may span multiple lines, 
    * per file</li> 
    * </ul> 
+   * |      Option     | Category mapped to 0 by StringIndexer |  Category dropped by RFormula    |
+   * +-----------------+---------------------------------------+----------------------------------+
+   * | 'frequencyDesc' | most frequent category ('b')          | least frequent category ('c')    |
+   * | 'frequencyAsc'  | least frequent category ('c')         | most frequent category ('b')     |
+   * | 'alphabetDesc'  | last alphabetical category ('c')      | first alphabetical category ('a')|
+   * | 'alphabetAsc'   | first alphabetical category ('a')     | last alphabetical category ('c') |
+   * +-----------------+---------------------------------------+----------------------------------+
+   * }}}
+   * Note that this ordering option is NOT used for the label column. When the label column is
+   * indexed, it uses the default descending frequency ordering in `StringIndexer`.
+   *
+   * @group param
+   */
+  @Since("2.3.0")
+  final val stringIndexerOrderType: Param[String] = new Param(this, "stringIndexerOrderType",
+    "How to order categories of a string FEATURE column used by StringIndexer. " +
+    "The last category after ordering is dropped when encoding strings. " +
+    s"Supported options: ${StringIndexer.supportedStringOrderType.mkString(", ")}. " +
+    "The default value is 'frequencyDesc'. When the ordering is set to 'alphabetDesc', " +
+    "RFormula drops the same category as R when encoding strings.",
+    ParamValidators.inArray(StringIndexer.supportedStringOrderType))
+
+  /** @group getParam */
+  @Since("2.3.0")
+  def getStringIndexerOrderType: String = $(stringIndexerOrderType)
+
   protected def hasLabelCol(schema: StructType): Boolean = {
     schema.map(_.name).contains($(labelCol))
   }
@@ -125,6 +161,11 @@ class RFormula @Since("1.5.0") (@Since("1.5.0") override val uid: String)
   @Since("2.1.0")
   def setForceIndexLabel(value: Boolean): this.type = set(forceIndexLabel, value)
 
+  /** @group setParam */
+  @Since("2.3.0")
+  def setStringIndexerOrderType(value: String): this.type = set(stringIndexerOrderType, value)
+  setDefault(stringIndexerOrderType, StringIndexer.frequencyDesc)
+
   /** Whether the formula specifies fitting an intercept. */
   private[ml] def hasIntercept: Boolean = {
     require(isDefined(formula), "Formula must be defined first.")
@@ -155,6 +196,7 @@ class RFormula @Since("1.5.0") (@Since("1.5.0") override val uid: String)
           encoderStages += new StringIndexer()
             .setInputCol(term)
             .setOutputCol(indexCol)
+            .setStringOrderType($(stringIndexerOrderType))
           prefixesToRewrite(indexCol + "_") = term + "_"
           (term, indexCol)
         case _ =>

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala
@@ -47,7 +47,7 @@ private[feature] trait StringIndexerBase extends Params with HasInputCol with Ha
    * @group param
    */
   @Since("1.6.0")
-  val handleInvalid: Param[String] = new Param[String](this, "handleInvalid", "how to handle " +
+  val handleInvalid: Param[String] = new Param[String](this, "handleInvalid", "How to handle " +
     "invalid data (unseen labels or NULL values). " +
     "Options are 'skip' (filter out rows with invalid data), error (throw an error), " +
     "or 'keep' (put invalid data in a special additional bucket, at index numLabels).",
@@ -73,7 +73,7 @@ private[feature] trait StringIndexerBase extends Params with HasInputCol with Ha
    */
   @Since("2.3.0")
   final val stringOrderType: Param[String] = new Param(this, "stringOrderType",
-    "how to order labels of string column. " +
+    "How to order labels of string column. " +
     "The first label after ordering is assigned an index of 0. " +
     s"Supported options: ${StringIndexer.supportedStringOrderType.mkString(", ")}.",
     ParamValidators.inArray(StringIndexer.supportedStringOrderType))

diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala
@@ -129,6 +129,90 @@ class RFormulaSuite extends SparkFunSuite with MLlibTestSparkContext with Defaul
     assert(result.collect() === expected.collect())
   }
 
+  test("encodes string terms with string indexer order type") {
+    val formula = new RFormula().setFormula("id ~ a + b")
+    val original = Seq((1, "foo", 4), (2, "bar", 4), (3, "bar", 5), (4, "aaz", 5))
+      .toDF("id", "a", "b")
+
+    val expected = Seq(
+      Seq(
+        (1, "foo", 4, Vectors.dense(0.0, 0.0, 4.0), 1.0),
+        (2, "bar", 4, Vectors.dense(1.0, 0.0, 4.0), 2.0),
+        (3, "bar", 5, Vectors.dense(1.0, 0.0, 5.0), 3.0),
+        (4, "aaz", 5, Vectors.dense(0.0, 1.0, 5.0), 4.0)
+      ).toDF("id", "a", "b", "features", "label"),
+      Seq(
+        (1, "foo", 4, Vectors.dense(0.0, 1.0, 4.0), 1.0),
+        (2, "bar", 4, Vectors.dense(0.0, 0.0, 4.0), 2.0),
+        (3, "bar", 5, Vectors.dense(0.0, 0.0, 5.0), 3.0),
+        (4, "aaz", 5, Vectors.dense(1.0, 0.0, 5.0), 4.0)
+      ).toDF("id", "a", "b", "features", "label"),
+      Seq(
+        (1, "foo", 4, Vectors.dense(1.0, 0.0, 4.0), 1.0),
+        (2, "bar", 4, Vectors.dense(0.0, 1.0, 4.0), 2.0),
+        (3, "bar", 5, Vectors.dense(0.0, 1.0, 5.0), 3.0),
+        (4, "aaz", 5, Vectors.dense(0.0, 0.0, 5.0), 4.0)
+      ).toDF("id", "a", "b", "features", "label"),
+      Seq(
+        (1, "foo", 4, Vectors.dense(0.0, 0.0, 4.0), 1.0),
+        (2, "bar", 4, Vectors.dense(0.0, 1.0, 4.0), 2.0),
+        (3, "bar", 5, Vectors.dense(0.0, 1.0, 5.0), 3.0),
+        (4, "aaz", 5, Vectors.dense(1.0, 0.0, 5.0), 4.0)
+      ).toDF("id", "a", "b", "features", "label")
+    )
+
+    var idx = 0
+    for (orderType <- StringIndexer.supportedStringOrderType) {
+      val model = formula.setStringIndexerOrderType(orderType).fit(original)
+      val result = model.transform(original)
+      val resultSchema = model.transformSchema(original.schema)
+      assert(result.schema.toString == resultSchema.toString)
+      assert(result.collect() === expected(idx).collect())
+      idx += 1
+    }
+  }
+
+  test("test consistency with R when encoding string terms") {
+    /*
+     R code:
+
+     df <- data.frame(id = c(1, 2, 3, 4),
+                  a = c("foo", "bar", "bar", "aaz"),
+                  b = c(4, 4, 5, 5))
+     model.matrix(id ~ a + b, df)[, -1]
+
+     abar afoo b
+      0    1   4
+      1    0   4
+      1    0   5
+      0    0   5
+    */
+    val original = Seq((1, "foo", 4), (2, "bar", 4), (3, "bar", 5), (4, "aaz", 5))
+      .toDF("id", "a", "b")
+    val formula = new RFormula().setFormula("id ~ a + b")
+      .setStringIndexerOrderType(StringIndexer.alphabetDesc)
+
+    /*
+     Note that the category dropped after encoding is the same between R and Spark
+     (i.e., "aaz" is treated as the reference level).
+     However, the column order is still different:
+     R renders the columns in ascending alphabetical order ("bar", "foo"), while
+     RFormula renders the columns in descending alphabetical order ("foo", "bar").
+    */
+    val expected = Seq(
+      (1, "foo", 4, Vectors.dense(1.0, 0.0, 4.0), 1.0),
+      (2, "bar", 4, Vectors.dense(0.0, 1.0, 4.0), 2.0),
+      (3, "bar", 5, Vectors.dense(0.0, 1.0, 5.0), 3.0),
+      (4, "aaz", 5, Vectors.dense(0.0, 0.0, 5.0), 4.0)
+    ).toDF("id", "a", "b", "features", "label")
+
+    val model = formula.fit(original)
+    val result = model.transform(original)
+    val resultSchema = model.transformSchema(original.schema)
+    assert(result.schema.toString == resultSchema.toString)
+    assert(result.collect() === expected.collect())
+  }
+
   test("index string label") {
     val formula = new RFormula().setFormula("id ~ a + b")
     val original =