From 4d27123926ee87231a73aea9dc34555c404c7f1b Mon Sep 17 00:00:00 2001 From: Wayne Zhang Date: Fri, 12 May 2017 00:52:13 -0700 Subject: [PATCH 01/10] add stringOrderType to RFormula --- .../apache/spark/ml/feature/RFormula.scala | 30 +++++++++++++ .../spark/ml/feature/RFormulaSuite.scala | 44 +++++++++++++++++++ 2 files changed, 74 insertions(+) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala index 5a3e2929f5f52..5eeb282b35da6 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala @@ -26,6 +26,7 @@ import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.{Estimator, Model, Pipeline, PipelineModel, PipelineStage, Transformer} import org.apache.spark.ml.attribute.AttributeGroup import org.apache.spark.ml.linalg.VectorUDT +import org.apache.spark.ml.param._ import org.apache.spark.ml.param.{BooleanParam, Param, ParamMap} import org.apache.spark.ml.param.shared.{HasFeaturesCol, HasLabelCol} import org.apache.spark.ml.util._ @@ -37,6 +38,29 @@ import org.apache.spark.sql.types._ */ private[feature] trait RFormulaBase extends HasFeaturesCol with HasLabelCol { + /** + * Param for how to order labels of string column. The first label after ordering is assigned + * an index of 0. + * Options are: + * - 'frequencyDesc': descending order by label frequency (most frequent label assigned 0) + * - 'frequencyAsc': ascending order by label frequency (least frequent label assigned 0) + * - 'alphabetDesc': descending alphabetical order + * - 'alphabetAsc': ascending alphabetical order + * Default is 'frequencyDesc'. + * + * @group param + */ + @Since("2.3.0") + final val stringOrderType: Param[String] = new Param(this, "stringOrderType", + "how to order labels of string column. " + + "The first label after ordering is assigned an index of 0. " + + s"Supported options: ${StringIndexer.supportedStringOrderType.mkString(", ")}.", + ParamValidators.inArray(StringIndexer.supportedStringOrderType)) + + /** @group getParam */ + @Since("2.3.0") + def getStringOrderType: String = $(stringOrderType) + protected def hasLabelCol(schema: StructType): Boolean = { schema.map(_.name).contains($(labelCol)) } @@ -125,6 +149,11 @@ class RFormula @Since("1.5.0") (@Since("1.5.0") override val uid: String) @Since("2.1.0") def setForceIndexLabel(value: Boolean): this.type = set(forceIndexLabel, value) + /** @group setParam */ + @Since("2.3.0") + def setStringOrderType(value: String): this.type = set(stringOrderType, value) + setDefault(stringOrderType, StringIndexer.frequencyDesc) + /** Whether the formula specifies fitting an intercept. */ private[ml] def hasIntercept: Boolean = { require(isDefined(formula), "Formula must be defined first.") @@ -155,6 +184,7 @@ class RFormula @Since("1.5.0") (@Since("1.5.0") override val uid: String) encoderStages += new StringIndexer() .setInputCol(term) .setOutputCol(indexCol) + .setStringOrderType($(stringOrderType)) prefixesToRewrite(indexCol + "_") = term + "_" (term, indexCol) case _ => diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala index fbebd75d70ac5..68708e5d60538 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala @@ -129,6 +129,50 @@ class RFormulaSuite extends SparkFunSuite with MLlibTestSparkContext with Defaul assert(result.collect() === expected.collect()) } + test("encodes string terms with string order type") { + val formula = new RFormula().setFormula("id ~ a + b") + val original = Seq((1, "foo", 4), (2, "bar", 4), (3, "bar", 5), (4, "aaz", 5)) + .toDF("id", "a", "b") + + val expected = Seq( + Seq( + (1, "foo", 4, Vectors.dense(0.0, 0.0, 4.0), 1.0), + (2, "bar", 4, Vectors.dense(1.0, 0.0, 4.0), 2.0), + (3, "bar", 5, Vectors.dense(1.0, 0.0, 5.0), 3.0), + (4, "aaz", 5, Vectors.dense(0.0, 1.0, 5.0), 4.0) + ).toDF("id", "a", "b", "features", "label"), + Seq( + (1, "foo", 4, Vectors.dense(0.0, 1.0, 4.0), 1.0), + (2, "bar", 4, Vectors.dense(0.0, 0.0, 4.0), 2.0), + (3, "bar", 5, Vectors.dense(0.0, 0.0, 5.0), 3.0), + (4, "aaz", 5, Vectors.dense(1.0, 0.0, 5.0), 4.0) + ).toDF("id", "a", "b", "features", "label"), + Seq( + (1, "foo", 4, Vectors.dense(1.0, 0.0, 4.0), 1.0), + (2, "bar", 4, Vectors.dense(0.0, 1.0, 4.0), 2.0), + (3, "bar", 5, Vectors.dense(0.0, 1.0, 5.0), 3.0), + (4, "aaz", 5, Vectors.dense(0.0, 0.0, 5.0), 4.0) + ).toDF("id", "a", "b", "features", "label"), + Seq( + (1, "foo", 4, Vectors.dense(0.0, 0.0, 4.0), 1.0), + (2, "bar", 4, Vectors.dense(0.0, 1.0, 4.0), 2.0), + (3, "bar", 5, Vectors.dense(0.0, 1.0, 5.0), 3.0), + (4, "aaz", 5, Vectors.dense(1.0, 0.0, 5.0), 4.0) + ).toDF("id", "a", "b", "features", "label") + ) + + var idx = 0 + for (orderType <- StringIndexer.supportedStringOrderType) { + val model = formula.setStringOrderType(orderType).fit(original) + val result = model.transform(original) + val resultSchema = model.transformSchema(original.schema) + + assert(result.schema.toString == resultSchema.toString) + assert(result.collect() === expected(idx).collect()) + idx += 1 + } + } + test("index string label") { val formula = new RFormula().setFormula("id ~ a + b") val original = From 6841c33768adf1b1397dc5aa36e34abdb8d6ff8a Mon Sep 17 00:00:00 2001 From: Wayne Zhang Date: Fri, 12 May 2017 09:30:12 -0700 Subject: [PATCH 02/10] clean up import --- .../src/main/scala/org/apache/spark/ml/feature/RFormula.scala | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala index 5eeb282b35da6..b6ec58df62db8 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala @@ -26,8 +26,7 @@ import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.{Estimator, Model, Pipeline, PipelineModel, PipelineStage, Transformer} import org.apache.spark.ml.attribute.AttributeGroup import org.apache.spark.ml.linalg.VectorUDT -import org.apache.spark.ml.param._ -import org.apache.spark.ml.param.{BooleanParam, Param, ParamMap} +import org.apache.spark.ml.param.{BooleanParam, Param, ParamMap, ParamValidators} import org.apache.spark.ml.param.shared.{HasFeaturesCol, HasLabelCol} import org.apache.spark.ml.util._ import org.apache.spark.sql.{DataFrame, Dataset} From 77fe864770420719d396715479fc1f452a80b8da Mon Sep 17 00:00:00 2001 From: Wayne Zhang Date: Fri, 12 May 2017 10:48:44 -0700 Subject: [PATCH 03/10] add comparison to R --- .../spark/ml/feature/RFormulaSuite.scala | 40 +++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala index 68708e5d60538..b010a80c28c2f 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala @@ -173,6 +173,46 @@ class RFormulaSuite extends SparkFunSuite with MLlibTestSparkContext with Defaul } } + test("test consistency with R when encoding string terms") { + /* + R code: + + df <- list(list(1, "foo", 4), list(2, "bar", 4), list(3, "bar", 5), list(4, "aaz", 5)) + df <- do.call(rbind, lapply(df, as.data.frame, col.names = c("id", "a", "b"))) + model.matrix(id ~ a + b, df)[, -1] + + abar aaaz b + 0 0 4 + 1 0 4 + 1 0 5 + 0 1 5 + */ + val original = Seq((1, "foo", 4), (2, "bar", 4), (3, "bar", 5), (4, "aaz", 5)) + .toDF("id", "a", "b") + val formula = new RFormula().setFormula("id ~ a + b") + .setStringOrderType(StringIndexer.alphabetAsc) + + /* + Note that the category dropped after encoding is the same between R and Spark + (i.e., "foo" is treated as the reference level). + However, the column order is still different: + R renders the columns in descending alphabetical order ("bar", "aaz"), while + RFormula renders the columns in ascending alphabetical order ("aaz", "bar"). + */ + val expected = Seq( + (1, "foo", 4, Vectors.dense(0.0, 0.0, 4.0), 1.0), + (2, "bar", 4, Vectors.dense(0.0, 1.0, 4.0), 2.0), + (3, "bar", 5, Vectors.dense(0.0, 1.0, 5.0), 3.0), + (4, "aaz", 5, Vectors.dense(1.0, 0.0, 5.0), 4.0) + ).toDF("id", "a", "b", "features", "label") + + val model = formula.fit(original) + val result = model.transform(original) + val resultSchema = model.transformSchema(original.schema) + assert(result.schema.toString == resultSchema.toString) + assert(result.collect() === expected.collect()) + } + test("index string label") { val formula = new RFormula().setFormula("id ~ a + b") val original = From a1be94cf92649ec553da3b47fd481f5a1ac37079 Mon Sep 17 00:00:00 2001 From: Wayne Zhang Date: Fri, 12 May 2017 11:04:59 -0700 Subject: [PATCH 04/10] fix style --- .../apache/spark/ml/feature/RFormula.scala | 26 +++++++++---------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala index b6ec58df62db8..0a7e1dca427a4 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala @@ -38,22 +38,22 @@ import org.apache.spark.sql.types._ private[feature] trait RFormulaBase extends HasFeaturesCol with HasLabelCol { /** - * Param for how to order labels of string column. The first label after ordering is assigned - * an index of 0. - * Options are: - * - 'frequencyDesc': descending order by label frequency (most frequent label assigned 0) - * - 'frequencyAsc': ascending order by label frequency (least frequent label assigned 0) - * - 'alphabetDesc': descending alphabetical order - * - 'alphabetAsc': ascending alphabetical order - * Default is 'frequencyDesc'. - * - * @group param - */ + * Param for how to order labels of string column. The first label after ordering is assigned + * an index of 0. + * Options are: + * - 'frequencyDesc': descending order by label frequency (most frequent label assigned 0) + * - 'frequencyAsc': ascending order by label frequency (least frequent label assigned 0) + * - 'alphabetDesc': descending alphabetical order + * - 'alphabetAsc': ascending alphabetical order + * Default is 'frequencyDesc'. + * + * @group param + */ @Since("2.3.0") final val stringOrderType: Param[String] = new Param(this, "stringOrderType", "how to order labels of string column. " + - "The first label after ordering is assigned an index of 0. " + - s"Supported options: ${StringIndexer.supportedStringOrderType.mkString(", ")}.", + "The first label after ordering is assigned an index of 0. " + + s"Supported options: ${StringIndexer.supportedStringOrderType.mkString(", ")}.", ParamValidators.inArray(StringIndexer.supportedStringOrderType)) /** @group getParam */ From 698588e15b0407e987dad77fb060f0404c8276a9 Mon Sep 17 00:00:00 2001 From: Wayne Zhang Date: Sun, 14 May 2017 13:27:45 -0700 Subject: [PATCH 05/10] correct typo and update example --- .../apache/spark/ml/feature/RFormula.scala | 2 +- .../spark/ml/feature/StringIndexer.scala | 4 ++-- .../spark/ml/feature/RFormulaSuite.scala | 24 +++++++++---------- 3 files changed, 15 insertions(+), 15 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala index 0a7e1dca427a4..5d9916437d831 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala @@ -51,7 +51,7 @@ private[feature] trait RFormulaBase extends HasFeaturesCol with HasLabelCol { */ @Since("2.3.0") final val stringOrderType: Param[String] = new Param(this, "stringOrderType", - "how to order labels of string column. " + + "How to order labels of string column. " + "The first label after ordering is assigned an index of 0. " + s"Supported options: ${StringIndexer.supportedStringOrderType.mkString(", ")}.", ParamValidators.inArray(StringIndexer.supportedStringOrderType)) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala index b2dc4fcb61964..dfc902bd0b0f1 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala @@ -47,7 +47,7 @@ private[feature] trait StringIndexerBase extends Params with HasInputCol with Ha * @group param */ @Since("1.6.0") - val handleInvalid: Param[String] = new Param[String](this, "handleInvalid", "how to handle " + + val handleInvalid: Param[String] = new Param[String](this, "handleInvalid", "How to handle " + "invalid data (unseen labels or NULL values). " + "Options are 'skip' (filter out rows with invalid data), error (throw an error), " + "or 'keep' (put invalid data in a special additional bucket, at index numLabels).", @@ -73,7 +73,7 @@ private[feature] trait StringIndexerBase extends Params with HasInputCol with Ha */ @Since("2.3.0") final val stringOrderType: Param[String] = new Param(this, "stringOrderType", - "how to order labels of string column. " + + "How to order labels of string column. " + "The first label after ordering is assigned an index of 0. " + s"Supported options: ${StringIndexer.supportedStringOrderType.mkString(", ")}.", ParamValidators.inArray(StringIndexer.supportedStringOrderType)) diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala index b010a80c28c2f..89ad8fe460740 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala @@ -166,7 +166,6 @@ class RFormulaSuite extends SparkFunSuite with MLlibTestSparkContext with Defaul val model = formula.setStringOrderType(orderType).fit(original) val result = model.transform(original) val resultSchema = model.transformSchema(original.schema) - assert(result.schema.toString == resultSchema.toString) assert(result.collect() === expected(idx).collect()) idx += 1 @@ -177,33 +176,34 @@ class RFormulaSuite extends SparkFunSuite with MLlibTestSparkContext with Defaul /* R code: - df <- list(list(1, "foo", 4), list(2, "bar", 4), list(3, "bar", 5), list(4, "aaz", 5)) - df <- do.call(rbind, lapply(df, as.data.frame, col.names = c("id", "a", "b"))) + df <- data.frame(id = c(1, 2, 3, 4), + a = c("foo", "bar", "bar", "aaz"), + b = c(4, 4, 5, 5)) model.matrix(id ~ a + b, df)[, -1] - abar aaaz b - 0 0 4 + abar afoo b + 0 1 4 1 0 4 1 0 5 - 0 1 5 + 0 0 5 */ val original = Seq((1, "foo", 4), (2, "bar", 4), (3, "bar", 5), (4, "aaz", 5)) .toDF("id", "a", "b") val formula = new RFormula().setFormula("id ~ a + b") - .setStringOrderType(StringIndexer.alphabetAsc) + .setStringOrderType(StringIndexer.alphabetDesc) /* Note that the category dropped after encoding is the same between R and Spark - (i.e., "foo" is treated as the reference level). + (i.e., "aaz" is treated as the reference level). However, the column order is still different: - R renders the columns in descending alphabetical order ("bar", "aaz"), while - RFormula renders the columns in ascending alphabetical order ("aaz", "bar"). + R renders the columns in ascending alphabetical order ("bar", "foo"), while + RFormula renders the columns in descending alphabetical order ("foo", "bar"). */ val expected = Seq( - (1, "foo", 4, Vectors.dense(0.0, 0.0, 4.0), 1.0), + (1, "foo", 4, Vectors.dense(1.0, 0.0, 4.0), 1.0), (2, "bar", 4, Vectors.dense(0.0, 1.0, 4.0), 2.0), (3, "bar", 5, Vectors.dense(0.0, 1.0, 5.0), 3.0), - (4, "aaz", 5, Vectors.dense(1.0, 0.0, 5.0), 4.0) + (4, "aaz", 5, Vectors.dense(0.0, 0.0, 5.0), 4.0) ).toDF("id", "a", "b", "features", "label") val model = formula.fit(original) From 147311ba34db55f6aa6ffc3cf75f0c80c8c29cbf Mon Sep 17 00:00:00 2001 From: Wayne Zhang Date: Fri, 19 May 2017 00:00:20 -0700 Subject: [PATCH 06/10] improve doc --- mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala index 5d9916437d831..3e0388a522c8d 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala @@ -46,6 +46,8 @@ private[feature] trait RFormulaBase extends HasFeaturesCol with HasLabelCol { * - 'alphabetDesc': descending alphabetical order * - 'alphabetAsc': ascending alphabetical order * Default is 'frequencyDesc'. + * When the ordering is set to 'alphabetDesc', `RFormula` drops the same category as R + * when encoding strings. * * @group param */ From 5f31d311c0c39da1968686dd4147376b3888cee3 Mon Sep 17 00:00:00 2001 From: Wayne Zhang Date: Fri, 19 May 2017 19:32:12 -0700 Subject: [PATCH 07/10] change param name and update doc --- .../apache/spark/ml/feature/RFormula.scala | 40 +++++++++++-------- .../spark/ml/feature/RFormulaSuite.scala | 6 +-- 2 files changed, 26 insertions(+), 20 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala index 3e0388a522c8d..7e73f02e99793 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala @@ -38,29 +38,35 @@ import org.apache.spark.sql.types._ private[feature] trait RFormulaBase extends HasFeaturesCol with HasLabelCol { /** - * Param for how to order labels of string column. The first label after ordering is assigned - * an index of 0. - * Options are: - * - 'frequencyDesc': descending order by label frequency (most frequent label assigned 0) - * - 'frequencyAsc': ascending order by label frequency (least frequent label assigned 0) - * - 'alphabetDesc': descending alphabetical order - * - 'alphabetAsc': ascending alphabetical order - * Default is 'frequencyDesc'. - * When the ordering is set to 'alphabetDesc', `RFormula` drops the same category as R - * when encoding strings. + * Param for how to order categories of a FEATURE string column used by `StringIndexer`. + * The last category after ordering is dropped when encoding strings. + * The options are explained using an example string: 'b', 'a', 'b', 'a', 'c', 'b' + * | + * | Option | Category mapped to 0 by StringIndexer | Category dropped by RFormula + * | 'frequencyDesc' | most frequent category ('b') | least frequent category ('c') + * | 'frequencyAsc' | least frequent category ('c') | most frequent category ('b') + * | 'alphabetDesc' | first alphabetical category ('a') | last alphabetical category ('c') + * | 'alphabetAsc' | last alphabetical category ('c') | last alphabetical category ('a') + * | + * The default value is 'frequencyDesc'. When the ordering is set to 'alphabetDesc', `RFormula` + * drops the same category as R when encoding strings. + * Note that this ordering option is NOT used for the label column. When the label column is + * indexed, it uses the default descending frequency ordering in `StringIndexer`. * * @group param */ @Since("2.3.0") - final val stringOrderType: Param[String] = new Param(this, "stringOrderType", - "How to order labels of string column. " + - "The first label after ordering is assigned an index of 0. " + + final val stringIndexerOrderType: Param[String] = new Param(this, "stringIndexerOrderType", + "How to order categories of a FEATURE string column used by StringIndexer. " + + "The last category after ordering is dropped when encoding strings. " + + "The default value is 'frequencyDesc'. When the ordering is set to 'alphabetDesc', " + + "RFormula drops the same category as R when encoding strings." + s"Supported options: ${StringIndexer.supportedStringOrderType.mkString(", ")}.", ParamValidators.inArray(StringIndexer.supportedStringOrderType)) /** @group getParam */ @Since("2.3.0") - def getStringOrderType: String = $(stringOrderType) + def getStringIndexerOrderType: String = $(stringIndexerOrderType) protected def hasLabelCol(schema: StructType): Boolean = { schema.map(_.name).contains($(labelCol)) @@ -152,8 +158,8 @@ class RFormula @Since("1.5.0") (@Since("1.5.0") override val uid: String) /** @group setParam */ @Since("2.3.0") - def setStringOrderType(value: String): this.type = set(stringOrderType, value) - setDefault(stringOrderType, StringIndexer.frequencyDesc) + def setStringIndexerOrderType(value: String): this.type = set(stringIndexerOrderType, value) + setDefault(stringIndexerOrderType, StringIndexer.frequencyDesc) /** Whether the formula specifies fitting an intercept. */ private[ml] def hasIntercept: Boolean = { @@ -185,7 +191,7 @@ class RFormula @Since("1.5.0") (@Since("1.5.0") override val uid: String) encoderStages += new StringIndexer() .setInputCol(term) .setOutputCol(indexCol) - .setStringOrderType($(stringOrderType)) + .setStringOrderType($(stringIndexerOrderType)) prefixesToRewrite(indexCol + "_") = term + "_" (term, indexCol) case _ => diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala index 89ad8fe460740..41d0062c2cabd 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala @@ -129,7 +129,7 @@ class RFormulaSuite extends SparkFunSuite with MLlibTestSparkContext with Defaul assert(result.collect() === expected.collect()) } - test("encodes string terms with string order type") { + test("encodes string terms with string indexer order type") { val formula = new RFormula().setFormula("id ~ a + b") val original = Seq((1, "foo", 4), (2, "bar", 4), (3, "bar", 5), (4, "aaz", 5)) .toDF("id", "a", "b") @@ -163,7 +163,7 @@ class RFormulaSuite extends SparkFunSuite with MLlibTestSparkContext with Defaul var idx = 0 for (orderType <- StringIndexer.supportedStringOrderType) { - val model = formula.setStringOrderType(orderType).fit(original) + val model = formula.setStringIndexerOrderType(orderType).fit(original) val result = model.transform(original) val resultSchema = model.transformSchema(original.schema) assert(result.schema.toString == resultSchema.toString) @@ -190,7 +190,7 @@ class RFormulaSuite extends SparkFunSuite with MLlibTestSparkContext with Defaul val original = Seq((1, "foo", 4), (2, "bar", 4), (3, "bar", 5), (4, "aaz", 5)) .toDF("id", "a", "b") val formula = new RFormula().setFormula("id ~ a + b") - .setStringOrderType(StringIndexer.alphabetDesc) + .setStringIndexerOrderType(StringIndexer.alphabetDesc) /* Note that the category dropped after encoding is the same between R and Spark From 341949c4c1e09baa9478e54e06aa1133b3c6fc86 Mon Sep 17 00:00:00 2001 From: Wayne Zhang Date: Sat, 20 May 2017 00:12:22 -0700 Subject: [PATCH 08/10] fix scala doc style --- .../org/apache/spark/ml/feature/RFormula.scala | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala index 7e73f02e99793..92440a2af6aa4 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala @@ -41,13 +41,16 @@ private[feature] trait RFormulaBase extends HasFeaturesCol with HasLabelCol { * Param for how to order categories of a FEATURE string column used by `StringIndexer`. * The last category after ordering is dropped when encoding strings. * The options are explained using an example string: 'b', 'a', 'b', 'a', 'c', 'b' - * | - * | Option | Category mapped to 0 by StringIndexer | Category dropped by RFormula - * | 'frequencyDesc' | most frequent category ('b') | least frequent category ('c') - * | 'frequencyAsc' | least frequent category ('c') | most frequent category ('b') - * | 'alphabetDesc' | first alphabetical category ('a') | last alphabetical category ('c') - * | 'alphabetAsc' | last alphabetical category ('c') | last alphabetical category ('a') - * | + * {{{ + * +-----------------+---------------------------------------+---------------------------------+ + * | Option | Category mapped to 0 by StringIndexer | Category dropped by RFormula | + * +-----------------+---------------------------------------+---------------------------------+ + * | 'frequencyDesc' | most frequent category ('b') | least frequent category ('c') | + * | 'frequencyAsc' | least frequent category ('c') | most frequent category ('b') | + * | 'alphabetDesc' | first alphabetical category ('a') | last alphabetical category ('c')| + * | 'alphabetAsc' | last alphabetical category ('c') | last alphabetical category ('a')| + * +-----------------+---------------------------------------+---------------------------------+ + * }}} * The default value is 'frequencyDesc'. When the ordering is set to 'alphabetDesc', `RFormula` * drops the same category as R when encoding strings. * Note that this ordering option is NOT used for the label column. When the label column is From 24818a7b77676665f9e58a88f8cc59073e368062 Mon Sep 17 00:00:00 2001 From: Wayne Zhang Date: Sat, 20 May 2017 12:24:14 -0700 Subject: [PATCH 09/10] fix typo --- .../apache/spark/ml/feature/RFormula.scala | 32 ++++++++++--------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala index 92440a2af6aa4..2f9f131f0bc2c 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala @@ -38,21 +38,23 @@ import org.apache.spark.sql.types._ private[feature] trait RFormulaBase extends HasFeaturesCol with HasLabelCol { /** - * Param for how to order categories of a FEATURE string column used by `StringIndexer`. + * Param for how to order categories of a string FEATURE column used by `StringIndexer`. * The last category after ordering is dropped when encoding strings. - * The options are explained using an example string: 'b', 'a', 'b', 'a', 'c', 'b' - * {{{ - * +-----------------+---------------------------------------+---------------------------------+ - * | Option | Category mapped to 0 by StringIndexer | Category dropped by RFormula | - * +-----------------+---------------------------------------+---------------------------------+ - * | 'frequencyDesc' | most frequent category ('b') | least frequent category ('c') | - * | 'frequencyAsc' | least frequent category ('c') | most frequent category ('b') | - * | 'alphabetDesc' | first alphabetical category ('a') | last alphabetical category ('c')| - * | 'alphabetAsc' | last alphabetical category ('c') | last alphabetical category ('a')| - * +-----------------+---------------------------------------+---------------------------------+ - * }}} + * Supported options: 'frequencyDesc', 'frequencyAsc', 'alphabetDesc', 'alphabetAsc'. * The default value is 'frequencyDesc'. When the ordering is set to 'alphabetDesc', `RFormula` * drops the same category as R when encoding strings. + * + * The options are explained using an example `'b', 'a', 'b', 'a', 'c', 'b'`: + * {{{ + * +-----------------+---------------------------------------+----------------------------------+ + * | Option | Category mapped to 0 by StringIndexer | Category dropped by RFormula | + * +-----------------+---------------------------------------+----------------------------------+ + * | 'frequencyDesc' | most frequent category ('b') | least frequent category ('c') | + * | 'frequencyAsc' | least frequent category ('c') | most frequent category ('b') | + * | 'alphabetDesc' | first alphabetical category ('a') | last alphabetical category ('c') | + * | 'alphabetAsc' | last alphabetical category ('c') | first alphabetical category ('a')| + * +-----------------+---------------------------------------+----------------------------------+ + * }}} * Note that this ordering option is NOT used for the label column. When the label column is * indexed, it uses the default descending frequency ordering in `StringIndexer`. * @@ -60,11 +62,11 @@ private[feature] trait RFormulaBase extends HasFeaturesCol with HasLabelCol { */ @Since("2.3.0") final val stringIndexerOrderType: Param[String] = new Param(this, "stringIndexerOrderType", - "How to order categories of a FEATURE string column used by StringIndexer. " + + "How to order categories of a string FEATURE column used by StringIndexer. " + "The last category after ordering is dropped when encoding strings. " + + s"Supported options: ${StringIndexer.supportedStringOrderType.mkString(", ")}. " + "The default value is 'frequencyDesc'. When the ordering is set to 'alphabetDesc', " + - "RFormula drops the same category as R when encoding strings." + - s"Supported options: ${StringIndexer.supportedStringOrderType.mkString(", ")}.", + "RFormula drops the same category as R when encoding strings.", ParamValidators.inArray(StringIndexer.supportedStringOrderType)) /** @group getParam */ From 1a1e06c9f1690e0654f78313f674c07da2b6b6f2 Mon Sep 17 00:00:00 2001 From: Wayne Zhang Date: Mon, 22 May 2017 15:31:24 -0700 Subject: [PATCH 10/10] fix error in doc example --- .../src/main/scala/org/apache/spark/ml/feature/RFormula.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala index 2f9f131f0bc2c..1fad0a6fc9443 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala @@ -51,8 +51,8 @@ private[feature] trait RFormulaBase extends HasFeaturesCol with HasLabelCol { * +-----------------+---------------------------------------+----------------------------------+ * | 'frequencyDesc' | most frequent category ('b') | least frequent category ('c') | * | 'frequencyAsc' | least frequent category ('c') | most frequent category ('b') | - * | 'alphabetDesc' | first alphabetical category ('a') | last alphabetical category ('c') | - * | 'alphabetAsc' | last alphabetical category ('c') | first alphabetical category ('a')| + * | 'alphabetDesc' | last alphabetical category ('c') | first alphabetical category ('a')| + * | 'alphabetAsc' | first alphabetical category ('a') | last alphabetical category ('c') | * +-----------------+---------------------------------------+----------------------------------+ * }}} * Note that this ordering option is NOT used for the label column. When the label column is