add test

ericl · ericl · commit b01c7c5c90ef · 2015-07-20T18:29:39.000-07:00
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala
@@ -83,12 +83,12 @@ class RFormula(override val uid: String) extends Estimator[RFormulaModel] with R
       dataset.schema(term) match {
         case column if column.dataType == StringType =>
           val idxTerm = term + "_idx_" + uid
-          val indexer = new StringIndexer(uid).setInputCol(term).setOutputCol(idxTerm))
-          Some(Map(term -> indexer.fit(dataset)))
+          val indexer = new StringIndexer().setInputCol(term).setOutputCol(idxTerm)
+          Some(term -> indexer.fit(dataset))
         case _ =>
           None
       }
-    }
+    }.toMap
     copyValues(new RFormulaModel(uid, parsedFormula.get, factorLevels).setParent(this))
   }
 
@@ -109,6 +109,8 @@ class RFormula(override val uid: String) extends Estimator[RFormulaModel] with R
 
 /**
  * A fitted RFormula. Fitting is required to determine the factor levels of formula terms.
+ * @param parsedFormula a pre-parsed R formula
+ * @param factorLevels the fitted factor to index mappings from the training dataset.
  */
 private[feature] class RFormulaModel(
     override val uid: String,
@@ -136,7 +138,7 @@ private[feature] class RFormulaModel(
   }
 
   override def copy(extra: ParamMap): RFormulaModel = copyValues(
-    new RFormulaModel(uid, parsedFormula))
+    new RFormulaModel(uid, parsedFormula, factorLevels))
 
   override def toString: String = s"RFormulaModel(${parsedFormula})"
 
@@ -172,12 +174,13 @@ private[feature] class RFormulaModel(
         case column if column.dataType == StringType =>
           val encodedTerm = term + "_onehot_" + uid
           val indexer = factorLevels(term)
+          val indexCol = indexer.getOrDefault(indexer.outputCol)
           encoderStages :+= indexer
-          encoderStages :+= new OneHotEncoder(uid)
-            .setInputCol($(indexer.outputCol))
+          encoderStages :+= new OneHotEncoder()
+            .setInputCol(indexCol)
             .setOutputCol(encodedTerm)
           tempColumns :+= encodedTerm
-          tempColumns :+= $(indexer.outputCol)
+          tempColumns :+= indexCol
           encodedTerm
         case _ =>
           term
@@ -186,16 +189,16 @@ private[feature] class RFormulaModel(
     encoderStages :+= new VectorAssembler(uid)
       .setInputCols(encodedTerms.toArray)
       .setOutputCol($(featuresCol))
-    encoderStages :+= new ColumnPruner(uid, tempColumns.toSet)
+    encoderStages :+= new ColumnPruner(tempColumns.toSet)
     new PipelineModel(uid, encoderStages.toArray)
   }
 }
 
 /**
- * Utility class for removing temporary columns from a DataFrame.
+ * Utility transformer for removing temporary columns from a DataFrame.
  */
-private[ml] class ColumnPruner(
-    override val uid: String, columnsToPrune: Set[String]) extends Transformer {
+private class ColumnPruner(columnsToPrune: Set[String]) extends Transformer {
+  override val uid = Identifiable.randomUID("columnPruner")
   override def transform(dataset: DataFrame): DataFrame = {
     var res: DataFrame = dataset
     for (column <- columnsToPrune) {
@@ -212,7 +215,7 @@ private[ml] class ColumnPruner(
 /**
  * Represents a parsed R formula.
  */
-private[ml] case class ParsedRFormula(label: String, terms: Seq[String])
+private[ml] case class ParsedRFormula(label: String, terms: Set[String])
 
 /**
  * Limited implementation of R formula parsing. Currently supports: '~', '+'.
@@ -223,7 +226,7 @@ private[ml] object RFormulaParser extends RegexParsers {
   def expr: Parser[List[String]] = term ~ rep("+" ~> term) ^^ { case a ~ list => a :: list }
 
   def formula: Parser[ParsedRFormula] =
-    (term ~ "~" ~ expr) ^^ { case r ~ "~" ~ t => ParsedRFormula(r, t) }
+    (term ~ "~" ~ expr) ^^ { case r ~ "~" ~ t => ParsedRFormula(r, t.toSet) }
 
   def parse(value: String): ParsedRFormula = parseAll(formula, value) match {
     case Success(result, _) => result
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaParserSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaParserSuite.scala
@@ -20,15 +20,16 @@ package org.apache.spark.ml.feature
 import org.apache.spark.SparkFunSuite
 
 class RFormulaParserSuite extends SparkFunSuite {
-  private def checkParse(formula: String, label: String, terms: Seq[String]) {
+  private def checkParse(formula: String, label: String, terms: Set[String]) {
     val parsed = RFormulaParser.parse(formula)
     assert(parsed.label == label)
     assert(parsed.terms == terms)
   }
 
   test("parse simple formulas") {
-    checkParse("y ~ x", "y", Seq("x"))
-    checkParse("y ~   ._foo  ", "y", Seq("._foo"))
-    checkParse("resp ~ A_VAR + B + c123", "resp", Seq("A_VAR", "B", "c123"))
+    checkParse("y ~ x", "y", Set("x"))
+    checkParse("y ~ x + x", "y", Set("x"))
+    checkParse("y ~   ._foo  ", "y", Set("._foo"))
+    checkParse("resp ~ A_VAR + B + c123", "resp", Set("A_VAR", "B", "c123"))
   }
 }
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala
@@ -78,20 +78,21 @@ class RFormulaSuite extends SparkFunSuite with MLlibTestSparkContext {
     }
   }
 
-// TODO(ekl) enable after we implement string label support
-//  test("transform string label") {
-//    val formula = new RFormula().setFormula("name ~ id")
-//    val original = sqlContext.createDataFrame(
-//      Seq((1, "foo"), (2, "bar"), (3, "bar"))).toDF("id", "name")
-//    val result = formula.transform(original)
-//    val resultSchema = formula.transformSchema(original.schema)
-//    val expected = sqlContext.createDataFrame(
-//      Seq(
-//        (1, "foo", Vectors.dense(Array(1.0)), 1.0),
-//        (2, "bar", Vectors.dense(Array(2.0)), 0.0),
-//        (3, "bar", Vectors.dense(Array(3.0)), 0.0))
-//      ).toDF("id", "name", "features", "label")
-//    assert(result.schema.toString == resultSchema.toString)
-//    assert(result.collect().toSeq == expected.collect().toSeq)
-//  }
+  test("encodes string terms") {
+    val formula = new RFormula().setFormula("id ~ category")
+    val original = sqlContext.createDataFrame(
+      Seq((1, "foo"), (2, "bar"), (3, "bar"), (4, "baz"))).toDF("id", "category")
+    val model = formula.fit(original)
+    val result = model.transform(original)
+    val resultSchema = model.transformSchema(original.schema)
+    val expected = sqlContext.createDataFrame(
+      Seq(
+        (1, "foo", Vectors.dense(Array(0.0, 1.0)), 1.0),
+        (2, "bar", Vectors.dense(Array(1.0, 0.0)), 2.0),
+        (3, "bar", Vectors.dense(Array(1.0, 0.0)), 3.0),
+        (4, "baz", Vectors.dense(Array(0.0, 0.0)), 4.0))
+      ).toDF("id", "name", "features", "label")
+    assert(result.schema.toString == resultSchema.toString)
+    assert(result.collect().toSeq == expected.collect().toSeq)
+  }
 }

Original file line number	Diff line number	Diff line change
`@@ -20,15 +20,16 @@ package org.apache.spark.ml.feature`
`20`	`20`	`import org.apache.spark.SparkFunSuite`
`21`	`21`
`22`	`22`	`class RFormulaParserSuite extends SparkFunSuite {`
`23`		`- private def checkParse(formula: String, label: String, terms: Seq[String]) {`
	`23`	`+ private def checkParse(formula: String, label: String, terms: Set[String]) {`
`24`	`24`	`val parsed = RFormulaParser.parse(formula)`
`25`	`25`	`assert(parsed.label == label)`
`26`	`26`	`assert(parsed.terms == terms)`
`27`	`27`	`}`
`28`	`28`
`29`	`29`	`test("parse simple formulas") {`
`30`		`- checkParse("y ~ x", "y", Seq("x"))`
`31`		`- checkParse("y ~ ._foo ", "y", Seq("._foo"))`
`32`		`- checkParse("resp ~ A_VAR + B + c123", "resp", Seq("A_VAR", "B", "c123"))`
	`30`	`+ checkParse("y ~ x", "y", Set("x"))`
	`31`	`+ checkParse("y ~ x + x", "y", Set("x"))`
	`32`	`+ checkParse("y ~ ._foo ", "y", Set("._foo"))`
	`33`	`+ checkParse("resp ~ A_VAR + B + c123", "resp", Set("A_VAR", "B", "c123"))`
`33`	`34`	`}`
`34`	`35`	`}`