small nits

ericl · ericl · commit 4c11a773e74e · 2015-08-06T00:32:24.000-07:00
diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R
@@ -27,7 +27,7 @@ setClass("PipelineModel", representation(model = "jobj"))
 #' Fits a generalized linear model, similarly to R's glm(). Also see the glmnet package.
 #'
 #' @param formula A symbolic description of the model to be fitted. Currently only a few formula
-#'                operators are supported, including '~', '+', '-', and '.'.
+#'                operators are supported, including '~', '.', ':', '+', and '-'.
 #' @param data DataFrame for training
 #' @param family Error distribution. "gaussian" -> linear regression, "binomial" -> logistic reg.
 #' @param lambda Regularization parameter
diff --git a/R/pkg/inst/tests/test_mllib.R b/R/pkg/inst/tests/test_mllib.R
@@ -49,6 +49,14 @@ test_that("dot minus and intercept vs native glm", {
   expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
 })
 
+test_that("feature interaction vs native glm", {
+  training <- createDataFrame(sqlContext, iris)
+  model <- glm(Sepal_Width ~ Species:Sepal_Length, data = training)
+  vals <- collect(select(predict(model, training), "prediction"))
+  rVals <- predict(glm(Sepal.Width ~ Species:Sepal.Length, data = iris), iris)
+  expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
+})
+
 test_that("summary coefficients match with native glm", {
   training <- createDataFrame(sqlContext, iris)
   stats <- summary(glm(Sepal_Width ~ Sepal_Length + Species, data = training))
@@ -57,5 +65,5 @@ test_that("summary coefficients match with native glm", {
   expect_true(all(abs(rCoefs - coefs) < 1e-6))
   expect_true(all(
     as.character(stats$features) ==
-    c("(Intercept)", "Sepal_Length", "Species__versicolor", "Species__virginica")))
+    c("(Intercept)", "Sepal_Length", "Species_versicolor", "Species_virginica")))
 })
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala
@@ -47,8 +47,8 @@ private[feature] trait RFormulaBase extends HasFeaturesCol with HasLabelCol {
 /**
  * :: Experimental ::
  * Implements the transforms required for fitting a dataset against an R model formula. Currently
- * we support a limited subset of the R operators, including '~' and '+'. Also see the R formula
- * docs here: http://stat.ethz.ch/R-manual/R-patched/library/stats/html/formula.html
+ * we support a limited subset of the R operators, including '~', '.', ':', '+', and '-'. Also see
+ * the R formula docs here: http://stat.ethz.ch/R-manual/R-patched/library/stats/html/formula.html
  */
 @Experimental
 class RFormula(override val uid: String) extends Estimator[RFormulaModel] with RFormulaBase {
@@ -86,6 +86,8 @@ class RFormula(override val uid: String) extends Estimator[RFormulaModel] with R
     val takenNames = mutable.Set(dataset.columns: _*)
     def encodeInteraction(terms: Seq[String]): String = {
       val outputCol = {
+        // TODO(ekl) this column naming should be unnecessary since we generate the right attr
+        // names in RInteraction, but the name is lost somewhere before VectorAssembler.
         var tmp = terms.mkString(":")
         while (takenNames.contains(tmp)) {
           tmp += "_"
@@ -99,7 +101,7 @@ class RFormula(override val uid: String) extends Estimator[RFormulaModel] with R
       tempColumns += outputCol
       outputCol
     }
-    val encodedCols = resolvedFormula.terms.map {
+    val encodedTerms = resolvedFormula.terms.map {
       case terms @ Seq(value) =>
         dataset.schema(value) match {
           case column if column.dataType == StringType =>
@@ -111,7 +113,7 @@ class RFormula(override val uid: String) extends Estimator[RFormulaModel] with R
         encodeInteraction(terms)
     }
     encoderStages += new VectorAssembler(uid)
-      .setInputCols(encodedCols.toArray)
+      .setInputCols(encodedTerms.toArray)
       .setOutputCol($(featuresCol))
     encoderStages += new ColumnPruner(tempColumns.toSet)
     val pipelineModel = new Pipeline(uid).setStages(encoderStages.toArray).fit(dataset)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/RFormulaParser.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/RFormulaParser.scala
@@ -32,7 +32,7 @@ private[ml] case class ParsedRFormula(label: ColumnRef, terms: Seq[Term]) {
    * of the special '.' term. Duplicate terms will be removed during resolution.
    */
   def resolve(schema: StructType): ResolvedRFormula = {
-    lazy val dotTerms = expandDot(schema)
+    val dotTerms = expandDot(schema)
     var includedTerms = Seq[Seq[String]]()
     terms.foreach {
       case term: ColumnRef =>
@@ -80,29 +80,30 @@ private[ml] case class ParsedRFormula(label: ColumnRef, terms: Seq[Term]) {
   private def expandInteraction(
       schema: StructType, terms: Seq[InteractionComponent]): Seq[Seq[String]] = {
     if (terms.isEmpty) {
-      Seq(Nil)
-    } else {
-      val rest = expandInteraction(schema, terms.tail)
-      val validInteractions = (terms.head match {
-        case Dot =>
-          expandDot(schema).filter(_ != label.value).flatMap { t =>
-            rest.map { r =>
-              Seq(t) ++ r
-            }
-          }
-        case ColumnRef(value) =>
-          rest.map(Seq(value) ++ _)
-      }).map(_.distinct)
-      // Deduplicates feature interactions, for example, a:b is the same as b:a.
-      var seen = mutable.Set[Set[String]]()
-      validInteractions.flatMap {
-        case t if seen.contains(t.toSet) =>
-          None
-        case t =>
-          seen += t.toSet
-          Some(t)
-      }.sortBy(_.length)
+      return Seq(Nil)
     }
+
+    val rest = expandInteraction(schema, terms.tail)
+    val validInteractions = (terms.head match {
+      case Dot =>
+        expandDot(schema).filter(_ != label.value).flatMap { t =>
+          rest.map { r =>
+            Seq(t) ++ r
+          }
+        }
+      case ColumnRef(value) =>
+        rest.map(Seq(value) ++ _)
+    }).map(_.distinct)
+
+    // Deduplicates feature interactions, for example, a:b is the same as b:a.
+    var seen = mutable.Set[Set[String]]()
+    validInteractions.flatMap {
+      case t if seen.contains(t.toSet) =>
+        None
+      case t =>
+        seen += t.toSet
+        Some(t)
+    }.sortBy(_.length)
   }
 
   // the dot operator excludes complex column types
@@ -116,6 +117,9 @@ private[ml] case class ParsedRFormula(label: ColumnRef, terms: Seq[Term]) {
 
 /**
  * Represents a fully evaluated and simplified R formula.
+ * @param label the column name of the R formula label (response variable).
+ * @param terms the simplified terms of the R formula. Interactions terms are represented as Seqs
+ *              of column names; non-interaction terms as length 1 Seqs.
  */
 private[ml] case class ResolvedRFormula(label: String, terms: Seq[Seq[String]])
 
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/RInteraction.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/RInteraction.scala
@@ -41,6 +41,7 @@ import org.apache.spark.sql.types._
  * See https://stat.ethz.ch/R-manual/R-devel/library/base/html/formula.html for more
  * information about factor interactions in R formulae.
  */
+// TODO(ekl) it might be nice to have standalone tests for RInteraction.
 @Experimental
 class RInteraction(override val uid: String) extends Estimator[PipelineModel]
   with HasInputCols with HasOutputCol {
@@ -127,8 +128,8 @@ class RInteraction(override val uid: String) extends Estimator[PipelineModel]
 }
 
 /**
- * This helper class combines the output of multiple string-indexed columns to simulate
- * the joint indexing of tuples containing all the column values.
+ * This helper class computes the joint index of multiple string-indexed columns such that the
+ * combined index covers the cartesian product of column values.
  */
 private class IndexCombiner(
     inputCols: Array[String], attrNames: Array[String], outputCol: String)
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaParserSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaParserSuite.scala
@@ -25,7 +25,7 @@ class RFormulaParserSuite extends SparkFunSuite {
       formula: String,
       label: String,
       terms: Seq[String],
-      schema: StructType = null) {
+      schema: StructType = new StructType) {
     val resolved = RFormulaParser.parse(formula).resolve(schema)
     assert(resolved.label == label)
     val simpleTerms = terms.map { t =>
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala
@@ -125,26 +125,26 @@ class RFormulaSuite extends SparkFunSuite with MLlibTestSparkContext {
   }
 
   test("numeric interaction") {
-    val formula = new RFormula().setFormula("a ~ b:c")
+    val formula = new RFormula().setFormula("a ~ b:c:d")
     val original = sqlContext.createDataFrame(
-      Seq((1, 2, 4), (2, 3, 4))
-    ).toDF("a", "b", "c")
+      Seq((1, 2, 4, 2), (2, 3, 4, 1))
+    ).toDF("a", "b", "c", "d")
     val model = formula.fit(original)
     val result = model.transform(original)
     val expected = sqlContext.createDataFrame(
       Seq(
-        (1, 2, 4, Vectors.dense(8.0), 1.0),
-        (2, 3, 4, Vectors.dense(12.0), 2.0))
-      ).toDF("id", "a", "b", "features", "label")
+        (1, 2, 4, 2, Vectors.dense(16.0), 1.0),
+        (2, 3, 4, 1, Vectors.dense(12.0), 2.0))
+      ).toDF("a", "b", "c", "d", "features", "label")
     assert(result.collect() === expected.collect())
     val attrs = AttributeGroup.fromStructField(result.schema("features"))
     val expectedAttrs = new AttributeGroup(
       "features",
-      Array[Attribute](new NumericAttribute(Some("b:c"), Some(1))))
+      Array[Attribute](new NumericAttribute(Some("b:c:d"), Some(1))))
     assert(attrs === expectedAttrs)
   }
 
-  test("numeric:factor interaction") {
+  test("factor numeric interaction") {
     val formula = new RFormula().setFormula("id ~ a:b")
     val original = sqlContext.createDataFrame(
       Seq((1, "foo", 4), (2, "bar", 4), (3, "bar", 5), (4, "baz", 5), (4, "baz", 5), (4, "baz", 5))
@@ -171,7 +171,7 @@ class RFormulaSuite extends SparkFunSuite with MLlibTestSparkContext {
     assert(attrs === expectedAttrs)
   }
 
-  test("factor:factor interaction") {
+  test("factor factor interaction") {
     val formula = new RFormula().setFormula("id ~ a:b")
     val original = sqlContext.createDataFrame(
       Seq((1, "foo", "zq"), (2, "bar", "zq"), (3, "bar", "zz"))
diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
@@ -1117,7 +1117,7 @@ class RFormula(JavaEstimator, HasFeaturesCol, HasLabelCol):
 
     Implements the transforms required for fitting a dataset against an
     R model formula. Currently we support a limited subset of the R
-    operators, including '~', '+', '-', and '.'. Also see the R formula
+    operators, including '~', '.', ':', '+', and '-'. Also see the R formula
     docs:
     http://stat.ethz.ch/R-manual/R-patched/library/stats/html/formula.html