second round of comments

ericl · ericl · commit 2db68aaa26d2 · 2015-07-15T16:36:16.000-07:00
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala
@@ -32,21 +32,21 @@ import org.apache.spark.sql.types._
  * :: Experimental ::
  * Implements the transforms required for fitting a dataset against an R model formula. Currently
  * we support a limited subset of the R operators, including '~' and '+'. Also see the R formula
- * docs here: http://www.inside-r.org/r-doc/stats/formula
+ * docs here: http://stat.ethz.ch/R-manual/R-patched/library/stats/html/formula.html
  */
 @Experimental
-class RModelFormula(override val uid: String)
+class RFormula(override val uid: String)
   extends Transformer with HasFeaturesCol with HasLabelCol {
 
-  def this() = this(Identifiable.randomUID("rModelFormula"))
+  def this() = this(Identifiable.randomUID("rFormula"))
 
   /**
    * R formula parameter. The formula is provided in string form.
    * @group setParam
    */
   val formula: Param[String] = new Param(this, "formula", "R model formula")
 
-  private var parsedFormula: Option[RFormula] = None
+  private var parsedFormula: Option[ParsedRFormula] = None
 
   /**
    * Sets the formula to use for this transformer. Must be called before use.
@@ -63,60 +63,74 @@ class RModelFormula(override val uid: String)
   def getFormula: String = $(formula)
 
   /** @group getParam */
-  def setFeaturesCol(col: String): this.type = set(featuresCol, col)
+  def setFeaturesCol(value: String): this.type = set(featuresCol, value)
 
   /** @group getParam */
-  def setLabelCol(col: String): this.type = set(labelCol, col)
+  def setLabelCol(value: String): this.type = set(labelCol, value)
 
   override def transformSchema(schema: StructType): StructType = {
-    require(parsedFormula.isDefined, "Must call setFormula() first.")
-    val withFeatures = featureTransformer.transformSchema(schema)
-    val nullable = schema(parsedFormula.get.response).dataType match {
-      case _: NumericType | BooleanType => false
-      case _ => true
+    checkCanTransform(schema)
+    val withFeatures = transformFeatures.transformSchema(schema)
+    if (hasLabelCol(schema)) {
+      withFeatures
+    } else {
+      val nullable = schema(parsedFormula.get.label).dataType match {
+        case _: NumericType | BooleanType => false
+        case _ => true
+      }
+      StructType(withFeatures.fields :+ StructField($(labelCol), DoubleType, nullable))
     }
-    StructType(withFeatures.fields :+ StructField($(labelCol), DoubleType, nullable))
   }
 
   override def transform(dataset: DataFrame): DataFrame = {
-    require(parsedFormula.isDefined, "Must call setFormula() first.")
-    transformLabel(featureTransformer.transform(dataset))
+    checkCanTransform(dataset.schema)
+    transformLabel(transformFeatures.transform(dataset))
   }
 
-  override def copy(extra: ParamMap): RModelFormula = defaultCopy(extra)
+  override def copy(extra: ParamMap): RFormula = defaultCopy(extra)
 
-  override def toString: String = s"RModelFormula(${get(formula)})"
+  override def toString: String = s"RFormula(${get(formula)})"
 
   private def transformLabel(dataset: DataFrame): DataFrame = {
-    val responseName = parsedFormula.get.response
-    dataset.schema(responseName).dataType match {
-      case _: NumericType | BooleanType =>
-        dataset.select(
-          col("*"),
-          dataset(responseName).cast(DoubleType).as($(labelCol)))
-      case StringType =>
-        new StringIndexer()
-          .setInputCol(responseName)
-          .setOutputCol($(labelCol))
-          .fit(dataset)
-          .transform(dataset)
-      case other =>
-        throw new IllegalArgumentException("Unsupported type for response: " + other)
+    if (hasLabelCol(dataset.schema)) {
+      dataset
+    } else {
+      val labelName = parsedFormula.get.label
+      dataset.schema(labelName).dataType match {
+        case _: NumericType | BooleanType =>
+          dataset.withColumn($(labelCol), dataset(labelName).cast(DoubleType))
+        // TODO(ekl) add support for string-type labels
+        case other =>
+          throw new IllegalArgumentException("Unsupported type for label: " + other)
+      }
     }
   }
 
-  private def featureTransformer: Transformer = {
+  private def transformFeatures: Transformer = {
     // TODO(ekl) add support for non-numeric features and feature interactions
     new VectorAssembler(uid)
       .setInputCols(parsedFormula.get.terms.toArray)
       .setOutputCol($(featuresCol))
   }
+
+  private def checkCanTransform(schema: StructType) {
+    require(parsedFormula.isDefined, "Must call setFormula() first.")
+    val columnNames = schema.map(_.name)
+    require(!columnNames.contains($(featuresCol)), "Features column already exists.")
+    require(
+      !columnNames.contains($(labelCol)) || schema($(labelCol)).dataType == DoubleType,
+      "Label column already exists and is not of type DoubleType.")
+  }
+
+  private def hasLabelCol(schema: StructType): Boolean = {
+    schema.map(_.name).contains($(labelCol))
+  }
 }
 
 /**
  * Represents a parsed R formula.
  */
-private[ml] case class RFormula(response: String, terms: Seq[String])
+private[ml] case class ParsedRFormula(label: String, terms: Seq[String])
 
 /**
  * Limited implementation of R formula parsing. Currently supports: '~', '+'.
@@ -126,9 +140,10 @@ private[ml] object RFormulaParser extends RegexParsers {
 
   def expr: Parser[List[String]] = term ~ rep("+" ~> term) ^^ { case a ~ list => a :: list }
 
-  def formula: Parser[RFormula] = (term ~ "~" ~ expr) ^^ { case r ~ "~" ~ t => RFormula(r, t) }
+  def formula: Parser[ParsedRFormula] =
+    (term ~ "~" ~ expr) ^^ { case r ~ "~" ~ t => ParsedRFormula(r, t) }
 
-  def parse(value: String): RFormula = parseAll(formula, value) match {
+  def parse(value: String): ParsedRFormula = parseAll(formula, value) match {
     case Success(result, _) => result
     case failure: NoSuccess => throw new IllegalArgumentException(
       "Could not parse formula: " + value)
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaParserSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaParserSuite.scala
@@ -0,0 +1,34 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.feature
+
+import org.apache.spark.SparkFunSuite
+
+class RFormulaParserSuite extends SparkFunSuite {
+  private def checkParse(formula: String, label: String, terms: Seq[String]) {
+    val parsed = RFormulaParser.parse(formula)
+    assert(parsed.label == label)
+    assert(parsed.terms == terms)
+  }
+
+  test("parse simple formulas") {
+    checkParse("y ~ x", "y", Seq("x"))
+    checkParse("y ~   ._foo  ", "y", Seq("._foo"))
+    checkParse("resp ~ A_VAR + B + c123", "resp", Seq("A_VAR", "B", "c123"))
+  }
+}
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala
@@ -0,0 +1,93 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.feature
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.ml.param.ParamsSuite
+import org.apache.spark.mllib.linalg.Vectors
+import org.apache.spark.mllib.util.MLlibTestSparkContext
+
+class RFormulaSuite extends SparkFunSuite with MLlibTestSparkContext {
+  test("params") {
+    ParamsSuite.checkParams(new RFormula())
+  }
+
+  test("transform numeric data") {
+    val formula = new RFormula().setFormula("id ~ v1 + v2")
+    val original = sqlContext.createDataFrame(
+      Seq((0, 1.0, 3.0), (2, 2.0, 5.0))).toDF("id", "v1", "v2")
+    val result = formula.transform(original)
+    val resultSchema = formula.transformSchema(original.schema)
+    val expected = sqlContext.createDataFrame(
+      Seq(
+        (0, 1.0, 3.0, Vectors.dense(Array(1.0, 3.0)), 0.0),
+        (2, 2.0, 5.0, Vectors.dense(Array(2.0, 5.0)), 2.0))
+      ).toDF("id", "v1", "v2", "features", "label")
+    // TODO(ekl) make schema comparisons check metadata, to avoid .toString
+    assert(result.schema.toString == resultSchema.toString)
+    assert(resultSchema == expected.schema)
+    assert(result.collect().toSeq == expected.collect().toSeq)
+  }
+
+  test("features column already exists") {
+    val formula = new RFormula().setFormula("y ~ x").setFeaturesCol("x")
+    val original = sqlContext.createDataFrame(Seq((0, 1.0), (2, 2.0))).toDF("x", "y")
+    intercept[IllegalArgumentException] {
+      formula.transformSchema(original.schema)
+    }
+    intercept[IllegalArgumentException] {
+      formula.transform(original)
+    }
+  }
+
+  test("label column already exists") {
+    val formula = new RFormula().setFormula("y ~ x").setLabelCol("y")
+    val original = sqlContext.createDataFrame(Seq((0, 1.0), (2, 2.0))).toDF("x", "y")
+    val resultSchema = formula.transformSchema(original.schema)
+    assert(resultSchema.length == 3)
+    assert(resultSchema.toString == formula.transform(original).schema.toString)
+  }
+
+  test("label column already exists but is not double type") {
+    val formula = new RFormula().setFormula("y ~ x").setLabelCol("y")
+    val original = sqlContext.createDataFrame(Seq((0, 1), (2, 2))).toDF("x", "y")
+    intercept[IllegalArgumentException] {
+      formula.transformSchema(original.schema)
+    }
+    intercept[IllegalArgumentException] {
+      formula.transform(original)
+    }
+  }
+
+// TODO(ekl) enable after we implement string label support
+//  test("transform string label") {
+//    val formula = new RFormula().setFormula("name ~ id")
+//    val original = sqlContext.createDataFrame(
+//      Seq((1, "foo"), (2, "bar"), (3, "bar"))).toDF("id", "name")
+//    val result = formula.transform(original)
+//    val resultSchema = formula.transformSchema(original.schema)
+//    val expected = sqlContext.createDataFrame(
+//      Seq(
+//        (1, "foo", Vectors.dense(Array(1.0)), 1.0),
+//        (2, "bar", Vectors.dense(Array(2.0)), 0.0),
+//        (3, "bar", Vectors.dense(Array(3.0)), 0.0))
+//      ).toDF("id", "name", "features", "label")
+//    assert(result.schema.toString == resultSchema.toString)
+//    assert(result.collect().toSeq == expected.collect().toSeq)
+//  }
+}
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/RModelFormulaSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/RModelFormulaSuite.scala