Revert user-facing R changes

ericl · ericl · commit 26b692522a21 · 2015-08-06T17:36:52.000-07:00
diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R
@@ -27,7 +27,7 @@ setClass("PipelineModel", representation(model = "jobj"))
 #' Fits a generalized linear model, similarly to R's glm(). Also see the glmnet package.
 #'
 #' @param formula A symbolic description of the model to be fitted. Currently only a few formula
-#'                operators are supported, including '~', '.', ':', '+', and '-'.
+#'                operators are supported, including '~', '+', '-', and '.'.
 #' @param data DataFrame for training
 #' @param family Error distribution. "gaussian" -> linear regression, "binomial" -> logistic reg.
 #' @param lambda Regularization parameter
diff --git a/R/pkg/inst/tests/test_mllib.R b/R/pkg/inst/tests/test_mllib.R
@@ -49,14 +49,6 @@ test_that("dot minus and intercept vs native glm", {
   expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
 })
 
-test_that("feature interaction vs native glm", {
-  training <- createDataFrame(sqlContext, iris)
-  model <- glm(Sepal_Width ~ Species:Sepal_Length, data = training)
-  vals <- collect(select(predict(model, training), "prediction"))
-  rVals <- predict(glm(Sepal.Width ~ Species:Sepal.Length, data = iris), iris)
-  expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
-})
-
 test_that("summary coefficients match with native glm", {
   training <- createDataFrame(sqlContext, iris)
   stats <- summary(glm(Sepal_Width ~ Sepal_Length + Species, data = training))
@@ -65,5 +57,5 @@ test_that("summary coefficients match with native glm", {
   expect_true(all(abs(rCoefs - coefs) < 1e-6))
   expect_true(all(
     as.character(stats$features) ==
-    c("(Intercept)", "Sepal_Length", "Species_versicolor", "Species_virginica")))
+    c("(Intercept)", "Sepal_Length", "Species__versicolor", "Species__virginica")))
 })
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala
@@ -47,8 +47,8 @@ private[feature] trait RFormulaBase extends HasFeaturesCol with HasLabelCol {
 /**
  * :: Experimental ::
  * Implements the transforms required for fitting a dataset against an R model formula. Currently
- * we support a limited subset of the R operators, including '~', '.', ':', '+', and '-'. Also see
- * the R formula docs here: http://stat.ethz.ch/R-manual/R-patched/library/stats/html/formula.html
+ * we support a limited subset of the R operators, including '~' and '+'. Also see the R formula
+ * docs here: http://stat.ethz.ch/R-manual/R-patched/library/stats/html/formula.html
  */
 @Experimental
 class RFormula(override val uid: String) extends Estimator[RFormulaModel] with RFormulaBase {
@@ -81,26 +81,32 @@ class RFormula(override val uid: String) extends Estimator[RFormulaModel] with R
     require(isDefined(formula), "Formula must be defined first.")
     val parsedFormula = RFormulaParser.parse($(formula))
     val resolvedFormula = parsedFormula.resolve(dataset.schema)
+    // StringType terms and terms representing interactions need to be encoded before assembly.
+    // TODO(ekl) add support for feature interactions
     val encoderStages = ArrayBuffer[PipelineStage]()
     val tempColumns = ArrayBuffer[String]()
-    def encodeInteraction(terms: Seq[String]): String = {
-      val outputCol = "interaction_" + uid + "_" + terms.mkString(":")
-      encoderStages += new RInteraction()
-        .setInputCols(terms.toArray)
-        .setOutputCol(outputCol)
-      tempColumns += outputCol
-      outputCol
-    }
-    val encodedTerms = resolvedFormula.terms.map {
-      case terms @ Seq(value) =>
-        dataset.schema(value) match {
-          case column if column.dataType == StringType =>
-            encodeInteraction(terms)
-          case _ =>
-            value
-        }
-      case terms =>
-        encodeInteraction(terms)
+    val takenNames = mutable.Set(dataset.columns: _*)
+    val encodedTerms = resolvedFormula.terms.map { term =>
+      dataset.schema(term) match {
+        case column if column.dataType == StringType =>
+          val indexCol = term + "_idx_" + uid
+          val encodedCol = {
+            var tmp = term
+            while (takenNames.contains(tmp)) {
+              tmp += "_"
+            }
+            tmp
+          }
+          takenNames.add(indexCol)
+          takenNames.add(encodedCol)
+          encoderStages += new StringIndexer().setInputCol(term).setOutputCol(indexCol)
+          encoderStages += new OneHotEncoder().setInputCol(indexCol).setOutputCol(encodedCol)
+          tempColumns += indexCol
+          tempColumns += encodedCol
+          encodedCol
+        case _ =>
+          term
+      }
     }
     encoderStages += new VectorAssembler(uid)
       .setInputCols(encodedTerms.toArray)
@@ -197,7 +203,7 @@ class RFormulaModel private[feature](
  * Utility transformer for removing temporary columns from a DataFrame.
  * TODO(ekl) make this a public transformer
  */
-private[feature] class ColumnPruner(columnsToPrune: Set[String]) extends Transformer {
+private class ColumnPruner(columnsToPrune: Set[String]) extends Transformer {
   override val uid = Identifiable.randomUID("columnPruner")
 
   override def transform(dataset: DataFrame): DataFrame = {
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/RFormulaParser.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/RFormulaParser.scala
@@ -17,7 +17,6 @@
 
 package org.apache.spark.ml.feature
 
-import scala.collection.mutable
 import scala.util.parsing.combinator.RegexParsers
 
 import org.apache.spark.mllib.linalg.VectorUDT
@@ -32,28 +31,20 @@ private[ml] case class ParsedRFormula(label: ColumnRef, terms: Seq[Term]) {
    * of the special '.' term. Duplicate terms will be removed during resolution.
    */
   def resolve(schema: StructType): ResolvedRFormula = {
-    val dotTerms = expandDot(schema)
-    var includedTerms = Seq[Seq[String]]()
+    var includedTerms = Seq[String]()
     terms.foreach {
-      case term: ColumnRef =>
-        includedTerms :+= Seq(term.value)
-      case ColumnInteraction(terms) =>
-        includedTerms ++= expandInteraction(schema, terms)
       case Dot =>
-        includedTerms ++= dotTerms.map(Seq(_))
+        includedTerms ++= simpleTypes(schema).filter(_ != label.value)
+      case ColumnRef(value) =>
+        includedTerms :+= value
       case Deletion(term: Term) =>
         term match {
-          case inner: ColumnRef =>
-            includedTerms = includedTerms.filter(_ != Seq(inner.value))
-          case ColumnInteraction(terms) =>
-            val fromInteraction = expandInteraction(schema, terms).map(_.toSet)
-            includedTerms = includedTerms.filter(t => !fromInteraction.contains(t.toSet))
+          case ColumnRef(value) =>
+            includedTerms = includedTerms.filter(_ != value)
           case Dot =>
             // e.g. "- .", which removes all first-order terms
-            includedTerms = includedTerms.filter {
-              case Seq(t) => !dotTerms.contains(t)
-              case _ => true
-            }
+            val fromSchema = simpleTypes(schema)
+            includedTerms = includedTerms.filter(fromSchema.contains(_))
           case _: Deletion =>
             assert(false, "Deletion terms cannot be nested")
           case _: Intercept =>
@@ -76,70 +67,31 @@ private[ml] case class ParsedRFormula(label: ColumnRef, terms: Seq[Term]) {
     intercept
   }
 
-  // expands the Dot operators in interaction terms
-  private def expandInteraction(
-      schema: StructType, terms: Seq[InteractionComponent]): Seq[Seq[String]] = {
-    if (terms.isEmpty) {
-      return Seq(Nil)
-    }
-
-    val rest = expandInteraction(schema, terms.tail)
-    val validInteractions = (terms.head match {
-      case Dot =>
-        expandDot(schema).filter(_ != label.value).flatMap { t =>
-          rest.map { r =>
-            Seq(t) ++ r
-          }
-        }
-      case ColumnRef(value) =>
-        rest.map(Seq(value) ++ _)
-    }).map(_.distinct)
-
-    // Deduplicates feature interactions, for example, a:b is the same as b:a.
-    var seen = mutable.Set[Set[String]]()
-    validInteractions.flatMap {
-      case t if seen.contains(t.toSet) =>
-        None
-      case t =>
-        seen += t.toSet
-        Some(t)
-    }.sortBy(_.length)
-  }
-
   // the dot operator excludes complex column types
-  private def expandDot(schema: StructType): Seq[String] = {
+  private def simpleTypes(schema: StructType): Seq[String] = {
     schema.fields.filter(_.dataType match {
       case _: NumericType | StringType | BooleanType | _: VectorUDT => true
       case _ => false
-    }).map(_.name).filter(_ != label.value)
+    }).map(_.name)
   }
 }
 
 /**
  * Represents a fully evaluated and simplified R formula.
- * @param label the column name of the R formula label (response variable).
- * @param terms the simplified terms of the R formula. Interactions terms are represented as Seqs
- *              of column names; non-interaction terms as length 1 Seqs.
  */
-private[ml] case class ResolvedRFormula(label: String, terms: Seq[Seq[String]])
+private[ml] case class ResolvedRFormula(label: String, terms: Seq[String])
 
 /**
  * R formula terms. See the R formula docs here for more information:
  * http://stat.ethz.ch/R-manual/R-patched/library/stats/html/formula.html
  */
 private[ml] sealed trait Term
 
-/** A term that may be part of an interaction, e.g. 'x' in 'x:y' */
-private[ml] sealed trait InteractionComponent extends Term
-
 /* R formula reference to all available columns, e.g. "." in a formula */
-private[ml] case object Dot extends InteractionComponent
+private[ml] case object Dot extends Term
 
 /* R formula reference to a column, e.g. "+ Species" in a formula */
-private[ml] case class ColumnRef(value: String) extends InteractionComponent
-
-/* R formula interaction of several columns, e.g. "Sepal_Length:Species" in a formula */
-private[ml] case class ColumnInteraction(terms: Seq[InteractionComponent]) extends Term
+private[ml] case class ColumnRef(value: String) extends Term
 
 /* R formula intercept toggle, e.g. "+ 0" in a formula */
 private[ml] case class Intercept(enabled: Boolean) extends Term
@@ -157,15 +109,7 @@ private[ml] object RFormulaParser extends RegexParsers {
   def columnRef: Parser[ColumnRef] =
     "([a-zA-Z]|\\.[a-zA-Z_])[a-zA-Z0-9._]*".r ^^ { case a => ColumnRef(a) }
 
-  def dot: Parser[InteractionComponent] = "\\.".r ^^ { case _ => Dot }
-
-  def interaction: Parser[List[InteractionComponent]] = repsep(columnRef | dot, ":")
-
-  def term: Parser[Term] = intercept |
-    interaction ^^ {
-      case Seq(term) => term
-      case terms => ColumnInteraction(terms)
-    }
+  def term: Parser[Term] = intercept | columnRef | "\\.".r ^^ { case _ => Dot }
 
   def terms: Parser[List[Term]] = (term ~ rep("+" ~ term | "-" ~ term)) ^^ {
     case op ~ list => list.foldLeft(List(op)) {
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/RInteraction.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/RInteraction.scala
@@ -127,8 +127,8 @@ class RInteraction(override val uid: String) extends Estimator[PipelineModel]
 }
 
 /**
- * This helper class computes the joint index of multiple string-indexed columns such that the
- * combined index covers the cartesian product of column values.
+ * Computes the joint index of multiple string-indexed columns such that the combined index
+ * covers the cartesian product of column values.
  */
 private class IndexCombiner(
     inputCols: Array[String], attrNames: Array[String], outputCol: String)
@@ -181,8 +181,8 @@ private class IndexCombiner(
 }
 
 /**
- * This helper class scales the input vector column by the product of the input numeric columns.
- * If no vector column is specified, the output is just the product of the numeric columns.
+ * Scales the input vector column by the product of the input numeric columns. If no vector column
+ * is specified, the output is just the product of the numeric columns.
  */
 private class NumericInteraction(
     inputCols: Array[String], vectorCol: Option[String], outputCol: String)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala
@@ -56,12 +56,10 @@ class VectorAssembler(override val uid: String)
       val index = schema.fieldIndex(c)
       field.dataType match {
         case DoubleType =>
-          val attr = Attribute.decodeStructField(field, preserveName = true)
+          val attr = Attribute.fromStructField(field)
           // If the input column doesn't have ML attribute, assume numeric.
           if (attr == UnresolvedAttribute) {
             Some(NumericAttribute.defaultAttr.withName(c))
-          } else if (attr.name.isDefined) {
-            Some(attr)
           } else {
             Some(attr.withName(c))
           }
@@ -71,8 +69,15 @@ class VectorAssembler(override val uid: String)
         case _: VectorUDT =>
           val group = AttributeGroup.fromStructField(field)
           if (group.attributes.isDefined) {
-            // If attributes are defined, copy them.
-            group.attributes.get
+            // If attributes are defined, copy them with updated names.
+            group.attributes.get.map { attr =>
+              if (attr.name.isDefined) {
+                // TODO: Define a rigorous naming scheme.
+                attr.withName(c + "_" + attr.name.get)
+              } else {
+                attr
+              }
+            }
           } else {
             // Otherwise, treat all attributes as numeric. If we cannot get the number of attributes
             // from metadata, check the first row.
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaParserSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaParserSuite.scala
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala
diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py