From 2999b268192e244bd7a520d62a0914e4742ee45d Mon Sep 17 00:00:00 2001
From: Yuhao Yang <hhbyyh@gmail.com>
Date: Mon, 29 Feb 2016 09:46:04 -0800
Subject: [PATCH 01/25] initial commit for Imputer

---
 .../org/apache/spark/ml/feature/Imputer.scala | 145 ++++++++++++++++++
 1 file changed, 145 insertions(+)
 create mode 100644 mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala
new file mode 100644
index 0000000000000..5eb8c49f2d8d6
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala
@@ -0,0 +1,145 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.feature
+
+import org.apache.hadoop.fs.Path
+
+import org.apache.spark.annotation.{Experimental, Since}
+import org.apache.spark.ml.Transformer
+import org.apache.spark.ml.param.{Param, ParamMap}
+import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol}
+import org.apache.spark.ml.util._
+import org.apache.spark.mllib.linalg.{Vector, VectorUDT}
+import org.apache.spark.mllib.stat.Statistics
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql._
+import org.apache.spark.sql.functions._
+import org.apache.spark.sql.types.{StructField, StructType}
+
+/**
+ * :: Experimental ::
+ *
+ */
+@Experimental
+class Imputer private[ml](
+    override val uid: String)
+  extends Transformer with HasInputCol with HasOutputCol with MLWritable {
+
+  import Imputer._
+
+  /** @group setParam */
+  def setInputCol(value: String): this.type = set(inputCol, value)
+
+  /** @group setParam */
+  def setOutputCol(value: String): this.type = set(outputCol, value)
+
+  val strategy: Param[String] = new Param(this, "strategy", "strategy for imputation")
+
+  /** @group getParam */
+  def getStrategy: String = $(strategy)
+
+  /** @group setParam */
+  def setStrategy(value: String): this.type = set(strategy, value)
+
+
+  override def transform(dataset: DataFrame): DataFrame = {
+
+    val reScale = udf { (vector: Vector) =>
+      if (vector == null) {
+        val replacement = $(strategy) match {
+          case "mean" =>
+            val input = dataset.select($(inputCol)).rdd.map { case Row(v: Vector) => v }
+            val summary = Statistics.colStats(input)
+            summary.mean
+          case "median" =>
+            val input = dataset.select($(inputCol)).rdd.map { case Row(v: Vector) => v }
+            Imputer.getMedian(input)
+          case "most" =>
+            val input = dataset.select($(inputCol)).rdd.map { case Row(v: Vector) => v }
+            val most = input.map(v => (v, 1)).reduceByKey(_ + _).sortBy(-_._2).take(1)(0)._1
+            most
+        }
+      }
+    }
+
+    dataset.withColumn($(outputCol), reScale(col($(inputCol))))
+  }
+
+  override def transformSchema(schema: StructType): StructType = {
+    validateParams()
+    val inputType = schema($(inputCol)).dataType
+    require(inputType.isInstanceOf[VectorUDT],
+      s"Input column ${$(inputCol)} must be a vector column")
+    require(!schema.fieldNames.contains($(outputCol)),
+      s"Output column ${$(outputCol)} already exists.")
+    val outputFields = schema.fields :+ StructField($(outputCol), new VectorUDT, false)
+    StructType(outputFields)
+  }
+
+  override def copy(extra: ParamMap): Imputer = {
+    val copied = new Imputer(uid)
+    copyValues(copied, extra)
+  }
+
+  @Since("1.6.0")
+  override def write: MLWriter = new ImputerWriter(this)
+}
+
+@Since("1.6.0")
+object Imputer extends MLReadable[Imputer] {
+
+  private def getMedian(input: RDD[Vector]): Vector = {
+    val summary = Statistics.colStats(input)
+    summary.mean
+  }
+
+  private[MinMaxScalerModel]
+  class ImputerWriter(instance: Imputer) extends MLWriter {
+
+    private case class Data(strategy: String)
+
+    override protected def saveImpl(path: String): Unit = {
+      DefaultParamsWriter.saveMetadata(instance, path, sc)
+      val data = new Data(instance.getStrategy)
+      val dataPath = new Path(path, "data").toString
+      sqlContext.createDataFrame(Seq(data)).repartition(1).write.parquet(dataPath)
+    }
+  }
+
+  private class ImputerReader extends MLReader[Imputer] {
+
+    private val className = classOf[Imputer].getName
+
+    override def load(path: String): Imputer = {
+      val metadata = DefaultParamsReader.loadMetadata(path, sc, className)
+      val dataPath = new Path(path, "data").toString
+      val Row(strategy: String) = sqlContext.read.parquet(dataPath)
+        .select("strategy")
+        .head()
+      val model = new Imputer(metadata.uid).setStrategy(strategy)
+      DefaultParamsReader.getAndSetParams(model, metadata)
+      model
+    }
+  }
+
+  @Since("1.6.0")
+  override def read: MLReader[Imputer] = new ImputerReader
+
+  @Since("1.6.0")
+  override def load(path: String): Imputer = super.load(path)
+}

From 8335cf21ebde164a22f3447000a1c468a69f39fc Mon Sep 17 00:00:00 2001
From: Yuhao Yang <hhbyyh@gmail.com>
Date: Mon, 29 Feb 2016 10:27:40 -0800
Subject: [PATCH 02/25] adjust mean and most

---
 .../org/apache/spark/ml/feature/Imputer.scala | 27 +++++++++++--------
 1 file changed, 16 insertions(+), 11 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala
index 5eb8c49f2d8d6..db426ca01d732 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala
@@ -40,6 +40,8 @@ class Imputer private[ml](
     override val uid: String)
   extends Transformer with HasInputCol with HasOutputCol with MLWritable {
 
+  def this() = this(Identifiable.randomUID("tokenizer"))
+
   import Imputer._
 
   /** @group setParam */
@@ -56,25 +58,33 @@ class Imputer private[ml](
   /** @group setParam */
   def setStrategy(value: String): this.type = set(strategy, value)
 
+  setDefault(strategy -> "mean")
 
   override def transform(dataset: DataFrame): DataFrame = {
-
     val reScale = udf { (vector: Vector) =>
       if (vector == null) {
         val replacement = $(strategy) match {
           case "mean" =>
-            val input = dataset.select($(inputCol)).rdd.map { case Row(v: Vector) => v }
+            val input = dataset.select($(inputCol)).rdd.filter(r => !r.anyNull)
+              .map { case Row(v: Vector) => v }
             val summary = Statistics.colStats(input)
             summary.mean
           case "median" =>
-            val input = dataset.select($(inputCol)).rdd.map { case Row(v: Vector) => v }
-            Imputer.getMedian(input)
+            val df = dataset.select($(inputCol))
+            df.registerTempTable("medianTable")
+            val median = df.sqlContext
+              .sql(s"select percentile(${$(inputCol)}, 0.5) from medianTable")
+              .head().getAs[Vector](0)
+            median
           case "most" =>
-            val input = dataset.select($(inputCol)).rdd.map { case Row(v: Vector) => v }
+            val input = dataset.select($(inputCol)).rdd.filter(r => !r.anyNull)
+              .map { case Row(v: Vector) => v }
             val most = input.map(v => (v, 1)).reduceByKey(_ + _).sortBy(-_._2).take(1)(0)._1
             most
         }
+        replacement
       }
+      else vector
     }
 
     dataset.withColumn($(outputCol), reScale(col($(inputCol))))
@@ -103,12 +113,7 @@ class Imputer private[ml](
 @Since("1.6.0")
 object Imputer extends MLReadable[Imputer] {
 
-  private def getMedian(input: RDD[Vector]): Vector = {
-    val summary = Statistics.colStats(input)
-    summary.mean
-  }
-
-  private[MinMaxScalerModel]
+  private[Imputer]
   class ImputerWriter(instance: Imputer) extends MLWriter {
 
     private case class Data(strategy: String)

From b949be5746608ca3861df672ccd76d9af4257ae2 Mon Sep 17 00:00:00 2001
From: Yuhao Yang <hhbyyh@gmail.com>
Date: Tue, 8 Mar 2016 18:19:32 -0800
Subject: [PATCH 03/25] refine code and add ut

---
 .../org/apache/spark/ml/feature/Imputer.scala | 278 +++++++++++++-----
 .../spark/ml/feature/ImputerSuite.scala       | 101 +++++++
 2 files changed, 306 insertions(+), 73 deletions(-)
 create mode 100644 mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala
index db426ca01d732..ea4b6995aa890 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala
@@ -20,29 +20,80 @@ package org.apache.spark.ml.feature
 import org.apache.hadoop.fs.Path
 
 import org.apache.spark.annotation.{Experimental, Since}
-import org.apache.spark.ml.Transformer
-import org.apache.spark.ml.param.{Param, ParamMap}
+import org.apache.spark.ml.{Estimator, Model}
+import org.apache.spark.ml.param.{DoubleParam, Param, ParamMap, Params}
 import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol}
 import org.apache.spark.ml.util._
-import org.apache.spark.mllib.linalg.{Vector, VectorUDT}
-import org.apache.spark.mllib.stat.Statistics
-import org.apache.spark.rdd.RDD
-import org.apache.spark.sql._
-import org.apache.spark.sql.functions._
-import org.apache.spark.sql.types.{StructField, StructType}
+import org.apache.spark.mllib.linalg._
+import org.apache.spark.sql.{DataFrame, Row}
+import org.apache.spark.sql.functions.{col, udf}
+import org.apache.spark.sql.types.{DoubleType, StructField, StructType}
+
+/**
+  * Params for [[Imputer]] and [[ImputerModel]].
+  */
+private[feature] trait ImputerParams extends Params with HasInputCol with HasOutputCol {
+
+  /**
+    * The imputation strategy.
+    * If "mean", then replace missing values using the mean along the axis.
+    * If "median", then replace missing values using the median along the axis.
+    * If "most", then replace missing using the most frequent value along the axis.
+    * Default: mean
+    *
+    * @group param
+    */
+  val strategy: Param[String] = new Param(this, "strategy", "strategy for imputation. " +
+    "If mean, then replace missing values using the mean along the axis." +
+    "If median, then replace missing values using the median along the axis." +
+    "If most, then replace missing using the most frequent value along the axis.")
+
+  /** @group getParam */
+  def getStrategy: String = $(strategy)
+
+  /**
+    * The placeholder for the missing values. All occurrences of missingvalues will be imputed.
+    * Default: Double.NaN
+    *
+    * @group param
+    */
+  val missingValue: DoubleParam = new DoubleParam(this, "missingValue",
+    "The placeholder for the missing values. All occurrences of missingvalues will be imputed")
+
+  /** @group getParam */
+  def getMissingValue: Double = $(missingValue)
+
+  /** Validates and transforms the input schema. */
+  protected def validateAndTransformSchema(schema: StructType): StructType = {
+    validateParams()
+    val inputType = schema($(inputCol)).dataType
+    require(inputType.isInstanceOf[VectorUDT] || inputType.isInstanceOf[DoubleType],
+      s"Input column ${$(inputCol)} must of type vector or Double")
+    require(!schema.fieldNames.contains($(outputCol)),
+      s"Output column ${$(outputCol)} already exists.")
+    val outputFields = schema.fields :+ StructField($(outputCol), new VectorUDT, false)
+    StructType(outputFields)
+  }
+
+  override def validateParams(): Unit = {
+    require(Seq("mean", "median", "most").contains($(strategy)),
+      s"${$(strategy)} is not supported. Options are mean, median and most")
+  }
+}
 
 /**
  * :: Experimental ::
+ * Imputation estimator for completing missing values, either using the mean, the median or
+ * the most frequent value of the column in which the missing values are located. This class
+ * also allows for different missing values encodings.
  *
  */
 @Experimental
-class Imputer private[ml](
-    override val uid: String)
-  extends Transformer with HasInputCol with HasOutputCol with MLWritable {
-
-  def this() = this(Identifiable.randomUID("tokenizer"))
+class Imputer @Since("2.0.0")(override val uid: String)
+  extends Estimator[ImputerModel] with ImputerParams with DefaultParamsWritable {
 
-  import Imputer._
+  @Since("2.0.0")
+  def this() = this(Identifiable.randomUID("imputer"))
 
   /** @group setParam */
   def setInputCol(value: String): this.type = set(inputCol, value)
@@ -50,101 +101,182 @@ class Imputer private[ml](
   /** @group setParam */
   def setOutputCol(value: String): this.type = set(outputCol, value)
 
-  val strategy: Param[String] = new Param(this, "strategy", "strategy for imputation")
-
-  /** @group getParam */
-  def getStrategy: String = $(strategy)
-
   /** @group setParam */
   def setStrategy(value: String): this.type = set(strategy, value)
 
-  setDefault(strategy -> "mean")
+  /** @group setParam */
+  def setMissingValue(value: Double): this.type = set(missingValue, value)
 
-  override def transform(dataset: DataFrame): DataFrame = {
-    val reScale = udf { (vector: Vector) =>
-      if (vector == null) {
-        val replacement = $(strategy) match {
-          case "mean" =>
-            val input = dataset.select($(inputCol)).rdd.filter(r => !r.anyNull)
-              .map { case Row(v: Vector) => v }
-            val summary = Statistics.colStats(input)
-            summary.mean
-          case "median" =>
-            val df = dataset.select($(inputCol))
-            df.registerTempTable("medianTable")
-            val median = df.sqlContext
-              .sql(s"select percentile(${$(inputCol)}, 0.5) from medianTable")
-              .head().getAs[Vector](0)
-            median
-          case "most" =>
-            val input = dataset.select($(inputCol)).rdd.filter(r => !r.anyNull)
-              .map { case Row(v: Vector) => v }
-            val most = input.map(v => (v, 1)).reduceByKey(_ + _).sortBy(-_._2).take(1)(0)._1
-            most
-        }
-        replacement
-      }
-      else vector
+  setDefault(strategy -> "mean", missingValue -> Double.NaN)
+
+  override def fit(dataset: DataFrame): ImputerModel = {
+    val alternate = dataset.select($(inputCol)).schema.fields(0).dataType match {
+      case DoubleType =>
+        val colStatistics = getColStatistics(dataset, $(inputCol))
+        Vectors.dense(Array(colStatistics))
+      case _: VectorUDT =>
+        val vl = dataset.first().getAs[Vector]($(inputCol)).size
+        val statisticsArray = new Array[Double](vl)
+        (0 until vl).foreach(i => {
+          val getI = udf((v: Vector) => v(i))
+          val tempColName = $(inputCol) + i
+          val tempData = dataset.where(s"${$(inputCol)} is not null")
+            .select($(inputCol)).withColumn(tempColName, getI(col($(inputCol))))
+          statisticsArray(i) = getColStatistics(tempData, tempColName)
+        })
+        Vectors.dense(statisticsArray)
     }
+    copyValues(new ImputerModel(uid, alternate).setParent(this))
+  }
 
-    dataset.withColumn($(outputCol), reScale(col($(inputCol))))
+  private def getColStatistics(dataset: DataFrame, colName: String): Double = {
+    val missValue = $(missingValue) match {
+      case Double.NaN => "NaN"
+      case _ => $(missingValue).toString
+    }
+    val colStatistics = $(strategy) match {
+      case "mean" =>
+        dataset.where(s"$colName != '$missValue'").selectExpr(s"avg($colName)").first().getDouble(0)
+      case "median" =>
+        // TODO: optimize the sort with quick-select or Percentile(Hive) if required
+        val rddDouble = dataset.select(colName).where(s"$colName != $missValue").rdd
+          .map(_.getDouble(0))
+        rddDouble.sortBy(d => d).zipWithIndex().map {
+          case (v, idx) => (idx, v)
+        }.lookup(rddDouble.count()/2).head
+      case "most" =>
+        val input = dataset.where(s"$colName != $missValue").select(colName).rdd
+          .map(_.getDouble(0))
+        val most = input.map(d => (d, 1)).reduceByKey(_ + _).sortBy(-_._2).first()._1
+        most
+    }
+    colStatistics
   }
 
   override def transformSchema(schema: StructType): StructType = {
-    validateParams()
-    val inputType = schema($(inputCol)).dataType
-    require(inputType.isInstanceOf[VectorUDT],
-      s"Input column ${$(inputCol)} must be a vector column")
-    require(!schema.fieldNames.contains($(outputCol)),
-      s"Output column ${$(outputCol)} already exists.")
-    val outputFields = schema.fields :+ StructField($(outputCol), new VectorUDT, false)
-    StructType(outputFields)
+    validateAndTransformSchema(schema)
   }
 
   override def copy(extra: ParamMap): Imputer = {
     val copied = new Imputer(uid)
     copyValues(copied, extra)
   }
+}
+
+/**
+  * :: Experimental ::
+  * Model fitted by [[Imputer]].
+  *
+  * @param alternate statistics value for each original column during fitting
+  */
+@Experimental
+class ImputerModel private[ml] (
+    override val uid: String,
+    val alternate: Vector)
+  extends Model[ImputerModel] with ImputerParams with MLWritable {
+
+  import ImputerModel._
+
+  /** @group setParam */
+  def setInputCol(value: String): this.type = set(inputCol, value)
+
+  /** @group setParam */
+  def setOutputCol(value: String): this.type = set(outputCol, value)
+
+  private def matchMissingValue(value: Double): Boolean = {
+    val miss = $(missingValue)
+    value == miss || (value.isNaN && miss.isNaN)
+  }
+
+  override def transform(dataset: DataFrame): DataFrame = {
+    dataset.select($(inputCol)).schema.fields(0).dataType match {
+      case DoubleType =>
+        val impute = udf { (d: Double) =>
+          if (matchMissingValue(d)) alternate(0) else d
+        }
+        dataset.withColumn($(outputCol), impute(col($(inputCol))))
+      case _: VectorUDT =>
+        val impute = udf { (vector: Vector) =>
+          if (vector == null) {
+            alternate
+          }
+          else {
+            val vCopy = vector.copy
+            vCopy match {
+              case d: DenseVector =>
+                var iter = 0
+                while(iter < d.size) {
+                  if (matchMissingValue(vCopy(iter))) {
+                    d.values(iter) = alternate(iter)
+                  }
+
+                  iter += 1
+                }
+              case s: SparseVector =>
+                var iter = 0
+                while(iter < s.values.size) {
+                  if (matchMissingValue(s.values(iter))) {
+                    s.values(iter) = alternate(s.indices(iter))
+                  }
+                  iter += 1
+                }
+            }
+            vCopy
+          }
+        }
+        dataset.withColumn($(outputCol), impute(col($(inputCol))))
+    }
+  }
+
+  override def transformSchema(schema: StructType): StructType = {
+    validateAndTransformSchema(schema)
+  }
 
-  @Since("1.6.0")
-  override def write: MLWriter = new ImputerWriter(this)
+  override def copy(extra: ParamMap): ImputerModel = {
+    val copied = new ImputerModel(uid, alternate)
+    copyValues(copied, extra).setParent(parent)
+  }
+
+  @Since("2.0.0")
+  override def write: MLWriter = new ImputerModelWriter(this)
 }
 
-@Since("1.6.0")
-object Imputer extends MLReadable[Imputer] {
 
-  private[Imputer]
-  class ImputerWriter(instance: Imputer) extends MLWriter {
+@Since("2.0.0")
+object ImputerModel extends MLReadable[ImputerModel] {
 
-    private case class Data(strategy: String)
+  private[ImputerModel]
+  class ImputerModelWriter(instance: ImputerModel) extends MLWriter {
+
+    private case class Data(alternate: Vector)
 
     override protected def saveImpl(path: String): Unit = {
       DefaultParamsWriter.saveMetadata(instance, path, sc)
-      val data = new Data(instance.getStrategy)
+      val data = new Data(instance.alternate)
       val dataPath = new Path(path, "data").toString
       sqlContext.createDataFrame(Seq(data)).repartition(1).write.parquet(dataPath)
     }
   }
 
-  private class ImputerReader extends MLReader[Imputer] {
+  private class ImputerReader extends MLReader[ImputerModel] {
 
-    private val className = classOf[Imputer].getName
+    private val className = classOf[ImputerModel].getName
 
-    override def load(path: String): Imputer = {
+    override def load(path: String): ImputerModel = {
       val metadata = DefaultParamsReader.loadMetadata(path, sc, className)
       val dataPath = new Path(path, "data").toString
-      val Row(strategy: String) = sqlContext.read.parquet(dataPath)
-        .select("strategy")
+      val Row(alternate: Vector) = sqlContext.read.parquet(dataPath)
+        .select("alternate")
         .head()
-      val model = new Imputer(metadata.uid).setStrategy(strategy)
+      val model = new ImputerModel(metadata.uid, alternate)
       DefaultParamsReader.getAndSetParams(model, metadata)
       model
     }
   }
 
-  @Since("1.6.0")
-  override def read: MLReader[Imputer] = new ImputerReader
+  @Since("2.0.0")
+  override def read: MLReader[ImputerModel] = new ImputerReader
 
-  @Since("1.6.0")
-  override def load(path: String): Imputer = super.load(path)
-}
+  @Since("2.0.0")
+  override def load(path: String): ImputerModel = super.load(path)
+}
\ No newline at end of file
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala
new file mode 100644
index 0000000000000..52af7b15108e5
--- /dev/null
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala
@@ -0,0 +1,101 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.ml.feature
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils}
+import org.apache.spark.mllib.linalg.{Vector, Vectors}
+import org.apache.spark.mllib.util.MLlibTestSparkContext
+import org.apache.spark.mllib.util.TestingUtils._
+import org.apache.spark.sql.Row
+
+
+class ImputerSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {
+
+  test("Imputer for Double column") {
+    val df = sqlContext.createDataFrame( Seq(
+      (0, 1.0, 1.0, 1.0, 1.0),
+      (1, 1.0, 1.0, 1.0, 1.0),
+      (2, 3.0, 3.0, 3.0, 3.0),
+      (3, 4.0, 4.0, 4.0, 4.0),
+      (4, Double.NaN, 2.25, 3.0, 1.0 )
+    )).toDF("id", "value", "mean", "median", "most")
+    Seq("mean", "median", "most").foreach { strategy =>
+      val imputer = new Imputer().setInputCol("value").setOutputCol("out").setStrategy(strategy)
+      val model = imputer.fit(df)
+      model.transform(df).select(strategy, "out").collect()
+        .foreach { case Row(d1: Double, d2: Double) =>
+          assert(d1 ~== d2 absTol 1e-5, s"Imputer ut error: $d2 should be $d1")
+        }
+    }
+  }
+
+  test("Imputer for with missing Value -1.0") {
+    val df = sqlContext.createDataFrame( Seq(
+      (0, 1.0, 1.0, 1.0, 1.0),
+      (1, 1.0, 1.0, 1.0, 1.0),
+      (2, 3.0, 3.0, 3.0, 3.0),
+      (3, 4.0, 4.0, 4.0, 4.0),
+      (4, -1.0, 2.25, 3.0, 1.0 )
+    )).toDF("id", "value", "mean", "median", "most")
+    Seq("mean", "median", "most").foreach { strategy =>
+      val imputer = new Imputer().setInputCol("value").setOutputCol("out").setStrategy(strategy)
+        .setMissingValue(-1.0)
+      val model = imputer.fit(df)
+      model.transform(df).select(strategy, "out").collect()
+        .foreach { case Row(d1: Double, d2: Double) =>
+          assert(d1 ~== d2 absTol 1e-5, s"Imputer ut error: $d2 should be $d1")
+        }
+    }
+  }
+
+  test("Imputer for Vector column with NaN and null") {
+    val df = sqlContext.createDataFrame( Seq(
+      (0, Vector(1, 2), Vector(1, 2), Vector(1, 2), Vector(1, 2)),
+      (1, Vector(1, 2), Vector(1, 2), Vector(1, 2), Vector(1, 2)),
+      (2, Vector(3, 2), Vector(3, 2), Vector(3, 2), Vector(3, 2)),
+      (3, Vector(4, 2), Vector(4, 2), Vector(4, 2), Vector(4, 2)),
+      (4, Vector(Double.NaN, 2), Vector(2.25, 2), Vector(3.0, 2), Vector(1.0, 2)),
+      (4, null, Vector(2.25, 2), Vector(3.0, 2), Vector(1.0, 2))
+    )).toDF("id", "value", "mean", "median", "most")
+    Seq("mean", "median", "most").foreach { strategy =>
+      val imputer = new Imputer().setInputCol("value").setOutputCol("out").setStrategy(strategy)
+      val model = imputer.fit(df)
+      model.transform(df).select(strategy, "out").collect()
+        .foreach { case Row(d1: Double, d2: Double) =>
+          assert(d1 ~== d2 absTol 1e-5, s"Imputer ut error: $d2 should be $d1")
+        }
+    }
+  }
+
+  test("Imputer read/write") {
+    val t = new Imputer()
+      .setInputCol("myInputCol")
+      .setOutputCol("myOutputCol")
+    testDefaultReadWrite(t)
+  }
+
+  test("Imputer read/write") {
+    val instance = new ImputerModel(
+      "myImputer", Vectors.dense(1.0, 10.0))
+      .setInputCol("myInputCol")
+      .setOutputCol("myOutputCol")
+    val newInstance = testDefaultReadWrite(instance)
+    assert(newInstance.alternate === instance.alternate)
+  }
+
+}

From c3d5d554f5ee90a18d96ff043f03f51f49d2ca7f Mon Sep 17 00:00:00 2001
From: Yuhao Yang <hhbyyh@gmail.com>
Date: Tue, 8 Mar 2016 19:52:04 -0800
Subject: [PATCH 04/25] minor change

---
 .../src/main/scala/org/apache/spark/ml/feature/Imputer.scala | 4 ++--
 .../scala/org/apache/spark/ml/feature/ImputerSuite.scala     | 5 ++---
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala
index ea4b6995aa890..9b810f52b65c5 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala
@@ -68,7 +68,7 @@ private[feature] trait ImputerParams extends Params with HasInputCol with HasOut
     validateParams()
     val inputType = schema($(inputCol)).dataType
     require(inputType.isInstanceOf[VectorUDT] || inputType.isInstanceOf[DoubleType],
-      s"Input column ${$(inputCol)} must of type vector or Double")
+      s"Input column ${$(inputCol)} must of type Vector or Double")
     require(!schema.fieldNames.contains($(outputCol)),
       s"Output column ${$(outputCol)} already exists.")
     val outputFields = schema.fields :+ StructField($(outputCol), new VectorUDT, false)
@@ -279,4 +279,4 @@ object ImputerModel extends MLReadable[ImputerModel] {
 
   @Since("2.0.0")
   override def load(path: String): ImputerModel = super.load(path)
-}
\ No newline at end of file
+}
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala
index 52af7b15108e5..a088b3b4b2386 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala
@@ -18,12 +18,11 @@ package org.apache.spark.ml.feature
 
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils}
-import org.apache.spark.mllib.linalg.{Vector, Vectors}
+import org.apache.spark.mllib.linalg.Vectors
 import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.apache.spark.mllib.util.TestingUtils._
 import org.apache.spark.sql.Row
 
-
 class ImputerSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {
 
   test("Imputer for Double column") {
@@ -44,7 +43,7 @@ class ImputerSuite extends SparkFunSuite with MLlibTestSparkContext with Default
     }
   }
 
-  test("Imputer for with missing Value -1.0") {
+  test("Imputer for Double with missing Value -1.0") {
     val df = sqlContext.createDataFrame( Seq(
       (0, 1.0, 1.0, 1.0, 1.0),
       (1, 1.0, 1.0, 1.0, 1.0),

From 1b3966800982fa980307d1b6ded6e28e5f5985e8 Mon Sep 17 00:00:00 2001
From: Yuhao Yang <hhbyyh@gmail.com>
Date: Tue, 8 Mar 2016 23:57:38 -0800
Subject: [PATCH 05/25] add object Imputer and ut refine

---
 .../org/apache/spark/ml/feature/Imputer.scala | 18 ++++++++-----
 .../spark/ml/feature/ImputerSuite.scala       | 25 +++++++++++--------
 2 files changed, 27 insertions(+), 16 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala
index 9b810f52b65c5..1678850f8fa35 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala
@@ -134,19 +134,18 @@ class Imputer @Since("2.0.0")(override val uid: String)
       case Double.NaN => "NaN"
       case _ => $(missingValue).toString
     }
+    val filteredDF = dataset.select(colName).where(s"$colName != '$missValue'")
     val colStatistics = $(strategy) match {
       case "mean" =>
-        dataset.where(s"$colName != '$missValue'").selectExpr(s"avg($colName)").first().getDouble(0)
+        filteredDF.selectExpr(s"avg($colName)").first().getDouble(0)
       case "median" =>
         // TODO: optimize the sort with quick-select or Percentile(Hive) if required
-        val rddDouble = dataset.select(colName).where(s"$colName != $missValue").rdd
-          .map(_.getDouble(0))
+        val rddDouble = filteredDF.rdd.map(_.getDouble(0))
         rddDouble.sortBy(d => d).zipWithIndex().map {
           case (v, idx) => (idx, v)
         }.lookup(rddDouble.count()/2).head
       case "most" =>
-        val input = dataset.where(s"$colName != $missValue").select(colName).rdd
-          .map(_.getDouble(0))
+        val input = filteredDF.rdd.map(_.getDouble(0))
         val most = input.map(d => (d, 1)).reduceByKey(_ + _).sortBy(-_._2).first()._1
         most
     }
@@ -163,6 +162,13 @@ class Imputer @Since("2.0.0")(override val uid: String)
   }
 }
 
+@Since("1.6.0")
+object Imputer extends DefaultParamsReadable[Imputer] {
+
+  @Since("1.6.0")
+  override def load(path: String): Imputer = super.load(path)
+}
+
 /**
   * :: Experimental ::
   * Model fitted by [[Imputer]].
@@ -214,7 +220,7 @@ class ImputerModel private[ml] (
                 }
               case s: SparseVector =>
                 var iter = 0
-                while(iter < s.values.size) {
+                while(iter < s.values.length) {
                   if (matchMissingValue(s.values(iter))) {
                     s.values(iter) = alternate(s.indices(iter))
                   }
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala
index a088b3b4b2386..c22adc48a1f0f 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala
@@ -18,7 +18,7 @@ package org.apache.spark.ml.feature
 
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils}
-import org.apache.spark.mllib.linalg.Vectors
+import org.apache.spark.mllib.linalg.{Vector, Vectors}
 import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.apache.spark.mllib.util.TestingUtils._
 import org.apache.spark.sql.Row
@@ -64,19 +64,23 @@ class ImputerSuite extends SparkFunSuite with MLlibTestSparkContext with Default
 
   test("Imputer for Vector column with NaN and null") {
     val df = sqlContext.createDataFrame( Seq(
-      (0, Vector(1, 2), Vector(1, 2), Vector(1, 2), Vector(1, 2)),
-      (1, Vector(1, 2), Vector(1, 2), Vector(1, 2), Vector(1, 2)),
-      (2, Vector(3, 2), Vector(3, 2), Vector(3, 2), Vector(3, 2)),
-      (3, Vector(4, 2), Vector(4, 2), Vector(4, 2), Vector(4, 2)),
-      (4, Vector(Double.NaN, 2), Vector(2.25, 2), Vector(3.0, 2), Vector(1.0, 2)),
-      (4, null, Vector(2.25, 2), Vector(3.0, 2), Vector(1.0, 2))
+      (0, Vectors.dense(1, 2), Vectors.dense(1, 2), Vectors.dense(1, 2), Vectors.dense(1, 2)),
+      (1, Vectors.dense(1, 2), Vectors.dense(1, 2), Vectors.dense(1, 2), Vectors.dense(1, 2)),
+      (2, Vectors.dense(3, 2), Vectors.dense(3, 2), Vectors.dense(3, 2), Vectors.dense(3, 2)),
+      (3, Vectors.dense(4, 2), Vectors.dense(4, 2), Vectors.dense(4, 2), Vectors.dense(4, 2)),
+      (4, Vectors.dense(Double.NaN, 2), Vectors.dense(2.25, 2), Vectors.dense(3.0, 2),
+        Vectors.dense(1.0, 2)),
+      (5, Vectors.sparse(2, Array(0, 1), Array(Double.NaN, 2.0)), Vectors.dense(2.25, 2),
+        Vectors.dense(3.0, 2), Vectors.dense(1.0, 2)),
+      (6, null.asInstanceOf[Vector], Vectors.dense(2.25, 2), Vectors.dense(3.0, 2),
+        Vectors.dense(1.0, 2))
     )).toDF("id", "value", "mean", "median", "most")
     Seq("mean", "median", "most").foreach { strategy =>
       val imputer = new Imputer().setInputCol("value").setOutputCol("out").setStrategy(strategy)
       val model = imputer.fit(df)
       model.transform(df).select(strategy, "out").collect()
-        .foreach { case Row(d1: Double, d2: Double) =>
-          assert(d1 ~== d2 absTol 1e-5, s"Imputer ut error: $d2 should be $d1")
+        .foreach { case Row(v1: Vector, v2: Vector) =>
+          assert(v1 == v2, s"$strategy Imputer ut error: $v2 should be $v1")
         }
     }
   }
@@ -85,10 +89,11 @@ class ImputerSuite extends SparkFunSuite with MLlibTestSparkContext with Default
     val t = new Imputer()
       .setInputCol("myInputCol")
       .setOutputCol("myOutputCol")
+      .setMissingValue(-1.0)
     testDefaultReadWrite(t)
   }
 
-  test("Imputer read/write") {
+  test("ImputerModel read/write") {
     val instance = new ImputerModel(
       "myImputer", Vectors.dense(1.0, 10.0))
       .setInputCol("myInputCol")

From 4e45f81f89f0b1ad13add524b3dd89fe52126bc0 Mon Sep 17 00:00:00 2001
From: Yuhao Yang <hhbyyh@gmail.com>
Date: Thu, 10 Mar 2016 11:06:30 -0800
Subject: [PATCH 06/25] add options validate and some small changes

---
 .../org/apache/spark/ml/feature/Imputer.scala | 54 ++++++++++---------
 1 file changed, 28 insertions(+), 26 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala
index 1678850f8fa35..70938691f536d 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala
@@ -21,7 +21,7 @@ import org.apache.hadoop.fs.Path
 
 import org.apache.spark.annotation.{Experimental, Since}
 import org.apache.spark.ml.{Estimator, Model}
-import org.apache.spark.ml.param.{DoubleParam, Param, ParamMap, Params}
+import org.apache.spark.ml.param._
 import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol}
 import org.apache.spark.ml.util._
 import org.apache.spark.mllib.linalg._
@@ -36,29 +36,30 @@ private[feature] trait ImputerParams extends Params with HasInputCol with HasOut
 
   /**
     * The imputation strategy.
-    * If "mean", then replace missing values using the mean along the axis.
-    * If "median", then replace missing values using the median along the axis.
-    * If "most", then replace missing using the most frequent value along the axis.
+    * If "mean", then replace missing values using the mean value of the feature.
+    * If "median", then replace missing values using the median value of the feature.
+    * If "most", then replace missing using the most frequent value of the feature.
     * Default: mean
     *
     * @group param
     */
   val strategy: Param[String] = new Param(this, "strategy", "strategy for imputation. " +
-    "If mean, then replace missing values using the mean along the axis." +
-    "If median, then replace missing values using the median along the axis." +
-    "If most, then replace missing using the most frequent value along the axis.")
+    "If mean, then replace missing values using the mean value of the feature." +
+    "If median, then replace missing values using the median value of the feature." +
+    "If most, then replace missing using the most frequent value of the feature.",
+    ParamValidators.inArray[String](Imputer.supportedStrategyNames.toArray))
 
   /** @group getParam */
   def getStrategy: String = $(strategy)
 
   /**
-    * The placeholder for the missing values. All occurrences of missingvalues will be imputed.
+    * The placeholder for the missing values. All occurrences of missingValue will be imputed.
     * Default: Double.NaN
     *
     * @group param
     */
   val missingValue: DoubleParam = new DoubleParam(this, "missingValue",
-    "The placeholder for the missing values. All occurrences of missingvalues will be imputed")
+    "The placeholder for the missing values. All occurrences of missingValue will be imputed")
 
   /** @group getParam */
   def getMissingValue: Double = $(missingValue)
@@ -75,18 +76,13 @@ private[feature] trait ImputerParams extends Params with HasInputCol with HasOut
     StructType(outputFields)
   }
 
-  override def validateParams(): Unit = {
-    require(Seq("mean", "median", "most").contains($(strategy)),
-      s"${$(strategy)} is not supported. Options are mean, median and most")
-  }
 }
 
 /**
  * :: Experimental ::
  * Imputation estimator for completing missing values, either using the mean, the median or
  * the most frequent value of the column in which the missing values are located. This class
- * also allows for different missing values encodings.
- *
+ * also allows for different missing values.
  */
 @Experimental
 class Imputer @Since("2.0.0")(override val uid: String)
@@ -101,7 +97,10 @@ class Imputer @Since("2.0.0")(override val uid: String)
   /** @group setParam */
   def setOutputCol(value: String): this.type = set(outputCol, value)
 
-  /** @group setParam */
+  /**
+   * Imputation strategy. Available options are "mean", "median" and "most".
+   * @group setParam
+   */
   def setStrategy(value: String): this.type = set(strategy, value)
 
   /** @group setParam */
@@ -112,15 +111,14 @@ class Imputer @Since("2.0.0")(override val uid: String)
   override def fit(dataset: DataFrame): ImputerModel = {
     val alternate = dataset.select($(inputCol)).schema.fields(0).dataType match {
       case DoubleType =>
-        val colStatistics = getColStatistics(dataset, $(inputCol))
-        Vectors.dense(Array(colStatistics))
+        Vectors.dense(getColStatistics(dataset, $(inputCol)))
       case _: VectorUDT =>
         val vl = dataset.first().getAs[Vector]($(inputCol)).size
         val statisticsArray = new Array[Double](vl)
         (0 until vl).foreach(i => {
           val getI = udf((v: Vector) => v(i))
           val tempColName = $(inputCol) + i
-          val tempData = dataset.where(s"${$(inputCol)} is not null")
+          val tempData = dataset.where(s"${$(inputCol)} IS NOT NULL")
             .select($(inputCol)).withColumn(tempColName, getI(col($(inputCol))))
           statisticsArray(i) = getColStatistics(tempData, tempColName)
         })
@@ -129,6 +127,7 @@ class Imputer @Since("2.0.0")(override val uid: String)
     copyValues(new ImputerModel(uid, alternate).setParent(this))
   }
 
+  /** Extract the statistics info from a Double column according to the strategy */
   private def getColStatistics(dataset: DataFrame, colName: String): Double = {
     val missValue = $(missingValue) match {
       case Double.NaN => "NaN"
@@ -143,7 +142,7 @@ class Imputer @Since("2.0.0")(override val uid: String)
         val rddDouble = filteredDF.rdd.map(_.getDouble(0))
         rddDouble.sortBy(d => d).zipWithIndex().map {
           case (v, idx) => (idx, v)
-        }.lookup(rddDouble.count()/2).head
+        }.lookup(rddDouble.count() / 2).head
       case "most" =>
         val input = filteredDF.rdd.map(_.getDouble(0))
         val most = input.map(d => (d, 1)).reduceByKey(_ + _).sortBy(-_._2).first()._1
@@ -165,6 +164,9 @@ class Imputer @Since("2.0.0")(override val uid: String)
 @Since("1.6.0")
 object Imputer extends DefaultParamsReadable[Imputer] {
 
+  /** Set of strategy names that Imputer currently supports. */
+  private[ml] val supportedStrategyNames = Set("mean", "median", "most")
+
   @Since("1.6.0")
   override def load(path: String): Imputer = super.load(path)
 }
@@ -173,7 +175,7 @@ object Imputer extends DefaultParamsReadable[Imputer] {
   * :: Experimental ::
   * Model fitted by [[Imputer]].
   *
-  * @param alternate statistics value for each original column during fitting
+  * @param alternate statistics value for each feature during fitting
   */
 @Experimental
 class ImputerModel private[ml] (
@@ -189,7 +191,7 @@ class ImputerModel private[ml] (
   /** @group setParam */
   def setOutputCol(value: String): this.type = set(outputCol, value)
 
-  private def matchMissingValue(value: Double): Boolean = {
+  private def isMissingValue(value: Double): Boolean = {
     val miss = $(missingValue)
     value == miss || (value.isNaN && miss.isNaN)
   }
@@ -198,7 +200,7 @@ class ImputerModel private[ml] (
     dataset.select($(inputCol)).schema.fields(0).dataType match {
       case DoubleType =>
         val impute = udf { (d: Double) =>
-          if (matchMissingValue(d)) alternate(0) else d
+          if (isMissingValue(d)) alternate(0) else d
         }
         dataset.withColumn($(outputCol), impute(col($(inputCol))))
       case _: VectorUDT =>
@@ -208,20 +210,20 @@ class ImputerModel private[ml] (
           }
           else {
             val vCopy = vector.copy
+            // TODO replace with update() since this hacks the internal implementation of Vector.
             vCopy match {
               case d: DenseVector =>
                 var iter = 0
                 while(iter < d.size) {
-                  if (matchMissingValue(vCopy(iter))) {
+                  if (isMissingValue(vCopy(iter))) {
                     d.values(iter) = alternate(iter)
                   }
-
                   iter += 1
                 }
               case s: SparseVector =>
                 var iter = 0
                 while(iter < s.values.length) {
-                  if (matchMissingValue(s.values(iter))) {
+                  if (isMissingValue(s.values(iter))) {
                     s.values(iter) = alternate(s.indices(iter))
                   }
                   iter += 1

From 1b36deb3eb0391ec7080bafebd2dfb662d09a6e4 Mon Sep 17 00:00:00 2001
From: Yuhao Yang <hhbyyh@gmail.com>
Date: Wed, 23 Mar 2016 17:45:35 +0800
Subject: [PATCH 07/25] optimize mean for vectors

---
 .../org/apache/spark/ml/feature/Imputer.scala | 86 +++++++++++--------
 1 file changed, 48 insertions(+), 38 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala
index 70938691f536d..994b06a359e65 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.ml.feature
 
 import org.apache.hadoop.fs.Path
+import org.apache.spark.SparkException
 
 import org.apache.spark.annotation.{Experimental, Since}
 import org.apache.spark.ml.{Estimator, Model}
@@ -25,6 +26,7 @@ import org.apache.spark.ml.param._
 import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol}
 import org.apache.spark.ml.util._
 import org.apache.spark.mllib.linalg._
+import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.{DataFrame, Row}
 import org.apache.spark.sql.functions.{col, udf}
 import org.apache.spark.sql.types.{DoubleType, StructField, StructType}
@@ -64,9 +66,13 @@ private[feature] trait ImputerParams extends Params with HasInputCol with HasOut
   /** @group getParam */
   def getMissingValue: Double = $(missingValue)
 
+  private[feature] def isMissingValue(value: Double): Boolean = {
+    val miss = $(missingValue)
+    value == miss || (value.isNaN && miss.isNaN)
+  }
+
   /** Validates and transforms the input schema. */
   protected def validateAndTransformSchema(schema: StructType): StructType = {
-    validateParams()
     val inputType = schema($(inputCol)).dataType
     require(inputType.isInstanceOf[VectorUDT] || inputType.isInstanceOf[DoubleType],
       s"Input column ${$(inputCol)} must of type Vector or Double")
@@ -111,42 +117,54 @@ class Imputer @Since("2.0.0")(override val uid: String)
   override def fit(dataset: DataFrame): ImputerModel = {
     val alternate = dataset.select($(inputCol)).schema.fields(0).dataType match {
       case DoubleType =>
-        Vectors.dense(getColStatistics(dataset, $(inputCol)))
+        val doubleRDD = dataset.select($(inputCol)).rdd.map(_.getDouble(0))
+        Vectors.dense(getColStatistics(doubleRDD))
       case _: VectorUDT =>
+        val filteredDF = dataset.where(s"${$(inputCol)} IS NOT NULL").select($(inputCol))
+        val vectorRDD = filteredDF.rdd.map(_.getAs[Vector](0)).cache()
         val vl = dataset.first().getAs[Vector]($(inputCol)).size
-        val statisticsArray = new Array[Double](vl)
-        (0 until vl).foreach(i => {
-          val getI = udf((v: Vector) => v(i))
-          val tempColName = $(inputCol) + i
-          val tempData = dataset.where(s"${$(inputCol)} IS NOT NULL")
-            .select($(inputCol)).withColumn(tempColName, getI(col($(inputCol))))
-          statisticsArray(i) = getColStatistics(tempData, tempColName)
-        })
-        Vectors.dense(statisticsArray)
+        $(strategy) match {
+          case "mean" =>
+            val summary = vectorRDD.treeAggregate((new Array[Double](vl), new Array[Int](vl)))(
+              (prev, data) => (prev, data) match { case ((mean, count), data) =>
+                  var i = 0
+                  while (i < mean.length) {
+                    if (data(i) != 0 && !data(i).isNaN){
+                      count(i) += 1
+                      mean(i) = mean(i) + (data(i) - mean(i)) / count(i)
+                    }
+                    i += 1
+                  }
+                  (mean, count)
+              }, (aggregator1, aggregator2) => (aggregator1, aggregator2) match {
+                case ((mean1, c1), (mean2, c2)) =>
+                  (0 until mean1.length).foreach{ i =>
+                    mean1(i) = mean1(i) + (mean2(i) - mean1(i)) * c2(i) / (c1(i) + c2(i))
+                    c1(i) += c2(i)
+                  }
+                  (mean1, c1)
+              })
+            Vectors.dense(summary._1)
+          case _ =>
+            val statisticsArray = new Array[Double](vl)
+            (0 until vl).foreach(i => {
+              statisticsArray(i) = getColStatistics(vectorRDD.map(v => v(i)))
+            })
+            Vectors.dense(statisticsArray)
+        }
     }
     copyValues(new ImputerModel(uid, alternate).setParent(this))
   }
 
   /** Extract the statistics info from a Double column according to the strategy */
-  private def getColStatistics(dataset: DataFrame, colName: String): Double = {
-    val missValue = $(missingValue) match {
-      case Double.NaN => "NaN"
-      case _ => $(missingValue).toString
-    }
-    val filteredDF = dataset.select(colName).where(s"$colName != '$missValue'")
+  private def getColStatistics(data: RDD[Double]): Double = {
+    val filteredRDD = data.filter(!isMissingValue(_))
     val colStatistics = $(strategy) match {
-      case "mean" =>
-        filteredDF.selectExpr(s"avg($colName)").first().getDouble(0)
-      case "median" =>
-        // TODO: optimize the sort with quick-select or Percentile(Hive) if required
-        val rddDouble = filteredDF.rdd.map(_.getDouble(0))
-        rddDouble.sortBy(d => d).zipWithIndex().map {
-          case (v, idx) => (idx, v)
-        }.lookup(rddDouble.count() / 2).head
-      case "most" =>
-        val input = filteredDF.rdd.map(_.getDouble(0))
-        val most = input.map(d => (d, 1)).reduceByKey(_ + _).sortBy(-_._2).first()._1
-        most
+      case "mean" => filteredRDD.mean()
+      case "median" => filteredRDD.sortBy(d => d).zipWithIndex()
+        .map(p => (p._2, p._1)).lookup(filteredRDD.count() / 2).head
+      case "most" => filteredRDD.map(d => (d, 1)).reduceByKey(_ + _).sortBy(-_._2).first()._1
+      case _ => throw new SparkException(s"unsupported impute strategy: ${$(strategy)}")
     }
     colStatistics
   }
@@ -191,11 +209,6 @@ class ImputerModel private[ml] (
   /** @group setParam */
   def setOutputCol(value: String): this.type = set(outputCol, value)
 
-  private def isMissingValue(value: Double): Boolean = {
-    val miss = $(missingValue)
-    value == miss || (value.isNaN && miss.isNaN)
-  }
-
   override def transform(dataset: DataFrame): DataFrame = {
     dataset.select($(inputCol)).schema.fields(0).dataType match {
       case DoubleType =>
@@ -210,14 +223,11 @@ class ImputerModel private[ml] (
           }
           else {
             val vCopy = vector.copy
-            // TODO replace with update() since this hacks the internal implementation of Vector.
             vCopy match {
               case d: DenseVector =>
                 var iter = 0
                 while(iter < d.size) {
-                  if (isMissingValue(vCopy(iter))) {
-                    d.values(iter) = alternate(iter)
-                  }
+                  if (isMissingValue(vCopy(iter))) { d.values(iter) = alternate(iter) }
                   iter += 1
                 }
               case s: SparseVector =>

From 72d104d92a96ba03d60a72a5fa0b06e583a28bdc Mon Sep 17 00:00:00 2001
From: Yuhao Yang <hhbyyh@gmail.com>
Date: Wed, 23 Mar 2016 08:07:34 -0400
Subject: [PATCH 08/25] style fix

---
 .../src/main/scala/org/apache/spark/ml/feature/Imputer.scala  | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala
index 994b06a359e65..8f6d4605fbe55 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala
@@ -18,8 +18,8 @@
 package org.apache.spark.ml.feature
 
 import org.apache.hadoop.fs.Path
-import org.apache.spark.SparkException
 
+import org.apache.spark.SparkException
 import org.apache.spark.annotation.{Experimental, Since}
 import org.apache.spark.ml.{Estimator, Model}
 import org.apache.spark.ml.param._
@@ -129,7 +129,7 @@ class Imputer @Since("2.0.0")(override val uid: String)
               (prev, data) => (prev, data) match { case ((mean, count), data) =>
                   var i = 0
                   while (i < mean.length) {
-                    if (data(i) != 0 && !data(i).isNaN){
+                    if (data(i) != 0 && !data(i).isNaN) {
                       count(i) += 1
                       mean(i) = mean(i) + (data(i) - mean(i)) / count(i)
                     }

From fdd6f943da2123aebaca4fe9d48ce6b6356bfa42 Mon Sep 17 00:00:00 2001
From: Yuhao Yang <hhbyyh@gmail.com>
Date: Tue, 12 Apr 2016 13:53:07 +0800
Subject: [PATCH 09/25] refactor to support numeric only

---
 .../org/apache/spark/ml/feature/Imputer.scala | 196 ++++++------------
 .../spark/ml/feature/ImputerSuite.scala       |  56 +++--
 2 files changed, 99 insertions(+), 153 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala
index 8f6d4605fbe55..4ba03d94d2a45 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala
@@ -25,26 +25,25 @@ import org.apache.spark.ml.{Estimator, Model}
 import org.apache.spark.ml.param._
 import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol}
 import org.apache.spark.ml.util._
-import org.apache.spark.mllib.linalg._
-import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.{DataFrame, Row}
-import org.apache.spark.sql.functions.{col, udf}
-import org.apache.spark.sql.types.{DoubleType, StructField, StructType}
+import org.apache.spark.sql.{DataFrame, Dataset, Row}
+import org.apache.spark.sql.functions._
+import org.apache.spark.sql.types._
+import org.apache.spark.sql.types.NumericType
 
 /**
-  * Params for [[Imputer]] and [[ImputerModel]].
-  */
+ * Params for [[Imputer]] and [[ImputerModel]].
+ */
 private[feature] trait ImputerParams extends Params with HasInputCol with HasOutputCol {
 
   /**
-    * The imputation strategy.
-    * If "mean", then replace missing values using the mean value of the feature.
-    * If "median", then replace missing values using the median value of the feature.
-    * If "most", then replace missing using the most frequent value of the feature.
-    * Default: mean
-    *
-    * @group param
-    */
+   * The imputation strategy.
+   * If "mean", then replace missing values using the mean value of the feature.
+   * If "median", then replace missing values using the median value of the feature.
+   * If "most", then replace missing using the most frequent value of the feature.
+   * Default: mean
+   *
+   * @group param
+   */
   val strategy: Param[String] = new Param(this, "strategy", "strategy for imputation. " +
     "If mean, then replace missing values using the mean value of the feature." +
     "If median, then replace missing values using the median value of the feature." +
@@ -55,30 +54,26 @@ private[feature] trait ImputerParams extends Params with HasInputCol with HasOut
   def getStrategy: String = $(strategy)
 
   /**
-    * The placeholder for the missing values. All occurrences of missingValue will be imputed.
-    * Default: Double.NaN
-    *
-    * @group param
-    */
+   * The placeholder for the missing values. All occurrences of missingValue will be imputed.
+   * Default: Double.NaN
+   *
+   * @group param
+   */
   val missingValue: DoubleParam = new DoubleParam(this, "missingValue",
     "The placeholder for the missing values. All occurrences of missingValue will be imputed")
 
   /** @group getParam */
   def getMissingValue: Double = $(missingValue)
 
-  private[feature] def isMissingValue(value: Double): Boolean = {
-    val miss = $(missingValue)
-    value == miss || (value.isNaN && miss.isNaN)
-  }
-
   /** Validates and transforms the input schema. */
   protected def validateAndTransformSchema(schema: StructType): StructType = {
     val inputType = schema($(inputCol)).dataType
-    require(inputType.isInstanceOf[VectorUDT] || inputType.isInstanceOf[DoubleType],
-      s"Input column ${$(inputCol)} must of type Vector or Double")
+    require(inputType.isInstanceOf[NumericType],
+      s"Input column ${$(inputCol)} must be of NumericType")
     require(!schema.fieldNames.contains($(outputCol)),
       s"Output column ${$(outputCol)} already exists.")
-    val outputFields = schema.fields :+ StructField($(outputCol), new VectorUDT, false)
+    val outputFields = schema.fields :+
+      StructField($(outputCol), inputType, schema($(inputCol)).nullable)
     StructType(outputFields)
   }
 
@@ -86,9 +81,9 @@ private[feature] trait ImputerParams extends Params with HasInputCol with HasOut
 
 /**
  * :: Experimental ::
- * Imputation estimator for completing missing values, either using the mean, the median or
- * the most frequent value of the column in which the missing values are located. This class
- * also allows for different missing values.
+ * Imputation estimator for completing missing values, either using the mean("mean"), the
+ * median("median") or the most frequent value("most") of the column in which the missing
+ * values are located.
  */
 @Experimental
 class Imputer @Since("2.0.0")(override val uid: String)
@@ -104,7 +99,7 @@ class Imputer @Since("2.0.0")(override val uid: String)
   def setOutputCol(value: String): this.type = set(outputCol, value)
 
   /**
-   * Imputation strategy. Available options are "mean", "median" and "most".
+   * Imputation strategy. Available options are ["mean", "median" and "most"].
    * @group setParam
    */
   def setStrategy(value: String): this.type = set(strategy, value)
@@ -114,59 +109,17 @@ class Imputer @Since("2.0.0")(override val uid: String)
 
   setDefault(strategy -> "mean", missingValue -> Double.NaN)
 
-  override def fit(dataset: DataFrame): ImputerModel = {
-    val alternate = dataset.select($(inputCol)).schema.fields(0).dataType match {
-      case DoubleType =>
-        val doubleRDD = dataset.select($(inputCol)).rdd.map(_.getDouble(0))
-        Vectors.dense(getColStatistics(doubleRDD))
-      case _: VectorUDT =>
-        val filteredDF = dataset.where(s"${$(inputCol)} IS NOT NULL").select($(inputCol))
-        val vectorRDD = filteredDF.rdd.map(_.getAs[Vector](0)).cache()
-        val vl = dataset.first().getAs[Vector]($(inputCol)).size
-        $(strategy) match {
-          case "mean" =>
-            val summary = vectorRDD.treeAggregate((new Array[Double](vl), new Array[Int](vl)))(
-              (prev, data) => (prev, data) match { case ((mean, count), data) =>
-                  var i = 0
-                  while (i < mean.length) {
-                    if (data(i) != 0 && !data(i).isNaN) {
-                      count(i) += 1
-                      mean(i) = mean(i) + (data(i) - mean(i)) / count(i)
-                    }
-                    i += 1
-                  }
-                  (mean, count)
-              }, (aggregator1, aggregator2) => (aggregator1, aggregator2) match {
-                case ((mean1, c1), (mean2, c2)) =>
-                  (0 until mean1.length).foreach{ i =>
-                    mean1(i) = mean1(i) + (mean2(i) - mean1(i)) * c2(i) / (c1(i) + c2(i))
-                    c1(i) += c2(i)
-                  }
-                  (mean1, c1)
-              })
-            Vectors.dense(summary._1)
-          case _ =>
-            val statisticsArray = new Array[Double](vl)
-            (0 until vl).foreach(i => {
-              statisticsArray(i) = getColStatistics(vectorRDD.map(v => v(i)))
-            })
-            Vectors.dense(statisticsArray)
-        }
+  override def fit(dataset: Dataset[_]): ImputerModel = {
+    val ic = col($(inputCol))
+    val filtered = dataset.select(ic.cast(DoubleType))
+      .filter(ic.isNotNull && !ic.isNaN && ic =!= $(missingValue))
+    val surrogate = $(strategy) match {
+      case "mean" => filtered.select(avg($(inputCol))).first().getDouble(0)
+      case "median" => filtered.stat.approxQuantile($(inputCol), Array(0.5), 0.001)(0)
+      case "most" => filtered.rdd.map(r => r.getDouble(0)).map(d => (d, 1)).reduceByKey(_ + _)
+        .sortBy(-_._2).first()._1
     }
-    copyValues(new ImputerModel(uid, alternate).setParent(this))
-  }
-
-  /** Extract the statistics info from a Double column according to the strategy */
-  private def getColStatistics(data: RDD[Double]): Double = {
-    val filteredRDD = data.filter(!isMissingValue(_))
-    val colStatistics = $(strategy) match {
-      case "mean" => filteredRDD.mean()
-      case "median" => filteredRDD.sortBy(d => d).zipWithIndex()
-        .map(p => (p._2, p._1)).lookup(filteredRDD.count() / 2).head
-      case "most" => filteredRDD.map(d => (d, 1)).reduceByKey(_ + _).sortBy(-_._2).first()._1
-      case _ => throw new SparkException(s"unsupported impute strategy: ${$(strategy)}")
-    }
-    colStatistics
+    copyValues(new ImputerModel(uid, surrogate).setParent(this))
   }
 
   override def transformSchema(schema: StructType): StructType = {
@@ -179,26 +132,26 @@ class Imputer @Since("2.0.0")(override val uid: String)
   }
 }
 
-@Since("1.6.0")
+@Since("2.0.0")
 object Imputer extends DefaultParamsReadable[Imputer] {
 
   /** Set of strategy names that Imputer currently supports. */
   private[ml] val supportedStrategyNames = Set("mean", "median", "most")
 
-  @Since("1.6.0")
+  @Since("2.0.0")
   override def load(path: String): Imputer = super.load(path)
 }
 
 /**
-  * :: Experimental ::
-  * Model fitted by [[Imputer]].
-  *
-  * @param alternate statistics value for each feature during fitting
-  */
+ * :: Experimental ::
+ * Model fitted by [[Imputer]].
+ *
+ * @param surrogate statistics value for each feature during fitting
+ */
 @Experimental
-class ImputerModel private[ml] (
+class ImputerModel private[ml](
     override val uid: String,
-    val alternate: Vector)
+    val surrogate: Double)
   extends Model[ImputerModel] with ImputerParams with MLWritable {
 
   import ImputerModel._
@@ -209,40 +162,17 @@ class ImputerModel private[ml] (
   /** @group setParam */
   def setOutputCol(value: String): this.type = set(outputCol, value)
 
-  override def transform(dataset: DataFrame): DataFrame = {
-    dataset.select($(inputCol)).schema.fields(0).dataType match {
-      case DoubleType =>
-        val impute = udf { (d: Double) =>
-          if (isMissingValue(d)) alternate(0) else d
-        }
-        dataset.withColumn($(outputCol), impute(col($(inputCol))))
-      case _: VectorUDT =>
-        val impute = udf { (vector: Vector) =>
-          if (vector == null) {
-            alternate
-          }
-          else {
-            val vCopy = vector.copy
-            vCopy match {
-              case d: DenseVector =>
-                var iter = 0
-                while(iter < d.size) {
-                  if (isMissingValue(vCopy(iter))) { d.values(iter) = alternate(iter) }
-                  iter += 1
-                }
-              case s: SparseVector =>
-                var iter = 0
-                while(iter < s.values.length) {
-                  if (isMissingValue(s.values(iter))) {
-                    s.values(iter) = alternate(s.indices(iter))
-                  }
-                  iter += 1
-                }
-            }
-            vCopy
-          }
-        }
-        dataset.withColumn($(outputCol), impute(col($(inputCol))))
+  override def transform(dataset: Dataset[_]): DataFrame = {
+    val inputType = dataset.select($(inputCol)).schema.fields(0).dataType
+    inputType match {
+      case _: NumericType =>
+        val ic = col($(inputCol)).cast(DoubleType)
+        dataset.withColumn($(outputCol), when(ic.isNull, surrogate)
+          .when(ic.isNaN, surrogate)
+          .when(ic === $(missingValue), surrogate)
+          .otherwise(ic)
+          .cast(inputType))
+      case _ => throw new SparkException("imputer supports numeric type only")
     }
   }
 
@@ -251,7 +181,7 @@ class ImputerModel private[ml] (
   }
 
   override def copy(extra: ParamMap): ImputerModel = {
-    val copied = new ImputerModel(uid, alternate)
+    val copied = new ImputerModel(uid, surrogate)
     copyValues(copied, extra).setParent(parent)
   }
 
@@ -266,11 +196,11 @@ object ImputerModel extends MLReadable[ImputerModel] {
   private[ImputerModel]
   class ImputerModelWriter(instance: ImputerModel) extends MLWriter {
 
-    private case class Data(alternate: Vector)
+    private case class Data(surrogate: Double)
 
     override protected def saveImpl(path: String): Unit = {
       DefaultParamsWriter.saveMetadata(instance, path, sc)
-      val data = new Data(instance.alternate)
+      val data = new Data(instance.surrogate)
       val dataPath = new Path(path, "data").toString
       sqlContext.createDataFrame(Seq(data)).repartition(1).write.parquet(dataPath)
     }
@@ -283,10 +213,10 @@ object ImputerModel extends MLReadable[ImputerModel] {
     override def load(path: String): ImputerModel = {
       val metadata = DefaultParamsReader.loadMetadata(path, sc, className)
       val dataPath = new Path(path, "data").toString
-      val Row(alternate: Vector) = sqlContext.read.parquet(dataPath)
-        .select("alternate")
+      val Row(surrogate: Double) = sqlContext.read.parquet(dataPath)
+        .select("surrogate")
         .head()
-      val model = new ImputerModel(metadata.uid, alternate)
+      val model = new ImputerModel(metadata.uid, surrogate)
       DefaultParamsReader.getAndSetParams(model, metadata)
       model
     }
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala
index c22adc48a1f0f..00589b17a17c6 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala
@@ -17,21 +17,20 @@
 package org.apache.spark.ml.feature
 
 import org.apache.spark.SparkFunSuite
-import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils}
-import org.apache.spark.mllib.linalg.{Vector, Vectors}
+import org.apache.spark.ml.util.{DefaultReadWriteTest}
 import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.apache.spark.mllib.util.TestingUtils._
 import org.apache.spark.sql.Row
 
 class ImputerSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {
 
-  test("Imputer for Double column") {
+  test("Imputer for Double with default missing Value NaN") {
     val df = sqlContext.createDataFrame( Seq(
       (0, 1.0, 1.0, 1.0, 1.0),
       (1, 1.0, 1.0, 1.0, 1.0),
       (2, 3.0, 3.0, 3.0, 3.0),
       (3, 4.0, 4.0, 4.0, 4.0),
-      (4, Double.NaN, 2.25, 3.0, 1.0 )
+      (4, Double.NaN, 2.25, 1.0, 1.0 )
     )).toDF("id", "value", "mean", "median", "most")
     Seq("mean", "median", "most").foreach { strategy =>
       val imputer = new Imputer().setInputCol("value").setOutputCol("out").setStrategy(strategy)
@@ -49,7 +48,7 @@ class ImputerSuite extends SparkFunSuite with MLlibTestSparkContext with Default
       (1, 1.0, 1.0, 1.0, 1.0),
       (2, 3.0, 3.0, 3.0, 3.0),
       (3, 4.0, 4.0, 4.0, 4.0),
-      (4, -1.0, 2.25, 3.0, 1.0 )
+      (4, -1.0, 2.25, 1.0, 1.0 )
     )).toDF("id", "value", "mean", "median", "most")
     Seq("mean", "median", "most").foreach { strategy =>
       val imputer = new Imputer().setInputCol("value").setOutputCol("out").setStrategy(strategy)
@@ -62,29 +61,46 @@ class ImputerSuite extends SparkFunSuite with MLlibTestSparkContext with Default
     }
   }
 
-  test("Imputer for Vector column with NaN and null") {
+  test("Imputer for Int with missing Value -1") {
     val df = sqlContext.createDataFrame( Seq(
-      (0, Vectors.dense(1, 2), Vectors.dense(1, 2), Vectors.dense(1, 2), Vectors.dense(1, 2)),
-      (1, Vectors.dense(1, 2), Vectors.dense(1, 2), Vectors.dense(1, 2), Vectors.dense(1, 2)),
-      (2, Vectors.dense(3, 2), Vectors.dense(3, 2), Vectors.dense(3, 2), Vectors.dense(3, 2)),
-      (3, Vectors.dense(4, 2), Vectors.dense(4, 2), Vectors.dense(4, 2), Vectors.dense(4, 2)),
-      (4, Vectors.dense(Double.NaN, 2), Vectors.dense(2.25, 2), Vectors.dense(3.0, 2),
-        Vectors.dense(1.0, 2)),
-      (5, Vectors.sparse(2, Array(0, 1), Array(Double.NaN, 2.0)), Vectors.dense(2.25, 2),
-        Vectors.dense(3.0, 2), Vectors.dense(1.0, 2)),
-      (6, null.asInstanceOf[Vector], Vectors.dense(2.25, 2), Vectors.dense(3.0, 2),
-        Vectors.dense(1.0, 2))
+      (0, 1, 1, 1, 1),
+      (1, 3, 3, 3, 3),
+      (2, 10, 10, 10, 10),
+      (3, 10, 10, 10, 10),
+      (4, -1, 6, 3, 10)
     )).toDF("id", "value", "mean", "median", "most")
+
     Seq("mean", "median", "most").foreach { strategy =>
       val imputer = new Imputer().setInputCol("value").setOutputCol("out").setStrategy(strategy)
+        .setMissingValue(-1)
       val model = imputer.fit(df)
       model.transform(df).select(strategy, "out").collect()
-        .foreach { case Row(v1: Vector, v2: Vector) =>
-          assert(v1 == v2, s"$strategy Imputer ut error: $v2 should be $v1")
+        .foreach { case Row(d1: Int, d2: Int) =>
+          assert(d1 === d2, s"Imputer ut error: $d2 should be $d1")
         }
     }
   }
 
+  test("Imputer should impute null") {
+    val df = sqlContext.createDataFrame( Seq(
+      (0, 1, 1, 1, 1),
+      (1, 3, 3, 3, 3),
+      (2, 10, 10, 10, 10),
+      (3, 10, 10, 10, 10),
+      (4, -1, 6, 3, 10)
+    )).toDF("id", "value", "mean", "median", "most")
+    val df2 = df.selectExpr("*", "IF(value=-1, null, value) as nullable_value")
+    Seq("mean", "median", "most").foreach { strategy =>
+      val imputer = new Imputer().setInputCol("nullable_value").setOutputCol("out")
+        .setStrategy(strategy)
+      val model = imputer.fit(df2)
+      model.transform(df2).select(strategy, "out").collect()
+        .foreach { case Row(d1: Int, d2: Int) =>
+        assert(d1 == d2, s"Imputer ut error: $d2 should be $d1")
+      }
+    }
+  }
+
   test("Imputer read/write") {
     val t = new Imputer()
       .setInputCol("myInputCol")
@@ -95,11 +111,11 @@ class ImputerSuite extends SparkFunSuite with MLlibTestSparkContext with Default
 
   test("ImputerModel read/write") {
     val instance = new ImputerModel(
-      "myImputer", Vectors.dense(1.0, 10.0))
+      "myImputer", 1.234)
       .setInputCol("myInputCol")
       .setOutputCol("myOutputCol")
     val newInstance = testDefaultReadWrite(instance)
-    assert(newInstance.alternate === instance.alternate)
+    assert(newInstance.surrogate === instance.surrogate)
   }
 
 }

From 4bdf595f576fae76f710bfb21e1e3f71571c55c8 Mon Sep 17 00:00:00 2001
From: Yuhao Yang <hhbyyh@gmail.com>
Date: Tue, 12 Apr 2016 10:12:08 -0400
Subject: [PATCH 10/25] change most to mode

---
 .../org/apache/spark/ml/feature/Imputer.scala    | 12 ++++++------
 .../apache/spark/ml/feature/ImputerSuite.scala   | 16 ++++++++--------
 2 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala
index 4ba03d94d2a45..a6496d06f6799 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala
@@ -39,7 +39,7 @@ private[feature] trait ImputerParams extends Params with HasInputCol with HasOut
    * The imputation strategy.
    * If "mean", then replace missing values using the mean value of the feature.
    * If "median", then replace missing values using the median value of the feature.
-   * If "most", then replace missing using the most frequent value of the feature.
+   * If "mode", then replace missing using the most frequent value of the feature.
    * Default: mean
    *
    * @group param
@@ -47,7 +47,7 @@ private[feature] trait ImputerParams extends Params with HasInputCol with HasOut
   val strategy: Param[String] = new Param(this, "strategy", "strategy for imputation. " +
     "If mean, then replace missing values using the mean value of the feature." +
     "If median, then replace missing values using the median value of the feature." +
-    "If most, then replace missing using the most frequent value of the feature.",
+    "If mode, then replace missing using the most frequent value of the feature.",
     ParamValidators.inArray[String](Imputer.supportedStrategyNames.toArray))
 
   /** @group getParam */
@@ -82,7 +82,7 @@ private[feature] trait ImputerParams extends Params with HasInputCol with HasOut
 /**
  * :: Experimental ::
  * Imputation estimator for completing missing values, either using the mean("mean"), the
- * median("median") or the most frequent value("most") of the column in which the missing
+ * median("median") or the most frequent value("mode") of the column in which the missing
  * values are located.
  */
 @Experimental
@@ -99,7 +99,7 @@ class Imputer @Since("2.0.0")(override val uid: String)
   def setOutputCol(value: String): this.type = set(outputCol, value)
 
   /**
-   * Imputation strategy. Available options are ["mean", "median" and "most"].
+   * Imputation strategy. Available options are ["mean", "median" and "mode"].
    * @group setParam
    */
   def setStrategy(value: String): this.type = set(strategy, value)
@@ -116,7 +116,7 @@ class Imputer @Since("2.0.0")(override val uid: String)
     val surrogate = $(strategy) match {
       case "mean" => filtered.select(avg($(inputCol))).first().getDouble(0)
       case "median" => filtered.stat.approxQuantile($(inputCol), Array(0.5), 0.001)(0)
-      case "most" => filtered.rdd.map(r => r.getDouble(0)).map(d => (d, 1)).reduceByKey(_ + _)
+      case "mode" => filtered.rdd.map(r => r.getDouble(0)).map(d => (d, 1)).reduceByKey(_ + _)
         .sortBy(-_._2).first()._1
     }
     copyValues(new ImputerModel(uid, surrogate).setParent(this))
@@ -136,7 +136,7 @@ class Imputer @Since("2.0.0")(override val uid: String)
 object Imputer extends DefaultParamsReadable[Imputer] {
 
   /** Set of strategy names that Imputer currently supports. */
-  private[ml] val supportedStrategyNames = Set("mean", "median", "most")
+  private[ml] val supportedStrategyNames = Set("mean", "median", "mode")
 
   @Since("2.0.0")
   override def load(path: String): Imputer = super.load(path)
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala
index 00589b17a17c6..c29614531ea8b 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala
@@ -31,8 +31,8 @@ class ImputerSuite extends SparkFunSuite with MLlibTestSparkContext with Default
       (2, 3.0, 3.0, 3.0, 3.0),
       (3, 4.0, 4.0, 4.0, 4.0),
       (4, Double.NaN, 2.25, 1.0, 1.0 )
-    )).toDF("id", "value", "mean", "median", "most")
-    Seq("mean", "median", "most").foreach { strategy =>
+    )).toDF("id", "value", "mean", "median", "mode")
+    Seq("mean", "median", "mode").foreach { strategy =>
       val imputer = new Imputer().setInputCol("value").setOutputCol("out").setStrategy(strategy)
       val model = imputer.fit(df)
       model.transform(df).select(strategy, "out").collect()
@@ -49,8 +49,8 @@ class ImputerSuite extends SparkFunSuite with MLlibTestSparkContext with Default
       (2, 3.0, 3.0, 3.0, 3.0),
       (3, 4.0, 4.0, 4.0, 4.0),
       (4, -1.0, 2.25, 1.0, 1.0 )
-    )).toDF("id", "value", "mean", "median", "most")
-    Seq("mean", "median", "most").foreach { strategy =>
+    )).toDF("id", "value", "mean", "median", "mode")
+    Seq("mean", "median", "mode").foreach { strategy =>
       val imputer = new Imputer().setInputCol("value").setOutputCol("out").setStrategy(strategy)
         .setMissingValue(-1.0)
       val model = imputer.fit(df)
@@ -68,9 +68,9 @@ class ImputerSuite extends SparkFunSuite with MLlibTestSparkContext with Default
       (2, 10, 10, 10, 10),
       (3, 10, 10, 10, 10),
       (4, -1, 6, 3, 10)
-    )).toDF("id", "value", "mean", "median", "most")
+    )).toDF("id", "value", "mean", "median", "mode")
 
-    Seq("mean", "median", "most").foreach { strategy =>
+    Seq("mean", "median", "mode").foreach { strategy =>
       val imputer = new Imputer().setInputCol("value").setOutputCol("out").setStrategy(strategy)
         .setMissingValue(-1)
       val model = imputer.fit(df)
@@ -88,9 +88,9 @@ class ImputerSuite extends SparkFunSuite with MLlibTestSparkContext with Default
       (2, 10, 10, 10, 10),
       (3, 10, 10, 10, 10),
       (4, -1, 6, 3, 10)
-    )).toDF("id", "value", "mean", "median", "most")
+    )).toDF("id", "value", "mean", "median", "mode")
     val df2 = df.selectExpr("*", "IF(value=-1, null, value) as nullable_value")
-    Seq("mean", "median", "most").foreach { strategy =>
+    Seq("mean", "median", "mode").foreach { strategy =>
       val imputer = new Imputer().setInputCol("nullable_value").setOutputCol("out")
         .setStrategy(strategy)
       val model = imputer.fit(df2)

From 171842210d3ea2e3c97fe803f0a8bb3831063f3f Mon Sep 17 00:00:00 2001
From: Yuhao Yang <hhbyyh@gmail.com>
Date: Sat, 16 Apr 2016 22:56:03 -0400
Subject: [PATCH 11/25] move filter to NaN

---
 .../src/main/scala/org/apache/spark/ml/feature/Imputer.scala | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala
index a6496d06f6799..494a2fbae161f 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala
@@ -112,9 +112,9 @@ class Imputer @Since("2.0.0")(override val uid: String)
   override def fit(dataset: Dataset[_]): ImputerModel = {
     val ic = col($(inputCol))
     val filtered = dataset.select(ic.cast(DoubleType))
-      .filter(ic.isNotNull && !ic.isNaN && ic =!= $(missingValue))
+      .filter(ic.isNotNull && ic =!= $(missingValue))
     val surrogate = $(strategy) match {
-      case "mean" => filtered.select(avg($(inputCol))).first().getDouble(0)
+      case "mean" => filtered.filter(!ic.isNaN).select(avg($(inputCol))).first().getDouble(0)
       case "median" => filtered.stat.approxQuantile($(inputCol), Array(0.5), 0.001)(0)
       case "mode" => filtered.rdd.map(r => r.getDouble(0)).map(d => (d, 1)).reduceByKey(_ + _)
         .sortBy(-_._2).first()._1
@@ -168,7 +168,6 @@ class ImputerModel private[ml](
       case _: NumericType =>
         val ic = col($(inputCol)).cast(DoubleType)
         dataset.withColumn($(outputCol), when(ic.isNull, surrogate)
-          .when(ic.isNaN, surrogate)
           .when(ic === $(missingValue), surrogate)
           .otherwise(ic)
           .cast(inputType))

From 594c501f85cad2a278caee5f08b85deb61272e5d Mon Sep 17 00:00:00 2001
From: Yuhao Yang <hhbyyh@gmail.com>
Date: Wed, 20 Apr 2016 11:55:25 -0400
Subject: [PATCH 12/25] add transformSchema

---
 mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala
index 494a2fbae161f..57f8abcbcaeb0 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala
@@ -110,6 +110,7 @@ class Imputer @Since("2.0.0")(override val uid: String)
   setDefault(strategy -> "mean", missingValue -> Double.NaN)
 
   override def fit(dataset: Dataset[_]): ImputerModel = {
+    transformSchema(dataset.schema, logging = true)
     val ic = col($(inputCol))
     val filtered = dataset.select(ic.cast(DoubleType))
       .filter(ic.isNotNull && ic =!= $(missingValue))
@@ -163,6 +164,7 @@ class ImputerModel private[ml](
   def setOutputCol(value: String): this.type = set(outputCol, value)
 
   override def transform(dataset: Dataset[_]): DataFrame = {
+    transformSchema(dataset.schema, logging = true)
     val inputType = dataset.select($(inputCol)).schema.fields(0).dataType
     inputType match {
       case _: NumericType =>

From b3633e8dd0edf47a684aa344ba6a3c43ac0d91fe Mon Sep 17 00:00:00 2001
From: Yuhao Yang <hhbyyh@gmail.com>
Date: Wed, 27 Apr 2016 12:36:05 -0400
Subject: [PATCH 13/25] remove mode and change input type

---
 .../org/apache/spark/ml/feature/Imputer.scala |  46 +++-----
 .../spark/ml/feature/ImputerSuite.scala       | 111 +++++++++++-------
 2 files changed, 83 insertions(+), 74 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala
index 57f8abcbcaeb0..68e0764e6225e 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala
@@ -28,7 +28,6 @@ import org.apache.spark.ml.util._
 import org.apache.spark.sql.{DataFrame, Dataset, Row}
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.types._
-import org.apache.spark.sql.types.NumericType
 
 /**
  * Params for [[Imputer]] and [[ImputerModel]].
@@ -38,16 +37,14 @@ private[feature] trait ImputerParams extends Params with HasInputCol with HasOut
   /**
    * The imputation strategy.
    * If "mean", then replace missing values using the mean value of the feature.
-   * If "median", then replace missing values using the median value of the feature.
-   * If "mode", then replace missing using the most frequent value of the feature.
+   * If "median", then replace missing values using the approximate median value of the feature.
    * Default: mean
    *
    * @group param
    */
-  val strategy: Param[String] = new Param(this, "strategy", "strategy for imputation. " +
+  final val strategy: Param[String] = new Param(this, "strategy", "strategy for imputation. " +
     "If mean, then replace missing values using the mean value of the feature." +
-    "If median, then replace missing values using the median value of the feature." +
-    "If mode, then replace missing using the most frequent value of the feature.",
+    "If median, then replace missing values using the median value of the feature.",
     ParamValidators.inArray[String](Imputer.supportedStrategyNames.toArray))
 
   /** @group getParam */
@@ -59,7 +56,7 @@ private[feature] trait ImputerParams extends Params with HasInputCol with HasOut
    *
    * @group param
    */
-  val missingValue: DoubleParam = new DoubleParam(this, "missingValue",
+  final val missingValue: DoubleParam = new DoubleParam(this, "missingValue",
     "The placeholder for the missing values. All occurrences of missingValue will be imputed")
 
   /** @group getParam */
@@ -68,22 +65,19 @@ private[feature] trait ImputerParams extends Params with HasInputCol with HasOut
   /** Validates and transforms the input schema. */
   protected def validateAndTransformSchema(schema: StructType): StructType = {
     val inputType = schema($(inputCol)).dataType
-    require(inputType.isInstanceOf[NumericType],
-      s"Input column ${$(inputCol)} must be of NumericType")
+    SchemaUtils.checkColumnTypes(schema, $(inputCol), Seq(DoubleType, FloatType))
     require(!schema.fieldNames.contains($(outputCol)),
       s"Output column ${$(outputCol)} already exists.")
-    val outputFields = schema.fields :+
-      StructField($(outputCol), inputType, schema($(inputCol)).nullable)
-    StructType(outputFields)
+    SchemaUtils.appendColumn(schema, $(outputCol), inputType)
   }
-
 }
 
 /**
  * :: Experimental ::
- * Imputation estimator for completing missing values, either using the mean("mean"), the
- * median("median") or the most frequent value("mode") of the column in which the missing
- * values are located.
+ * Imputation estimator for completing missing values, either using the mean("mean") or the
+ * median("median") of the column in which the missing values are located.
+ *
+ * Note that all the null values will be imputed as well.
  */
 @Experimental
 class Imputer @Since("2.0.0")(override val uid: String)
@@ -99,7 +93,7 @@ class Imputer @Since("2.0.0")(override val uid: String)
   def setOutputCol(value: String): this.type = set(outputCol, value)
 
   /**
-   * Imputation strategy. Available options are ["mean", "median" and "mode"].
+   * Imputation strategy. Available options are ["mean", "median"].
    * @group setParam
    */
   def setStrategy(value: String): this.type = set(strategy, value)
@@ -117,8 +111,6 @@ class Imputer @Since("2.0.0")(override val uid: String)
     val surrogate = $(strategy) match {
       case "mean" => filtered.filter(!ic.isNaN).select(avg($(inputCol))).first().getDouble(0)
       case "median" => filtered.stat.approxQuantile($(inputCol), Array(0.5), 0.001)(0)
-      case "mode" => filtered.rdd.map(r => r.getDouble(0)).map(d => (d, 1)).reduceByKey(_ + _)
-        .sortBy(-_._2).first()._1
     }
     copyValues(new ImputerModel(uid, surrogate).setParent(this))
   }
@@ -137,7 +129,7 @@ class Imputer @Since("2.0.0")(override val uid: String)
 object Imputer extends DefaultParamsReadable[Imputer] {
 
   /** Set of strategy names that Imputer currently supports. */
-  private[ml] val supportedStrategyNames = Set("mean", "median", "mode")
+  private[ml] val supportedStrategyNames = Set("mean", "median")
 
   @Since("2.0.0")
   override def load(path: String): Imputer = super.load(path)
@@ -166,15 +158,11 @@ class ImputerModel private[ml](
   override def transform(dataset: Dataset[_]): DataFrame = {
     transformSchema(dataset.schema, logging = true)
     val inputType = dataset.select($(inputCol)).schema.fields(0).dataType
-    inputType match {
-      case _: NumericType =>
-        val ic = col($(inputCol)).cast(DoubleType)
-        dataset.withColumn($(outputCol), when(ic.isNull, surrogate)
-          .when(ic === $(missingValue), surrogate)
-          .otherwise(ic)
-          .cast(inputType))
-      case _ => throw new SparkException("imputer supports numeric type only")
-    }
+    val ic = col($(inputCol))
+    dataset.withColumn($(outputCol), when(ic.isNull, surrogate)
+      .when(ic === $(missingValue), surrogate)
+      .otherwise(ic)
+      .cast(inputType))
   }
 
   override def transformSchema(schema: StructType): StructType = {
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala
index c29614531ea8b..61e216998aad2 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala
@@ -26,77 +26,98 @@ class ImputerSuite extends SparkFunSuite with MLlibTestSparkContext with Default
 
   test("Imputer for Double with default missing Value NaN") {
     val df = sqlContext.createDataFrame( Seq(
-      (0, 1.0, 1.0, 1.0, 1.0),
-      (1, 1.0, 1.0, 1.0, 1.0),
-      (2, 3.0, 3.0, 3.0, 3.0),
-      (3, 4.0, 4.0, 4.0, 4.0),
-      (4, Double.NaN, 2.25, 1.0, 1.0 )
-    )).toDF("id", "value", "mean", "median", "mode")
-    Seq("mean", "median", "mode").foreach { strategy =>
+      (0, 1.0, 1.0, 1.0),
+      (1, 1.0, 1.0, 1.0),
+      (2, 3.0, 3.0, 3.0),
+      (3, 4.0, 4.0, 4.0),
+      (4, Double.NaN, 2.25, 1.0)
+    )).toDF("id", "value", "exp_mean", "exp_median")
+    Seq("mean", "median").foreach { strategy =>
       val imputer = new Imputer().setInputCol("value").setOutputCol("out").setStrategy(strategy)
       val model = imputer.fit(df)
-      model.transform(df).select(strategy, "out").collect()
-        .foreach { case Row(d1: Double, d2: Double) =>
-          assert(d1 ~== d2 absTol 1e-5, s"Imputer ut error: $d2 should be $d1")
-        }
+      model.transform(df).select("exp_" + strategy, "out").collect().foreach {
+       case Row(exp: Double, out: Double) =>
+          assert(exp ~== out absTol 1e-5, s"Imputed values differ. Expected: $exp, actual: $out")
+      }
     }
   }
 
   test("Imputer for Double with missing Value -1.0") {
     val df = sqlContext.createDataFrame( Seq(
-      (0, 1.0, 1.0, 1.0, 1.0),
-      (1, 1.0, 1.0, 1.0, 1.0),
-      (2, 3.0, 3.0, 3.0, 3.0),
-      (3, 4.0, 4.0, 4.0, 4.0),
-      (4, -1.0, 2.25, 1.0, 1.0 )
-    )).toDF("id", "value", "mean", "median", "mode")
-    Seq("mean", "median", "mode").foreach { strategy =>
+      (0, 1.0, 1.0, 1.0),
+      (1, 1.0, 1.0, 1.0),
+      (2, 3.0, 3.0, 3.0),
+      (3, 4.0, 4.0, 4.0),
+      (4, -1.0, 2.25, 1.0)
+    )).toDF("id", "value", "exp_mean", "exp_median")
+    Seq("mean", "median").foreach { strategy =>
       val imputer = new Imputer().setInputCol("value").setOutputCol("out").setStrategy(strategy)
         .setMissingValue(-1.0)
       val model = imputer.fit(df)
-      model.transform(df).select(strategy, "out").collect()
-        .foreach { case Row(d1: Double, d2: Double) =>
-          assert(d1 ~== d2 absTol 1e-5, s"Imputer ut error: $d2 should be $d1")
-        }
+      model.transform(df).select("exp_" + strategy, "out").collect().foreach {
+        case Row(exp: Double, out: Double) =>
+          assert(exp ~== out absTol 1e-5, s"Impute($strategy) error. Expected: $exp, actual: $out")
+      }
     }
   }
 
-  test("Imputer for Int with missing Value -1") {
+  test("Imputer for Double with missing Value -1.0 and contains NaN") {
     val df = sqlContext.createDataFrame( Seq(
-      (0, 1, 1, 1, 1),
-      (1, 3, 3, 3, 3),
-      (2, 10, 10, 10, 10),
-      (3, 10, 10, 10, 10),
-      (4, -1, 6, 3, 10)
-    )).toDF("id", "value", "mean", "median", "mode")
+      (0, 1.0, 1.0, 1.0),
+      (1, 3.0, 3.0, 3.0),
+      (2, Double.NaN, Double.NaN, Double.NaN),
+      (3, -1.0, 2.0, 3.0)
+    )).toDF("id", "value", "exp_mean", "exp_median")
+    Seq("mean", "median").foreach { strategy =>
+      val imputer = new Imputer().setInputCol("value").setOutputCol("out").setStrategy(strategy)
+        .setMissingValue(-1.0)
+      val model = imputer.fit(df)
+      model.transform(df).select("exp_" + strategy, "out").collect().foreach {
+        case Row(exp: Double, out: Double) =>
+          assert((exp.isNaN && out.isNaN) || (exp ~== out absTol 1e-5),
+            s"Imputed values differ. Expected: $exp, actual: $out")
+      }
+    }
+  }
 
-    Seq("mean", "median", "mode").foreach { strategy =>
+  test("Imputer for Float with missing Value -1.0") {
+    val df = sqlContext.createDataFrame( Seq(
+      (0, 1.0F, 1.0F, 1.0F),
+      (1, 3.0F, 3.0F, 3.0F),
+      (2, 10.0F, 10.0F, 10.0F),
+      (3, 10.0F, 10.0F, 10.0F),
+      (4, -1.0F, 6.0F, 3.0F)
+    )).toDF("id", "value", "exp_mean", "exp_median")
+
+    Seq("mean", "median").foreach { strategy =>
       val imputer = new Imputer().setInputCol("value").setOutputCol("out").setStrategy(strategy)
         .setMissingValue(-1)
       val model = imputer.fit(df)
-      model.transform(df).select(strategy, "out").collect()
-        .foreach { case Row(d1: Int, d2: Int) =>
-          assert(d1 === d2, s"Imputer ut error: $d2 should be $d1")
-        }
+      val result = model.transform(df)
+        result.printSchema()
+      model.transform(df).select("exp_" + strategy, "out").collect().foreach {
+        case Row(exp: Float, out: Float) =>
+          assert(exp == out, s"Imputed values differ. Expected: $exp, actual: $out")
+      }
     }
   }
 
   test("Imputer should impute null") {
     val df = sqlContext.createDataFrame( Seq(
-      (0, 1, 1, 1, 1),
-      (1, 3, 3, 3, 3),
-      (2, 10, 10, 10, 10),
-      (3, 10, 10, 10, 10),
-      (4, -1, 6, 3, 10)
-    )).toDF("id", "value", "mean", "median", "mode")
-    val df2 = df.selectExpr("*", "IF(value=-1, null, value) as nullable_value")
-    Seq("mean", "median", "mode").foreach { strategy =>
+      (0, 4.0, 4.0, 4.0),
+      (1, 10.0, 10.0, 10.0),
+      (2, 10.0, 10.0, 10.0),
+      (3, Double.NaN, 8.0, 10.0),
+      (4, -1.0, 8.0, 10.0)
+    )).toDF("id", "value", "exp_mean", "exp_median")
+    val df2 = df.selectExpr("*", "IF(value=-1.0, null, value) as nullable_value")
+    Seq("mean", "median").foreach { strategy =>
       val imputer = new Imputer().setInputCol("nullable_value").setOutputCol("out")
         .setStrategy(strategy)
       val model = imputer.fit(df2)
-      model.transform(df2).select(strategy, "out").collect()
-        .foreach { case Row(d1: Int, d2: Int) =>
-        assert(d1 == d2, s"Imputer ut error: $d2 should be $d1")
+      model.transform(df2).select("exp_" + strategy, "out").collect().foreach {
+        case Row(exp: Double, out: Double) =>
+          assert(exp ~== out absTol 1e-5, s"Imputed values differ. Expected: $exp, actual: $out")
       }
     }
   }

From 053d489a70a28674029ee51a69f529e851261c96 Mon Sep 17 00:00:00 2001
From: Yuhao Yang <hhbyyh@gmail.com>
Date: Wed, 27 Apr 2016 12:41:01 -0400
Subject: [PATCH 14/25] remove print

---
 .../test/scala/org/apache/spark/ml/feature/ImputerSuite.scala    | 1 -
 1 file changed, 1 deletion(-)

diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala
index 61e216998aad2..ebc17415e3eaf 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala
@@ -94,7 +94,6 @@ class ImputerSuite extends SparkFunSuite with MLlibTestSparkContext with Default
         .setMissingValue(-1)
       val model = imputer.fit(df)
       val result = model.transform(df)
-        result.printSchema()
       model.transform(df).select("exp_" + strategy, "out").collect().foreach {
         case Row(exp: Float, out: Float) =>
           assert(exp == out, s"Imputed values differ. Expected: $exp, actual: $out")

From 4e1c34a77b8e4382c00a0438e00e34f544b591a3 Mon Sep 17 00:00:00 2001
From: Yuhao Yang <hhbyyh@gmail.com>
Date: Thu, 28 Apr 2016 14:58:09 +0800
Subject: [PATCH 15/25] update document and remove a ut

---
 .../org/apache/spark/ml/feature/Imputer.scala |  9 ++++---
 .../spark/ml/feature/ImputerSuite.scala       | 27 +++----------------
 2 files changed, 9 insertions(+), 27 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala
index 68e0764e6225e..40ecfe51bd9fe 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala
@@ -19,7 +19,6 @@ package org.apache.spark.ml.feature
 
 import org.apache.hadoop.fs.Path
 
-import org.apache.spark.SparkException
 import org.apache.spark.annotation.{Experimental, Since}
 import org.apache.spark.ml.{Estimator, Model}
 import org.apache.spark.ml.param._
@@ -74,10 +73,12 @@ private[feature] trait ImputerParams extends Params with HasInputCol with HasOut
 
 /**
  * :: Experimental ::
- * Imputation estimator for completing missing values, either using the mean("mean") or the
- * median("median") of the column in which the missing values are located.
+ * Imputation estimator for completing missing values, either using the mean or the
+ * median of the column in which the missing values are located. InputCol should be
+ * of DoubleType or FloatType.
  *
- * Note that all the null values will be imputed as well.
+ * Note that the mean/median value is computed after filtering out missing values.
+ * All Null values in the input column are treated as missing, and so are also imputed.
  */
 @Experimental
 class Imputer @Since("2.0.0")(override val uid: String)
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala
index ebc17415e3eaf..959e8583070ef 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala
@@ -31,7 +31,7 @@ class ImputerSuite extends SparkFunSuite with MLlibTestSparkContext with Default
       (2, 3.0, 3.0, 3.0),
       (3, 4.0, 4.0, 4.0),
       (4, Double.NaN, 2.25, 1.0)
-    )).toDF("id", "value", "exp_mean", "exp_median")
+    )).toDF("id", "value", "expected_mean", "expected_median")
     Seq("mean", "median").foreach { strategy =>
       val imputer = new Imputer().setInputCol("value").setOutputCol("out").setStrategy(strategy)
       val model = imputer.fit(df)
@@ -42,32 +42,13 @@ class ImputerSuite extends SparkFunSuite with MLlibTestSparkContext with Default
     }
   }
 
-  test("Imputer for Double with missing Value -1.0") {
-    val df = sqlContext.createDataFrame( Seq(
-      (0, 1.0, 1.0, 1.0),
-      (1, 1.0, 1.0, 1.0),
-      (2, 3.0, 3.0, 3.0),
-      (3, 4.0, 4.0, 4.0),
-      (4, -1.0, 2.25, 1.0)
-    )).toDF("id", "value", "exp_mean", "exp_median")
-    Seq("mean", "median").foreach { strategy =>
-      val imputer = new Imputer().setInputCol("value").setOutputCol("out").setStrategy(strategy)
-        .setMissingValue(-1.0)
-      val model = imputer.fit(df)
-      model.transform(df).select("exp_" + strategy, "out").collect().foreach {
-        case Row(exp: Double, out: Double) =>
-          assert(exp ~== out absTol 1e-5, s"Impute($strategy) error. Expected: $exp, actual: $out")
-      }
-    }
-  }
-
   test("Imputer for Double with missing Value -1.0 and contains NaN") {
     val df = sqlContext.createDataFrame( Seq(
       (0, 1.0, 1.0, 1.0),
       (1, 3.0, 3.0, 3.0),
       (2, Double.NaN, Double.NaN, Double.NaN),
       (3, -1.0, 2.0, 3.0)
-    )).toDF("id", "value", "exp_mean", "exp_median")
+    )).toDF("id", "value", "expected_mean", "expected_median")
     Seq("mean", "median").foreach { strategy =>
       val imputer = new Imputer().setInputCol("value").setOutputCol("out").setStrategy(strategy)
         .setMissingValue(-1.0)
@@ -87,7 +68,7 @@ class ImputerSuite extends SparkFunSuite with MLlibTestSparkContext with Default
       (2, 10.0F, 10.0F, 10.0F),
       (3, 10.0F, 10.0F, 10.0F),
       (4, -1.0F, 6.0F, 3.0F)
-    )).toDF("id", "value", "exp_mean", "exp_median")
+    )).toDF("id", "value", "expected_mean", "expected_median")
 
     Seq("mean", "median").foreach { strategy =>
       val imputer = new Imputer().setInputCol("value").setOutputCol("out").setStrategy(strategy)
@@ -108,7 +89,7 @@ class ImputerSuite extends SparkFunSuite with MLlibTestSparkContext with Default
       (2, 10.0, 10.0, 10.0),
       (3, Double.NaN, 8.0, 10.0),
       (4, -1.0, 8.0, 10.0)
-    )).toDF("id", "value", "exp_mean", "exp_median")
+    )).toDF("id", "value", "expected_mean", "expected_median")
     val df2 = df.selectExpr("*", "IF(value=-1.0, null, value) as nullable_value")
     Seq("mean", "median").foreach { strategy =>
       val imputer = new Imputer().setInputCol("nullable_value").setOutputCol("out")

From aef094bc7b7a00c0ded1b2998b7f98d2bc42c666 Mon Sep 17 00:00:00 2001
From: Yuhao Yang <hhbyyh@gmail.com>
Date: Fri, 29 Apr 2016 10:15:21 +0800
Subject: [PATCH 16/25] fix ut

---
 .../main/scala/org/apache/spark/ml/feature/Imputer.scala  | 5 ++---
 .../scala/org/apache/spark/ml/feature/ImputerSuite.scala  | 8 ++++----
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala
index 40ecfe51bd9fe..0ca4b524184ea 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala
@@ -42,7 +42,7 @@ private[feature] trait ImputerParams extends Params with HasInputCol with HasOut
    * @group param
    */
   final val strategy: Param[String] = new Param(this, "strategy", "strategy for imputation. " +
-    "If mean, then replace missing values using the mean value of the feature." +
+    "If mean, then replace missing values using the mean value of the feature. " +
     "If median, then replace missing values using the median value of the feature.",
     ParamValidators.inArray[String](Imputer.supportedStrategyNames.toArray))
 
@@ -183,8 +183,7 @@ class ImputerModel private[ml](
 @Since("2.0.0")
 object ImputerModel extends MLReadable[ImputerModel] {
 
-  private[ImputerModel]
-  class ImputerModelWriter(instance: ImputerModel) extends MLWriter {
+  private[ImputerModel] class ImputerModelWriter(instance: ImputerModel) extends MLWriter {
 
     private case class Data(surrogate: Double)
 
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala
index 959e8583070ef..06e754b21c6eb 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala
@@ -35,7 +35,7 @@ class ImputerSuite extends SparkFunSuite with MLlibTestSparkContext with Default
     Seq("mean", "median").foreach { strategy =>
       val imputer = new Imputer().setInputCol("value").setOutputCol("out").setStrategy(strategy)
       val model = imputer.fit(df)
-      model.transform(df).select("exp_" + strategy, "out").collect().foreach {
+      model.transform(df).select("expected_" + strategy, "out").collect().foreach {
        case Row(exp: Double, out: Double) =>
           assert(exp ~== out absTol 1e-5, s"Imputed values differ. Expected: $exp, actual: $out")
       }
@@ -53,7 +53,7 @@ class ImputerSuite extends SparkFunSuite with MLlibTestSparkContext with Default
       val imputer = new Imputer().setInputCol("value").setOutputCol("out").setStrategy(strategy)
         .setMissingValue(-1.0)
       val model = imputer.fit(df)
-      model.transform(df).select("exp_" + strategy, "out").collect().foreach {
+      model.transform(df).select("expected_" + strategy, "out").collect().foreach {
         case Row(exp: Double, out: Double) =>
           assert((exp.isNaN && out.isNaN) || (exp ~== out absTol 1e-5),
             s"Imputed values differ. Expected: $exp, actual: $out")
@@ -75,7 +75,7 @@ class ImputerSuite extends SparkFunSuite with MLlibTestSparkContext with Default
         .setMissingValue(-1)
       val model = imputer.fit(df)
       val result = model.transform(df)
-      model.transform(df).select("exp_" + strategy, "out").collect().foreach {
+      model.transform(df).select("expected_" + strategy, "out").collect().foreach {
         case Row(exp: Float, out: Float) =>
           assert(exp == out, s"Imputed values differ. Expected: $exp, actual: $out")
       }
@@ -95,7 +95,7 @@ class ImputerSuite extends SparkFunSuite with MLlibTestSparkContext with Default
       val imputer = new Imputer().setInputCol("nullable_value").setOutputCol("out")
         .setStrategy(strategy)
       val model = imputer.fit(df2)
-      model.transform(df2).select("exp_" + strategy, "out").collect().foreach {
+      model.transform(df2).select("expected_" + strategy, "out").collect().foreach {
         case Row(exp: Double, out: Double) =>
           assert(exp ~== out absTol 1e-5, s"Imputed values differ. Expected: $exp, actual: $out")
       }

From cca8dd41714d79476c2bf23f706012a282c53bcb Mon Sep 17 00:00:00 2001
From: Yuhao Yang <hhbyyh@gmail.com>
Date: Sat, 30 Apr 2016 22:01:21 -0400
Subject: [PATCH 17/25] rename ut

---
 .../main/scala/org/apache/spark/ml/feature/Imputer.scala    | 6 +++---
 .../scala/org/apache/spark/ml/feature/ImputerSuite.scala    | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala
index 0ca4b524184ea..efb86ddcecfa0 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala
@@ -73,9 +73,9 @@ private[feature] trait ImputerParams extends Params with HasInputCol with HasOut
 
 /**
  * :: Experimental ::
- * Imputation estimator for completing missing values, either using the mean or the
- * median of the column in which the missing values are located. InputCol should be
- * of DoubleType or FloatType.
+ * Imputation estimator for completing missing values, either using the mean or the median
+ * of the column in which the missing values are located. The input column should be of
+ * DoubleType or FloatType.
  *
  * Note that the mean/median value is computed after filtering out missing values.
  * All Null values in the input column are treated as missing, and so are also imputed.
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala
index 06e754b21c6eb..a2f4664e1a6b3 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala
@@ -42,7 +42,7 @@ class ImputerSuite extends SparkFunSuite with MLlibTestSparkContext with Default
     }
   }
 
-  test("Imputer for Double with missing Value -1.0 and contains NaN") {
+  test("Imputer should handle NaNs when computing surrogate value, if missingValue is not NaN") {
     val df = sqlContext.createDataFrame( Seq(
       (0, 1.0, 1.0, 1.0),
       (1, 3.0, 3.0, 3.0),
@@ -82,7 +82,7 @@ class ImputerSuite extends SparkFunSuite with MLlibTestSparkContext with Default
     }
   }
 
-  test("Imputer should impute null") {
+  test("Imputer should impute null as well as 'missingValue'") {
     val df = sqlContext.createDataFrame( Seq(
       (0, 4.0, 4.0, 4.0),
       (1, 10.0, 10.0, 10.0),

From 4e0743139796ac53df2554cfa53736b8035bae15 Mon Sep 17 00:00:00 2001
From: Yuhao Yang <hhbyyh@gmail.com>
Date: Tue, 3 May 2016 17:09:31 +0800
Subject: [PATCH 18/25] update parameter doc

---
 mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala
index efb86ddcecfa0..9030c87666a4e 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala
@@ -140,7 +140,7 @@ object Imputer extends DefaultParamsReadable[Imputer] {
  * :: Experimental ::
  * Model fitted by [[Imputer]].
  *
- * @param surrogate statistics value for each feature during fitting
+ * @param surrogate Value by which missing values in the input column will be replaced.
  */
 @Experimental
 class ImputerModel private[ml](

From 544a65c82a7d921bdff73998e8b350e11b51dcbe Mon Sep 17 00:00:00 2001
From: Yuhao Yang <hhbyyh@gmail.com>
Date: Wed, 7 Sep 2016 12:42:45 -0700
Subject: [PATCH 19/25] update version

---
 .../org/apache/spark/ml/feature/Imputer.scala    | 16 ++++++++--------
 .../apache/spark/ml/feature/ImputerSuite.scala   |  8 ++++----
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala
index 9030c87666a4e..c6ab55c9bfba7 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala
@@ -81,10 +81,10 @@ private[feature] trait ImputerParams extends Params with HasInputCol with HasOut
  * All Null values in the input column are treated as missing, and so are also imputed.
  */
 @Experimental
-class Imputer @Since("2.0.0")(override val uid: String)
+class Imputer @Since("2.1.0")(override val uid: String)
   extends Estimator[ImputerModel] with ImputerParams with DefaultParamsWritable {
 
-  @Since("2.0.0")
+  @Since("2.1.0")
   def this() = this(Identifiable.randomUID("imputer"))
 
   /** @group setParam */
@@ -126,13 +126,13 @@ class Imputer @Since("2.0.0")(override val uid: String)
   }
 }
 
-@Since("2.0.0")
+@Since("2.1.0")
 object Imputer extends DefaultParamsReadable[Imputer] {
 
   /** Set of strategy names that Imputer currently supports. */
   private[ml] val supportedStrategyNames = Set("mean", "median")
 
-  @Since("2.0.0")
+  @Since("2.1.0")
   override def load(path: String): Imputer = super.load(path)
 }
 
@@ -175,12 +175,12 @@ class ImputerModel private[ml](
     copyValues(copied, extra).setParent(parent)
   }
 
-  @Since("2.0.0")
+  @Since("2.1.0")
   override def write: MLWriter = new ImputerModelWriter(this)
 }
 
 
-@Since("2.0.0")
+@Since("2.1.0")
 object ImputerModel extends MLReadable[ImputerModel] {
 
   private[ImputerModel] class ImputerModelWriter(instance: ImputerModel) extends MLWriter {
@@ -211,9 +211,9 @@ object ImputerModel extends MLReadable[ImputerModel] {
     }
   }
 
-  @Since("2.0.0")
+  @Since("2.1.0")
   override def read: MLReader[ImputerModel] = new ImputerReader
 
-  @Since("2.0.0")
+  @Since("2.1.0")
   override def load(path: String): ImputerModel = super.load(path)
 }
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala
index a2f4664e1a6b3..c61200f2224e6 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala
@@ -25,7 +25,7 @@ import org.apache.spark.sql.Row
 class ImputerSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {
 
   test("Imputer for Double with default missing Value NaN") {
-    val df = sqlContext.createDataFrame( Seq(
+    val df = spark.createDataFrame( Seq(
       (0, 1.0, 1.0, 1.0),
       (1, 1.0, 1.0, 1.0),
       (2, 3.0, 3.0, 3.0),
@@ -43,7 +43,7 @@ class ImputerSuite extends SparkFunSuite with MLlibTestSparkContext with Default
   }
 
   test("Imputer should handle NaNs when computing surrogate value, if missingValue is not NaN") {
-    val df = sqlContext.createDataFrame( Seq(
+    val df = spark.createDataFrame( Seq(
       (0, 1.0, 1.0, 1.0),
       (1, 3.0, 3.0, 3.0),
       (2, Double.NaN, Double.NaN, Double.NaN),
@@ -62,7 +62,7 @@ class ImputerSuite extends SparkFunSuite with MLlibTestSparkContext with Default
   }
 
   test("Imputer for Float with missing Value -1.0") {
-    val df = sqlContext.createDataFrame( Seq(
+    val df = spark.createDataFrame( Seq(
       (0, 1.0F, 1.0F, 1.0F),
       (1, 3.0F, 3.0F, 3.0F),
       (2, 10.0F, 10.0F, 10.0F),
@@ -83,7 +83,7 @@ class ImputerSuite extends SparkFunSuite with MLlibTestSparkContext with Default
   }
 
   test("Imputer should impute null as well as 'missingValue'") {
-    val df = sqlContext.createDataFrame( Seq(
+    val df = spark.createDataFrame( Seq(
       (0, 4.0, 4.0, 4.0),
       (1, 10.0, 10.0, 10.0),
       (2, 10.0, 10.0, 10.0),

From 91d4cee75a150ad2335dba0838c47cb4f0505ad8 Mon Sep 17 00:00:00 2001
From: Yuhao <yuhao.yang@intel.com>
Date: Thu, 6 Oct 2016 17:39:51 -0700
Subject: [PATCH 20/25] throw exception

---
 .../org/apache/spark/ml/feature/Imputer.scala | 25 ++++++++++++-------
 .../spark/ml/feature/ImputerSuite.scala       | 22 +++++++++++++---
 2 files changed, 34 insertions(+), 13 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala
index c6ab55c9bfba7..e7477c616e6ee 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala
@@ -19,6 +19,7 @@ package org.apache.spark.ml.feature
 
 import org.apache.hadoop.fs.Path
 
+import org.apache.spark.SparkException
 import org.apache.spark.annotation.{Experimental, Since}
 import org.apache.spark.ml.{Estimator, Model}
 import org.apache.spark.ml.param._
@@ -51,6 +52,7 @@ private[feature] trait ImputerParams extends Params with HasInputCol with HasOut
 
   /**
    * The placeholder for the missing values. All occurrences of missingValue will be imputed.
+   * Note that null values are always treated as missing.
    * Default: Double.NaN
    *
    * @group param
@@ -65,8 +67,6 @@ private[feature] trait ImputerParams extends Params with HasInputCol with HasOut
   protected def validateAndTransformSchema(schema: StructType): StructType = {
     val inputType = schema($(inputCol)).dataType
     SchemaUtils.checkColumnTypes(schema, $(inputCol), Seq(DoubleType, FloatType))
-    require(!schema.fieldNames.contains($(outputCol)),
-      s"Output column ${$(outputCol)} already exists.")
     SchemaUtils.appendColumn(schema, $(outputCol), inputType)
   }
 }
@@ -75,7 +75,8 @@ private[feature] trait ImputerParams extends Params with HasInputCol with HasOut
  * :: Experimental ::
  * Imputation estimator for completing missing values, either using the mean or the median
  * of the column in which the missing values are located. The input column should be of
- * DoubleType or FloatType.
+ * DoubleType or FloatType. Currently Imputer does not support categorical features yet
+ * and possibly creates incorrect values for a categorical feature.
  *
  * Note that the mean/median value is computed after filtering out missing values.
  * All Null values in the input column are treated as missing, and so are also imputed.
@@ -88,18 +89,22 @@ class Imputer @Since("2.1.0")(override val uid: String)
   def this() = this(Identifiable.randomUID("imputer"))
 
   /** @group setParam */
+  @Since("2.1.0")
   def setInputCol(value: String): this.type = set(inputCol, value)
 
   /** @group setParam */
+  @Since("2.1.0")
   def setOutputCol(value: String): this.type = set(outputCol, value)
 
   /**
    * Imputation strategy. Available options are ["mean", "median"].
    * @group setParam
    */
+  @Since("2.1.0")
   def setStrategy(value: String): this.type = set(strategy, value)
 
   /** @group setParam */
+  @Since("2.1.0")
   def setMissingValue(value: Double): this.type = set(missingValue, value)
 
   setDefault(strategy -> "mean", missingValue -> Double.NaN)
@@ -109,8 +114,13 @@ class Imputer @Since("2.1.0")(override val uid: String)
     val ic = col($(inputCol))
     val filtered = dataset.select(ic.cast(DoubleType))
       .filter(ic.isNotNull && ic =!= $(missingValue))
+      .filter(!ic.isNaN)
+    if(filtered.count() == 0) {
+      throw new SparkException(s"surrogate cannot be computed. " +
+        s"All the values in ${$(inputCol)} are Null, Nan or missingValue ($missingValue)")
+    }
     val surrogate = $(strategy) match {
-      case "mean" => filtered.filter(!ic.isNaN).select(avg($(inputCol))).first().getDouble(0)
+      case "mean" => filtered.select(avg($(inputCol))).first().getDouble(0)
       case "median" => filtered.stat.approxQuantile($(inputCol), Array(0.5), 0.001)(0)
     }
     copyValues(new ImputerModel(uid, surrogate).setParent(this))
@@ -120,10 +130,7 @@ class Imputer @Since("2.1.0")(override val uid: String)
     validateAndTransformSchema(schema)
   }
 
-  override def copy(extra: ParamMap): Imputer = {
-    val copied = new Imputer(uid)
-    copyValues(copied, extra)
-  }
+  override def copy(extra: ParamMap): Imputer = defaultCopy(extra)
 }
 
 @Since("2.1.0")
@@ -158,7 +165,7 @@ class ImputerModel private[ml](
 
   override def transform(dataset: Dataset[_]): DataFrame = {
     transformSchema(dataset.schema, logging = true)
-    val inputType = dataset.select($(inputCol)).schema.fields(0).dataType
+    val inputType = dataset.schema($(inputCol)).dataType
     val ic = col($(inputCol))
     dataset.withColumn($(outputCol), when(ic.isNull, surrogate)
       .when(ic === $(missingValue), surrogate)
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala
index c61200f2224e6..292595d29fb34 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala
@@ -16,8 +16,8 @@
  */
 package org.apache.spark.ml.feature
 
-import org.apache.spark.SparkFunSuite
-import org.apache.spark.ml.util.{DefaultReadWriteTest}
+import org.apache.spark.{SparkException, SparkFunSuite}
+import org.apache.spark.ml.util.DefaultReadWriteTest
 import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.apache.spark.mllib.util.TestingUtils._
 import org.apache.spark.sql.Row
@@ -30,7 +30,7 @@ class ImputerSuite extends SparkFunSuite with MLlibTestSparkContext with Default
       (1, 1.0, 1.0, 1.0),
       (2, 3.0, 3.0, 3.0),
       (3, 4.0, 4.0, 4.0),
-      (4, Double.NaN, 2.25, 1.0)
+      (4, Double.NaN, 2.25, 3.0)
     )).toDF("id", "value", "expected_mean", "expected_median")
     Seq("mean", "median").foreach { strategy =>
       val imputer = new Imputer().setInputCol("value").setOutputCol("out").setStrategy(strategy)
@@ -74,7 +74,6 @@ class ImputerSuite extends SparkFunSuite with MLlibTestSparkContext with Default
       val imputer = new Imputer().setInputCol("value").setOutputCol("out").setStrategy(strategy)
         .setMissingValue(-1)
       val model = imputer.fit(df)
-      val result = model.transform(df)
       model.transform(df).select("expected_" + strategy, "out").collect().foreach {
         case Row(exp: Float, out: Float) =>
           assert(exp == out, s"Imputed values differ. Expected: $exp, actual: $out")
@@ -102,6 +101,21 @@ class ImputerSuite extends SparkFunSuite with MLlibTestSparkContext with Default
     }
   }
 
+
+  test("Imputer throws exception when surrogate cannot be computed") {
+    val df = spark.createDataFrame( Seq(
+      (0, Double.NaN, 1.0, 1.0),
+      (1, Double.NaN, 3.0, 3.0),
+      (2, Double.NaN, Double.NaN, Double.NaN)
+    )).toDF("id", "value", "expected_mean", "expected_median")
+    Seq("mean", "median").foreach { strategy =>
+      val imputer = new Imputer().setInputCol("value").setOutputCol("out").setStrategy(strategy)
+      intercept[SparkException] {
+        val model = imputer.fit(df)
+      }
+    }
+  }
+
   test("Imputer read/write") {
     val t = new Imputer()
       .setInputCol("myInputCol")

From 8744524e8da174316207cb4c33b425cbbd78f68e Mon Sep 17 00:00:00 2001
From: Yuhao <yuhao.yang@intel.com>
Date: Fri, 7 Oct 2016 11:34:42 -0700
Subject: [PATCH 21/25] change data format

---
 .../org/apache/spark/ml/feature/Imputer.scala | 31 ++++----
 .../spark/ml/feature/ImputerSuite.scala       | 79 +++++++++----------
 2 files changed, 54 insertions(+), 56 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala
index e7477c616e6ee..c39b8b243d0fb 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala
@@ -25,7 +25,7 @@ import org.apache.spark.ml.{Estimator, Model}
 import org.apache.spark.ml.param._
 import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol}
 import org.apache.spark.ml.util._
-import org.apache.spark.sql.{DataFrame, Dataset, Row}
+import org.apache.spark.sql.{DataFrame, Dataset}
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.types._
 
@@ -76,7 +76,7 @@ private[feature] trait ImputerParams extends Params with HasInputCol with HasOut
  * Imputation estimator for completing missing values, either using the mean or the median
  * of the column in which the missing values are located. The input column should be of
  * DoubleType or FloatType. Currently Imputer does not support categorical features yet
- * and possibly creates incorrect values for a categorical feature.
+ * (SPARK-15041) and possibly creates incorrect values for a categorical feature.
  *
  * Note that the mean/median value is computed after filtering out missing values.
  * All Null values in the input column are treated as missing, and so are also imputed.
@@ -123,7 +123,9 @@ class Imputer @Since("2.1.0")(override val uid: String)
       case "mean" => filtered.select(avg($(inputCol))).first().getDouble(0)
       case "median" => filtered.stat.approxQuantile($(inputCol), Array(0.5), 0.001)(0)
     }
-    copyValues(new ImputerModel(uid, surrogate).setParent(this))
+    import dataset.sparkSession.implicits._
+    val surrogateDF = Seq(surrogate.asInstanceOf[Double]).toDF($(inputCol))
+    copyValues(new ImputerModel(uid, surrogateDF).setParent(this))
   }
 
   override def transformSchema(schema: StructType): StructType = {
@@ -147,12 +149,13 @@ object Imputer extends DefaultParamsReadable[Imputer] {
  * :: Experimental ::
  * Model fitted by [[Imputer]].
  *
- * @param surrogate Value by which missing values in the input column will be replaced.
+ * @param surrogateDF Value by which missing values in the input columns will be replaced. This
+ *    is stored using DataFrame with input column names and the corresponding surrogates.
  */
 @Experimental
 class ImputerModel private[ml](
     override val uid: String,
-    val surrogate: Double)
+    val surrogateDF: DataFrame)
   extends Model[ImputerModel] with ImputerParams with MLWritable {
 
   import ImputerModel._
@@ -167,8 +170,9 @@ class ImputerModel private[ml](
     transformSchema(dataset.schema, logging = true)
     val inputType = dataset.schema($(inputCol)).dataType
     val ic = col($(inputCol))
-    dataset.withColumn($(outputCol), when(ic.isNull, surrogate)
-      .when(ic === $(missingValue), surrogate)
+    val icsurrogate = surrogateDF.head().getDouble(0)
+    dataset.withColumn($(outputCol), when(ic.isNull, icsurrogate)
+      .when(ic === $(missingValue), icsurrogate)
       .otherwise(ic)
       .cast(inputType))
   }
@@ -178,7 +182,7 @@ class ImputerModel private[ml](
   }
 
   override def copy(extra: ParamMap): ImputerModel = {
-    val copied = new ImputerModel(uid, surrogate)
+    val copied = new ImputerModel(uid, surrogateDF)
     copyValues(copied, extra).setParent(parent)
   }
 
@@ -192,13 +196,10 @@ object ImputerModel extends MLReadable[ImputerModel] {
 
   private[ImputerModel] class ImputerModelWriter(instance: ImputerModel) extends MLWriter {
 
-    private case class Data(surrogate: Double)
-
     override protected def saveImpl(path: String): Unit = {
       DefaultParamsWriter.saveMetadata(instance, path, sc)
-      val data = new Data(instance.surrogate)
       val dataPath = new Path(path, "data").toString
-      sqlContext.createDataFrame(Seq(data)).repartition(1).write.parquet(dataPath)
+      instance.surrogateDF.repartition(1).write.parquet(dataPath)
     }
   }
 
@@ -209,10 +210,8 @@ object ImputerModel extends MLReadable[ImputerModel] {
     override def load(path: String): ImputerModel = {
       val metadata = DefaultParamsReader.loadMetadata(path, sc, className)
       val dataPath = new Path(path, "data").toString
-      val Row(surrogate: Double) = sqlContext.read.parquet(dataPath)
-        .select("surrogate")
-        .head()
-      val model = new ImputerModel(metadata.uid, surrogate)
+      val surrogateDF = sqlContext.read.parquet(dataPath)
+      val model = new ImputerModel(metadata.uid, surrogateDF)
       DefaultParamsReader.getAndSetParams(model, metadata)
       model
     }
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala
index 292595d29fb34..a53047f33ad7a 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala
@@ -20,7 +20,7 @@ import org.apache.spark.{SparkException, SparkFunSuite}
 import org.apache.spark.ml.util.DefaultReadWriteTest
 import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.apache.spark.mllib.util.TestingUtils._
-import org.apache.spark.sql.Row
+import org.apache.spark.sql.{DataFrame, Row}
 
 class ImputerSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {
 
@@ -32,14 +32,8 @@ class ImputerSuite extends SparkFunSuite with MLlibTestSparkContext with Default
       (3, 4.0, 4.0, 4.0),
       (4, Double.NaN, 2.25, 3.0)
     )).toDF("id", "value", "expected_mean", "expected_median")
-    Seq("mean", "median").foreach { strategy =>
-      val imputer = new Imputer().setInputCol("value").setOutputCol("out").setStrategy(strategy)
-      val model = imputer.fit(df)
-      model.transform(df).select("expected_" + strategy, "out").collect().foreach {
-       case Row(exp: Double, out: Double) =>
-          assert(exp ~== out absTol 1e-5, s"Imputed values differ. Expected: $exp, actual: $out")
-      }
-    }
+    val imputer = new Imputer().setInputCol("value").setOutputCol("out")
+    ImputerSuite.iterateStrategyTest(imputer, df)
   }
 
   test("Imputer should handle NaNs when computing surrogate value, if missingValue is not NaN") {
@@ -49,16 +43,9 @@ class ImputerSuite extends SparkFunSuite with MLlibTestSparkContext with Default
       (2, Double.NaN, Double.NaN, Double.NaN),
       (3, -1.0, 2.0, 3.0)
     )).toDF("id", "value", "expected_mean", "expected_median")
-    Seq("mean", "median").foreach { strategy =>
-      val imputer = new Imputer().setInputCol("value").setOutputCol("out").setStrategy(strategy)
-        .setMissingValue(-1.0)
-      val model = imputer.fit(df)
-      model.transform(df).select("expected_" + strategy, "out").collect().foreach {
-        case Row(exp: Double, out: Double) =>
-          assert((exp.isNaN && out.isNaN) || (exp ~== out absTol 1e-5),
-            s"Imputed values differ. Expected: $exp, actual: $out")
-      }
-    }
+    val imputer = new Imputer().setInputCol("value").setOutputCol("out")
+      .setMissingValue(-1.0)
+    ImputerSuite.iterateStrategyTest(imputer, df)
   }
 
   test("Imputer for Float with missing Value -1.0") {
@@ -69,16 +56,9 @@ class ImputerSuite extends SparkFunSuite with MLlibTestSparkContext with Default
       (3, 10.0F, 10.0F, 10.0F),
       (4, -1.0F, 6.0F, 3.0F)
     )).toDF("id", "value", "expected_mean", "expected_median")
-
-    Seq("mean", "median").foreach { strategy =>
-      val imputer = new Imputer().setInputCol("value").setOutputCol("out").setStrategy(strategy)
-        .setMissingValue(-1)
-      val model = imputer.fit(df)
-      model.transform(df).select("expected_" + strategy, "out").collect().foreach {
-        case Row(exp: Float, out: Float) =>
-          assert(exp == out, s"Imputed values differ. Expected: $exp, actual: $out")
-      }
-    }
+    val imputer = new Imputer().setInputCol("value").setOutputCol("out")
+      .setMissingValue(-1)
+    ImputerSuite.iterateStrategyTest(imputer, df)
   }
 
   test("Imputer should impute null as well as 'missingValue'") {
@@ -90,15 +70,8 @@ class ImputerSuite extends SparkFunSuite with MLlibTestSparkContext with Default
       (4, -1.0, 8.0, 10.0)
     )).toDF("id", "value", "expected_mean", "expected_median")
     val df2 = df.selectExpr("*", "IF(value=-1.0, null, value) as nullable_value")
-    Seq("mean", "median").foreach { strategy =>
-      val imputer = new Imputer().setInputCol("nullable_value").setOutputCol("out")
-        .setStrategy(strategy)
-      val model = imputer.fit(df2)
-      model.transform(df2).select("expected_" + strategy, "out").collect().foreach {
-        case Row(exp: Double, out: Double) =>
-          assert(exp ~== out absTol 1e-5, s"Imputed values differ. Expected: $exp, actual: $out")
-      }
-    }
+    val imputer = new Imputer().setInputCol("nullable_value").setOutputCol("out")
+    ImputerSuite.iterateStrategyTest(imputer, df2)
   }
 
 
@@ -125,12 +98,38 @@ class ImputerSuite extends SparkFunSuite with MLlibTestSparkContext with Default
   }
 
   test("ImputerModel read/write") {
+    val spark = this.spark
+    import spark.implicits._
+    val surrogateDF = Seq(1.234).toDF("myInputCol")
+
     val instance = new ImputerModel(
-      "myImputer", 1.234)
+      "myImputer", surrogateDF)
       .setInputCol("myInputCol")
       .setOutputCol("myOutputCol")
     val newInstance = testDefaultReadWrite(instance)
-    assert(newInstance.surrogate === instance.surrogate)
+    assert(newInstance.surrogateDF.collect() === instance.surrogateDF.collect())
   }
 
 }
+
+object ImputerSuite{
+
+  /**
+   * Imputation strategy. Available options are ["mean", "median"].
+   * @param df DataFrame with columns "id", "value", "expected_mean", "expected_median"
+   */
+  def iterateStrategyTest(imputer: Imputer, df: DataFrame): Unit = {
+    Seq("mean", "median").foreach { strategy =>
+      imputer.setStrategy(strategy)
+      val model = imputer.fit(df)
+      model.transform(df).select("expected_" + strategy, "out").collect().foreach {
+        case Row(exp: Float, out: Float) =>
+          assert((exp.isNaN && out.isNaN) || (exp == out),
+            s"Imputed values differ. Expected: $exp, actual: $out")
+        case Row(exp: Double, out: Double) =>
+          assert((exp.isNaN && out.isNaN) || (exp ~== out absTol 1e-5),
+            s"Imputed values differ. Expected: $exp, actual: $out")
+      }
+    }
+  }
+}

From e86d9198c65c3b289b091150b52708deda37f090 Mon Sep 17 00:00:00 2001
From: Yuhao Yang <yuhao.yang@intel.com>
Date: Tue, 21 Feb 2017 23:46:17 -0800
Subject: [PATCH 22/25] add multi column support

---
 .../org/apache/spark/ml/feature/Imputer.scala | 93 +++++++++++++------
 .../spark/ml/feature/ImputerSuite.scala       | 21 +++--
 2 files changed, 75 insertions(+), 39 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala
index c39b8b243d0fb..4a9f63810e088 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala
@@ -23,16 +23,16 @@ import org.apache.spark.SparkException
 import org.apache.spark.annotation.{Experimental, Since}
 import org.apache.spark.ml.{Estimator, Model}
 import org.apache.spark.ml.param._
-import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol}
+import org.apache.spark.ml.param.shared.{HasInputCols, HasOutputCol}
 import org.apache.spark.ml.util._
-import org.apache.spark.sql.{DataFrame, Dataset}
+import org.apache.spark.sql.{DataFrame, Dataset, Row}
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.types._
 
 /**
  * Params for [[Imputer]] and [[ImputerModel]].
  */
-private[feature] trait ImputerParams extends Params with HasInputCol with HasOutputCol {
+private[feature] trait ImputerParams extends Params with HasInputCols with HasOutputCol {
 
   /**
    * The imputation strategy.
@@ -63,11 +63,32 @@ private[feature] trait ImputerParams extends Params with HasInputCol with HasOut
   /** @group getParam */
   def getMissingValue: Double = $(missingValue)
 
+    /**
+   * Param for output column names.
+   * @group param
+   */
+  final val outputCols: StringArrayParam = new StringArrayParam(this, "outputCols",
+    "output column names")
+
+  /** @group getParam */
+  final def getOutputCols: Array[String] = $(outputCols)
+
   /** Validates and transforms the input schema. */
   protected def validateAndTransformSchema(schema: StructType): StructType = {
-    val inputType = schema($(inputCol)).dataType
-    SchemaUtils.checkColumnTypes(schema, $(inputCol), Seq(DoubleType, FloatType))
-    SchemaUtils.appendColumn(schema, $(outputCol), inputType)
+    require($(inputCols).length == $(outputCols).length, "inputCols and outputCols should have" +
+      "the same length")
+    val localInputCols = $(inputCols)
+    val localOutputCols = $(outputCols)
+    var outputSchema = schema
+
+    $(inputCols).indices.foreach { i =>
+      val inputCol = localInputCols(i)
+      val outputCol = localOutputCols(i)
+      val inputType = schema(inputCol).dataType
+      SchemaUtils.checkColumnTypes(schema, inputCol, Seq(DoubleType, FloatType))
+      outputSchema = SchemaUtils.appendColumn(outputSchema, outputCol, inputType)
+    }
+    outputSchema
   }
 }
 
@@ -90,11 +111,11 @@ class Imputer @Since("2.1.0")(override val uid: String)
 
   /** @group setParam */
   @Since("2.1.0")
-  def setInputCol(value: String): this.type = set(inputCol, value)
+  def setInputCols(value: Array[String]): this.type = set(inputCols, value)
 
   /** @group setParam */
   @Since("2.1.0")
-  def setOutputCol(value: String): this.type = set(outputCol, value)
+  def setOutputCols(value: Array[String]): this.type = set(outputCols, value)
 
   /**
    * Imputation strategy. Available options are ["mean", "median"].
@@ -111,20 +132,24 @@ class Imputer @Since("2.1.0")(override val uid: String)
 
   override def fit(dataset: Dataset[_]): ImputerModel = {
     transformSchema(dataset.schema, logging = true)
-    val ic = col($(inputCol))
-    val filtered = dataset.select(ic.cast(DoubleType))
-      .filter(ic.isNotNull && ic =!= $(missingValue))
-      .filter(!ic.isNaN)
-    if(filtered.count() == 0) {
-      throw new SparkException(s"surrogate cannot be computed. " +
-        s"All the values in ${$(inputCol)} are Null, Nan or missingValue ($missingValue)")
-    }
-    val surrogate = $(strategy) match {
-      case "mean" => filtered.select(avg($(inputCol))).first().getDouble(0)
-      case "median" => filtered.stat.approxQuantile($(inputCol), Array(0.5), 0.001)(0)
+    val surrogates = $(inputCols).map { inputCol =>
+      val ic = col(inputCol)
+      val filtered = dataset.select(ic.cast(DoubleType))
+        .filter(ic.isNotNull && ic =!= $(missingValue))
+        .filter(!ic.isNaN)
+      if(filtered.rdd.isEmpty()) {
+        throw new SparkException(s"surrogate cannot be computed. " +
+          s"All the values in ${inputCol} are Null, Nan or missingValue ($missingValue)")
+      }
+      val surrogate = $(strategy) match {
+        case "mean" => filtered.select(avg(inputCol)).first().getDouble(0)
+        case "median" => filtered.stat.approxQuantile(inputCol, Array(0.5), 0.001)(0)
+      }
+      surrogate.asInstanceOf[Double]
     }
+
     import dataset.sparkSession.implicits._
-    val surrogateDF = Seq(surrogate.asInstanceOf[Double]).toDF($(inputCol))
+    val surrogateDF = Seq(surrogates).toDF("surrogates")
     copyValues(new ImputerModel(uid, surrogateDF).setParent(this))
   }
 
@@ -161,20 +186,30 @@ class ImputerModel private[ml](
   import ImputerModel._
 
   /** @group setParam */
-  def setInputCol(value: String): this.type = set(inputCol, value)
+  def setInputCols(value: Array[String]): this.type = set(inputCols, value)
 
   /** @group setParam */
-  def setOutputCol(value: String): this.type = set(outputCol, value)
+  def setOutputCols(value: Array[String]): this.type = set(outputCols, value)
 
   override def transform(dataset: Dataset[_]): DataFrame = {
     transformSchema(dataset.schema, logging = true)
-    val inputType = dataset.schema($(inputCol)).dataType
-    val ic = col($(inputCol))
-    val icsurrogate = surrogateDF.head().getDouble(0)
-    dataset.withColumn($(outputCol), when(ic.isNull, icsurrogate)
-      .when(ic === $(missingValue), icsurrogate)
-      .otherwise(ic)
-      .cast(inputType))
+    val localInputCols = $(inputCols)
+    val localOutputCols = $(outputCols)
+    var outputDF = dataset
+    val surrogates = surrogateDF.head().getSeq[Double](0)
+
+    $(inputCols).indices.foreach { i =>
+      val inputCol = localInputCols(i)
+      val outputCol = localOutputCols(i)
+      val inputType = dataset.schema(inputCol).dataType
+      val ic = col(inputCol)
+      val icSurrogate = surrogates(i)
+      outputDF = outputDF.withColumn(outputCol, when(ic.isNull, icSurrogate)
+        .when(ic === $(missingValue), icSurrogate)
+        .otherwise(ic)
+        .cast(inputType))
+    }
+    outputDF.toDF()
   }
 
   override def transformSchema(schema: StructType): StructType = {
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala
index a53047f33ad7a..5cabd7f204cee 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala
@@ -30,9 +30,9 @@ class ImputerSuite extends SparkFunSuite with MLlibTestSparkContext with Default
       (1, 1.0, 1.0, 1.0),
       (2, 3.0, 3.0, 3.0),
       (3, 4.0, 4.0, 4.0),
-      (4, Double.NaN, 2.25, 3.0)
+      (4, Double.NaN, 2.25, 1.0)
     )).toDF("id", "value", "expected_mean", "expected_median")
-    val imputer = new Imputer().setInputCol("value").setOutputCol("out")
+    val imputer = new Imputer().setInputCols(Array("value")).setOutputCols(Array("out"))
     ImputerSuite.iterateStrategyTest(imputer, df)
   }
 
@@ -43,7 +43,7 @@ class ImputerSuite extends SparkFunSuite with MLlibTestSparkContext with Default
       (2, Double.NaN, Double.NaN, Double.NaN),
       (3, -1.0, 2.0, 3.0)
     )).toDF("id", "value", "expected_mean", "expected_median")
-    val imputer = new Imputer().setInputCol("value").setOutputCol("out")
+    val imputer = new Imputer().setInputCols(Array("value")).setOutputCols(Array("out"))
       .setMissingValue(-1.0)
     ImputerSuite.iterateStrategyTest(imputer, df)
   }
@@ -56,7 +56,7 @@ class ImputerSuite extends SparkFunSuite with MLlibTestSparkContext with Default
       (3, 10.0F, 10.0F, 10.0F),
       (4, -1.0F, 6.0F, 3.0F)
     )).toDF("id", "value", "expected_mean", "expected_median")
-    val imputer = new Imputer().setInputCol("value").setOutputCol("out")
+    val imputer = new Imputer().setInputCols(Array("value")).setOutputCols(Array("out"))
       .setMissingValue(-1)
     ImputerSuite.iterateStrategyTest(imputer, df)
   }
@@ -70,7 +70,7 @@ class ImputerSuite extends SparkFunSuite with MLlibTestSparkContext with Default
       (4, -1.0, 8.0, 10.0)
     )).toDF("id", "value", "expected_mean", "expected_median")
     val df2 = df.selectExpr("*", "IF(value=-1.0, null, value) as nullable_value")
-    val imputer = new Imputer().setInputCol("nullable_value").setOutputCol("out")
+    val imputer = new Imputer().setInputCols(Array("nullable_value")).setOutputCols(Array("out"))
     ImputerSuite.iterateStrategyTest(imputer, df2)
   }
 
@@ -82,7 +82,8 @@ class ImputerSuite extends SparkFunSuite with MLlibTestSparkContext with Default
       (2, Double.NaN, Double.NaN, Double.NaN)
     )).toDF("id", "value", "expected_mean", "expected_median")
     Seq("mean", "median").foreach { strategy =>
-      val imputer = new Imputer().setInputCol("value").setOutputCol("out").setStrategy(strategy)
+      val imputer = new Imputer().setInputCols(Array("value")).setOutputCols(Array("out"))
+        .setStrategy(strategy)
       intercept[SparkException] {
         val model = imputer.fit(df)
       }
@@ -91,8 +92,8 @@ class ImputerSuite extends SparkFunSuite with MLlibTestSparkContext with Default
 
   test("Imputer read/write") {
     val t = new Imputer()
-      .setInputCol("myInputCol")
-      .setOutputCol("myOutputCol")
+      .setInputCols(Array("myInputCol"))
+      .setOutputCols(Array("myOutputCol"))
       .setMissingValue(-1.0)
     testDefaultReadWrite(t)
   }
@@ -104,8 +105,8 @@ class ImputerSuite extends SparkFunSuite with MLlibTestSparkContext with Default
 
     val instance = new ImputerModel(
       "myImputer", surrogateDF)
-      .setInputCol("myInputCol")
-      .setOutputCol("myOutputCol")
+      .setInputCols(Array("myInputCol"))
+      .setOutputCols(Array("myOutputCol"))
     val newInstance = testDefaultReadWrite(instance)
     assert(newInstance.surrogateDF.collect() === instance.surrogateDF.collect())
   }

From 41d91b9ef855a611016c9a9613942e578ff599dd Mon Sep 17 00:00:00 2001
From: Yuhao Yang <yuhao.yang@intel.com>
Date: Fri, 3 Mar 2017 14:49:22 -0800
Subject: [PATCH 23/25] change surrogateDF format and add ut for multi-columns

---
 .../org/apache/spark/ml/feature/Imputer.scala | 118 +++++++++---------
 .../spark/ml/feature/ImputerSuite.scala       |  76 +++++++----
 2 files changed, 112 insertions(+), 82 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala
index 4a9f63810e088..ec8920993921b 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala
@@ -23,7 +23,7 @@ import org.apache.spark.SparkException
 import org.apache.spark.annotation.{Experimental, Since}
 import org.apache.spark.ml.{Estimator, Model}
 import org.apache.spark.ml.param._
-import org.apache.spark.ml.param.shared.{HasInputCols, HasOutputCol}
+import org.apache.spark.ml.param.shared.HasInputCols
 import org.apache.spark.ml.util._
 import org.apache.spark.sql.{DataFrame, Dataset, Row}
 import org.apache.spark.sql.functions._
@@ -32,20 +32,21 @@ import org.apache.spark.sql.types._
 /**
  * Params for [[Imputer]] and [[ImputerModel]].
  */
-private[feature] trait ImputerParams extends Params with HasInputCols with HasOutputCol {
+private[feature] trait ImputerParams extends Params with HasInputCols {
 
   /**
    * The imputation strategy.
    * If "mean", then replace missing values using the mean value of the feature.
-   * If "median", then replace missing values using the approximate median value of the feature.
+   * If "median", then replace missing values using the approximate median value of the
+   * feature (relative error less than 0.001).
    * Default: mean
    *
    * @group param
    */
-  final val strategy: Param[String] = new Param(this, "strategy", "strategy for imputation. " +
-    "If mean, then replace missing values using the mean value of the feature. " +
-    "If median, then replace missing values using the median value of the feature.",
-    ParamValidators.inArray[String](Imputer.supportedStrategyNames.toArray))
+  final val strategy: Param[String] = new Param(this, "strategy", s"strategy for imputation. " +
+    s"If ${Imputer.mean}, then replace missing values using the mean value of the feature. " +
+    s"If ${Imputer.median}, then replace missing values using the median value of the feature.",
+    ParamValidators.inArray[String](Array(Imputer.mean, Imputer.median)))
 
   /** @group getParam */
   def getStrategy: String = $(strategy)
@@ -63,7 +64,7 @@ private[feature] trait ImputerParams extends Params with HasInputCols with HasOu
   /** @group getParam */
   def getMissingValue: Double = $(missingValue)
 
-    /**
+  /**
    * Param for output column names.
    * @group param
    */
@@ -75,20 +76,18 @@ private[feature] trait ImputerParams extends Params with HasInputCols with HasOu
 
   /** Validates and transforms the input schema. */
   protected def validateAndTransformSchema(schema: StructType): StructType = {
-    require($(inputCols).length == $(outputCols).length, "inputCols and outputCols should have" +
-      "the same length")
-    val localInputCols = $(inputCols)
-    val localOutputCols = $(outputCols)
-    var outputSchema = schema
-
-    $(inputCols).indices.foreach { i =>
-      val inputCol = localInputCols(i)
-      val outputCol = localOutputCols(i)
-      val inputType = schema(inputCol).dataType
+    require($(inputCols).length == $(inputCols).distinct.length, s"inputCols duplicates:" +
+      s" (${$(inputCols).mkString(", ")})")
+    require($(outputCols).length == $(outputCols).distinct.length, s"outputCols duplicates:" +
+      s" (${$(outputCols).mkString(", ")})")
+    require($(inputCols).length == $(outputCols).length, s"inputCols(${$(inputCols).length})" +
+      s" and outputCols(${$(outputCols).length}) should have the same length")
+    val outputFields = $(inputCols).zip($(outputCols)).map { case (inputCol, outputCol) =>
+      val inputField = schema(inputCol)
       SchemaUtils.checkColumnTypes(schema, inputCol, Seq(DoubleType, FloatType))
-      outputSchema = SchemaUtils.appendColumn(outputSchema, outputCol, inputType)
+      StructField(outputCol, inputField.dataType, inputField.nullable)
     }
-    outputSchema
+    StructType(schema ++ outputFields)
   }
 }
 
@@ -103,53 +102,56 @@ private[feature] trait ImputerParams extends Params with HasInputCols with HasOu
  * All Null values in the input column are treated as missing, and so are also imputed.
  */
 @Experimental
-class Imputer @Since("2.1.0")(override val uid: String)
+class Imputer @Since("2.2.0")(override val uid: String)
   extends Estimator[ImputerModel] with ImputerParams with DefaultParamsWritable {
 
-  @Since("2.1.0")
+  @Since("2.2.0")
   def this() = this(Identifiable.randomUID("imputer"))
 
   /** @group setParam */
-  @Since("2.1.0")
+  @Since("2.2.0")
   def setInputCols(value: Array[String]): this.type = set(inputCols, value)
 
   /** @group setParam */
-  @Since("2.1.0")
+  @Since("2.2.0")
   def setOutputCols(value: Array[String]): this.type = set(outputCols, value)
 
   /**
    * Imputation strategy. Available options are ["mean", "median"].
    * @group setParam
    */
-  @Since("2.1.0")
+  @Since("2.2.0")
   def setStrategy(value: String): this.type = set(strategy, value)
 
   /** @group setParam */
-  @Since("2.1.0")
+  @Since("2.2.0")
   def setMissingValue(value: Double): this.type = set(missingValue, value)
 
-  setDefault(strategy -> "mean", missingValue -> Double.NaN)
+  import org.apache.spark.ml.feature.Imputer._
+  setDefault(strategy -> mean, missingValue -> Double.NaN)
 
   override def fit(dataset: Dataset[_]): ImputerModel = {
     transformSchema(dataset.schema, logging = true)
+    val spark = dataset.sparkSession
+    import spark.implicits._
     val surrogates = $(inputCols).map { inputCol =>
       val ic = col(inputCol)
       val filtered = dataset.select(ic.cast(DoubleType))
-        .filter(ic.isNotNull && ic =!= $(missingValue))
-        .filter(!ic.isNaN)
+        .filter(ic.isNotNull && ic =!= $(missingValue) && !ic.isNaN)
       if(filtered.rdd.isEmpty()) {
         throw new SparkException(s"surrogate cannot be computed. " +
-          s"All the values in ${inputCol} are Null, Nan or missingValue ($missingValue)")
+          s"All the values in $inputCol are Null, Nan or missingValue ($missingValue)")
       }
       val surrogate = $(strategy) match {
-        case "mean" => filtered.select(avg(inputCol)).first().getDouble(0)
-        case "median" => filtered.stat.approxQuantile(inputCol, Array(0.5), 0.001)(0)
+        case Imputer.mean => filtered.select(avg(inputCol)).as[Double].first()
+        case Imputer.median => filtered.stat.approxQuantile(inputCol, Array(0.5), 0.001).head
       }
-      surrogate.asInstanceOf[Double]
+      surrogate
     }
 
-    import dataset.sparkSession.implicits._
-    val surrogateDF = Seq(surrogates).toDF("surrogates")
+    val rows = spark.sparkContext.parallelize(Seq(Row.fromSeq(surrogates)))
+    val schema = StructType($(inputCols).map(col => StructField(col, DoubleType, nullable = false)))
+    val surrogateDF = spark.createDataFrame(rows, schema)
     copyValues(new ImputerModel(uid, surrogateDF).setParent(this))
   }
 
@@ -160,13 +162,14 @@ class Imputer @Since("2.1.0")(override val uid: String)
   override def copy(extra: ParamMap): Imputer = defaultCopy(extra)
 }
 
-@Since("2.1.0")
+@Since("2.2.0")
 object Imputer extends DefaultParamsReadable[Imputer] {
 
-  /** Set of strategy names that Imputer currently supports. */
-  private[ml] val supportedStrategyNames = Set("mean", "median")
+  /** strategy names that Imputer currently supports. */
+  private[ml] val mean = "mean"
+  private[ml] val median = "median"
 
-  @Since("2.1.0")
+  @Since("2.2.0")
   override def load(path: String): Imputer = super.load(path)
 }
 
@@ -174,8 +177,8 @@ object Imputer extends DefaultParamsReadable[Imputer] {
  * :: Experimental ::
  * Model fitted by [[Imputer]].
  *
- * @param surrogateDF Value by which missing values in the input columns will be replaced. This
- *    is stored using DataFrame with input column names and the corresponding surrogates.
+ * @param surrogateDF a DataFrame contains inputCols and their corresponding surrogates, which are
+ *                    used to replace the missing values in the input DataFrame.
  */
 @Experimental
 class ImputerModel private[ml](
@@ -193,21 +196,18 @@ class ImputerModel private[ml](
 
   override def transform(dataset: Dataset[_]): DataFrame = {
     transformSchema(dataset.schema, logging = true)
-    val localInputCols = $(inputCols)
-    val localOutputCols = $(outputCols)
     var outputDF = dataset
-    val surrogates = surrogateDF.head().getSeq[Double](0)
-
-    $(inputCols).indices.foreach { i =>
-      val inputCol = localInputCols(i)
-      val outputCol = localOutputCols(i)
-      val inputType = dataset.schema(inputCol).dataType
-      val ic = col(inputCol)
-      val icSurrogate = surrogates(i)
-      outputDF = outputDF.withColumn(outputCol, when(ic.isNull, icSurrogate)
-        .when(ic === $(missingValue), icSurrogate)
-        .otherwise(ic)
-        .cast(inputType))
+    val surrogates = surrogateDF.select($(inputCols).head, $(inputCols).tail: _*).head().toSeq
+
+    $(inputCols).zip($(outputCols)).zip(surrogates).foreach {
+      case ((inputCol, outputCol), surrogate) =>
+        val inputType = dataset.schema(inputCol).dataType
+        val ic = col(inputCol)
+        outputDF = outputDF.withColumn(outputCol,
+          when(ic.isNull, surrogate)
+          .when(ic === $(missingValue), surrogate)
+          .otherwise(ic)
+          .cast(inputType))
     }
     outputDF.toDF()
   }
@@ -221,12 +221,12 @@ class ImputerModel private[ml](
     copyValues(copied, extra).setParent(parent)
   }
 
-  @Since("2.1.0")
+  @Since("2.2.0")
   override def write: MLWriter = new ImputerModelWriter(this)
 }
 
 
-@Since("2.1.0")
+@Since("2.2.0")
 object ImputerModel extends MLReadable[ImputerModel] {
 
   private[ImputerModel] class ImputerModelWriter(instance: ImputerModel) extends MLWriter {
@@ -252,9 +252,9 @@ object ImputerModel extends MLReadable[ImputerModel] {
     }
   }
 
-  @Since("2.1.0")
+  @Since("2.2.0")
   override def read: MLReader[ImputerModel] = new ImputerReader
 
-  @Since("2.1.0")
+  @Since("2.2.0")
   override def load(path: String): ImputerModel = super.load(path)
 }
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala
index 5cabd7f204cee..9e6392aa65fac 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala
@@ -26,13 +26,15 @@ class ImputerSuite extends SparkFunSuite with MLlibTestSparkContext with Default
 
   test("Imputer for Double with default missing Value NaN") {
     val df = spark.createDataFrame( Seq(
-      (0, 1.0, 1.0, 1.0),
-      (1, 1.0, 1.0, 1.0),
-      (2, 3.0, 3.0, 3.0),
-      (3, 4.0, 4.0, 4.0),
-      (4, Double.NaN, 2.25, 1.0)
-    )).toDF("id", "value", "expected_mean", "expected_median")
-    val imputer = new Imputer().setInputCols(Array("value")).setOutputCols(Array("out"))
+      (0, 1.0, 4.0, 1.0, 1.0, 4.0, 4.0),
+      (1, 11.0, 12.0, 11.0, 11.0, 12.0, 12.0),
+      (2, 3.0, Double.NaN, 3.0, 3.0, 10.0, 12.0),
+      (3, Double.NaN, 14.0, 5.0, 3.0, 14.0, 14.0)
+    )).toDF("id", "value1", "value2", "expected_mean_value1", "expected_median_value1",
+      "expected_mean_value2", "expected_median_value2")
+    val imputer = new Imputer()
+      .setInputCols(Array("value1", "value2"))
+      .setOutputCols(Array("out1", "out2"))
     ImputerSuite.iterateStrategyTest(imputer, df)
   }
 
@@ -42,7 +44,7 @@ class ImputerSuite extends SparkFunSuite with MLlibTestSparkContext with Default
       (1, 3.0, 3.0, 3.0),
       (2, Double.NaN, Double.NaN, Double.NaN),
       (3, -1.0, 2.0, 3.0)
-    )).toDF("id", "value", "expected_mean", "expected_median")
+    )).toDF("id", "value", "expected_mean_value", "expected_median_value")
     val imputer = new Imputer().setInputCols(Array("value")).setOutputCols(Array("out"))
       .setMissingValue(-1.0)
     ImputerSuite.iterateStrategyTest(imputer, df)
@@ -55,32 +57,31 @@ class ImputerSuite extends SparkFunSuite with MLlibTestSparkContext with Default
       (2, 10.0F, 10.0F, 10.0F),
       (3, 10.0F, 10.0F, 10.0F),
       (4, -1.0F, 6.0F, 3.0F)
-    )).toDF("id", "value", "expected_mean", "expected_median")
+    )).toDF("id", "value", "expected_mean_value", "expected_median_value")
     val imputer = new Imputer().setInputCols(Array("value")).setOutputCols(Array("out"))
       .setMissingValue(-1)
     ImputerSuite.iterateStrategyTest(imputer, df)
   }
 
   test("Imputer should impute null as well as 'missingValue'") {
-    val df = spark.createDataFrame( Seq(
+    val rawDf = spark.createDataFrame( Seq(
       (0, 4.0, 4.0, 4.0),
       (1, 10.0, 10.0, 10.0),
       (2, 10.0, 10.0, 10.0),
       (3, Double.NaN, 8.0, 10.0),
       (4, -1.0, 8.0, 10.0)
-    )).toDF("id", "value", "expected_mean", "expected_median")
-    val df2 = df.selectExpr("*", "IF(value=-1.0, null, value) as nullable_value")
-    val imputer = new Imputer().setInputCols(Array("nullable_value")).setOutputCols(Array("out"))
-    ImputerSuite.iterateStrategyTest(imputer, df2)
+    )).toDF("id", "rawValue", "expected_mean_value", "expected_median_value")
+    val df = rawDf.selectExpr("*", "IF(rawValue=-1.0, null, rawValue) as value")
+    val imputer = new Imputer().setInputCols(Array("value")).setOutputCols(Array("out"))
+    ImputerSuite.iterateStrategyTest(imputer, df)
   }
 
-
   test("Imputer throws exception when surrogate cannot be computed") {
     val df = spark.createDataFrame( Seq(
       (0, Double.NaN, 1.0, 1.0),
       (1, Double.NaN, 3.0, 3.0),
       (2, Double.NaN, Double.NaN, Double.NaN)
-    )).toDF("id", "value", "expected_mean", "expected_median")
+    )).toDF("id", "value", "expected_mean_value", "expected_median_value")
     Seq("mean", "median").foreach { strategy =>
       val imputer = new Imputer().setInputCols(Array("value")).setOutputCols(Array("out"))
         .setStrategy(strategy)
@@ -90,6 +91,30 @@ class ImputerSuite extends SparkFunSuite with MLlibTestSparkContext with Default
     }
   }
 
+  test("Imputer throws exception when inputCols does not match outputCols") {
+    val df = spark.createDataFrame( Seq(
+      (0, 1.0, 1.0, 1.0),
+      (1, Double.NaN, 3.0, 3.0),
+      (2, Double.NaN, Double.NaN, Double.NaN)
+    )).toDF("id", "value1", "value2", "value3")
+    Seq("mean", "median").foreach { strategy =>
+      // inputCols and outCols length different
+      val imputer = new Imputer()
+        .setInputCols(Array("value1", "value2"))
+        .setOutputCols(Array("out1"))
+        .setStrategy(strategy)
+      intercept[IllegalArgumentException] {
+        val model = imputer.fit(df)
+      }
+      // duplicate name in inputCols
+      imputer.setInputCols(Array("value1", "value1")).setOutputCols(Array("out1, out2"))
+      intercept[IllegalArgumentException] {
+        val model = imputer.fit(df)
+      }
+
+    }
+  }
+
   test("Imputer read/write") {
     val t = new Imputer()
       .setInputCols(Array("myInputCol"))
@@ -120,16 +145,21 @@ object ImputerSuite{
    * @param df DataFrame with columns "id", "value", "expected_mean", "expected_median"
    */
   def iterateStrategyTest(imputer: Imputer, df: DataFrame): Unit = {
+    val inputCols = imputer.getInputCols
+
     Seq("mean", "median").foreach { strategy =>
       imputer.setStrategy(strategy)
       val model = imputer.fit(df)
-      model.transform(df).select("expected_" + strategy, "out").collect().foreach {
-        case Row(exp: Float, out: Float) =>
-          assert((exp.isNaN && out.isNaN) || (exp == out),
-            s"Imputed values differ. Expected: $exp, actual: $out")
-        case Row(exp: Double, out: Double) =>
-          assert((exp.isNaN && out.isNaN) || (exp ~== out absTol 1e-5),
-            s"Imputed values differ. Expected: $exp, actual: $out")
+      val resultDF = model.transform(df)
+      imputer.getInputCols.zip(imputer.getOutputCols).foreach { case (inputCol, outputCol) =>
+        resultDF.select(s"expected_${strategy}_$inputCol", outputCol).collect().foreach {
+          case Row(exp: Float, out: Float) =>
+            assert((exp.isNaN && out.isNaN) || (exp == out),
+              s"Imputed values differ. Expected: $exp, actual: $out")
+          case Row(exp: Double, out: Double) =>
+            assert((exp.isNaN && out.isNaN) || (exp ~== out absTol 1e-5),
+              s"Imputed values differ. Expected: $exp, actual: $out")
+        }
       }
     }
   }

From e378db5944d7d8bed0ebadc0573a3ea03fe387f0 Mon Sep 17 00:00:00 2001
From: Yuhao Yang <yuhao.yang@intel.com>
Date: Mon, 6 Mar 2017 13:15:22 -0800
Subject: [PATCH 24/25] unit test refine and comments update

---
 .../org/apache/spark/ml/feature/Imputer.scala | 19 ++++---
 .../spark/ml/feature/ImputerSuite.scala       | 49 +++++++++++++------
 2 files changed, 43 insertions(+), 25 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala
index ec8920993921b..6d3121c870721 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala
@@ -37,8 +37,7 @@ private[feature] trait ImputerParams extends Params with HasInputCols {
   /**
    * The imputation strategy.
    * If "mean", then replace missing values using the mean value of the feature.
-   * If "median", then replace missing values using the approximate median value of the
-   * feature (relative error less than 0.001).
+   * If "median", then replace missing values using the approximate median value of the feature.
    * Default: mean
    *
    * @group param
@@ -76,10 +75,10 @@ private[feature] trait ImputerParams extends Params with HasInputCols {
 
   /** Validates and transforms the input schema. */
   protected def validateAndTransformSchema(schema: StructType): StructType = {
-    require($(inputCols).length == $(inputCols).distinct.length, s"inputCols duplicates:" +
-      s" (${$(inputCols).mkString(", ")})")
-    require($(outputCols).length == $(outputCols).distinct.length, s"outputCols duplicates:" +
-      s" (${$(outputCols).mkString(", ")})")
+    require($(inputCols).length == $(inputCols).distinct.length, s"inputCols contains" +
+      s" duplicates: (${$(inputCols).mkString(", ")})")
+    require($(outputCols).length == $(outputCols).distinct.length, s"outputCols contains" +
+      s" duplicates: (${$(outputCols).mkString(", ")})")
     require($(inputCols).length == $(outputCols).length, s"inputCols(${$(inputCols).length})" +
       s" and outputCols(${$(outputCols).length}) should have the same length")
     val outputFields = $(inputCols).zip($(outputCols)).map { case (inputCol, outputCol) =>
@@ -99,7 +98,8 @@ private[feature] trait ImputerParams extends Params with HasInputCols {
  * (SPARK-15041) and possibly creates incorrect values for a categorical feature.
  *
  * Note that the mean/median value is computed after filtering out missing values.
- * All Null values in the input column are treated as missing, and so are also imputed.
+ * All Null values in the input column are treated as missing, and so are also imputed. For
+ * computing median, DataFrameStatFunctions.approxQuantile is used with a relative error of 0.001.
  */
 @Experimental
 class Imputer @Since("2.2.0")(override val uid: String)
@@ -127,8 +127,7 @@ class Imputer @Since("2.2.0")(override val uid: String)
   @Since("2.2.0")
   def setMissingValue(value: Double): this.type = set(missingValue, value)
 
-  import org.apache.spark.ml.feature.Imputer._
-  setDefault(strategy -> mean, missingValue -> Double.NaN)
+  setDefault(strategy -> Imputer.mean, missingValue -> Double.NaN)
 
   override def fit(dataset: Dataset[_]): ImputerModel = {
     transformSchema(dataset.schema, logging = true)
@@ -197,7 +196,7 @@ class ImputerModel private[ml](
   override def transform(dataset: Dataset[_]): DataFrame = {
     transformSchema(dataset.schema, logging = true)
     var outputDF = dataset
-    val surrogates = surrogateDF.select($(inputCols).head, $(inputCols).tail: _*).head().toSeq
+    val surrogates = surrogateDF.select($(inputCols).map(col): _*).head().toSeq
 
     $(inputCols).zip($(outputCols)).zip(surrogates).foreach {
       case ((inputCol, outputCol), surrogate) =>
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala
index 9e6392aa65fac..ee2ba73fa96d5 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala
@@ -85,33 +85,51 @@ class ImputerSuite extends SparkFunSuite with MLlibTestSparkContext with Default
     Seq("mean", "median").foreach { strategy =>
       val imputer = new Imputer().setInputCols(Array("value")).setOutputCols(Array("out"))
         .setStrategy(strategy)
-      intercept[SparkException] {
-        val model = imputer.fit(df)
+      withClue("Imputer should fail all the values are invalid") {
+        val e: SparkException = intercept[SparkException] {
+          val model = imputer.fit(df)
+        }
+        assert(e.getMessage.contains("surrogate cannot be computed"))
       }
     }
   }
 
-  test("Imputer throws exception when inputCols does not match outputCols") {
+  test("Imputer input & output column validation") {
     val df = spark.createDataFrame( Seq(
       (0, 1.0, 1.0, 1.0),
       (1, Double.NaN, 3.0, 3.0),
       (2, Double.NaN, Double.NaN, Double.NaN)
     )).toDF("id", "value1", "value2", "value3")
     Seq("mean", "median").foreach { strategy =>
-      // inputCols and outCols length different
-      val imputer = new Imputer()
-        .setInputCols(Array("value1", "value2"))
-        .setOutputCols(Array("out1"))
-        .setStrategy(strategy)
-      intercept[IllegalArgumentException] {
-        val model = imputer.fit(df)
+      withClue("Imputer should fail if inputCols and outputCols are different length") {
+        val e: IllegalArgumentException = intercept[IllegalArgumentException] {
+          val imputer = new Imputer().setStrategy(strategy)
+            .setInputCols(Array("value1", "value2"))
+            .setOutputCols(Array("out1"))
+          val model = imputer.fit(df)
+        }
+        assert(e.getMessage.contains("should have the same length"))
       }
-      // duplicate name in inputCols
-      imputer.setInputCols(Array("value1", "value1")).setOutputCols(Array("out1, out2"))
-      intercept[IllegalArgumentException] {
-        val model = imputer.fit(df)
+
+      withClue("Imputer should fail if inputCols contains duplicates") {
+        val e: IllegalArgumentException = intercept[IllegalArgumentException] {
+          val imputer = new Imputer().setStrategy(strategy)
+            .setInputCols(Array("value1", "value1"))
+            .setOutputCols(Array("out1", "out2"))
+          val model = imputer.fit(df)
+        }
+        assert(e.getMessage.contains("inputCols contains duplicates"))
       }
 
+      withClue("Imputer should fail if outputCols contains duplicates") {
+        val e: IllegalArgumentException = intercept[IllegalArgumentException] {
+          val imputer = new Imputer().setStrategy(strategy)
+            .setInputCols(Array("value1", "value2"))
+            .setOutputCols(Array("out1", "out1"))
+          val model = imputer.fit(df)
+        }
+        assert(e.getMessage.contains("outputCols contains duplicates"))
+      }
     }
   }
 
@@ -133,12 +151,13 @@ class ImputerSuite extends SparkFunSuite with MLlibTestSparkContext with Default
       .setInputCols(Array("myInputCol"))
       .setOutputCols(Array("myOutputCol"))
     val newInstance = testDefaultReadWrite(instance)
+    assert(newInstance.surrogateDF.columns === instance.surrogateDF.columns)
     assert(newInstance.surrogateDF.collect() === instance.surrogateDF.collect())
   }
 
 }
 
-object ImputerSuite{
+object ImputerSuite {
 
   /**
    * Imputation strategy. Available options are ["mean", "median"].

From c67afc11e1fee58b65da67bf3e25e5245f72280d Mon Sep 17 00:00:00 2001
From: Yuhao Yang <yuhao.yang@intel.com>
Date: Wed, 8 Mar 2017 10:37:12 -0800
Subject: [PATCH 25/25] fix exception message

---
 .../src/main/scala/org/apache/spark/ml/feature/Imputer.scala  | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala
index 6d3121c870721..b1a802ee13fc4 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala
@@ -137,9 +137,9 @@ class Imputer @Since("2.2.0")(override val uid: String)
       val ic = col(inputCol)
       val filtered = dataset.select(ic.cast(DoubleType))
         .filter(ic.isNotNull && ic =!= $(missingValue) && !ic.isNaN)
-      if(filtered.rdd.isEmpty()) {
+      if(filtered.take(1).length == 0) {
         throw new SparkException(s"surrogate cannot be computed. " +
-          s"All the values in $inputCol are Null, Nan or missingValue ($missingValue)")
+          s"All the values in $inputCol are Null, Nan or missingValue(${$(missingValue)})")
       }
       val surrogate = $(strategy) match {
         case Imputer.mean => filtered.select(avg(inputCol)).as[Double].first()