[SPARK-20604][ML] Allow imputer to handle numeric types

actuaryzhang · Wayne Zhang · srowen · commit 6d7a6751d88d · 2019-08-02T10:54:50.000-05:00
## What changes were proposed in this pull request? Imputer currently requires input column to be Double or Float, but the logic should work on any numeric data types. Many practical problems have integer data types, and it could get very tedious to manually cast them into Double before calling imputer. This transformer could be extended to handle all numeric types. ## How was this patch tested? new test Closes #17864 from actuaryzhang/imputer. Lead-authored-by: actuaryzhang <actuaryzhang10@gmail.com> Co-authored-by: Wayne Zhang <actuaryzhang@uber.com> Signed-off-by: Sean Owen <sean.owen@databricks.com>
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala
@@ -73,7 +73,7 @@ private[feature] trait ImputerParams extends Params with HasInputCols with HasOu
       s" and outputCols(${$(outputCols).length}) should have the same length")
     val outputFields = $(inputCols).zip($(outputCols)).map { case (inputCol, outputCol) =>
       val inputField = schema(inputCol)
-      SchemaUtils.checkColumnTypes(schema, inputCol, Seq(DoubleType, FloatType))
+      SchemaUtils.checkNumericType(schema, inputCol)
       StructField(outputCol, inputField.dataType, inputField.nullable)
     }
     StructType(schema ++ outputFields)
@@ -84,9 +84,13 @@ private[feature] trait ImputerParams extends Params with HasInputCols with HasOu
  * :: Experimental ::
  * Imputation estimator for completing missing values, either using the mean or the median
  * of the columns in which the missing values are located. The input columns should be of
- * DoubleType or FloatType. Currently Imputer does not support categorical features
+ * numeric type. Currently Imputer does not support categorical features
  * (SPARK-15041) and possibly creates incorrect values for a categorical feature.
  *
+ * Note when an input column is integer, the imputed value is casted (truncated) to an integer type.
+ * For example, if the input column is IntegerType (1, 2, 4, null),
+ * the output will be IntegerType (1, 2, 4, 2) after mean imputation.
+ *
  * Note that the mean/median value is computed after filtering out missing values.
  * All Null values in the input columns are treated as missing, and so are also imputed. For
  * computing median, DataFrameStatFunctions.approxQuantile is used with a relative error of 0.001.
@@ -218,7 +222,7 @@ class ImputerModel private[ml] (
     val newCols = $(inputCols).zip($(outputCols)).zip(surrogates).map {
       case ((inputCol, outputCol), surrogate) =>
         val inputType = dataset.schema(inputCol).dataType
-        val ic = col(inputCol)
+        val ic = col(inputCol).cast(DoubleType)
         when(ic.isNull, surrogate)
           .when(ic === $(missingValue), surrogate)
           .otherwise(ic)
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala
@@ -20,6 +20,8 @@ import org.apache.spark.SparkException
 import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTest}
 import org.apache.spark.mllib.util.TestingUtils._
 import org.apache.spark.sql.{DataFrame, Row}
+import org.apache.spark.sql.functions._
+import org.apache.spark.sql.types._
 
 class ImputerSuite extends MLTest with DefaultReadWriteTest {
 
@@ -176,6 +178,48 @@ class ImputerSuite extends MLTest with DefaultReadWriteTest {
     assert(newInstance.surrogateDF.collect() === instance.surrogateDF.collect())
   }
 
+  test("Imputer for IntegerType with default missing value null") {
+
+    val df = spark.createDataFrame(Seq[(Integer, Integer, Integer)](
+      (1, 1, 1),
+      (11, 11, 11),
+      (3, 3, 3),
+      (null, 5, 3)
+    )).toDF("value1", "expected_mean_value1", "expected_median_value1")
+
+    val imputer = new Imputer()
+      .setInputCols(Array("value1"))
+      .setOutputCols(Array("out1"))
+
+    val types = Seq(IntegerType, LongType)
+    for (mType <- types) {
+      // cast all columns to desired data type for testing
+      val df2 = df.select(df.columns.map(c => col(c).cast(mType)): _*)
+      ImputerSuite.iterateStrategyTest(imputer, df2)
+    }
+  }
+
+  test("Imputer for IntegerType with missing value -1") {
+
+    val df = spark.createDataFrame(Seq[(Integer, Integer, Integer)](
+      (1, 1, 1),
+      (11, 11, 11),
+      (3, 3, 3),
+      (-1, 5, 3)
+    )).toDF("value1", "expected_mean_value1", "expected_median_value1")
+
+    val imputer = new Imputer()
+      .setInputCols(Array("value1"))
+      .setOutputCols(Array("out1"))
+      .setMissingValue(-1.0)
+
+    val types = Seq(IntegerType, LongType)
+    for (mType <- types) {
+      // cast all columns to desired data type for testing
+      val df2 = df.select(df.columns.map(c => col(c).cast(mType)): _*)
+      ImputerSuite.iterateStrategyTest(imputer, df2)
+    }
+  }
 }
 
 object ImputerSuite {
@@ -190,13 +234,26 @@ object ImputerSuite {
       val model = imputer.fit(df)
       val resultDF = model.transform(df)
       imputer.getInputCols.zip(imputer.getOutputCols).foreach { case (inputCol, outputCol) =>
+
+        // check dataType is consistent between input and output
+        val inputType = resultDF.schema(inputCol).dataType
+        val outputType = resultDF.schema(outputCol).dataType
+        assert(inputType == outputType, "Output type is not the same as input type.")
+
+        // check value
         resultDF.select(s"expected_${strategy}_$inputCol", outputCol).collect().foreach {
           case Row(exp: Float, out: Float) =>
             assert((exp.isNaN && out.isNaN) || (exp == out),
               s"Imputed values differ. Expected: $exp, actual: $out")
           case Row(exp: Double, out: Double) =>
             assert((exp.isNaN && out.isNaN) || (exp ~== out absTol 1e-5),
               s"Imputed values differ. Expected: $exp, actual: $out")
+          case Row(exp: Integer, out: Integer) =>
+            assert(exp == out,
+              s"Imputed values differ. Expected: $exp, actual: $out")
+          case Row(exp: Long, out: Long) =>
+            assert(exp == out,
+              s"Imputed values differ. Expected: $exp, actual: $out")
         }
       }
     }