From 2999b268192e244bd7a520d62a0914e4742ee45d Mon Sep 17 00:00:00 2001 From: Yuhao Yang Date: Mon, 29 Feb 2016 09:46:04 -0800 Subject: [PATCH 01/25] initial commit for Imputer --- .../org/apache/spark/ml/feature/Imputer.scala | 145 ++++++++++++++++++ 1 file changed, 145 insertions(+) create mode 100644 mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala new file mode 100644 index 0000000000000..5eb8c49f2d8d6 --- /dev/null +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala @@ -0,0 +1,145 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.ml.feature + +import org.apache.hadoop.fs.Path + +import org.apache.spark.annotation.{Experimental, Since} +import org.apache.spark.ml.Transformer +import org.apache.spark.ml.param.{Param, ParamMap} +import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} +import org.apache.spark.ml.util._ +import org.apache.spark.mllib.linalg.{Vector, VectorUDT} +import org.apache.spark.mllib.stat.Statistics +import org.apache.spark.rdd.RDD +import org.apache.spark.sql._ +import org.apache.spark.sql.functions._ +import org.apache.spark.sql.types.{StructField, StructType} + +/** + * :: Experimental :: + * + */ +@Experimental +class Imputer private[ml]( + override val uid: String) + extends Transformer with HasInputCol with HasOutputCol with MLWritable { + + import Imputer._ + + /** @group setParam */ + def setInputCol(value: String): this.type = set(inputCol, value) + + /** @group setParam */ + def setOutputCol(value: String): this.type = set(outputCol, value) + + val strategy: Param[String] = new Param(this, "strategy", "strategy for imputation") + + /** @group getParam */ + def getStrategy: String = $(strategy) + + /** @group setParam */ + def setStrategy(value: String): this.type = set(strategy, value) + + + override def transform(dataset: DataFrame): DataFrame = { + + val reScale = udf { (vector: Vector) => + if (vector == null) { + val replacement = $(strategy) match { + case "mean" => + val input = dataset.select($(inputCol)).rdd.map { case Row(v: Vector) => v } + val summary = Statistics.colStats(input) + summary.mean + case "median" => + val input = dataset.select($(inputCol)).rdd.map { case Row(v: Vector) => v } + Imputer.getMedian(input) + case "most" => + val input = dataset.select($(inputCol)).rdd.map { case Row(v: Vector) => v } + val most = input.map(v => (v, 1)).reduceByKey(_ + _).sortBy(-_._2).take(1)(0)._1 + most + } + } + } + + dataset.withColumn($(outputCol), reScale(col($(inputCol)))) + } + + override def transformSchema(schema: StructType): StructType = { + validateParams() + val inputType = schema($(inputCol)).dataType + require(inputType.isInstanceOf[VectorUDT], + s"Input column ${$(inputCol)} must be a vector column") + require(!schema.fieldNames.contains($(outputCol)), + s"Output column ${$(outputCol)} already exists.") + val outputFields = schema.fields :+ StructField($(outputCol), new VectorUDT, false) + StructType(outputFields) + } + + override def copy(extra: ParamMap): Imputer = { + val copied = new Imputer(uid) + copyValues(copied, extra) + } + + @Since("1.6.0") + override def write: MLWriter = new ImputerWriter(this) +} + +@Since("1.6.0") +object Imputer extends MLReadable[Imputer] { + + private def getMedian(input: RDD[Vector]): Vector = { + val summary = Statistics.colStats(input) + summary.mean + } + + private[MinMaxScalerModel] + class ImputerWriter(instance: Imputer) extends MLWriter { + + private case class Data(strategy: String) + + override protected def saveImpl(path: String): Unit = { + DefaultParamsWriter.saveMetadata(instance, path, sc) + val data = new Data(instance.getStrategy) + val dataPath = new Path(path, "data").toString + sqlContext.createDataFrame(Seq(data)).repartition(1).write.parquet(dataPath) + } + } + + private class ImputerReader extends MLReader[Imputer] { + + private val className = classOf[Imputer].getName + + override def load(path: String): Imputer = { + val metadata = DefaultParamsReader.loadMetadata(path, sc, className) + val dataPath = new Path(path, "data").toString + val Row(strategy: String) = sqlContext.read.parquet(dataPath) + .select("strategy") + .head() + val model = new Imputer(metadata.uid).setStrategy(strategy) + DefaultParamsReader.getAndSetParams(model, metadata) + model + } + } + + @Since("1.6.0") + override def read: MLReader[Imputer] = new ImputerReader + + @Since("1.6.0") + override def load(path: String): Imputer = super.load(path) +} From 8335cf21ebde164a22f3447000a1c468a69f39fc Mon Sep 17 00:00:00 2001 From: Yuhao Yang Date: Mon, 29 Feb 2016 10:27:40 -0800 Subject: [PATCH 02/25] adjust mean and most --- .../org/apache/spark/ml/feature/Imputer.scala | 27 +++++++++++-------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala index 5eb8c49f2d8d6..db426ca01d732 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala @@ -40,6 +40,8 @@ class Imputer private[ml]( override val uid: String) extends Transformer with HasInputCol with HasOutputCol with MLWritable { + def this() = this(Identifiable.randomUID("tokenizer")) + import Imputer._ /** @group setParam */ @@ -56,25 +58,33 @@ class Imputer private[ml]( /** @group setParam */ def setStrategy(value: String): this.type = set(strategy, value) + setDefault(strategy -> "mean") override def transform(dataset: DataFrame): DataFrame = { - val reScale = udf { (vector: Vector) => if (vector == null) { val replacement = $(strategy) match { case "mean" => - val input = dataset.select($(inputCol)).rdd.map { case Row(v: Vector) => v } + val input = dataset.select($(inputCol)).rdd.filter(r => !r.anyNull) + .map { case Row(v: Vector) => v } val summary = Statistics.colStats(input) summary.mean case "median" => - val input = dataset.select($(inputCol)).rdd.map { case Row(v: Vector) => v } - Imputer.getMedian(input) + val df = dataset.select($(inputCol)) + df.registerTempTable("medianTable") + val median = df.sqlContext + .sql(s"select percentile(${$(inputCol)}, 0.5) from medianTable") + .head().getAs[Vector](0) + median case "most" => - val input = dataset.select($(inputCol)).rdd.map { case Row(v: Vector) => v } + val input = dataset.select($(inputCol)).rdd.filter(r => !r.anyNull) + .map { case Row(v: Vector) => v } val most = input.map(v => (v, 1)).reduceByKey(_ + _).sortBy(-_._2).take(1)(0)._1 most } + replacement } + else vector } dataset.withColumn($(outputCol), reScale(col($(inputCol)))) @@ -103,12 +113,7 @@ class Imputer private[ml]( @Since("1.6.0") object Imputer extends MLReadable[Imputer] { - private def getMedian(input: RDD[Vector]): Vector = { - val summary = Statistics.colStats(input) - summary.mean - } - - private[MinMaxScalerModel] + private[Imputer] class ImputerWriter(instance: Imputer) extends MLWriter { private case class Data(strategy: String) From b949be5746608ca3861df672ccd76d9af4257ae2 Mon Sep 17 00:00:00 2001 From: Yuhao Yang Date: Tue, 8 Mar 2016 18:19:32 -0800 Subject: [PATCH 03/25] refine code and add ut --- .../org/apache/spark/ml/feature/Imputer.scala | 278 +++++++++++++----- .../spark/ml/feature/ImputerSuite.scala | 101 +++++++ 2 files changed, 306 insertions(+), 73 deletions(-) create mode 100644 mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala index db426ca01d732..ea4b6995aa890 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala @@ -20,29 +20,80 @@ package org.apache.spark.ml.feature import org.apache.hadoop.fs.Path import org.apache.spark.annotation.{Experimental, Since} -import org.apache.spark.ml.Transformer -import org.apache.spark.ml.param.{Param, ParamMap} +import org.apache.spark.ml.{Estimator, Model} +import org.apache.spark.ml.param.{DoubleParam, Param, ParamMap, Params} import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} import org.apache.spark.ml.util._ -import org.apache.spark.mllib.linalg.{Vector, VectorUDT} -import org.apache.spark.mllib.stat.Statistics -import org.apache.spark.rdd.RDD -import org.apache.spark.sql._ -import org.apache.spark.sql.functions._ -import org.apache.spark.sql.types.{StructField, StructType} +import org.apache.spark.mllib.linalg._ +import org.apache.spark.sql.{DataFrame, Row} +import org.apache.spark.sql.functions.{col, udf} +import org.apache.spark.sql.types.{DoubleType, StructField, StructType} + +/** + * Params for [[Imputer]] and [[ImputerModel]]. + */ +private[feature] trait ImputerParams extends Params with HasInputCol with HasOutputCol { + + /** + * The imputation strategy. + * If "mean", then replace missing values using the mean along the axis. + * If "median", then replace missing values using the median along the axis. + * If "most", then replace missing using the most frequent value along the axis. + * Default: mean + * + * @group param + */ + val strategy: Param[String] = new Param(this, "strategy", "strategy for imputation. " + + "If mean, then replace missing values using the mean along the axis." + + "If median, then replace missing values using the median along the axis." + + "If most, then replace missing using the most frequent value along the axis.") + + /** @group getParam */ + def getStrategy: String = $(strategy) + + /** + * The placeholder for the missing values. All occurrences of missingvalues will be imputed. + * Default: Double.NaN + * + * @group param + */ + val missingValue: DoubleParam = new DoubleParam(this, "missingValue", + "The placeholder for the missing values. All occurrences of missingvalues will be imputed") + + /** @group getParam */ + def getMissingValue: Double = $(missingValue) + + /** Validates and transforms the input schema. */ + protected def validateAndTransformSchema(schema: StructType): StructType = { + validateParams() + val inputType = schema($(inputCol)).dataType + require(inputType.isInstanceOf[VectorUDT] || inputType.isInstanceOf[DoubleType], + s"Input column ${$(inputCol)} must of type vector or Double") + require(!schema.fieldNames.contains($(outputCol)), + s"Output column ${$(outputCol)} already exists.") + val outputFields = schema.fields :+ StructField($(outputCol), new VectorUDT, false) + StructType(outputFields) + } + + override def validateParams(): Unit = { + require(Seq("mean", "median", "most").contains($(strategy)), + s"${$(strategy)} is not supported. Options are mean, median and most") + } +} /** * :: Experimental :: + * Imputation estimator for completing missing values, either using the mean, the median or + * the most frequent value of the column in which the missing values are located. This class + * also allows for different missing values encodings. * */ @Experimental -class Imputer private[ml]( - override val uid: String) - extends Transformer with HasInputCol with HasOutputCol with MLWritable { - - def this() = this(Identifiable.randomUID("tokenizer")) +class Imputer @Since("2.0.0")(override val uid: String) + extends Estimator[ImputerModel] with ImputerParams with DefaultParamsWritable { - import Imputer._ + @Since("2.0.0") + def this() = this(Identifiable.randomUID("imputer")) /** @group setParam */ def setInputCol(value: String): this.type = set(inputCol, value) @@ -50,101 +101,182 @@ class Imputer private[ml]( /** @group setParam */ def setOutputCol(value: String): this.type = set(outputCol, value) - val strategy: Param[String] = new Param(this, "strategy", "strategy for imputation") - - /** @group getParam */ - def getStrategy: String = $(strategy) - /** @group setParam */ def setStrategy(value: String): this.type = set(strategy, value) - setDefault(strategy -> "mean") + /** @group setParam */ + def setMissingValue(value: Double): this.type = set(missingValue, value) - override def transform(dataset: DataFrame): DataFrame = { - val reScale = udf { (vector: Vector) => - if (vector == null) { - val replacement = $(strategy) match { - case "mean" => - val input = dataset.select($(inputCol)).rdd.filter(r => !r.anyNull) - .map { case Row(v: Vector) => v } - val summary = Statistics.colStats(input) - summary.mean - case "median" => - val df = dataset.select($(inputCol)) - df.registerTempTable("medianTable") - val median = df.sqlContext - .sql(s"select percentile(${$(inputCol)}, 0.5) from medianTable") - .head().getAs[Vector](0) - median - case "most" => - val input = dataset.select($(inputCol)).rdd.filter(r => !r.anyNull) - .map { case Row(v: Vector) => v } - val most = input.map(v => (v, 1)).reduceByKey(_ + _).sortBy(-_._2).take(1)(0)._1 - most - } - replacement - } - else vector + setDefault(strategy -> "mean", missingValue -> Double.NaN) + + override def fit(dataset: DataFrame): ImputerModel = { + val alternate = dataset.select($(inputCol)).schema.fields(0).dataType match { + case DoubleType => + val colStatistics = getColStatistics(dataset, $(inputCol)) + Vectors.dense(Array(colStatistics)) + case _: VectorUDT => + val vl = dataset.first().getAs[Vector]($(inputCol)).size + val statisticsArray = new Array[Double](vl) + (0 until vl).foreach(i => { + val getI = udf((v: Vector) => v(i)) + val tempColName = $(inputCol) + i + val tempData = dataset.where(s"${$(inputCol)} is not null") + .select($(inputCol)).withColumn(tempColName, getI(col($(inputCol)))) + statisticsArray(i) = getColStatistics(tempData, tempColName) + }) + Vectors.dense(statisticsArray) } + copyValues(new ImputerModel(uid, alternate).setParent(this)) + } - dataset.withColumn($(outputCol), reScale(col($(inputCol)))) + private def getColStatistics(dataset: DataFrame, colName: String): Double = { + val missValue = $(missingValue) match { + case Double.NaN => "NaN" + case _ => $(missingValue).toString + } + val colStatistics = $(strategy) match { + case "mean" => + dataset.where(s"$colName != '$missValue'").selectExpr(s"avg($colName)").first().getDouble(0) + case "median" => + // TODO: optimize the sort with quick-select or Percentile(Hive) if required + val rddDouble = dataset.select(colName).where(s"$colName != $missValue").rdd + .map(_.getDouble(0)) + rddDouble.sortBy(d => d).zipWithIndex().map { + case (v, idx) => (idx, v) + }.lookup(rddDouble.count()/2).head + case "most" => + val input = dataset.where(s"$colName != $missValue").select(colName).rdd + .map(_.getDouble(0)) + val most = input.map(d => (d, 1)).reduceByKey(_ + _).sortBy(-_._2).first()._1 + most + } + colStatistics } override def transformSchema(schema: StructType): StructType = { - validateParams() - val inputType = schema($(inputCol)).dataType - require(inputType.isInstanceOf[VectorUDT], - s"Input column ${$(inputCol)} must be a vector column") - require(!schema.fieldNames.contains($(outputCol)), - s"Output column ${$(outputCol)} already exists.") - val outputFields = schema.fields :+ StructField($(outputCol), new VectorUDT, false) - StructType(outputFields) + validateAndTransformSchema(schema) } override def copy(extra: ParamMap): Imputer = { val copied = new Imputer(uid) copyValues(copied, extra) } +} + +/** + * :: Experimental :: + * Model fitted by [[Imputer]]. + * + * @param alternate statistics value for each original column during fitting + */ +@Experimental +class ImputerModel private[ml] ( + override val uid: String, + val alternate: Vector) + extends Model[ImputerModel] with ImputerParams with MLWritable { + + import ImputerModel._ + + /** @group setParam */ + def setInputCol(value: String): this.type = set(inputCol, value) + + /** @group setParam */ + def setOutputCol(value: String): this.type = set(outputCol, value) + + private def matchMissingValue(value: Double): Boolean = { + val miss = $(missingValue) + value == miss || (value.isNaN && miss.isNaN) + } + + override def transform(dataset: DataFrame): DataFrame = { + dataset.select($(inputCol)).schema.fields(0).dataType match { + case DoubleType => + val impute = udf { (d: Double) => + if (matchMissingValue(d)) alternate(0) else d + } + dataset.withColumn($(outputCol), impute(col($(inputCol)))) + case _: VectorUDT => + val impute = udf { (vector: Vector) => + if (vector == null) { + alternate + } + else { + val vCopy = vector.copy + vCopy match { + case d: DenseVector => + var iter = 0 + while(iter < d.size) { + if (matchMissingValue(vCopy(iter))) { + d.values(iter) = alternate(iter) + } + + iter += 1 + } + case s: SparseVector => + var iter = 0 + while(iter < s.values.size) { + if (matchMissingValue(s.values(iter))) { + s.values(iter) = alternate(s.indices(iter)) + } + iter += 1 + } + } + vCopy + } + } + dataset.withColumn($(outputCol), impute(col($(inputCol)))) + } + } + + override def transformSchema(schema: StructType): StructType = { + validateAndTransformSchema(schema) + } - @Since("1.6.0") - override def write: MLWriter = new ImputerWriter(this) + override def copy(extra: ParamMap): ImputerModel = { + val copied = new ImputerModel(uid, alternate) + copyValues(copied, extra).setParent(parent) + } + + @Since("2.0.0") + override def write: MLWriter = new ImputerModelWriter(this) } -@Since("1.6.0") -object Imputer extends MLReadable[Imputer] { - private[Imputer] - class ImputerWriter(instance: Imputer) extends MLWriter { +@Since("2.0.0") +object ImputerModel extends MLReadable[ImputerModel] { - private case class Data(strategy: String) + private[ImputerModel] + class ImputerModelWriter(instance: ImputerModel) extends MLWriter { + + private case class Data(alternate: Vector) override protected def saveImpl(path: String): Unit = { DefaultParamsWriter.saveMetadata(instance, path, sc) - val data = new Data(instance.getStrategy) + val data = new Data(instance.alternate) val dataPath = new Path(path, "data").toString sqlContext.createDataFrame(Seq(data)).repartition(1).write.parquet(dataPath) } } - private class ImputerReader extends MLReader[Imputer] { + private class ImputerReader extends MLReader[ImputerModel] { - private val className = classOf[Imputer].getName + private val className = classOf[ImputerModel].getName - override def load(path: String): Imputer = { + override def load(path: String): ImputerModel = { val metadata = DefaultParamsReader.loadMetadata(path, sc, className) val dataPath = new Path(path, "data").toString - val Row(strategy: String) = sqlContext.read.parquet(dataPath) - .select("strategy") + val Row(alternate: Vector) = sqlContext.read.parquet(dataPath) + .select("alternate") .head() - val model = new Imputer(metadata.uid).setStrategy(strategy) + val model = new ImputerModel(metadata.uid, alternate) DefaultParamsReader.getAndSetParams(model, metadata) model } } - @Since("1.6.0") - override def read: MLReader[Imputer] = new ImputerReader + @Since("2.0.0") + override def read: MLReader[ImputerModel] = new ImputerReader - @Since("1.6.0") - override def load(path: String): Imputer = super.load(path) -} + @Since("2.0.0") + override def load(path: String): ImputerModel = super.load(path) +} \ No newline at end of file diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala new file mode 100644 index 0000000000000..52af7b15108e5 --- /dev/null +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala @@ -0,0 +1,101 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.ml.feature + +import org.apache.spark.SparkFunSuite +import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils} +import org.apache.spark.mllib.linalg.{Vector, Vectors} +import org.apache.spark.mllib.util.MLlibTestSparkContext +import org.apache.spark.mllib.util.TestingUtils._ +import org.apache.spark.sql.Row + + +class ImputerSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { + + test("Imputer for Double column") { + val df = sqlContext.createDataFrame( Seq( + (0, 1.0, 1.0, 1.0, 1.0), + (1, 1.0, 1.0, 1.0, 1.0), + (2, 3.0, 3.0, 3.0, 3.0), + (3, 4.0, 4.0, 4.0, 4.0), + (4, Double.NaN, 2.25, 3.0, 1.0 ) + )).toDF("id", "value", "mean", "median", "most") + Seq("mean", "median", "most").foreach { strategy => + val imputer = new Imputer().setInputCol("value").setOutputCol("out").setStrategy(strategy) + val model = imputer.fit(df) + model.transform(df).select(strategy, "out").collect() + .foreach { case Row(d1: Double, d2: Double) => + assert(d1 ~== d2 absTol 1e-5, s"Imputer ut error: $d2 should be $d1") + } + } + } + + test("Imputer for with missing Value -1.0") { + val df = sqlContext.createDataFrame( Seq( + (0, 1.0, 1.0, 1.0, 1.0), + (1, 1.0, 1.0, 1.0, 1.0), + (2, 3.0, 3.0, 3.0, 3.0), + (3, 4.0, 4.0, 4.0, 4.0), + (4, -1.0, 2.25, 3.0, 1.0 ) + )).toDF("id", "value", "mean", "median", "most") + Seq("mean", "median", "most").foreach { strategy => + val imputer = new Imputer().setInputCol("value").setOutputCol("out").setStrategy(strategy) + .setMissingValue(-1.0) + val model = imputer.fit(df) + model.transform(df).select(strategy, "out").collect() + .foreach { case Row(d1: Double, d2: Double) => + assert(d1 ~== d2 absTol 1e-5, s"Imputer ut error: $d2 should be $d1") + } + } + } + + test("Imputer for Vector column with NaN and null") { + val df = sqlContext.createDataFrame( Seq( + (0, Vector(1, 2), Vector(1, 2), Vector(1, 2), Vector(1, 2)), + (1, Vector(1, 2), Vector(1, 2), Vector(1, 2), Vector(1, 2)), + (2, Vector(3, 2), Vector(3, 2), Vector(3, 2), Vector(3, 2)), + (3, Vector(4, 2), Vector(4, 2), Vector(4, 2), Vector(4, 2)), + (4, Vector(Double.NaN, 2), Vector(2.25, 2), Vector(3.0, 2), Vector(1.0, 2)), + (4, null, Vector(2.25, 2), Vector(3.0, 2), Vector(1.0, 2)) + )).toDF("id", "value", "mean", "median", "most") + Seq("mean", "median", "most").foreach { strategy => + val imputer = new Imputer().setInputCol("value").setOutputCol("out").setStrategy(strategy) + val model = imputer.fit(df) + model.transform(df).select(strategy, "out").collect() + .foreach { case Row(d1: Double, d2: Double) => + assert(d1 ~== d2 absTol 1e-5, s"Imputer ut error: $d2 should be $d1") + } + } + } + + test("Imputer read/write") { + val t = new Imputer() + .setInputCol("myInputCol") + .setOutputCol("myOutputCol") + testDefaultReadWrite(t) + } + + test("Imputer read/write") { + val instance = new ImputerModel( + "myImputer", Vectors.dense(1.0, 10.0)) + .setInputCol("myInputCol") + .setOutputCol("myOutputCol") + val newInstance = testDefaultReadWrite(instance) + assert(newInstance.alternate === instance.alternate) + } + +} From c3d5d554f5ee90a18d96ff043f03f51f49d2ca7f Mon Sep 17 00:00:00 2001 From: Yuhao Yang Date: Tue, 8 Mar 2016 19:52:04 -0800 Subject: [PATCH 04/25] minor change --- .../src/main/scala/org/apache/spark/ml/feature/Imputer.scala | 4 ++-- .../scala/org/apache/spark/ml/feature/ImputerSuite.scala | 5 ++--- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala index ea4b6995aa890..9b810f52b65c5 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala @@ -68,7 +68,7 @@ private[feature] trait ImputerParams extends Params with HasInputCol with HasOut validateParams() val inputType = schema($(inputCol)).dataType require(inputType.isInstanceOf[VectorUDT] || inputType.isInstanceOf[DoubleType], - s"Input column ${$(inputCol)} must of type vector or Double") + s"Input column ${$(inputCol)} must of type Vector or Double") require(!schema.fieldNames.contains($(outputCol)), s"Output column ${$(outputCol)} already exists.") val outputFields = schema.fields :+ StructField($(outputCol), new VectorUDT, false) @@ -279,4 +279,4 @@ object ImputerModel extends MLReadable[ImputerModel] { @Since("2.0.0") override def load(path: String): ImputerModel = super.load(path) -} \ No newline at end of file +} diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala index 52af7b15108e5..a088b3b4b2386 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala @@ -18,12 +18,11 @@ package org.apache.spark.ml.feature import org.apache.spark.SparkFunSuite import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils} -import org.apache.spark.mllib.linalg.{Vector, Vectors} +import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ import org.apache.spark.sql.Row - class ImputerSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { test("Imputer for Double column") { @@ -44,7 +43,7 @@ class ImputerSuite extends SparkFunSuite with MLlibTestSparkContext with Default } } - test("Imputer for with missing Value -1.0") { + test("Imputer for Double with missing Value -1.0") { val df = sqlContext.createDataFrame( Seq( (0, 1.0, 1.0, 1.0, 1.0), (1, 1.0, 1.0, 1.0, 1.0), From 1b3966800982fa980307d1b6ded6e28e5f5985e8 Mon Sep 17 00:00:00 2001 From: Yuhao Yang Date: Tue, 8 Mar 2016 23:57:38 -0800 Subject: [PATCH 05/25] add object Imputer and ut refine --- .../org/apache/spark/ml/feature/Imputer.scala | 18 ++++++++----- .../spark/ml/feature/ImputerSuite.scala | 25 +++++++++++-------- 2 files changed, 27 insertions(+), 16 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala index 9b810f52b65c5..1678850f8fa35 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala @@ -134,19 +134,18 @@ class Imputer @Since("2.0.0")(override val uid: String) case Double.NaN => "NaN" case _ => $(missingValue).toString } + val filteredDF = dataset.select(colName).where(s"$colName != '$missValue'") val colStatistics = $(strategy) match { case "mean" => - dataset.where(s"$colName != '$missValue'").selectExpr(s"avg($colName)").first().getDouble(0) + filteredDF.selectExpr(s"avg($colName)").first().getDouble(0) case "median" => // TODO: optimize the sort with quick-select or Percentile(Hive) if required - val rddDouble = dataset.select(colName).where(s"$colName != $missValue").rdd - .map(_.getDouble(0)) + val rddDouble = filteredDF.rdd.map(_.getDouble(0)) rddDouble.sortBy(d => d).zipWithIndex().map { case (v, idx) => (idx, v) }.lookup(rddDouble.count()/2).head case "most" => - val input = dataset.where(s"$colName != $missValue").select(colName).rdd - .map(_.getDouble(0)) + val input = filteredDF.rdd.map(_.getDouble(0)) val most = input.map(d => (d, 1)).reduceByKey(_ + _).sortBy(-_._2).first()._1 most } @@ -163,6 +162,13 @@ class Imputer @Since("2.0.0")(override val uid: String) } } +@Since("1.6.0") +object Imputer extends DefaultParamsReadable[Imputer] { + + @Since("1.6.0") + override def load(path: String): Imputer = super.load(path) +} + /** * :: Experimental :: * Model fitted by [[Imputer]]. @@ -214,7 +220,7 @@ class ImputerModel private[ml] ( } case s: SparseVector => var iter = 0 - while(iter < s.values.size) { + while(iter < s.values.length) { if (matchMissingValue(s.values(iter))) { s.values(iter) = alternate(s.indices(iter)) } diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala index a088b3b4b2386..c22adc48a1f0f 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala @@ -18,7 +18,7 @@ package org.apache.spark.ml.feature import org.apache.spark.SparkFunSuite import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils} -import org.apache.spark.mllib.linalg.Vectors +import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ import org.apache.spark.sql.Row @@ -64,19 +64,23 @@ class ImputerSuite extends SparkFunSuite with MLlibTestSparkContext with Default test("Imputer for Vector column with NaN and null") { val df = sqlContext.createDataFrame( Seq( - (0, Vector(1, 2), Vector(1, 2), Vector(1, 2), Vector(1, 2)), - (1, Vector(1, 2), Vector(1, 2), Vector(1, 2), Vector(1, 2)), - (2, Vector(3, 2), Vector(3, 2), Vector(3, 2), Vector(3, 2)), - (3, Vector(4, 2), Vector(4, 2), Vector(4, 2), Vector(4, 2)), - (4, Vector(Double.NaN, 2), Vector(2.25, 2), Vector(3.0, 2), Vector(1.0, 2)), - (4, null, Vector(2.25, 2), Vector(3.0, 2), Vector(1.0, 2)) + (0, Vectors.dense(1, 2), Vectors.dense(1, 2), Vectors.dense(1, 2), Vectors.dense(1, 2)), + (1, Vectors.dense(1, 2), Vectors.dense(1, 2), Vectors.dense(1, 2), Vectors.dense(1, 2)), + (2, Vectors.dense(3, 2), Vectors.dense(3, 2), Vectors.dense(3, 2), Vectors.dense(3, 2)), + (3, Vectors.dense(4, 2), Vectors.dense(4, 2), Vectors.dense(4, 2), Vectors.dense(4, 2)), + (4, Vectors.dense(Double.NaN, 2), Vectors.dense(2.25, 2), Vectors.dense(3.0, 2), + Vectors.dense(1.0, 2)), + (5, Vectors.sparse(2, Array(0, 1), Array(Double.NaN, 2.0)), Vectors.dense(2.25, 2), + Vectors.dense(3.0, 2), Vectors.dense(1.0, 2)), + (6, null.asInstanceOf[Vector], Vectors.dense(2.25, 2), Vectors.dense(3.0, 2), + Vectors.dense(1.0, 2)) )).toDF("id", "value", "mean", "median", "most") Seq("mean", "median", "most").foreach { strategy => val imputer = new Imputer().setInputCol("value").setOutputCol("out").setStrategy(strategy) val model = imputer.fit(df) model.transform(df).select(strategy, "out").collect() - .foreach { case Row(d1: Double, d2: Double) => - assert(d1 ~== d2 absTol 1e-5, s"Imputer ut error: $d2 should be $d1") + .foreach { case Row(v1: Vector, v2: Vector) => + assert(v1 == v2, s"$strategy Imputer ut error: $v2 should be $v1") } } } @@ -85,10 +89,11 @@ class ImputerSuite extends SparkFunSuite with MLlibTestSparkContext with Default val t = new Imputer() .setInputCol("myInputCol") .setOutputCol("myOutputCol") + .setMissingValue(-1.0) testDefaultReadWrite(t) } - test("Imputer read/write") { + test("ImputerModel read/write") { val instance = new ImputerModel( "myImputer", Vectors.dense(1.0, 10.0)) .setInputCol("myInputCol") From 4e45f81f89f0b1ad13add524b3dd89fe52126bc0 Mon Sep 17 00:00:00 2001 From: Yuhao Yang Date: Thu, 10 Mar 2016 11:06:30 -0800 Subject: [PATCH 06/25] add options validate and some small changes --- .../org/apache/spark/ml/feature/Imputer.scala | 54 ++++++++++--------- 1 file changed, 28 insertions(+), 26 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala index 1678850f8fa35..70938691f536d 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala @@ -21,7 +21,7 @@ import org.apache.hadoop.fs.Path import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.{Estimator, Model} -import org.apache.spark.ml.param.{DoubleParam, Param, ParamMap, Params} +import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} import org.apache.spark.ml.util._ import org.apache.spark.mllib.linalg._ @@ -36,29 +36,30 @@ private[feature] trait ImputerParams extends Params with HasInputCol with HasOut /** * The imputation strategy. - * If "mean", then replace missing values using the mean along the axis. - * If "median", then replace missing values using the median along the axis. - * If "most", then replace missing using the most frequent value along the axis. + * If "mean", then replace missing values using the mean value of the feature. + * If "median", then replace missing values using the median value of the feature. + * If "most", then replace missing using the most frequent value of the feature. * Default: mean * * @group param */ val strategy: Param[String] = new Param(this, "strategy", "strategy for imputation. " + - "If mean, then replace missing values using the mean along the axis." + - "If median, then replace missing values using the median along the axis." + - "If most, then replace missing using the most frequent value along the axis.") + "If mean, then replace missing values using the mean value of the feature." + + "If median, then replace missing values using the median value of the feature." + + "If most, then replace missing using the most frequent value of the feature.", + ParamValidators.inArray[String](Imputer.supportedStrategyNames.toArray)) /** @group getParam */ def getStrategy: String = $(strategy) /** - * The placeholder for the missing values. All occurrences of missingvalues will be imputed. + * The placeholder for the missing values. All occurrences of missingValue will be imputed. * Default: Double.NaN * * @group param */ val missingValue: DoubleParam = new DoubleParam(this, "missingValue", - "The placeholder for the missing values. All occurrences of missingvalues will be imputed") + "The placeholder for the missing values. All occurrences of missingValue will be imputed") /** @group getParam */ def getMissingValue: Double = $(missingValue) @@ -75,18 +76,13 @@ private[feature] trait ImputerParams extends Params with HasInputCol with HasOut StructType(outputFields) } - override def validateParams(): Unit = { - require(Seq("mean", "median", "most").contains($(strategy)), - s"${$(strategy)} is not supported. Options are mean, median and most") - } } /** * :: Experimental :: * Imputation estimator for completing missing values, either using the mean, the median or * the most frequent value of the column in which the missing values are located. This class - * also allows for different missing values encodings. - * + * also allows for different missing values. */ @Experimental class Imputer @Since("2.0.0")(override val uid: String) @@ -101,7 +97,10 @@ class Imputer @Since("2.0.0")(override val uid: String) /** @group setParam */ def setOutputCol(value: String): this.type = set(outputCol, value) - /** @group setParam */ + /** + * Imputation strategy. Available options are "mean", "median" and "most". + * @group setParam + */ def setStrategy(value: String): this.type = set(strategy, value) /** @group setParam */ @@ -112,15 +111,14 @@ class Imputer @Since("2.0.0")(override val uid: String) override def fit(dataset: DataFrame): ImputerModel = { val alternate = dataset.select($(inputCol)).schema.fields(0).dataType match { case DoubleType => - val colStatistics = getColStatistics(dataset, $(inputCol)) - Vectors.dense(Array(colStatistics)) + Vectors.dense(getColStatistics(dataset, $(inputCol))) case _: VectorUDT => val vl = dataset.first().getAs[Vector]($(inputCol)).size val statisticsArray = new Array[Double](vl) (0 until vl).foreach(i => { val getI = udf((v: Vector) => v(i)) val tempColName = $(inputCol) + i - val tempData = dataset.where(s"${$(inputCol)} is not null") + val tempData = dataset.where(s"${$(inputCol)} IS NOT NULL") .select($(inputCol)).withColumn(tempColName, getI(col($(inputCol)))) statisticsArray(i) = getColStatistics(tempData, tempColName) }) @@ -129,6 +127,7 @@ class Imputer @Since("2.0.0")(override val uid: String) copyValues(new ImputerModel(uid, alternate).setParent(this)) } + /** Extract the statistics info from a Double column according to the strategy */ private def getColStatistics(dataset: DataFrame, colName: String): Double = { val missValue = $(missingValue) match { case Double.NaN => "NaN" @@ -143,7 +142,7 @@ class Imputer @Since("2.0.0")(override val uid: String) val rddDouble = filteredDF.rdd.map(_.getDouble(0)) rddDouble.sortBy(d => d).zipWithIndex().map { case (v, idx) => (idx, v) - }.lookup(rddDouble.count()/2).head + }.lookup(rddDouble.count() / 2).head case "most" => val input = filteredDF.rdd.map(_.getDouble(0)) val most = input.map(d => (d, 1)).reduceByKey(_ + _).sortBy(-_._2).first()._1 @@ -165,6 +164,9 @@ class Imputer @Since("2.0.0")(override val uid: String) @Since("1.6.0") object Imputer extends DefaultParamsReadable[Imputer] { + /** Set of strategy names that Imputer currently supports. */ + private[ml] val supportedStrategyNames = Set("mean", "median", "most") + @Since("1.6.0") override def load(path: String): Imputer = super.load(path) } @@ -173,7 +175,7 @@ object Imputer extends DefaultParamsReadable[Imputer] { * :: Experimental :: * Model fitted by [[Imputer]]. * - * @param alternate statistics value for each original column during fitting + * @param alternate statistics value for each feature during fitting */ @Experimental class ImputerModel private[ml] ( @@ -189,7 +191,7 @@ class ImputerModel private[ml] ( /** @group setParam */ def setOutputCol(value: String): this.type = set(outputCol, value) - private def matchMissingValue(value: Double): Boolean = { + private def isMissingValue(value: Double): Boolean = { val miss = $(missingValue) value == miss || (value.isNaN && miss.isNaN) } @@ -198,7 +200,7 @@ class ImputerModel private[ml] ( dataset.select($(inputCol)).schema.fields(0).dataType match { case DoubleType => val impute = udf { (d: Double) => - if (matchMissingValue(d)) alternate(0) else d + if (isMissingValue(d)) alternate(0) else d } dataset.withColumn($(outputCol), impute(col($(inputCol)))) case _: VectorUDT => @@ -208,20 +210,20 @@ class ImputerModel private[ml] ( } else { val vCopy = vector.copy + // TODO replace with update() since this hacks the internal implementation of Vector. vCopy match { case d: DenseVector => var iter = 0 while(iter < d.size) { - if (matchMissingValue(vCopy(iter))) { + if (isMissingValue(vCopy(iter))) { d.values(iter) = alternate(iter) } - iter += 1 } case s: SparseVector => var iter = 0 while(iter < s.values.length) { - if (matchMissingValue(s.values(iter))) { + if (isMissingValue(s.values(iter))) { s.values(iter) = alternate(s.indices(iter)) } iter += 1 From 1b36deb3eb0391ec7080bafebd2dfb662d09a6e4 Mon Sep 17 00:00:00 2001 From: Yuhao Yang Date: Wed, 23 Mar 2016 17:45:35 +0800 Subject: [PATCH 07/25] optimize mean for vectors --- .../org/apache/spark/ml/feature/Imputer.scala | 86 +++++++++++-------- 1 file changed, 48 insertions(+), 38 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala index 70938691f536d..994b06a359e65 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala @@ -18,6 +18,7 @@ package org.apache.spark.ml.feature import org.apache.hadoop.fs.Path +import org.apache.spark.SparkException import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.{Estimator, Model} @@ -25,6 +26,7 @@ import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} import org.apache.spark.ml.util._ import org.apache.spark.mllib.linalg._ +import org.apache.spark.rdd.RDD import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.functions.{col, udf} import org.apache.spark.sql.types.{DoubleType, StructField, StructType} @@ -64,9 +66,13 @@ private[feature] trait ImputerParams extends Params with HasInputCol with HasOut /** @group getParam */ def getMissingValue: Double = $(missingValue) + private[feature] def isMissingValue(value: Double): Boolean = { + val miss = $(missingValue) + value == miss || (value.isNaN && miss.isNaN) + } + /** Validates and transforms the input schema. */ protected def validateAndTransformSchema(schema: StructType): StructType = { - validateParams() val inputType = schema($(inputCol)).dataType require(inputType.isInstanceOf[VectorUDT] || inputType.isInstanceOf[DoubleType], s"Input column ${$(inputCol)} must of type Vector or Double") @@ -111,42 +117,54 @@ class Imputer @Since("2.0.0")(override val uid: String) override def fit(dataset: DataFrame): ImputerModel = { val alternate = dataset.select($(inputCol)).schema.fields(0).dataType match { case DoubleType => - Vectors.dense(getColStatistics(dataset, $(inputCol))) + val doubleRDD = dataset.select($(inputCol)).rdd.map(_.getDouble(0)) + Vectors.dense(getColStatistics(doubleRDD)) case _: VectorUDT => + val filteredDF = dataset.where(s"${$(inputCol)} IS NOT NULL").select($(inputCol)) + val vectorRDD = filteredDF.rdd.map(_.getAs[Vector](0)).cache() val vl = dataset.first().getAs[Vector]($(inputCol)).size - val statisticsArray = new Array[Double](vl) - (0 until vl).foreach(i => { - val getI = udf((v: Vector) => v(i)) - val tempColName = $(inputCol) + i - val tempData = dataset.where(s"${$(inputCol)} IS NOT NULL") - .select($(inputCol)).withColumn(tempColName, getI(col($(inputCol)))) - statisticsArray(i) = getColStatistics(tempData, tempColName) - }) - Vectors.dense(statisticsArray) + $(strategy) match { + case "mean" => + val summary = vectorRDD.treeAggregate((new Array[Double](vl), new Array[Int](vl)))( + (prev, data) => (prev, data) match { case ((mean, count), data) => + var i = 0 + while (i < mean.length) { + if (data(i) != 0 && !data(i).isNaN){ + count(i) += 1 + mean(i) = mean(i) + (data(i) - mean(i)) / count(i) + } + i += 1 + } + (mean, count) + }, (aggregator1, aggregator2) => (aggregator1, aggregator2) match { + case ((mean1, c1), (mean2, c2)) => + (0 until mean1.length).foreach{ i => + mean1(i) = mean1(i) + (mean2(i) - mean1(i)) * c2(i) / (c1(i) + c2(i)) + c1(i) += c2(i) + } + (mean1, c1) + }) + Vectors.dense(summary._1) + case _ => + val statisticsArray = new Array[Double](vl) + (0 until vl).foreach(i => { + statisticsArray(i) = getColStatistics(vectorRDD.map(v => v(i))) + }) + Vectors.dense(statisticsArray) + } } copyValues(new ImputerModel(uid, alternate).setParent(this)) } /** Extract the statistics info from a Double column according to the strategy */ - private def getColStatistics(dataset: DataFrame, colName: String): Double = { - val missValue = $(missingValue) match { - case Double.NaN => "NaN" - case _ => $(missingValue).toString - } - val filteredDF = dataset.select(colName).where(s"$colName != '$missValue'") + private def getColStatistics(data: RDD[Double]): Double = { + val filteredRDD = data.filter(!isMissingValue(_)) val colStatistics = $(strategy) match { - case "mean" => - filteredDF.selectExpr(s"avg($colName)").first().getDouble(0) - case "median" => - // TODO: optimize the sort with quick-select or Percentile(Hive) if required - val rddDouble = filteredDF.rdd.map(_.getDouble(0)) - rddDouble.sortBy(d => d).zipWithIndex().map { - case (v, idx) => (idx, v) - }.lookup(rddDouble.count() / 2).head - case "most" => - val input = filteredDF.rdd.map(_.getDouble(0)) - val most = input.map(d => (d, 1)).reduceByKey(_ + _).sortBy(-_._2).first()._1 - most + case "mean" => filteredRDD.mean() + case "median" => filteredRDD.sortBy(d => d).zipWithIndex() + .map(p => (p._2, p._1)).lookup(filteredRDD.count() / 2).head + case "most" => filteredRDD.map(d => (d, 1)).reduceByKey(_ + _).sortBy(-_._2).first()._1 + case _ => throw new SparkException(s"unsupported impute strategy: ${$(strategy)}") } colStatistics } @@ -191,11 +209,6 @@ class ImputerModel private[ml] ( /** @group setParam */ def setOutputCol(value: String): this.type = set(outputCol, value) - private def isMissingValue(value: Double): Boolean = { - val miss = $(missingValue) - value == miss || (value.isNaN && miss.isNaN) - } - override def transform(dataset: DataFrame): DataFrame = { dataset.select($(inputCol)).schema.fields(0).dataType match { case DoubleType => @@ -210,14 +223,11 @@ class ImputerModel private[ml] ( } else { val vCopy = vector.copy - // TODO replace with update() since this hacks the internal implementation of Vector. vCopy match { case d: DenseVector => var iter = 0 while(iter < d.size) { - if (isMissingValue(vCopy(iter))) { - d.values(iter) = alternate(iter) - } + if (isMissingValue(vCopy(iter))) { d.values(iter) = alternate(iter) } iter += 1 } case s: SparseVector => From 72d104d92a96ba03d60a72a5fa0b06e583a28bdc Mon Sep 17 00:00:00 2001 From: Yuhao Yang Date: Wed, 23 Mar 2016 08:07:34 -0400 Subject: [PATCH 08/25] style fix --- .../src/main/scala/org/apache/spark/ml/feature/Imputer.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala index 994b06a359e65..8f6d4605fbe55 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala @@ -18,8 +18,8 @@ package org.apache.spark.ml.feature import org.apache.hadoop.fs.Path -import org.apache.spark.SparkException +import org.apache.spark.SparkException import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.{Estimator, Model} import org.apache.spark.ml.param._ @@ -129,7 +129,7 @@ class Imputer @Since("2.0.0")(override val uid: String) (prev, data) => (prev, data) match { case ((mean, count), data) => var i = 0 while (i < mean.length) { - if (data(i) != 0 && !data(i).isNaN){ + if (data(i) != 0 && !data(i).isNaN) { count(i) += 1 mean(i) = mean(i) + (data(i) - mean(i)) / count(i) } From fdd6f943da2123aebaca4fe9d48ce6b6356bfa42 Mon Sep 17 00:00:00 2001 From: Yuhao Yang Date: Tue, 12 Apr 2016 13:53:07 +0800 Subject: [PATCH 09/25] refactor to support numeric only --- .../org/apache/spark/ml/feature/Imputer.scala | 196 ++++++------------ .../spark/ml/feature/ImputerSuite.scala | 56 +++-- 2 files changed, 99 insertions(+), 153 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala index 8f6d4605fbe55..4ba03d94d2a45 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala @@ -25,26 +25,25 @@ import org.apache.spark.ml.{Estimator, Model} import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} import org.apache.spark.ml.util._ -import org.apache.spark.mllib.linalg._ -import org.apache.spark.rdd.RDD -import org.apache.spark.sql.{DataFrame, Row} -import org.apache.spark.sql.functions.{col, udf} -import org.apache.spark.sql.types.{DoubleType, StructField, StructType} +import org.apache.spark.sql.{DataFrame, Dataset, Row} +import org.apache.spark.sql.functions._ +import org.apache.spark.sql.types._ +import org.apache.spark.sql.types.NumericType /** - * Params for [[Imputer]] and [[ImputerModel]]. - */ + * Params for [[Imputer]] and [[ImputerModel]]. + */ private[feature] trait ImputerParams extends Params with HasInputCol with HasOutputCol { /** - * The imputation strategy. - * If "mean", then replace missing values using the mean value of the feature. - * If "median", then replace missing values using the median value of the feature. - * If "most", then replace missing using the most frequent value of the feature. - * Default: mean - * - * @group param - */ + * The imputation strategy. + * If "mean", then replace missing values using the mean value of the feature. + * If "median", then replace missing values using the median value of the feature. + * If "most", then replace missing using the most frequent value of the feature. + * Default: mean + * + * @group param + */ val strategy: Param[String] = new Param(this, "strategy", "strategy for imputation. " + "If mean, then replace missing values using the mean value of the feature." + "If median, then replace missing values using the median value of the feature." + @@ -55,30 +54,26 @@ private[feature] trait ImputerParams extends Params with HasInputCol with HasOut def getStrategy: String = $(strategy) /** - * The placeholder for the missing values. All occurrences of missingValue will be imputed. - * Default: Double.NaN - * - * @group param - */ + * The placeholder for the missing values. All occurrences of missingValue will be imputed. + * Default: Double.NaN + * + * @group param + */ val missingValue: DoubleParam = new DoubleParam(this, "missingValue", "The placeholder for the missing values. All occurrences of missingValue will be imputed") /** @group getParam */ def getMissingValue: Double = $(missingValue) - private[feature] def isMissingValue(value: Double): Boolean = { - val miss = $(missingValue) - value == miss || (value.isNaN && miss.isNaN) - } - /** Validates and transforms the input schema. */ protected def validateAndTransformSchema(schema: StructType): StructType = { val inputType = schema($(inputCol)).dataType - require(inputType.isInstanceOf[VectorUDT] || inputType.isInstanceOf[DoubleType], - s"Input column ${$(inputCol)} must of type Vector or Double") + require(inputType.isInstanceOf[NumericType], + s"Input column ${$(inputCol)} must be of NumericType") require(!schema.fieldNames.contains($(outputCol)), s"Output column ${$(outputCol)} already exists.") - val outputFields = schema.fields :+ StructField($(outputCol), new VectorUDT, false) + val outputFields = schema.fields :+ + StructField($(outputCol), inputType, schema($(inputCol)).nullable) StructType(outputFields) } @@ -86,9 +81,9 @@ private[feature] trait ImputerParams extends Params with HasInputCol with HasOut /** * :: Experimental :: - * Imputation estimator for completing missing values, either using the mean, the median or - * the most frequent value of the column in which the missing values are located. This class - * also allows for different missing values. + * Imputation estimator for completing missing values, either using the mean("mean"), the + * median("median") or the most frequent value("most") of the column in which the missing + * values are located. */ @Experimental class Imputer @Since("2.0.0")(override val uid: String) @@ -104,7 +99,7 @@ class Imputer @Since("2.0.0")(override val uid: String) def setOutputCol(value: String): this.type = set(outputCol, value) /** - * Imputation strategy. Available options are "mean", "median" and "most". + * Imputation strategy. Available options are ["mean", "median" and "most"]. * @group setParam */ def setStrategy(value: String): this.type = set(strategy, value) @@ -114,59 +109,17 @@ class Imputer @Since("2.0.0")(override val uid: String) setDefault(strategy -> "mean", missingValue -> Double.NaN) - override def fit(dataset: DataFrame): ImputerModel = { - val alternate = dataset.select($(inputCol)).schema.fields(0).dataType match { - case DoubleType => - val doubleRDD = dataset.select($(inputCol)).rdd.map(_.getDouble(0)) - Vectors.dense(getColStatistics(doubleRDD)) - case _: VectorUDT => - val filteredDF = dataset.where(s"${$(inputCol)} IS NOT NULL").select($(inputCol)) - val vectorRDD = filteredDF.rdd.map(_.getAs[Vector](0)).cache() - val vl = dataset.first().getAs[Vector]($(inputCol)).size - $(strategy) match { - case "mean" => - val summary = vectorRDD.treeAggregate((new Array[Double](vl), new Array[Int](vl)))( - (prev, data) => (prev, data) match { case ((mean, count), data) => - var i = 0 - while (i < mean.length) { - if (data(i) != 0 && !data(i).isNaN) { - count(i) += 1 - mean(i) = mean(i) + (data(i) - mean(i)) / count(i) - } - i += 1 - } - (mean, count) - }, (aggregator1, aggregator2) => (aggregator1, aggregator2) match { - case ((mean1, c1), (mean2, c2)) => - (0 until mean1.length).foreach{ i => - mean1(i) = mean1(i) + (mean2(i) - mean1(i)) * c2(i) / (c1(i) + c2(i)) - c1(i) += c2(i) - } - (mean1, c1) - }) - Vectors.dense(summary._1) - case _ => - val statisticsArray = new Array[Double](vl) - (0 until vl).foreach(i => { - statisticsArray(i) = getColStatistics(vectorRDD.map(v => v(i))) - }) - Vectors.dense(statisticsArray) - } + override def fit(dataset: Dataset[_]): ImputerModel = { + val ic = col($(inputCol)) + val filtered = dataset.select(ic.cast(DoubleType)) + .filter(ic.isNotNull && !ic.isNaN && ic =!= $(missingValue)) + val surrogate = $(strategy) match { + case "mean" => filtered.select(avg($(inputCol))).first().getDouble(0) + case "median" => filtered.stat.approxQuantile($(inputCol), Array(0.5), 0.001)(0) + case "most" => filtered.rdd.map(r => r.getDouble(0)).map(d => (d, 1)).reduceByKey(_ + _) + .sortBy(-_._2).first()._1 } - copyValues(new ImputerModel(uid, alternate).setParent(this)) - } - - /** Extract the statistics info from a Double column according to the strategy */ - private def getColStatistics(data: RDD[Double]): Double = { - val filteredRDD = data.filter(!isMissingValue(_)) - val colStatistics = $(strategy) match { - case "mean" => filteredRDD.mean() - case "median" => filteredRDD.sortBy(d => d).zipWithIndex() - .map(p => (p._2, p._1)).lookup(filteredRDD.count() / 2).head - case "most" => filteredRDD.map(d => (d, 1)).reduceByKey(_ + _).sortBy(-_._2).first()._1 - case _ => throw new SparkException(s"unsupported impute strategy: ${$(strategy)}") - } - colStatistics + copyValues(new ImputerModel(uid, surrogate).setParent(this)) } override def transformSchema(schema: StructType): StructType = { @@ -179,26 +132,26 @@ class Imputer @Since("2.0.0")(override val uid: String) } } -@Since("1.6.0") +@Since("2.0.0") object Imputer extends DefaultParamsReadable[Imputer] { /** Set of strategy names that Imputer currently supports. */ private[ml] val supportedStrategyNames = Set("mean", "median", "most") - @Since("1.6.0") + @Since("2.0.0") override def load(path: String): Imputer = super.load(path) } /** - * :: Experimental :: - * Model fitted by [[Imputer]]. - * - * @param alternate statistics value for each feature during fitting - */ + * :: Experimental :: + * Model fitted by [[Imputer]]. + * + * @param surrogate statistics value for each feature during fitting + */ @Experimental -class ImputerModel private[ml] ( +class ImputerModel private[ml]( override val uid: String, - val alternate: Vector) + val surrogate: Double) extends Model[ImputerModel] with ImputerParams with MLWritable { import ImputerModel._ @@ -209,40 +162,17 @@ class ImputerModel private[ml] ( /** @group setParam */ def setOutputCol(value: String): this.type = set(outputCol, value) - override def transform(dataset: DataFrame): DataFrame = { - dataset.select($(inputCol)).schema.fields(0).dataType match { - case DoubleType => - val impute = udf { (d: Double) => - if (isMissingValue(d)) alternate(0) else d - } - dataset.withColumn($(outputCol), impute(col($(inputCol)))) - case _: VectorUDT => - val impute = udf { (vector: Vector) => - if (vector == null) { - alternate - } - else { - val vCopy = vector.copy - vCopy match { - case d: DenseVector => - var iter = 0 - while(iter < d.size) { - if (isMissingValue(vCopy(iter))) { d.values(iter) = alternate(iter) } - iter += 1 - } - case s: SparseVector => - var iter = 0 - while(iter < s.values.length) { - if (isMissingValue(s.values(iter))) { - s.values(iter) = alternate(s.indices(iter)) - } - iter += 1 - } - } - vCopy - } - } - dataset.withColumn($(outputCol), impute(col($(inputCol)))) + override def transform(dataset: Dataset[_]): DataFrame = { + val inputType = dataset.select($(inputCol)).schema.fields(0).dataType + inputType match { + case _: NumericType => + val ic = col($(inputCol)).cast(DoubleType) + dataset.withColumn($(outputCol), when(ic.isNull, surrogate) + .when(ic.isNaN, surrogate) + .when(ic === $(missingValue), surrogate) + .otherwise(ic) + .cast(inputType)) + case _ => throw new SparkException("imputer supports numeric type only") } } @@ -251,7 +181,7 @@ class ImputerModel private[ml] ( } override def copy(extra: ParamMap): ImputerModel = { - val copied = new ImputerModel(uid, alternate) + val copied = new ImputerModel(uid, surrogate) copyValues(copied, extra).setParent(parent) } @@ -266,11 +196,11 @@ object ImputerModel extends MLReadable[ImputerModel] { private[ImputerModel] class ImputerModelWriter(instance: ImputerModel) extends MLWriter { - private case class Data(alternate: Vector) + private case class Data(surrogate: Double) override protected def saveImpl(path: String): Unit = { DefaultParamsWriter.saveMetadata(instance, path, sc) - val data = new Data(instance.alternate) + val data = new Data(instance.surrogate) val dataPath = new Path(path, "data").toString sqlContext.createDataFrame(Seq(data)).repartition(1).write.parquet(dataPath) } @@ -283,10 +213,10 @@ object ImputerModel extends MLReadable[ImputerModel] { override def load(path: String): ImputerModel = { val metadata = DefaultParamsReader.loadMetadata(path, sc, className) val dataPath = new Path(path, "data").toString - val Row(alternate: Vector) = sqlContext.read.parquet(dataPath) - .select("alternate") + val Row(surrogate: Double) = sqlContext.read.parquet(dataPath) + .select("surrogate") .head() - val model = new ImputerModel(metadata.uid, alternate) + val model = new ImputerModel(metadata.uid, surrogate) DefaultParamsReader.getAndSetParams(model, metadata) model } diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala index c22adc48a1f0f..00589b17a17c6 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala @@ -17,21 +17,20 @@ package org.apache.spark.ml.feature import org.apache.spark.SparkFunSuite -import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils} -import org.apache.spark.mllib.linalg.{Vector, Vectors} +import org.apache.spark.ml.util.{DefaultReadWriteTest} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ import org.apache.spark.sql.Row class ImputerSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { - test("Imputer for Double column") { + test("Imputer for Double with default missing Value NaN") { val df = sqlContext.createDataFrame( Seq( (0, 1.0, 1.0, 1.0, 1.0), (1, 1.0, 1.0, 1.0, 1.0), (2, 3.0, 3.0, 3.0, 3.0), (3, 4.0, 4.0, 4.0, 4.0), - (4, Double.NaN, 2.25, 3.0, 1.0 ) + (4, Double.NaN, 2.25, 1.0, 1.0 ) )).toDF("id", "value", "mean", "median", "most") Seq("mean", "median", "most").foreach { strategy => val imputer = new Imputer().setInputCol("value").setOutputCol("out").setStrategy(strategy) @@ -49,7 +48,7 @@ class ImputerSuite extends SparkFunSuite with MLlibTestSparkContext with Default (1, 1.0, 1.0, 1.0, 1.0), (2, 3.0, 3.0, 3.0, 3.0), (3, 4.0, 4.0, 4.0, 4.0), - (4, -1.0, 2.25, 3.0, 1.0 ) + (4, -1.0, 2.25, 1.0, 1.0 ) )).toDF("id", "value", "mean", "median", "most") Seq("mean", "median", "most").foreach { strategy => val imputer = new Imputer().setInputCol("value").setOutputCol("out").setStrategy(strategy) @@ -62,29 +61,46 @@ class ImputerSuite extends SparkFunSuite with MLlibTestSparkContext with Default } } - test("Imputer for Vector column with NaN and null") { + test("Imputer for Int with missing Value -1") { val df = sqlContext.createDataFrame( Seq( - (0, Vectors.dense(1, 2), Vectors.dense(1, 2), Vectors.dense(1, 2), Vectors.dense(1, 2)), - (1, Vectors.dense(1, 2), Vectors.dense(1, 2), Vectors.dense(1, 2), Vectors.dense(1, 2)), - (2, Vectors.dense(3, 2), Vectors.dense(3, 2), Vectors.dense(3, 2), Vectors.dense(3, 2)), - (3, Vectors.dense(4, 2), Vectors.dense(4, 2), Vectors.dense(4, 2), Vectors.dense(4, 2)), - (4, Vectors.dense(Double.NaN, 2), Vectors.dense(2.25, 2), Vectors.dense(3.0, 2), - Vectors.dense(1.0, 2)), - (5, Vectors.sparse(2, Array(0, 1), Array(Double.NaN, 2.0)), Vectors.dense(2.25, 2), - Vectors.dense(3.0, 2), Vectors.dense(1.0, 2)), - (6, null.asInstanceOf[Vector], Vectors.dense(2.25, 2), Vectors.dense(3.0, 2), - Vectors.dense(1.0, 2)) + (0, 1, 1, 1, 1), + (1, 3, 3, 3, 3), + (2, 10, 10, 10, 10), + (3, 10, 10, 10, 10), + (4, -1, 6, 3, 10) )).toDF("id", "value", "mean", "median", "most") + Seq("mean", "median", "most").foreach { strategy => val imputer = new Imputer().setInputCol("value").setOutputCol("out").setStrategy(strategy) + .setMissingValue(-1) val model = imputer.fit(df) model.transform(df).select(strategy, "out").collect() - .foreach { case Row(v1: Vector, v2: Vector) => - assert(v1 == v2, s"$strategy Imputer ut error: $v2 should be $v1") + .foreach { case Row(d1: Int, d2: Int) => + assert(d1 === d2, s"Imputer ut error: $d2 should be $d1") } } } + test("Imputer should impute null") { + val df = sqlContext.createDataFrame( Seq( + (0, 1, 1, 1, 1), + (1, 3, 3, 3, 3), + (2, 10, 10, 10, 10), + (3, 10, 10, 10, 10), + (4, -1, 6, 3, 10) + )).toDF("id", "value", "mean", "median", "most") + val df2 = df.selectExpr("*", "IF(value=-1, null, value) as nullable_value") + Seq("mean", "median", "most").foreach { strategy => + val imputer = new Imputer().setInputCol("nullable_value").setOutputCol("out") + .setStrategy(strategy) + val model = imputer.fit(df2) + model.transform(df2).select(strategy, "out").collect() + .foreach { case Row(d1: Int, d2: Int) => + assert(d1 == d2, s"Imputer ut error: $d2 should be $d1") + } + } + } + test("Imputer read/write") { val t = new Imputer() .setInputCol("myInputCol") @@ -95,11 +111,11 @@ class ImputerSuite extends SparkFunSuite with MLlibTestSparkContext with Default test("ImputerModel read/write") { val instance = new ImputerModel( - "myImputer", Vectors.dense(1.0, 10.0)) + "myImputer", 1.234) .setInputCol("myInputCol") .setOutputCol("myOutputCol") val newInstance = testDefaultReadWrite(instance) - assert(newInstance.alternate === instance.alternate) + assert(newInstance.surrogate === instance.surrogate) } } From 4bdf595f576fae76f710bfb21e1e3f71571c55c8 Mon Sep 17 00:00:00 2001 From: Yuhao Yang Date: Tue, 12 Apr 2016 10:12:08 -0400 Subject: [PATCH 10/25] change most to mode --- .../org/apache/spark/ml/feature/Imputer.scala | 12 ++++++------ .../apache/spark/ml/feature/ImputerSuite.scala | 16 ++++++++-------- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala index 4ba03d94d2a45..a6496d06f6799 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala @@ -39,7 +39,7 @@ private[feature] trait ImputerParams extends Params with HasInputCol with HasOut * The imputation strategy. * If "mean", then replace missing values using the mean value of the feature. * If "median", then replace missing values using the median value of the feature. - * If "most", then replace missing using the most frequent value of the feature. + * If "mode", then replace missing using the most frequent value of the feature. * Default: mean * * @group param @@ -47,7 +47,7 @@ private[feature] trait ImputerParams extends Params with HasInputCol with HasOut val strategy: Param[String] = new Param(this, "strategy", "strategy for imputation. " + "If mean, then replace missing values using the mean value of the feature." + "If median, then replace missing values using the median value of the feature." + - "If most, then replace missing using the most frequent value of the feature.", + "If mode, then replace missing using the most frequent value of the feature.", ParamValidators.inArray[String](Imputer.supportedStrategyNames.toArray)) /** @group getParam */ @@ -82,7 +82,7 @@ private[feature] trait ImputerParams extends Params with HasInputCol with HasOut /** * :: Experimental :: * Imputation estimator for completing missing values, either using the mean("mean"), the - * median("median") or the most frequent value("most") of the column in which the missing + * median("median") or the most frequent value("mode") of the column in which the missing * values are located. */ @Experimental @@ -99,7 +99,7 @@ class Imputer @Since("2.0.0")(override val uid: String) def setOutputCol(value: String): this.type = set(outputCol, value) /** - * Imputation strategy. Available options are ["mean", "median" and "most"]. + * Imputation strategy. Available options are ["mean", "median" and "mode"]. * @group setParam */ def setStrategy(value: String): this.type = set(strategy, value) @@ -116,7 +116,7 @@ class Imputer @Since("2.0.0")(override val uid: String) val surrogate = $(strategy) match { case "mean" => filtered.select(avg($(inputCol))).first().getDouble(0) case "median" => filtered.stat.approxQuantile($(inputCol), Array(0.5), 0.001)(0) - case "most" => filtered.rdd.map(r => r.getDouble(0)).map(d => (d, 1)).reduceByKey(_ + _) + case "mode" => filtered.rdd.map(r => r.getDouble(0)).map(d => (d, 1)).reduceByKey(_ + _) .sortBy(-_._2).first()._1 } copyValues(new ImputerModel(uid, surrogate).setParent(this)) @@ -136,7 +136,7 @@ class Imputer @Since("2.0.0")(override val uid: String) object Imputer extends DefaultParamsReadable[Imputer] { /** Set of strategy names that Imputer currently supports. */ - private[ml] val supportedStrategyNames = Set("mean", "median", "most") + private[ml] val supportedStrategyNames = Set("mean", "median", "mode") @Since("2.0.0") override def load(path: String): Imputer = super.load(path) diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala index 00589b17a17c6..c29614531ea8b 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala @@ -31,8 +31,8 @@ class ImputerSuite extends SparkFunSuite with MLlibTestSparkContext with Default (2, 3.0, 3.0, 3.0, 3.0), (3, 4.0, 4.0, 4.0, 4.0), (4, Double.NaN, 2.25, 1.0, 1.0 ) - )).toDF("id", "value", "mean", "median", "most") - Seq("mean", "median", "most").foreach { strategy => + )).toDF("id", "value", "mean", "median", "mode") + Seq("mean", "median", "mode").foreach { strategy => val imputer = new Imputer().setInputCol("value").setOutputCol("out").setStrategy(strategy) val model = imputer.fit(df) model.transform(df).select(strategy, "out").collect() @@ -49,8 +49,8 @@ class ImputerSuite extends SparkFunSuite with MLlibTestSparkContext with Default (2, 3.0, 3.0, 3.0, 3.0), (3, 4.0, 4.0, 4.0, 4.0), (4, -1.0, 2.25, 1.0, 1.0 ) - )).toDF("id", "value", "mean", "median", "most") - Seq("mean", "median", "most").foreach { strategy => + )).toDF("id", "value", "mean", "median", "mode") + Seq("mean", "median", "mode").foreach { strategy => val imputer = new Imputer().setInputCol("value").setOutputCol("out").setStrategy(strategy) .setMissingValue(-1.0) val model = imputer.fit(df) @@ -68,9 +68,9 @@ class ImputerSuite extends SparkFunSuite with MLlibTestSparkContext with Default (2, 10, 10, 10, 10), (3, 10, 10, 10, 10), (4, -1, 6, 3, 10) - )).toDF("id", "value", "mean", "median", "most") + )).toDF("id", "value", "mean", "median", "mode") - Seq("mean", "median", "most").foreach { strategy => + Seq("mean", "median", "mode").foreach { strategy => val imputer = new Imputer().setInputCol("value").setOutputCol("out").setStrategy(strategy) .setMissingValue(-1) val model = imputer.fit(df) @@ -88,9 +88,9 @@ class ImputerSuite extends SparkFunSuite with MLlibTestSparkContext with Default (2, 10, 10, 10, 10), (3, 10, 10, 10, 10), (4, -1, 6, 3, 10) - )).toDF("id", "value", "mean", "median", "most") + )).toDF("id", "value", "mean", "median", "mode") val df2 = df.selectExpr("*", "IF(value=-1, null, value) as nullable_value") - Seq("mean", "median", "most").foreach { strategy => + Seq("mean", "median", "mode").foreach { strategy => val imputer = new Imputer().setInputCol("nullable_value").setOutputCol("out") .setStrategy(strategy) val model = imputer.fit(df2) From 171842210d3ea2e3c97fe803f0a8bb3831063f3f Mon Sep 17 00:00:00 2001 From: Yuhao Yang Date: Sat, 16 Apr 2016 22:56:03 -0400 Subject: [PATCH 11/25] move filter to NaN --- .../src/main/scala/org/apache/spark/ml/feature/Imputer.scala | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala index a6496d06f6799..494a2fbae161f 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala @@ -112,9 +112,9 @@ class Imputer @Since("2.0.0")(override val uid: String) override def fit(dataset: Dataset[_]): ImputerModel = { val ic = col($(inputCol)) val filtered = dataset.select(ic.cast(DoubleType)) - .filter(ic.isNotNull && !ic.isNaN && ic =!= $(missingValue)) + .filter(ic.isNotNull && ic =!= $(missingValue)) val surrogate = $(strategy) match { - case "mean" => filtered.select(avg($(inputCol))).first().getDouble(0) + case "mean" => filtered.filter(!ic.isNaN).select(avg($(inputCol))).first().getDouble(0) case "median" => filtered.stat.approxQuantile($(inputCol), Array(0.5), 0.001)(0) case "mode" => filtered.rdd.map(r => r.getDouble(0)).map(d => (d, 1)).reduceByKey(_ + _) .sortBy(-_._2).first()._1 @@ -168,7 +168,6 @@ class ImputerModel private[ml]( case _: NumericType => val ic = col($(inputCol)).cast(DoubleType) dataset.withColumn($(outputCol), when(ic.isNull, surrogate) - .when(ic.isNaN, surrogate) .when(ic === $(missingValue), surrogate) .otherwise(ic) .cast(inputType)) From 594c501f85cad2a278caee5f08b85deb61272e5d Mon Sep 17 00:00:00 2001 From: Yuhao Yang Date: Wed, 20 Apr 2016 11:55:25 -0400 Subject: [PATCH 12/25] add transformSchema --- mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala index 494a2fbae161f..57f8abcbcaeb0 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala @@ -110,6 +110,7 @@ class Imputer @Since("2.0.0")(override val uid: String) setDefault(strategy -> "mean", missingValue -> Double.NaN) override def fit(dataset: Dataset[_]): ImputerModel = { + transformSchema(dataset.schema, logging = true) val ic = col($(inputCol)) val filtered = dataset.select(ic.cast(DoubleType)) .filter(ic.isNotNull && ic =!= $(missingValue)) @@ -163,6 +164,7 @@ class ImputerModel private[ml]( def setOutputCol(value: String): this.type = set(outputCol, value) override def transform(dataset: Dataset[_]): DataFrame = { + transformSchema(dataset.schema, logging = true) val inputType = dataset.select($(inputCol)).schema.fields(0).dataType inputType match { case _: NumericType => From b3633e8dd0edf47a684aa344ba6a3c43ac0d91fe Mon Sep 17 00:00:00 2001 From: Yuhao Yang Date: Wed, 27 Apr 2016 12:36:05 -0400 Subject: [PATCH 13/25] remove mode and change input type --- .../org/apache/spark/ml/feature/Imputer.scala | 46 +++----- .../spark/ml/feature/ImputerSuite.scala | 111 +++++++++++------- 2 files changed, 83 insertions(+), 74 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala index 57f8abcbcaeb0..68e0764e6225e 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala @@ -28,7 +28,6 @@ import org.apache.spark.ml.util._ import org.apache.spark.sql.{DataFrame, Dataset, Row} import org.apache.spark.sql.functions._ import org.apache.spark.sql.types._ -import org.apache.spark.sql.types.NumericType /** * Params for [[Imputer]] and [[ImputerModel]]. @@ -38,16 +37,14 @@ private[feature] trait ImputerParams extends Params with HasInputCol with HasOut /** * The imputation strategy. * If "mean", then replace missing values using the mean value of the feature. - * If "median", then replace missing values using the median value of the feature. - * If "mode", then replace missing using the most frequent value of the feature. + * If "median", then replace missing values using the approximate median value of the feature. * Default: mean * * @group param */ - val strategy: Param[String] = new Param(this, "strategy", "strategy for imputation. " + + final val strategy: Param[String] = new Param(this, "strategy", "strategy for imputation. " + "If mean, then replace missing values using the mean value of the feature." + - "If median, then replace missing values using the median value of the feature." + - "If mode, then replace missing using the most frequent value of the feature.", + "If median, then replace missing values using the median value of the feature.", ParamValidators.inArray[String](Imputer.supportedStrategyNames.toArray)) /** @group getParam */ @@ -59,7 +56,7 @@ private[feature] trait ImputerParams extends Params with HasInputCol with HasOut * * @group param */ - val missingValue: DoubleParam = new DoubleParam(this, "missingValue", + final val missingValue: DoubleParam = new DoubleParam(this, "missingValue", "The placeholder for the missing values. All occurrences of missingValue will be imputed") /** @group getParam */ @@ -68,22 +65,19 @@ private[feature] trait ImputerParams extends Params with HasInputCol with HasOut /** Validates and transforms the input schema. */ protected def validateAndTransformSchema(schema: StructType): StructType = { val inputType = schema($(inputCol)).dataType - require(inputType.isInstanceOf[NumericType], - s"Input column ${$(inputCol)} must be of NumericType") + SchemaUtils.checkColumnTypes(schema, $(inputCol), Seq(DoubleType, FloatType)) require(!schema.fieldNames.contains($(outputCol)), s"Output column ${$(outputCol)} already exists.") - val outputFields = schema.fields :+ - StructField($(outputCol), inputType, schema($(inputCol)).nullable) - StructType(outputFields) + SchemaUtils.appendColumn(schema, $(outputCol), inputType) } - } /** * :: Experimental :: - * Imputation estimator for completing missing values, either using the mean("mean"), the - * median("median") or the most frequent value("mode") of the column in which the missing - * values are located. + * Imputation estimator for completing missing values, either using the mean("mean") or the + * median("median") of the column in which the missing values are located. + * + * Note that all the null values will be imputed as well. */ @Experimental class Imputer @Since("2.0.0")(override val uid: String) @@ -99,7 +93,7 @@ class Imputer @Since("2.0.0")(override val uid: String) def setOutputCol(value: String): this.type = set(outputCol, value) /** - * Imputation strategy. Available options are ["mean", "median" and "mode"]. + * Imputation strategy. Available options are ["mean", "median"]. * @group setParam */ def setStrategy(value: String): this.type = set(strategy, value) @@ -117,8 +111,6 @@ class Imputer @Since("2.0.0")(override val uid: String) val surrogate = $(strategy) match { case "mean" => filtered.filter(!ic.isNaN).select(avg($(inputCol))).first().getDouble(0) case "median" => filtered.stat.approxQuantile($(inputCol), Array(0.5), 0.001)(0) - case "mode" => filtered.rdd.map(r => r.getDouble(0)).map(d => (d, 1)).reduceByKey(_ + _) - .sortBy(-_._2).first()._1 } copyValues(new ImputerModel(uid, surrogate).setParent(this)) } @@ -137,7 +129,7 @@ class Imputer @Since("2.0.0")(override val uid: String) object Imputer extends DefaultParamsReadable[Imputer] { /** Set of strategy names that Imputer currently supports. */ - private[ml] val supportedStrategyNames = Set("mean", "median", "mode") + private[ml] val supportedStrategyNames = Set("mean", "median") @Since("2.0.0") override def load(path: String): Imputer = super.load(path) @@ -166,15 +158,11 @@ class ImputerModel private[ml]( override def transform(dataset: Dataset[_]): DataFrame = { transformSchema(dataset.schema, logging = true) val inputType = dataset.select($(inputCol)).schema.fields(0).dataType - inputType match { - case _: NumericType => - val ic = col($(inputCol)).cast(DoubleType) - dataset.withColumn($(outputCol), when(ic.isNull, surrogate) - .when(ic === $(missingValue), surrogate) - .otherwise(ic) - .cast(inputType)) - case _ => throw new SparkException("imputer supports numeric type only") - } + val ic = col($(inputCol)) + dataset.withColumn($(outputCol), when(ic.isNull, surrogate) + .when(ic === $(missingValue), surrogate) + .otherwise(ic) + .cast(inputType)) } override def transformSchema(schema: StructType): StructType = { diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala index c29614531ea8b..61e216998aad2 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala @@ -26,77 +26,98 @@ class ImputerSuite extends SparkFunSuite with MLlibTestSparkContext with Default test("Imputer for Double with default missing Value NaN") { val df = sqlContext.createDataFrame( Seq( - (0, 1.0, 1.0, 1.0, 1.0), - (1, 1.0, 1.0, 1.0, 1.0), - (2, 3.0, 3.0, 3.0, 3.0), - (3, 4.0, 4.0, 4.0, 4.0), - (4, Double.NaN, 2.25, 1.0, 1.0 ) - )).toDF("id", "value", "mean", "median", "mode") - Seq("mean", "median", "mode").foreach { strategy => + (0, 1.0, 1.0, 1.0), + (1, 1.0, 1.0, 1.0), + (2, 3.0, 3.0, 3.0), + (3, 4.0, 4.0, 4.0), + (4, Double.NaN, 2.25, 1.0) + )).toDF("id", "value", "exp_mean", "exp_median") + Seq("mean", "median").foreach { strategy => val imputer = new Imputer().setInputCol("value").setOutputCol("out").setStrategy(strategy) val model = imputer.fit(df) - model.transform(df).select(strategy, "out").collect() - .foreach { case Row(d1: Double, d2: Double) => - assert(d1 ~== d2 absTol 1e-5, s"Imputer ut error: $d2 should be $d1") - } + model.transform(df).select("exp_" + strategy, "out").collect().foreach { + case Row(exp: Double, out: Double) => + assert(exp ~== out absTol 1e-5, s"Imputed values differ. Expected: $exp, actual: $out") + } } } test("Imputer for Double with missing Value -1.0") { val df = sqlContext.createDataFrame( Seq( - (0, 1.0, 1.0, 1.0, 1.0), - (1, 1.0, 1.0, 1.0, 1.0), - (2, 3.0, 3.0, 3.0, 3.0), - (3, 4.0, 4.0, 4.0, 4.0), - (4, -1.0, 2.25, 1.0, 1.0 ) - )).toDF("id", "value", "mean", "median", "mode") - Seq("mean", "median", "mode").foreach { strategy => + (0, 1.0, 1.0, 1.0), + (1, 1.0, 1.0, 1.0), + (2, 3.0, 3.0, 3.0), + (3, 4.0, 4.0, 4.0), + (4, -1.0, 2.25, 1.0) + )).toDF("id", "value", "exp_mean", "exp_median") + Seq("mean", "median").foreach { strategy => val imputer = new Imputer().setInputCol("value").setOutputCol("out").setStrategy(strategy) .setMissingValue(-1.0) val model = imputer.fit(df) - model.transform(df).select(strategy, "out").collect() - .foreach { case Row(d1: Double, d2: Double) => - assert(d1 ~== d2 absTol 1e-5, s"Imputer ut error: $d2 should be $d1") - } + model.transform(df).select("exp_" + strategy, "out").collect().foreach { + case Row(exp: Double, out: Double) => + assert(exp ~== out absTol 1e-5, s"Impute($strategy) error. Expected: $exp, actual: $out") + } } } - test("Imputer for Int with missing Value -1") { + test("Imputer for Double with missing Value -1.0 and contains NaN") { val df = sqlContext.createDataFrame( Seq( - (0, 1, 1, 1, 1), - (1, 3, 3, 3, 3), - (2, 10, 10, 10, 10), - (3, 10, 10, 10, 10), - (4, -1, 6, 3, 10) - )).toDF("id", "value", "mean", "median", "mode") + (0, 1.0, 1.0, 1.0), + (1, 3.0, 3.0, 3.0), + (2, Double.NaN, Double.NaN, Double.NaN), + (3, -1.0, 2.0, 3.0) + )).toDF("id", "value", "exp_mean", "exp_median") + Seq("mean", "median").foreach { strategy => + val imputer = new Imputer().setInputCol("value").setOutputCol("out").setStrategy(strategy) + .setMissingValue(-1.0) + val model = imputer.fit(df) + model.transform(df).select("exp_" + strategy, "out").collect().foreach { + case Row(exp: Double, out: Double) => + assert((exp.isNaN && out.isNaN) || (exp ~== out absTol 1e-5), + s"Imputed values differ. Expected: $exp, actual: $out") + } + } + } - Seq("mean", "median", "mode").foreach { strategy => + test("Imputer for Float with missing Value -1.0") { + val df = sqlContext.createDataFrame( Seq( + (0, 1.0F, 1.0F, 1.0F), + (1, 3.0F, 3.0F, 3.0F), + (2, 10.0F, 10.0F, 10.0F), + (3, 10.0F, 10.0F, 10.0F), + (4, -1.0F, 6.0F, 3.0F) + )).toDF("id", "value", "exp_mean", "exp_median") + + Seq("mean", "median").foreach { strategy => val imputer = new Imputer().setInputCol("value").setOutputCol("out").setStrategy(strategy) .setMissingValue(-1) val model = imputer.fit(df) - model.transform(df).select(strategy, "out").collect() - .foreach { case Row(d1: Int, d2: Int) => - assert(d1 === d2, s"Imputer ut error: $d2 should be $d1") - } + val result = model.transform(df) + result.printSchema() + model.transform(df).select("exp_" + strategy, "out").collect().foreach { + case Row(exp: Float, out: Float) => + assert(exp == out, s"Imputed values differ. Expected: $exp, actual: $out") + } } } test("Imputer should impute null") { val df = sqlContext.createDataFrame( Seq( - (0, 1, 1, 1, 1), - (1, 3, 3, 3, 3), - (2, 10, 10, 10, 10), - (3, 10, 10, 10, 10), - (4, -1, 6, 3, 10) - )).toDF("id", "value", "mean", "median", "mode") - val df2 = df.selectExpr("*", "IF(value=-1, null, value) as nullable_value") - Seq("mean", "median", "mode").foreach { strategy => + (0, 4.0, 4.0, 4.0), + (1, 10.0, 10.0, 10.0), + (2, 10.0, 10.0, 10.0), + (3, Double.NaN, 8.0, 10.0), + (4, -1.0, 8.0, 10.0) + )).toDF("id", "value", "exp_mean", "exp_median") + val df2 = df.selectExpr("*", "IF(value=-1.0, null, value) as nullable_value") + Seq("mean", "median").foreach { strategy => val imputer = new Imputer().setInputCol("nullable_value").setOutputCol("out") .setStrategy(strategy) val model = imputer.fit(df2) - model.transform(df2).select(strategy, "out").collect() - .foreach { case Row(d1: Int, d2: Int) => - assert(d1 == d2, s"Imputer ut error: $d2 should be $d1") + model.transform(df2).select("exp_" + strategy, "out").collect().foreach { + case Row(exp: Double, out: Double) => + assert(exp ~== out absTol 1e-5, s"Imputed values differ. Expected: $exp, actual: $out") } } } From 053d489a70a28674029ee51a69f529e851261c96 Mon Sep 17 00:00:00 2001 From: Yuhao Yang Date: Wed, 27 Apr 2016 12:41:01 -0400 Subject: [PATCH 14/25] remove print --- .../test/scala/org/apache/spark/ml/feature/ImputerSuite.scala | 1 - 1 file changed, 1 deletion(-) diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala index 61e216998aad2..ebc17415e3eaf 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala @@ -94,7 +94,6 @@ class ImputerSuite extends SparkFunSuite with MLlibTestSparkContext with Default .setMissingValue(-1) val model = imputer.fit(df) val result = model.transform(df) - result.printSchema() model.transform(df).select("exp_" + strategy, "out").collect().foreach { case Row(exp: Float, out: Float) => assert(exp == out, s"Imputed values differ. Expected: $exp, actual: $out") From 4e1c34a77b8e4382c00a0438e00e34f544b591a3 Mon Sep 17 00:00:00 2001 From: Yuhao Yang Date: Thu, 28 Apr 2016 14:58:09 +0800 Subject: [PATCH 15/25] update document and remove a ut --- .../org/apache/spark/ml/feature/Imputer.scala | 9 ++++--- .../spark/ml/feature/ImputerSuite.scala | 27 +++---------------- 2 files changed, 9 insertions(+), 27 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala index 68e0764e6225e..40ecfe51bd9fe 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala @@ -19,7 +19,6 @@ package org.apache.spark.ml.feature import org.apache.hadoop.fs.Path -import org.apache.spark.SparkException import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.{Estimator, Model} import org.apache.spark.ml.param._ @@ -74,10 +73,12 @@ private[feature] trait ImputerParams extends Params with HasInputCol with HasOut /** * :: Experimental :: - * Imputation estimator for completing missing values, either using the mean("mean") or the - * median("median") of the column in which the missing values are located. + * Imputation estimator for completing missing values, either using the mean or the + * median of the column in which the missing values are located. InputCol should be + * of DoubleType or FloatType. * - * Note that all the null values will be imputed as well. + * Note that the mean/median value is computed after filtering out missing values. + * All Null values in the input column are treated as missing, and so are also imputed. */ @Experimental class Imputer @Since("2.0.0")(override val uid: String) diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala index ebc17415e3eaf..959e8583070ef 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala @@ -31,7 +31,7 @@ class ImputerSuite extends SparkFunSuite with MLlibTestSparkContext with Default (2, 3.0, 3.0, 3.0), (3, 4.0, 4.0, 4.0), (4, Double.NaN, 2.25, 1.0) - )).toDF("id", "value", "exp_mean", "exp_median") + )).toDF("id", "value", "expected_mean", "expected_median") Seq("mean", "median").foreach { strategy => val imputer = new Imputer().setInputCol("value").setOutputCol("out").setStrategy(strategy) val model = imputer.fit(df) @@ -42,32 +42,13 @@ class ImputerSuite extends SparkFunSuite with MLlibTestSparkContext with Default } } - test("Imputer for Double with missing Value -1.0") { - val df = sqlContext.createDataFrame( Seq( - (0, 1.0, 1.0, 1.0), - (1, 1.0, 1.0, 1.0), - (2, 3.0, 3.0, 3.0), - (3, 4.0, 4.0, 4.0), - (4, -1.0, 2.25, 1.0) - )).toDF("id", "value", "exp_mean", "exp_median") - Seq("mean", "median").foreach { strategy => - val imputer = new Imputer().setInputCol("value").setOutputCol("out").setStrategy(strategy) - .setMissingValue(-1.0) - val model = imputer.fit(df) - model.transform(df).select("exp_" + strategy, "out").collect().foreach { - case Row(exp: Double, out: Double) => - assert(exp ~== out absTol 1e-5, s"Impute($strategy) error. Expected: $exp, actual: $out") - } - } - } - test("Imputer for Double with missing Value -1.0 and contains NaN") { val df = sqlContext.createDataFrame( Seq( (0, 1.0, 1.0, 1.0), (1, 3.0, 3.0, 3.0), (2, Double.NaN, Double.NaN, Double.NaN), (3, -1.0, 2.0, 3.0) - )).toDF("id", "value", "exp_mean", "exp_median") + )).toDF("id", "value", "expected_mean", "expected_median") Seq("mean", "median").foreach { strategy => val imputer = new Imputer().setInputCol("value").setOutputCol("out").setStrategy(strategy) .setMissingValue(-1.0) @@ -87,7 +68,7 @@ class ImputerSuite extends SparkFunSuite with MLlibTestSparkContext with Default (2, 10.0F, 10.0F, 10.0F), (3, 10.0F, 10.0F, 10.0F), (4, -1.0F, 6.0F, 3.0F) - )).toDF("id", "value", "exp_mean", "exp_median") + )).toDF("id", "value", "expected_mean", "expected_median") Seq("mean", "median").foreach { strategy => val imputer = new Imputer().setInputCol("value").setOutputCol("out").setStrategy(strategy) @@ -108,7 +89,7 @@ class ImputerSuite extends SparkFunSuite with MLlibTestSparkContext with Default (2, 10.0, 10.0, 10.0), (3, Double.NaN, 8.0, 10.0), (4, -1.0, 8.0, 10.0) - )).toDF("id", "value", "exp_mean", "exp_median") + )).toDF("id", "value", "expected_mean", "expected_median") val df2 = df.selectExpr("*", "IF(value=-1.0, null, value) as nullable_value") Seq("mean", "median").foreach { strategy => val imputer = new Imputer().setInputCol("nullable_value").setOutputCol("out") From aef094bc7b7a00c0ded1b2998b7f98d2bc42c666 Mon Sep 17 00:00:00 2001 From: Yuhao Yang Date: Fri, 29 Apr 2016 10:15:21 +0800 Subject: [PATCH 16/25] fix ut --- .../main/scala/org/apache/spark/ml/feature/Imputer.scala | 5 ++--- .../scala/org/apache/spark/ml/feature/ImputerSuite.scala | 8 ++++---- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala index 40ecfe51bd9fe..0ca4b524184ea 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala @@ -42,7 +42,7 @@ private[feature] trait ImputerParams extends Params with HasInputCol with HasOut * @group param */ final val strategy: Param[String] = new Param(this, "strategy", "strategy for imputation. " + - "If mean, then replace missing values using the mean value of the feature." + + "If mean, then replace missing values using the mean value of the feature. " + "If median, then replace missing values using the median value of the feature.", ParamValidators.inArray[String](Imputer.supportedStrategyNames.toArray)) @@ -183,8 +183,7 @@ class ImputerModel private[ml]( @Since("2.0.0") object ImputerModel extends MLReadable[ImputerModel] { - private[ImputerModel] - class ImputerModelWriter(instance: ImputerModel) extends MLWriter { + private[ImputerModel] class ImputerModelWriter(instance: ImputerModel) extends MLWriter { private case class Data(surrogate: Double) diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala index 959e8583070ef..06e754b21c6eb 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala @@ -35,7 +35,7 @@ class ImputerSuite extends SparkFunSuite with MLlibTestSparkContext with Default Seq("mean", "median").foreach { strategy => val imputer = new Imputer().setInputCol("value").setOutputCol("out").setStrategy(strategy) val model = imputer.fit(df) - model.transform(df).select("exp_" + strategy, "out").collect().foreach { + model.transform(df).select("expected_" + strategy, "out").collect().foreach { case Row(exp: Double, out: Double) => assert(exp ~== out absTol 1e-5, s"Imputed values differ. Expected: $exp, actual: $out") } @@ -53,7 +53,7 @@ class ImputerSuite extends SparkFunSuite with MLlibTestSparkContext with Default val imputer = new Imputer().setInputCol("value").setOutputCol("out").setStrategy(strategy) .setMissingValue(-1.0) val model = imputer.fit(df) - model.transform(df).select("exp_" + strategy, "out").collect().foreach { + model.transform(df).select("expected_" + strategy, "out").collect().foreach { case Row(exp: Double, out: Double) => assert((exp.isNaN && out.isNaN) || (exp ~== out absTol 1e-5), s"Imputed values differ. Expected: $exp, actual: $out") @@ -75,7 +75,7 @@ class ImputerSuite extends SparkFunSuite with MLlibTestSparkContext with Default .setMissingValue(-1) val model = imputer.fit(df) val result = model.transform(df) - model.transform(df).select("exp_" + strategy, "out").collect().foreach { + model.transform(df).select("expected_" + strategy, "out").collect().foreach { case Row(exp: Float, out: Float) => assert(exp == out, s"Imputed values differ. Expected: $exp, actual: $out") } @@ -95,7 +95,7 @@ class ImputerSuite extends SparkFunSuite with MLlibTestSparkContext with Default val imputer = new Imputer().setInputCol("nullable_value").setOutputCol("out") .setStrategy(strategy) val model = imputer.fit(df2) - model.transform(df2).select("exp_" + strategy, "out").collect().foreach { + model.transform(df2).select("expected_" + strategy, "out").collect().foreach { case Row(exp: Double, out: Double) => assert(exp ~== out absTol 1e-5, s"Imputed values differ. Expected: $exp, actual: $out") } From cca8dd41714d79476c2bf23f706012a282c53bcb Mon Sep 17 00:00:00 2001 From: Yuhao Yang Date: Sat, 30 Apr 2016 22:01:21 -0400 Subject: [PATCH 17/25] rename ut --- .../main/scala/org/apache/spark/ml/feature/Imputer.scala | 6 +++--- .../scala/org/apache/spark/ml/feature/ImputerSuite.scala | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala index 0ca4b524184ea..efb86ddcecfa0 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala @@ -73,9 +73,9 @@ private[feature] trait ImputerParams extends Params with HasInputCol with HasOut /** * :: Experimental :: - * Imputation estimator for completing missing values, either using the mean or the - * median of the column in which the missing values are located. InputCol should be - * of DoubleType or FloatType. + * Imputation estimator for completing missing values, either using the mean or the median + * of the column in which the missing values are located. The input column should be of + * DoubleType or FloatType. * * Note that the mean/median value is computed after filtering out missing values. * All Null values in the input column are treated as missing, and so are also imputed. diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala index 06e754b21c6eb..a2f4664e1a6b3 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala @@ -42,7 +42,7 @@ class ImputerSuite extends SparkFunSuite with MLlibTestSparkContext with Default } } - test("Imputer for Double with missing Value -1.0 and contains NaN") { + test("Imputer should handle NaNs when computing surrogate value, if missingValue is not NaN") { val df = sqlContext.createDataFrame( Seq( (0, 1.0, 1.0, 1.0), (1, 3.0, 3.0, 3.0), @@ -82,7 +82,7 @@ class ImputerSuite extends SparkFunSuite with MLlibTestSparkContext with Default } } - test("Imputer should impute null") { + test("Imputer should impute null as well as 'missingValue'") { val df = sqlContext.createDataFrame( Seq( (0, 4.0, 4.0, 4.0), (1, 10.0, 10.0, 10.0), From 4e0743139796ac53df2554cfa53736b8035bae15 Mon Sep 17 00:00:00 2001 From: Yuhao Yang Date: Tue, 3 May 2016 17:09:31 +0800 Subject: [PATCH 18/25] update parameter doc --- mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala index efb86ddcecfa0..9030c87666a4e 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala @@ -140,7 +140,7 @@ object Imputer extends DefaultParamsReadable[Imputer] { * :: Experimental :: * Model fitted by [[Imputer]]. * - * @param surrogate statistics value for each feature during fitting + * @param surrogate Value by which missing values in the input column will be replaced. */ @Experimental class ImputerModel private[ml]( From 544a65c82a7d921bdff73998e8b350e11b51dcbe Mon Sep 17 00:00:00 2001 From: Yuhao Yang Date: Wed, 7 Sep 2016 12:42:45 -0700 Subject: [PATCH 19/25] update version --- .../org/apache/spark/ml/feature/Imputer.scala | 16 ++++++++-------- .../apache/spark/ml/feature/ImputerSuite.scala | 8 ++++---- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala index 9030c87666a4e..c6ab55c9bfba7 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala @@ -81,10 +81,10 @@ private[feature] trait ImputerParams extends Params with HasInputCol with HasOut * All Null values in the input column are treated as missing, and so are also imputed. */ @Experimental -class Imputer @Since("2.0.0")(override val uid: String) +class Imputer @Since("2.1.0")(override val uid: String) extends Estimator[ImputerModel] with ImputerParams with DefaultParamsWritable { - @Since("2.0.0") + @Since("2.1.0") def this() = this(Identifiable.randomUID("imputer")) /** @group setParam */ @@ -126,13 +126,13 @@ class Imputer @Since("2.0.0")(override val uid: String) } } -@Since("2.0.0") +@Since("2.1.0") object Imputer extends DefaultParamsReadable[Imputer] { /** Set of strategy names that Imputer currently supports. */ private[ml] val supportedStrategyNames = Set("mean", "median") - @Since("2.0.0") + @Since("2.1.0") override def load(path: String): Imputer = super.load(path) } @@ -175,12 +175,12 @@ class ImputerModel private[ml]( copyValues(copied, extra).setParent(parent) } - @Since("2.0.0") + @Since("2.1.0") override def write: MLWriter = new ImputerModelWriter(this) } -@Since("2.0.0") +@Since("2.1.0") object ImputerModel extends MLReadable[ImputerModel] { private[ImputerModel] class ImputerModelWriter(instance: ImputerModel) extends MLWriter { @@ -211,9 +211,9 @@ object ImputerModel extends MLReadable[ImputerModel] { } } - @Since("2.0.0") + @Since("2.1.0") override def read: MLReader[ImputerModel] = new ImputerReader - @Since("2.0.0") + @Since("2.1.0") override def load(path: String): ImputerModel = super.load(path) } diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala index a2f4664e1a6b3..c61200f2224e6 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala @@ -25,7 +25,7 @@ import org.apache.spark.sql.Row class ImputerSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { test("Imputer for Double with default missing Value NaN") { - val df = sqlContext.createDataFrame( Seq( + val df = spark.createDataFrame( Seq( (0, 1.0, 1.0, 1.0), (1, 1.0, 1.0, 1.0), (2, 3.0, 3.0, 3.0), @@ -43,7 +43,7 @@ class ImputerSuite extends SparkFunSuite with MLlibTestSparkContext with Default } test("Imputer should handle NaNs when computing surrogate value, if missingValue is not NaN") { - val df = sqlContext.createDataFrame( Seq( + val df = spark.createDataFrame( Seq( (0, 1.0, 1.0, 1.0), (1, 3.0, 3.0, 3.0), (2, Double.NaN, Double.NaN, Double.NaN), @@ -62,7 +62,7 @@ class ImputerSuite extends SparkFunSuite with MLlibTestSparkContext with Default } test("Imputer for Float with missing Value -1.0") { - val df = sqlContext.createDataFrame( Seq( + val df = spark.createDataFrame( Seq( (0, 1.0F, 1.0F, 1.0F), (1, 3.0F, 3.0F, 3.0F), (2, 10.0F, 10.0F, 10.0F), @@ -83,7 +83,7 @@ class ImputerSuite extends SparkFunSuite with MLlibTestSparkContext with Default } test("Imputer should impute null as well as 'missingValue'") { - val df = sqlContext.createDataFrame( Seq( + val df = spark.createDataFrame( Seq( (0, 4.0, 4.0, 4.0), (1, 10.0, 10.0, 10.0), (2, 10.0, 10.0, 10.0), From 91d4cee75a150ad2335dba0838c47cb4f0505ad8 Mon Sep 17 00:00:00 2001 From: Yuhao Date: Thu, 6 Oct 2016 17:39:51 -0700 Subject: [PATCH 20/25] throw exception --- .../org/apache/spark/ml/feature/Imputer.scala | 25 ++++++++++++------- .../spark/ml/feature/ImputerSuite.scala | 22 +++++++++++++--- 2 files changed, 34 insertions(+), 13 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala index c6ab55c9bfba7..e7477c616e6ee 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala @@ -19,6 +19,7 @@ package org.apache.spark.ml.feature import org.apache.hadoop.fs.Path +import org.apache.spark.SparkException import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.{Estimator, Model} import org.apache.spark.ml.param._ @@ -51,6 +52,7 @@ private[feature] trait ImputerParams extends Params with HasInputCol with HasOut /** * The placeholder for the missing values. All occurrences of missingValue will be imputed. + * Note that null values are always treated as missing. * Default: Double.NaN * * @group param @@ -65,8 +67,6 @@ private[feature] trait ImputerParams extends Params with HasInputCol with HasOut protected def validateAndTransformSchema(schema: StructType): StructType = { val inputType = schema($(inputCol)).dataType SchemaUtils.checkColumnTypes(schema, $(inputCol), Seq(DoubleType, FloatType)) - require(!schema.fieldNames.contains($(outputCol)), - s"Output column ${$(outputCol)} already exists.") SchemaUtils.appendColumn(schema, $(outputCol), inputType) } } @@ -75,7 +75,8 @@ private[feature] trait ImputerParams extends Params with HasInputCol with HasOut * :: Experimental :: * Imputation estimator for completing missing values, either using the mean or the median * of the column in which the missing values are located. The input column should be of - * DoubleType or FloatType. + * DoubleType or FloatType. Currently Imputer does not support categorical features yet + * and possibly creates incorrect values for a categorical feature. * * Note that the mean/median value is computed after filtering out missing values. * All Null values in the input column are treated as missing, and so are also imputed. @@ -88,18 +89,22 @@ class Imputer @Since("2.1.0")(override val uid: String) def this() = this(Identifiable.randomUID("imputer")) /** @group setParam */ + @Since("2.1.0") def setInputCol(value: String): this.type = set(inputCol, value) /** @group setParam */ + @Since("2.1.0") def setOutputCol(value: String): this.type = set(outputCol, value) /** * Imputation strategy. Available options are ["mean", "median"]. * @group setParam */ + @Since("2.1.0") def setStrategy(value: String): this.type = set(strategy, value) /** @group setParam */ + @Since("2.1.0") def setMissingValue(value: Double): this.type = set(missingValue, value) setDefault(strategy -> "mean", missingValue -> Double.NaN) @@ -109,8 +114,13 @@ class Imputer @Since("2.1.0")(override val uid: String) val ic = col($(inputCol)) val filtered = dataset.select(ic.cast(DoubleType)) .filter(ic.isNotNull && ic =!= $(missingValue)) + .filter(!ic.isNaN) + if(filtered.count() == 0) { + throw new SparkException(s"surrogate cannot be computed. " + + s"All the values in ${$(inputCol)} are Null, Nan or missingValue ($missingValue)") + } val surrogate = $(strategy) match { - case "mean" => filtered.filter(!ic.isNaN).select(avg($(inputCol))).first().getDouble(0) + case "mean" => filtered.select(avg($(inputCol))).first().getDouble(0) case "median" => filtered.stat.approxQuantile($(inputCol), Array(0.5), 0.001)(0) } copyValues(new ImputerModel(uid, surrogate).setParent(this)) @@ -120,10 +130,7 @@ class Imputer @Since("2.1.0")(override val uid: String) validateAndTransformSchema(schema) } - override def copy(extra: ParamMap): Imputer = { - val copied = new Imputer(uid) - copyValues(copied, extra) - } + override def copy(extra: ParamMap): Imputer = defaultCopy(extra) } @Since("2.1.0") @@ -158,7 +165,7 @@ class ImputerModel private[ml]( override def transform(dataset: Dataset[_]): DataFrame = { transformSchema(dataset.schema, logging = true) - val inputType = dataset.select($(inputCol)).schema.fields(0).dataType + val inputType = dataset.schema($(inputCol)).dataType val ic = col($(inputCol)) dataset.withColumn($(outputCol), when(ic.isNull, surrogate) .when(ic === $(missingValue), surrogate) diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala index c61200f2224e6..292595d29fb34 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala @@ -16,8 +16,8 @@ */ package org.apache.spark.ml.feature -import org.apache.spark.SparkFunSuite -import org.apache.spark.ml.util.{DefaultReadWriteTest} +import org.apache.spark.{SparkException, SparkFunSuite} +import org.apache.spark.ml.util.DefaultReadWriteTest import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ import org.apache.spark.sql.Row @@ -30,7 +30,7 @@ class ImputerSuite extends SparkFunSuite with MLlibTestSparkContext with Default (1, 1.0, 1.0, 1.0), (2, 3.0, 3.0, 3.0), (3, 4.0, 4.0, 4.0), - (4, Double.NaN, 2.25, 1.0) + (4, Double.NaN, 2.25, 3.0) )).toDF("id", "value", "expected_mean", "expected_median") Seq("mean", "median").foreach { strategy => val imputer = new Imputer().setInputCol("value").setOutputCol("out").setStrategy(strategy) @@ -74,7 +74,6 @@ class ImputerSuite extends SparkFunSuite with MLlibTestSparkContext with Default val imputer = new Imputer().setInputCol("value").setOutputCol("out").setStrategy(strategy) .setMissingValue(-1) val model = imputer.fit(df) - val result = model.transform(df) model.transform(df).select("expected_" + strategy, "out").collect().foreach { case Row(exp: Float, out: Float) => assert(exp == out, s"Imputed values differ. Expected: $exp, actual: $out") @@ -102,6 +101,21 @@ class ImputerSuite extends SparkFunSuite with MLlibTestSparkContext with Default } } + + test("Imputer throws exception when surrogate cannot be computed") { + val df = spark.createDataFrame( Seq( + (0, Double.NaN, 1.0, 1.0), + (1, Double.NaN, 3.0, 3.0), + (2, Double.NaN, Double.NaN, Double.NaN) + )).toDF("id", "value", "expected_mean", "expected_median") + Seq("mean", "median").foreach { strategy => + val imputer = new Imputer().setInputCol("value").setOutputCol("out").setStrategy(strategy) + intercept[SparkException] { + val model = imputer.fit(df) + } + } + } + test("Imputer read/write") { val t = new Imputer() .setInputCol("myInputCol") From 8744524e8da174316207cb4c33b425cbbd78f68e Mon Sep 17 00:00:00 2001 From: Yuhao Date: Fri, 7 Oct 2016 11:34:42 -0700 Subject: [PATCH 21/25] change data format --- .../org/apache/spark/ml/feature/Imputer.scala | 31 ++++---- .../spark/ml/feature/ImputerSuite.scala | 79 +++++++++---------- 2 files changed, 54 insertions(+), 56 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala index e7477c616e6ee..c39b8b243d0fb 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala @@ -25,7 +25,7 @@ import org.apache.spark.ml.{Estimator, Model} import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} import org.apache.spark.ml.util._ -import org.apache.spark.sql.{DataFrame, Dataset, Row} +import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.sql.functions._ import org.apache.spark.sql.types._ @@ -76,7 +76,7 @@ private[feature] trait ImputerParams extends Params with HasInputCol with HasOut * Imputation estimator for completing missing values, either using the mean or the median * of the column in which the missing values are located. The input column should be of * DoubleType or FloatType. Currently Imputer does not support categorical features yet - * and possibly creates incorrect values for a categorical feature. + * (SPARK-15041) and possibly creates incorrect values for a categorical feature. * * Note that the mean/median value is computed after filtering out missing values. * All Null values in the input column are treated as missing, and so are also imputed. @@ -123,7 +123,9 @@ class Imputer @Since("2.1.0")(override val uid: String) case "mean" => filtered.select(avg($(inputCol))).first().getDouble(0) case "median" => filtered.stat.approxQuantile($(inputCol), Array(0.5), 0.001)(0) } - copyValues(new ImputerModel(uid, surrogate).setParent(this)) + import dataset.sparkSession.implicits._ + val surrogateDF = Seq(surrogate.asInstanceOf[Double]).toDF($(inputCol)) + copyValues(new ImputerModel(uid, surrogateDF).setParent(this)) } override def transformSchema(schema: StructType): StructType = { @@ -147,12 +149,13 @@ object Imputer extends DefaultParamsReadable[Imputer] { * :: Experimental :: * Model fitted by [[Imputer]]. * - * @param surrogate Value by which missing values in the input column will be replaced. + * @param surrogateDF Value by which missing values in the input columns will be replaced. This + * is stored using DataFrame with input column names and the corresponding surrogates. */ @Experimental class ImputerModel private[ml]( override val uid: String, - val surrogate: Double) + val surrogateDF: DataFrame) extends Model[ImputerModel] with ImputerParams with MLWritable { import ImputerModel._ @@ -167,8 +170,9 @@ class ImputerModel private[ml]( transformSchema(dataset.schema, logging = true) val inputType = dataset.schema($(inputCol)).dataType val ic = col($(inputCol)) - dataset.withColumn($(outputCol), when(ic.isNull, surrogate) - .when(ic === $(missingValue), surrogate) + val icsurrogate = surrogateDF.head().getDouble(0) + dataset.withColumn($(outputCol), when(ic.isNull, icsurrogate) + .when(ic === $(missingValue), icsurrogate) .otherwise(ic) .cast(inputType)) } @@ -178,7 +182,7 @@ class ImputerModel private[ml]( } override def copy(extra: ParamMap): ImputerModel = { - val copied = new ImputerModel(uid, surrogate) + val copied = new ImputerModel(uid, surrogateDF) copyValues(copied, extra).setParent(parent) } @@ -192,13 +196,10 @@ object ImputerModel extends MLReadable[ImputerModel] { private[ImputerModel] class ImputerModelWriter(instance: ImputerModel) extends MLWriter { - private case class Data(surrogate: Double) - override protected def saveImpl(path: String): Unit = { DefaultParamsWriter.saveMetadata(instance, path, sc) - val data = new Data(instance.surrogate) val dataPath = new Path(path, "data").toString - sqlContext.createDataFrame(Seq(data)).repartition(1).write.parquet(dataPath) + instance.surrogateDF.repartition(1).write.parquet(dataPath) } } @@ -209,10 +210,8 @@ object ImputerModel extends MLReadable[ImputerModel] { override def load(path: String): ImputerModel = { val metadata = DefaultParamsReader.loadMetadata(path, sc, className) val dataPath = new Path(path, "data").toString - val Row(surrogate: Double) = sqlContext.read.parquet(dataPath) - .select("surrogate") - .head() - val model = new ImputerModel(metadata.uid, surrogate) + val surrogateDF = sqlContext.read.parquet(dataPath) + val model = new ImputerModel(metadata.uid, surrogateDF) DefaultParamsReader.getAndSetParams(model, metadata) model } diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala index 292595d29fb34..a53047f33ad7a 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala @@ -20,7 +20,7 @@ import org.apache.spark.{SparkException, SparkFunSuite} import org.apache.spark.ml.util.DefaultReadWriteTest import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ -import org.apache.spark.sql.Row +import org.apache.spark.sql.{DataFrame, Row} class ImputerSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { @@ -32,14 +32,8 @@ class ImputerSuite extends SparkFunSuite with MLlibTestSparkContext with Default (3, 4.0, 4.0, 4.0), (4, Double.NaN, 2.25, 3.0) )).toDF("id", "value", "expected_mean", "expected_median") - Seq("mean", "median").foreach { strategy => - val imputer = new Imputer().setInputCol("value").setOutputCol("out").setStrategy(strategy) - val model = imputer.fit(df) - model.transform(df).select("expected_" + strategy, "out").collect().foreach { - case Row(exp: Double, out: Double) => - assert(exp ~== out absTol 1e-5, s"Imputed values differ. Expected: $exp, actual: $out") - } - } + val imputer = new Imputer().setInputCol("value").setOutputCol("out") + ImputerSuite.iterateStrategyTest(imputer, df) } test("Imputer should handle NaNs when computing surrogate value, if missingValue is not NaN") { @@ -49,16 +43,9 @@ class ImputerSuite extends SparkFunSuite with MLlibTestSparkContext with Default (2, Double.NaN, Double.NaN, Double.NaN), (3, -1.0, 2.0, 3.0) )).toDF("id", "value", "expected_mean", "expected_median") - Seq("mean", "median").foreach { strategy => - val imputer = new Imputer().setInputCol("value").setOutputCol("out").setStrategy(strategy) - .setMissingValue(-1.0) - val model = imputer.fit(df) - model.transform(df).select("expected_" + strategy, "out").collect().foreach { - case Row(exp: Double, out: Double) => - assert((exp.isNaN && out.isNaN) || (exp ~== out absTol 1e-5), - s"Imputed values differ. Expected: $exp, actual: $out") - } - } + val imputer = new Imputer().setInputCol("value").setOutputCol("out") + .setMissingValue(-1.0) + ImputerSuite.iterateStrategyTest(imputer, df) } test("Imputer for Float with missing Value -1.0") { @@ -69,16 +56,9 @@ class ImputerSuite extends SparkFunSuite with MLlibTestSparkContext with Default (3, 10.0F, 10.0F, 10.0F), (4, -1.0F, 6.0F, 3.0F) )).toDF("id", "value", "expected_mean", "expected_median") - - Seq("mean", "median").foreach { strategy => - val imputer = new Imputer().setInputCol("value").setOutputCol("out").setStrategy(strategy) - .setMissingValue(-1) - val model = imputer.fit(df) - model.transform(df).select("expected_" + strategy, "out").collect().foreach { - case Row(exp: Float, out: Float) => - assert(exp == out, s"Imputed values differ. Expected: $exp, actual: $out") - } - } + val imputer = new Imputer().setInputCol("value").setOutputCol("out") + .setMissingValue(-1) + ImputerSuite.iterateStrategyTest(imputer, df) } test("Imputer should impute null as well as 'missingValue'") { @@ -90,15 +70,8 @@ class ImputerSuite extends SparkFunSuite with MLlibTestSparkContext with Default (4, -1.0, 8.0, 10.0) )).toDF("id", "value", "expected_mean", "expected_median") val df2 = df.selectExpr("*", "IF(value=-1.0, null, value) as nullable_value") - Seq("mean", "median").foreach { strategy => - val imputer = new Imputer().setInputCol("nullable_value").setOutputCol("out") - .setStrategy(strategy) - val model = imputer.fit(df2) - model.transform(df2).select("expected_" + strategy, "out").collect().foreach { - case Row(exp: Double, out: Double) => - assert(exp ~== out absTol 1e-5, s"Imputed values differ. Expected: $exp, actual: $out") - } - } + val imputer = new Imputer().setInputCol("nullable_value").setOutputCol("out") + ImputerSuite.iterateStrategyTest(imputer, df2) } @@ -125,12 +98,38 @@ class ImputerSuite extends SparkFunSuite with MLlibTestSparkContext with Default } test("ImputerModel read/write") { + val spark = this.spark + import spark.implicits._ + val surrogateDF = Seq(1.234).toDF("myInputCol") + val instance = new ImputerModel( - "myImputer", 1.234) + "myImputer", surrogateDF) .setInputCol("myInputCol") .setOutputCol("myOutputCol") val newInstance = testDefaultReadWrite(instance) - assert(newInstance.surrogate === instance.surrogate) + assert(newInstance.surrogateDF.collect() === instance.surrogateDF.collect()) } } + +object ImputerSuite{ + + /** + * Imputation strategy. Available options are ["mean", "median"]. + * @param df DataFrame with columns "id", "value", "expected_mean", "expected_median" + */ + def iterateStrategyTest(imputer: Imputer, df: DataFrame): Unit = { + Seq("mean", "median").foreach { strategy => + imputer.setStrategy(strategy) + val model = imputer.fit(df) + model.transform(df).select("expected_" + strategy, "out").collect().foreach { + case Row(exp: Float, out: Float) => + assert((exp.isNaN && out.isNaN) || (exp == out), + s"Imputed values differ. Expected: $exp, actual: $out") + case Row(exp: Double, out: Double) => + assert((exp.isNaN && out.isNaN) || (exp ~== out absTol 1e-5), + s"Imputed values differ. Expected: $exp, actual: $out") + } + } + } +} From e86d9198c65c3b289b091150b52708deda37f090 Mon Sep 17 00:00:00 2001 From: Yuhao Yang Date: Tue, 21 Feb 2017 23:46:17 -0800 Subject: [PATCH 22/25] add multi column support --- .../org/apache/spark/ml/feature/Imputer.scala | 93 +++++++++++++------ .../spark/ml/feature/ImputerSuite.scala | 21 +++-- 2 files changed, 75 insertions(+), 39 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala index c39b8b243d0fb..4a9f63810e088 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala @@ -23,16 +23,16 @@ import org.apache.spark.SparkException import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.{Estimator, Model} import org.apache.spark.ml.param._ -import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} +import org.apache.spark.ml.param.shared.{HasInputCols, HasOutputCol} import org.apache.spark.ml.util._ -import org.apache.spark.sql.{DataFrame, Dataset} +import org.apache.spark.sql.{DataFrame, Dataset, Row} import org.apache.spark.sql.functions._ import org.apache.spark.sql.types._ /** * Params for [[Imputer]] and [[ImputerModel]]. */ -private[feature] trait ImputerParams extends Params with HasInputCol with HasOutputCol { +private[feature] trait ImputerParams extends Params with HasInputCols with HasOutputCol { /** * The imputation strategy. @@ -63,11 +63,32 @@ private[feature] trait ImputerParams extends Params with HasInputCol with HasOut /** @group getParam */ def getMissingValue: Double = $(missingValue) + /** + * Param for output column names. + * @group param + */ + final val outputCols: StringArrayParam = new StringArrayParam(this, "outputCols", + "output column names") + + /** @group getParam */ + final def getOutputCols: Array[String] = $(outputCols) + /** Validates and transforms the input schema. */ protected def validateAndTransformSchema(schema: StructType): StructType = { - val inputType = schema($(inputCol)).dataType - SchemaUtils.checkColumnTypes(schema, $(inputCol), Seq(DoubleType, FloatType)) - SchemaUtils.appendColumn(schema, $(outputCol), inputType) + require($(inputCols).length == $(outputCols).length, "inputCols and outputCols should have" + + "the same length") + val localInputCols = $(inputCols) + val localOutputCols = $(outputCols) + var outputSchema = schema + + $(inputCols).indices.foreach { i => + val inputCol = localInputCols(i) + val outputCol = localOutputCols(i) + val inputType = schema(inputCol).dataType + SchemaUtils.checkColumnTypes(schema, inputCol, Seq(DoubleType, FloatType)) + outputSchema = SchemaUtils.appendColumn(outputSchema, outputCol, inputType) + } + outputSchema } } @@ -90,11 +111,11 @@ class Imputer @Since("2.1.0")(override val uid: String) /** @group setParam */ @Since("2.1.0") - def setInputCol(value: String): this.type = set(inputCol, value) + def setInputCols(value: Array[String]): this.type = set(inputCols, value) /** @group setParam */ @Since("2.1.0") - def setOutputCol(value: String): this.type = set(outputCol, value) + def setOutputCols(value: Array[String]): this.type = set(outputCols, value) /** * Imputation strategy. Available options are ["mean", "median"]. @@ -111,20 +132,24 @@ class Imputer @Since("2.1.0")(override val uid: String) override def fit(dataset: Dataset[_]): ImputerModel = { transformSchema(dataset.schema, logging = true) - val ic = col($(inputCol)) - val filtered = dataset.select(ic.cast(DoubleType)) - .filter(ic.isNotNull && ic =!= $(missingValue)) - .filter(!ic.isNaN) - if(filtered.count() == 0) { - throw new SparkException(s"surrogate cannot be computed. " + - s"All the values in ${$(inputCol)} are Null, Nan or missingValue ($missingValue)") - } - val surrogate = $(strategy) match { - case "mean" => filtered.select(avg($(inputCol))).first().getDouble(0) - case "median" => filtered.stat.approxQuantile($(inputCol), Array(0.5), 0.001)(0) + val surrogates = $(inputCols).map { inputCol => + val ic = col(inputCol) + val filtered = dataset.select(ic.cast(DoubleType)) + .filter(ic.isNotNull && ic =!= $(missingValue)) + .filter(!ic.isNaN) + if(filtered.rdd.isEmpty()) { + throw new SparkException(s"surrogate cannot be computed. " + + s"All the values in ${inputCol} are Null, Nan or missingValue ($missingValue)") + } + val surrogate = $(strategy) match { + case "mean" => filtered.select(avg(inputCol)).first().getDouble(0) + case "median" => filtered.stat.approxQuantile(inputCol, Array(0.5), 0.001)(0) + } + surrogate.asInstanceOf[Double] } + import dataset.sparkSession.implicits._ - val surrogateDF = Seq(surrogate.asInstanceOf[Double]).toDF($(inputCol)) + val surrogateDF = Seq(surrogates).toDF("surrogates") copyValues(new ImputerModel(uid, surrogateDF).setParent(this)) } @@ -161,20 +186,30 @@ class ImputerModel private[ml]( import ImputerModel._ /** @group setParam */ - def setInputCol(value: String): this.type = set(inputCol, value) + def setInputCols(value: Array[String]): this.type = set(inputCols, value) /** @group setParam */ - def setOutputCol(value: String): this.type = set(outputCol, value) + def setOutputCols(value: Array[String]): this.type = set(outputCols, value) override def transform(dataset: Dataset[_]): DataFrame = { transformSchema(dataset.schema, logging = true) - val inputType = dataset.schema($(inputCol)).dataType - val ic = col($(inputCol)) - val icsurrogate = surrogateDF.head().getDouble(0) - dataset.withColumn($(outputCol), when(ic.isNull, icsurrogate) - .when(ic === $(missingValue), icsurrogate) - .otherwise(ic) - .cast(inputType)) + val localInputCols = $(inputCols) + val localOutputCols = $(outputCols) + var outputDF = dataset + val surrogates = surrogateDF.head().getSeq[Double](0) + + $(inputCols).indices.foreach { i => + val inputCol = localInputCols(i) + val outputCol = localOutputCols(i) + val inputType = dataset.schema(inputCol).dataType + val ic = col(inputCol) + val icSurrogate = surrogates(i) + outputDF = outputDF.withColumn(outputCol, when(ic.isNull, icSurrogate) + .when(ic === $(missingValue), icSurrogate) + .otherwise(ic) + .cast(inputType)) + } + outputDF.toDF() } override def transformSchema(schema: StructType): StructType = { diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala index a53047f33ad7a..5cabd7f204cee 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala @@ -30,9 +30,9 @@ class ImputerSuite extends SparkFunSuite with MLlibTestSparkContext with Default (1, 1.0, 1.0, 1.0), (2, 3.0, 3.0, 3.0), (3, 4.0, 4.0, 4.0), - (4, Double.NaN, 2.25, 3.0) + (4, Double.NaN, 2.25, 1.0) )).toDF("id", "value", "expected_mean", "expected_median") - val imputer = new Imputer().setInputCol("value").setOutputCol("out") + val imputer = new Imputer().setInputCols(Array("value")).setOutputCols(Array("out")) ImputerSuite.iterateStrategyTest(imputer, df) } @@ -43,7 +43,7 @@ class ImputerSuite extends SparkFunSuite with MLlibTestSparkContext with Default (2, Double.NaN, Double.NaN, Double.NaN), (3, -1.0, 2.0, 3.0) )).toDF("id", "value", "expected_mean", "expected_median") - val imputer = new Imputer().setInputCol("value").setOutputCol("out") + val imputer = new Imputer().setInputCols(Array("value")).setOutputCols(Array("out")) .setMissingValue(-1.0) ImputerSuite.iterateStrategyTest(imputer, df) } @@ -56,7 +56,7 @@ class ImputerSuite extends SparkFunSuite with MLlibTestSparkContext with Default (3, 10.0F, 10.0F, 10.0F), (4, -1.0F, 6.0F, 3.0F) )).toDF("id", "value", "expected_mean", "expected_median") - val imputer = new Imputer().setInputCol("value").setOutputCol("out") + val imputer = new Imputer().setInputCols(Array("value")).setOutputCols(Array("out")) .setMissingValue(-1) ImputerSuite.iterateStrategyTest(imputer, df) } @@ -70,7 +70,7 @@ class ImputerSuite extends SparkFunSuite with MLlibTestSparkContext with Default (4, -1.0, 8.0, 10.0) )).toDF("id", "value", "expected_mean", "expected_median") val df2 = df.selectExpr("*", "IF(value=-1.0, null, value) as nullable_value") - val imputer = new Imputer().setInputCol("nullable_value").setOutputCol("out") + val imputer = new Imputer().setInputCols(Array("nullable_value")).setOutputCols(Array("out")) ImputerSuite.iterateStrategyTest(imputer, df2) } @@ -82,7 +82,8 @@ class ImputerSuite extends SparkFunSuite with MLlibTestSparkContext with Default (2, Double.NaN, Double.NaN, Double.NaN) )).toDF("id", "value", "expected_mean", "expected_median") Seq("mean", "median").foreach { strategy => - val imputer = new Imputer().setInputCol("value").setOutputCol("out").setStrategy(strategy) + val imputer = new Imputer().setInputCols(Array("value")).setOutputCols(Array("out")) + .setStrategy(strategy) intercept[SparkException] { val model = imputer.fit(df) } @@ -91,8 +92,8 @@ class ImputerSuite extends SparkFunSuite with MLlibTestSparkContext with Default test("Imputer read/write") { val t = new Imputer() - .setInputCol("myInputCol") - .setOutputCol("myOutputCol") + .setInputCols(Array("myInputCol")) + .setOutputCols(Array("myOutputCol")) .setMissingValue(-1.0) testDefaultReadWrite(t) } @@ -104,8 +105,8 @@ class ImputerSuite extends SparkFunSuite with MLlibTestSparkContext with Default val instance = new ImputerModel( "myImputer", surrogateDF) - .setInputCol("myInputCol") - .setOutputCol("myOutputCol") + .setInputCols(Array("myInputCol")) + .setOutputCols(Array("myOutputCol")) val newInstance = testDefaultReadWrite(instance) assert(newInstance.surrogateDF.collect() === instance.surrogateDF.collect()) } From 41d91b9ef855a611016c9a9613942e578ff599dd Mon Sep 17 00:00:00 2001 From: Yuhao Yang Date: Fri, 3 Mar 2017 14:49:22 -0800 Subject: [PATCH 23/25] change surrogateDF format and add ut for multi-columns --- .../org/apache/spark/ml/feature/Imputer.scala | 118 +++++++++--------- .../spark/ml/feature/ImputerSuite.scala | 76 +++++++---- 2 files changed, 112 insertions(+), 82 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala index 4a9f63810e088..ec8920993921b 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala @@ -23,7 +23,7 @@ import org.apache.spark.SparkException import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.{Estimator, Model} import org.apache.spark.ml.param._ -import org.apache.spark.ml.param.shared.{HasInputCols, HasOutputCol} +import org.apache.spark.ml.param.shared.HasInputCols import org.apache.spark.ml.util._ import org.apache.spark.sql.{DataFrame, Dataset, Row} import org.apache.spark.sql.functions._ @@ -32,20 +32,21 @@ import org.apache.spark.sql.types._ /** * Params for [[Imputer]] and [[ImputerModel]]. */ -private[feature] trait ImputerParams extends Params with HasInputCols with HasOutputCol { +private[feature] trait ImputerParams extends Params with HasInputCols { /** * The imputation strategy. * If "mean", then replace missing values using the mean value of the feature. - * If "median", then replace missing values using the approximate median value of the feature. + * If "median", then replace missing values using the approximate median value of the + * feature (relative error less than 0.001). * Default: mean * * @group param */ - final val strategy: Param[String] = new Param(this, "strategy", "strategy for imputation. " + - "If mean, then replace missing values using the mean value of the feature. " + - "If median, then replace missing values using the median value of the feature.", - ParamValidators.inArray[String](Imputer.supportedStrategyNames.toArray)) + final val strategy: Param[String] = new Param(this, "strategy", s"strategy for imputation. " + + s"If ${Imputer.mean}, then replace missing values using the mean value of the feature. " + + s"If ${Imputer.median}, then replace missing values using the median value of the feature.", + ParamValidators.inArray[String](Array(Imputer.mean, Imputer.median))) /** @group getParam */ def getStrategy: String = $(strategy) @@ -63,7 +64,7 @@ private[feature] trait ImputerParams extends Params with HasInputCols with HasOu /** @group getParam */ def getMissingValue: Double = $(missingValue) - /** + /** * Param for output column names. * @group param */ @@ -75,20 +76,18 @@ private[feature] trait ImputerParams extends Params with HasInputCols with HasOu /** Validates and transforms the input schema. */ protected def validateAndTransformSchema(schema: StructType): StructType = { - require($(inputCols).length == $(outputCols).length, "inputCols and outputCols should have" + - "the same length") - val localInputCols = $(inputCols) - val localOutputCols = $(outputCols) - var outputSchema = schema - - $(inputCols).indices.foreach { i => - val inputCol = localInputCols(i) - val outputCol = localOutputCols(i) - val inputType = schema(inputCol).dataType + require($(inputCols).length == $(inputCols).distinct.length, s"inputCols duplicates:" + + s" (${$(inputCols).mkString(", ")})") + require($(outputCols).length == $(outputCols).distinct.length, s"outputCols duplicates:" + + s" (${$(outputCols).mkString(", ")})") + require($(inputCols).length == $(outputCols).length, s"inputCols(${$(inputCols).length})" + + s" and outputCols(${$(outputCols).length}) should have the same length") + val outputFields = $(inputCols).zip($(outputCols)).map { case (inputCol, outputCol) => + val inputField = schema(inputCol) SchemaUtils.checkColumnTypes(schema, inputCol, Seq(DoubleType, FloatType)) - outputSchema = SchemaUtils.appendColumn(outputSchema, outputCol, inputType) + StructField(outputCol, inputField.dataType, inputField.nullable) } - outputSchema + StructType(schema ++ outputFields) } } @@ -103,53 +102,56 @@ private[feature] trait ImputerParams extends Params with HasInputCols with HasOu * All Null values in the input column are treated as missing, and so are also imputed. */ @Experimental -class Imputer @Since("2.1.0")(override val uid: String) +class Imputer @Since("2.2.0")(override val uid: String) extends Estimator[ImputerModel] with ImputerParams with DefaultParamsWritable { - @Since("2.1.0") + @Since("2.2.0") def this() = this(Identifiable.randomUID("imputer")) /** @group setParam */ - @Since("2.1.0") + @Since("2.2.0") def setInputCols(value: Array[String]): this.type = set(inputCols, value) /** @group setParam */ - @Since("2.1.0") + @Since("2.2.0") def setOutputCols(value: Array[String]): this.type = set(outputCols, value) /** * Imputation strategy. Available options are ["mean", "median"]. * @group setParam */ - @Since("2.1.0") + @Since("2.2.0") def setStrategy(value: String): this.type = set(strategy, value) /** @group setParam */ - @Since("2.1.0") + @Since("2.2.0") def setMissingValue(value: Double): this.type = set(missingValue, value) - setDefault(strategy -> "mean", missingValue -> Double.NaN) + import org.apache.spark.ml.feature.Imputer._ + setDefault(strategy -> mean, missingValue -> Double.NaN) override def fit(dataset: Dataset[_]): ImputerModel = { transformSchema(dataset.schema, logging = true) + val spark = dataset.sparkSession + import spark.implicits._ val surrogates = $(inputCols).map { inputCol => val ic = col(inputCol) val filtered = dataset.select(ic.cast(DoubleType)) - .filter(ic.isNotNull && ic =!= $(missingValue)) - .filter(!ic.isNaN) + .filter(ic.isNotNull && ic =!= $(missingValue) && !ic.isNaN) if(filtered.rdd.isEmpty()) { throw new SparkException(s"surrogate cannot be computed. " + - s"All the values in ${inputCol} are Null, Nan or missingValue ($missingValue)") + s"All the values in $inputCol are Null, Nan or missingValue ($missingValue)") } val surrogate = $(strategy) match { - case "mean" => filtered.select(avg(inputCol)).first().getDouble(0) - case "median" => filtered.stat.approxQuantile(inputCol, Array(0.5), 0.001)(0) + case Imputer.mean => filtered.select(avg(inputCol)).as[Double].first() + case Imputer.median => filtered.stat.approxQuantile(inputCol, Array(0.5), 0.001).head } - surrogate.asInstanceOf[Double] + surrogate } - import dataset.sparkSession.implicits._ - val surrogateDF = Seq(surrogates).toDF("surrogates") + val rows = spark.sparkContext.parallelize(Seq(Row.fromSeq(surrogates))) + val schema = StructType($(inputCols).map(col => StructField(col, DoubleType, nullable = false))) + val surrogateDF = spark.createDataFrame(rows, schema) copyValues(new ImputerModel(uid, surrogateDF).setParent(this)) } @@ -160,13 +162,14 @@ class Imputer @Since("2.1.0")(override val uid: String) override def copy(extra: ParamMap): Imputer = defaultCopy(extra) } -@Since("2.1.0") +@Since("2.2.0") object Imputer extends DefaultParamsReadable[Imputer] { - /** Set of strategy names that Imputer currently supports. */ - private[ml] val supportedStrategyNames = Set("mean", "median") + /** strategy names that Imputer currently supports. */ + private[ml] val mean = "mean" + private[ml] val median = "median" - @Since("2.1.0") + @Since("2.2.0") override def load(path: String): Imputer = super.load(path) } @@ -174,8 +177,8 @@ object Imputer extends DefaultParamsReadable[Imputer] { * :: Experimental :: * Model fitted by [[Imputer]]. * - * @param surrogateDF Value by which missing values in the input columns will be replaced. This - * is stored using DataFrame with input column names and the corresponding surrogates. + * @param surrogateDF a DataFrame contains inputCols and their corresponding surrogates, which are + * used to replace the missing values in the input DataFrame. */ @Experimental class ImputerModel private[ml]( @@ -193,21 +196,18 @@ class ImputerModel private[ml]( override def transform(dataset: Dataset[_]): DataFrame = { transformSchema(dataset.schema, logging = true) - val localInputCols = $(inputCols) - val localOutputCols = $(outputCols) var outputDF = dataset - val surrogates = surrogateDF.head().getSeq[Double](0) - - $(inputCols).indices.foreach { i => - val inputCol = localInputCols(i) - val outputCol = localOutputCols(i) - val inputType = dataset.schema(inputCol).dataType - val ic = col(inputCol) - val icSurrogate = surrogates(i) - outputDF = outputDF.withColumn(outputCol, when(ic.isNull, icSurrogate) - .when(ic === $(missingValue), icSurrogate) - .otherwise(ic) - .cast(inputType)) + val surrogates = surrogateDF.select($(inputCols).head, $(inputCols).tail: _*).head().toSeq + + $(inputCols).zip($(outputCols)).zip(surrogates).foreach { + case ((inputCol, outputCol), surrogate) => + val inputType = dataset.schema(inputCol).dataType + val ic = col(inputCol) + outputDF = outputDF.withColumn(outputCol, + when(ic.isNull, surrogate) + .when(ic === $(missingValue), surrogate) + .otherwise(ic) + .cast(inputType)) } outputDF.toDF() } @@ -221,12 +221,12 @@ class ImputerModel private[ml]( copyValues(copied, extra).setParent(parent) } - @Since("2.1.0") + @Since("2.2.0") override def write: MLWriter = new ImputerModelWriter(this) } -@Since("2.1.0") +@Since("2.2.0") object ImputerModel extends MLReadable[ImputerModel] { private[ImputerModel] class ImputerModelWriter(instance: ImputerModel) extends MLWriter { @@ -252,9 +252,9 @@ object ImputerModel extends MLReadable[ImputerModel] { } } - @Since("2.1.0") + @Since("2.2.0") override def read: MLReader[ImputerModel] = new ImputerReader - @Since("2.1.0") + @Since("2.2.0") override def load(path: String): ImputerModel = super.load(path) } diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala index 5cabd7f204cee..9e6392aa65fac 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala @@ -26,13 +26,15 @@ class ImputerSuite extends SparkFunSuite with MLlibTestSparkContext with Default test("Imputer for Double with default missing Value NaN") { val df = spark.createDataFrame( Seq( - (0, 1.0, 1.0, 1.0), - (1, 1.0, 1.0, 1.0), - (2, 3.0, 3.0, 3.0), - (3, 4.0, 4.0, 4.0), - (4, Double.NaN, 2.25, 1.0) - )).toDF("id", "value", "expected_mean", "expected_median") - val imputer = new Imputer().setInputCols(Array("value")).setOutputCols(Array("out")) + (0, 1.0, 4.0, 1.0, 1.0, 4.0, 4.0), + (1, 11.0, 12.0, 11.0, 11.0, 12.0, 12.0), + (2, 3.0, Double.NaN, 3.0, 3.0, 10.0, 12.0), + (3, Double.NaN, 14.0, 5.0, 3.0, 14.0, 14.0) + )).toDF("id", "value1", "value2", "expected_mean_value1", "expected_median_value1", + "expected_mean_value2", "expected_median_value2") + val imputer = new Imputer() + .setInputCols(Array("value1", "value2")) + .setOutputCols(Array("out1", "out2")) ImputerSuite.iterateStrategyTest(imputer, df) } @@ -42,7 +44,7 @@ class ImputerSuite extends SparkFunSuite with MLlibTestSparkContext with Default (1, 3.0, 3.0, 3.0), (2, Double.NaN, Double.NaN, Double.NaN), (3, -1.0, 2.0, 3.0) - )).toDF("id", "value", "expected_mean", "expected_median") + )).toDF("id", "value", "expected_mean_value", "expected_median_value") val imputer = new Imputer().setInputCols(Array("value")).setOutputCols(Array("out")) .setMissingValue(-1.0) ImputerSuite.iterateStrategyTest(imputer, df) @@ -55,32 +57,31 @@ class ImputerSuite extends SparkFunSuite with MLlibTestSparkContext with Default (2, 10.0F, 10.0F, 10.0F), (3, 10.0F, 10.0F, 10.0F), (4, -1.0F, 6.0F, 3.0F) - )).toDF("id", "value", "expected_mean", "expected_median") + )).toDF("id", "value", "expected_mean_value", "expected_median_value") val imputer = new Imputer().setInputCols(Array("value")).setOutputCols(Array("out")) .setMissingValue(-1) ImputerSuite.iterateStrategyTest(imputer, df) } test("Imputer should impute null as well as 'missingValue'") { - val df = spark.createDataFrame( Seq( + val rawDf = spark.createDataFrame( Seq( (0, 4.0, 4.0, 4.0), (1, 10.0, 10.0, 10.0), (2, 10.0, 10.0, 10.0), (3, Double.NaN, 8.0, 10.0), (4, -1.0, 8.0, 10.0) - )).toDF("id", "value", "expected_mean", "expected_median") - val df2 = df.selectExpr("*", "IF(value=-1.0, null, value) as nullable_value") - val imputer = new Imputer().setInputCols(Array("nullable_value")).setOutputCols(Array("out")) - ImputerSuite.iterateStrategyTest(imputer, df2) + )).toDF("id", "rawValue", "expected_mean_value", "expected_median_value") + val df = rawDf.selectExpr("*", "IF(rawValue=-1.0, null, rawValue) as value") + val imputer = new Imputer().setInputCols(Array("value")).setOutputCols(Array("out")) + ImputerSuite.iterateStrategyTest(imputer, df) } - test("Imputer throws exception when surrogate cannot be computed") { val df = spark.createDataFrame( Seq( (0, Double.NaN, 1.0, 1.0), (1, Double.NaN, 3.0, 3.0), (2, Double.NaN, Double.NaN, Double.NaN) - )).toDF("id", "value", "expected_mean", "expected_median") + )).toDF("id", "value", "expected_mean_value", "expected_median_value") Seq("mean", "median").foreach { strategy => val imputer = new Imputer().setInputCols(Array("value")).setOutputCols(Array("out")) .setStrategy(strategy) @@ -90,6 +91,30 @@ class ImputerSuite extends SparkFunSuite with MLlibTestSparkContext with Default } } + test("Imputer throws exception when inputCols does not match outputCols") { + val df = spark.createDataFrame( Seq( + (0, 1.0, 1.0, 1.0), + (1, Double.NaN, 3.0, 3.0), + (2, Double.NaN, Double.NaN, Double.NaN) + )).toDF("id", "value1", "value2", "value3") + Seq("mean", "median").foreach { strategy => + // inputCols and outCols length different + val imputer = new Imputer() + .setInputCols(Array("value1", "value2")) + .setOutputCols(Array("out1")) + .setStrategy(strategy) + intercept[IllegalArgumentException] { + val model = imputer.fit(df) + } + // duplicate name in inputCols + imputer.setInputCols(Array("value1", "value1")).setOutputCols(Array("out1, out2")) + intercept[IllegalArgumentException] { + val model = imputer.fit(df) + } + + } + } + test("Imputer read/write") { val t = new Imputer() .setInputCols(Array("myInputCol")) @@ -120,16 +145,21 @@ object ImputerSuite{ * @param df DataFrame with columns "id", "value", "expected_mean", "expected_median" */ def iterateStrategyTest(imputer: Imputer, df: DataFrame): Unit = { + val inputCols = imputer.getInputCols + Seq("mean", "median").foreach { strategy => imputer.setStrategy(strategy) val model = imputer.fit(df) - model.transform(df).select("expected_" + strategy, "out").collect().foreach { - case Row(exp: Float, out: Float) => - assert((exp.isNaN && out.isNaN) || (exp == out), - s"Imputed values differ. Expected: $exp, actual: $out") - case Row(exp: Double, out: Double) => - assert((exp.isNaN && out.isNaN) || (exp ~== out absTol 1e-5), - s"Imputed values differ. Expected: $exp, actual: $out") + val resultDF = model.transform(df) + imputer.getInputCols.zip(imputer.getOutputCols).foreach { case (inputCol, outputCol) => + resultDF.select(s"expected_${strategy}_$inputCol", outputCol).collect().foreach { + case Row(exp: Float, out: Float) => + assert((exp.isNaN && out.isNaN) || (exp == out), + s"Imputed values differ. Expected: $exp, actual: $out") + case Row(exp: Double, out: Double) => + assert((exp.isNaN && out.isNaN) || (exp ~== out absTol 1e-5), + s"Imputed values differ. Expected: $exp, actual: $out") + } } } } From e378db5944d7d8bed0ebadc0573a3ea03fe387f0 Mon Sep 17 00:00:00 2001 From: Yuhao Yang Date: Mon, 6 Mar 2017 13:15:22 -0800 Subject: [PATCH 24/25] unit test refine and comments update --- .../org/apache/spark/ml/feature/Imputer.scala | 19 ++++--- .../spark/ml/feature/ImputerSuite.scala | 49 +++++++++++++------ 2 files changed, 43 insertions(+), 25 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala index ec8920993921b..6d3121c870721 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala @@ -37,8 +37,7 @@ private[feature] trait ImputerParams extends Params with HasInputCols { /** * The imputation strategy. * If "mean", then replace missing values using the mean value of the feature. - * If "median", then replace missing values using the approximate median value of the - * feature (relative error less than 0.001). + * If "median", then replace missing values using the approximate median value of the feature. * Default: mean * * @group param @@ -76,10 +75,10 @@ private[feature] trait ImputerParams extends Params with HasInputCols { /** Validates and transforms the input schema. */ protected def validateAndTransformSchema(schema: StructType): StructType = { - require($(inputCols).length == $(inputCols).distinct.length, s"inputCols duplicates:" + - s" (${$(inputCols).mkString(", ")})") - require($(outputCols).length == $(outputCols).distinct.length, s"outputCols duplicates:" + - s" (${$(outputCols).mkString(", ")})") + require($(inputCols).length == $(inputCols).distinct.length, s"inputCols contains" + + s" duplicates: (${$(inputCols).mkString(", ")})") + require($(outputCols).length == $(outputCols).distinct.length, s"outputCols contains" + + s" duplicates: (${$(outputCols).mkString(", ")})") require($(inputCols).length == $(outputCols).length, s"inputCols(${$(inputCols).length})" + s" and outputCols(${$(outputCols).length}) should have the same length") val outputFields = $(inputCols).zip($(outputCols)).map { case (inputCol, outputCol) => @@ -99,7 +98,8 @@ private[feature] trait ImputerParams extends Params with HasInputCols { * (SPARK-15041) and possibly creates incorrect values for a categorical feature. * * Note that the mean/median value is computed after filtering out missing values. - * All Null values in the input column are treated as missing, and so are also imputed. + * All Null values in the input column are treated as missing, and so are also imputed. For + * computing median, DataFrameStatFunctions.approxQuantile is used with a relative error of 0.001. */ @Experimental class Imputer @Since("2.2.0")(override val uid: String) @@ -127,8 +127,7 @@ class Imputer @Since("2.2.0")(override val uid: String) @Since("2.2.0") def setMissingValue(value: Double): this.type = set(missingValue, value) - import org.apache.spark.ml.feature.Imputer._ - setDefault(strategy -> mean, missingValue -> Double.NaN) + setDefault(strategy -> Imputer.mean, missingValue -> Double.NaN) override def fit(dataset: Dataset[_]): ImputerModel = { transformSchema(dataset.schema, logging = true) @@ -197,7 +196,7 @@ class ImputerModel private[ml]( override def transform(dataset: Dataset[_]): DataFrame = { transformSchema(dataset.schema, logging = true) var outputDF = dataset - val surrogates = surrogateDF.select($(inputCols).head, $(inputCols).tail: _*).head().toSeq + val surrogates = surrogateDF.select($(inputCols).map(col): _*).head().toSeq $(inputCols).zip($(outputCols)).zip(surrogates).foreach { case ((inputCol, outputCol), surrogate) => diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala index 9e6392aa65fac..ee2ba73fa96d5 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala @@ -85,33 +85,51 @@ class ImputerSuite extends SparkFunSuite with MLlibTestSparkContext with Default Seq("mean", "median").foreach { strategy => val imputer = new Imputer().setInputCols(Array("value")).setOutputCols(Array("out")) .setStrategy(strategy) - intercept[SparkException] { - val model = imputer.fit(df) + withClue("Imputer should fail all the values are invalid") { + val e: SparkException = intercept[SparkException] { + val model = imputer.fit(df) + } + assert(e.getMessage.contains("surrogate cannot be computed")) } } } - test("Imputer throws exception when inputCols does not match outputCols") { + test("Imputer input & output column validation") { val df = spark.createDataFrame( Seq( (0, 1.0, 1.0, 1.0), (1, Double.NaN, 3.0, 3.0), (2, Double.NaN, Double.NaN, Double.NaN) )).toDF("id", "value1", "value2", "value3") Seq("mean", "median").foreach { strategy => - // inputCols and outCols length different - val imputer = new Imputer() - .setInputCols(Array("value1", "value2")) - .setOutputCols(Array("out1")) - .setStrategy(strategy) - intercept[IllegalArgumentException] { - val model = imputer.fit(df) + withClue("Imputer should fail if inputCols and outputCols are different length") { + val e: IllegalArgumentException = intercept[IllegalArgumentException] { + val imputer = new Imputer().setStrategy(strategy) + .setInputCols(Array("value1", "value2")) + .setOutputCols(Array("out1")) + val model = imputer.fit(df) + } + assert(e.getMessage.contains("should have the same length")) } - // duplicate name in inputCols - imputer.setInputCols(Array("value1", "value1")).setOutputCols(Array("out1, out2")) - intercept[IllegalArgumentException] { - val model = imputer.fit(df) + + withClue("Imputer should fail if inputCols contains duplicates") { + val e: IllegalArgumentException = intercept[IllegalArgumentException] { + val imputer = new Imputer().setStrategy(strategy) + .setInputCols(Array("value1", "value1")) + .setOutputCols(Array("out1", "out2")) + val model = imputer.fit(df) + } + assert(e.getMessage.contains("inputCols contains duplicates")) } + withClue("Imputer should fail if outputCols contains duplicates") { + val e: IllegalArgumentException = intercept[IllegalArgumentException] { + val imputer = new Imputer().setStrategy(strategy) + .setInputCols(Array("value1", "value2")) + .setOutputCols(Array("out1", "out1")) + val model = imputer.fit(df) + } + assert(e.getMessage.contains("outputCols contains duplicates")) + } } } @@ -133,12 +151,13 @@ class ImputerSuite extends SparkFunSuite with MLlibTestSparkContext with Default .setInputCols(Array("myInputCol")) .setOutputCols(Array("myOutputCol")) val newInstance = testDefaultReadWrite(instance) + assert(newInstance.surrogateDF.columns === instance.surrogateDF.columns) assert(newInstance.surrogateDF.collect() === instance.surrogateDF.collect()) } } -object ImputerSuite{ +object ImputerSuite { /** * Imputation strategy. Available options are ["mean", "median"]. From c67afc11e1fee58b65da67bf3e25e5245f72280d Mon Sep 17 00:00:00 2001 From: Yuhao Yang Date: Wed, 8 Mar 2017 10:37:12 -0800 Subject: [PATCH 25/25] fix exception message --- .../src/main/scala/org/apache/spark/ml/feature/Imputer.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala index 6d3121c870721..b1a802ee13fc4 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala @@ -137,9 +137,9 @@ class Imputer @Since("2.2.0")(override val uid: String) val ic = col(inputCol) val filtered = dataset.select(ic.cast(DoubleType)) .filter(ic.isNotNull && ic =!= $(missingValue) && !ic.isNaN) - if(filtered.rdd.isEmpty()) { + if(filtered.take(1).length == 0) { throw new SparkException(s"surrogate cannot be computed. " + - s"All the values in $inputCol are Null, Nan or missingValue ($missingValue)") + s"All the values in $inputCol are Null, Nan or missingValue(${$(missingValue)})") } val surrogate = $(strategy) match { case Imputer.mean => filtered.select(avg(inputCol)).as[Double].first()