imputer doc and example

YY-OnCall · YY-OnCall · commit f2e7a69badc9 · 2017-03-16T15:05:56.000-07:00
diff --git a/docs/ml-features.md b/docs/ml-features.md
@@ -1284,6 +1284,61 @@ for more details on the API.
 
 </div>
 
+
+## Imputer
+
+Imputation transformer for completing missing values in the dataset, either using the mean or the 
+median of the columns in which the missing value are located. The input columns should be of
+DoubleType or FloatType. Currently Imputer does not support categorical features and possibly
+creates incorrect values for a categorical feature. All Null values in the input column are
+treated as missing, and so are also imputed.
+
+**Examples**
+
+Suppose that we have a DataFrame with the column `a` and `b`:
+
+~~~
+      a     |      b      
+------------|-----------
+     1.0    | Double.NaN
+     2.0    | Double.NaN
+ Double.NaN |     3.0   
+     4.0    |     4.0   
+     5.0    |     5.0   
+~~~
+
+By default, Imputer will replace all the `Double.NaN` (missing value) with the mean (strategy) from
+other values in the corresponding columns. In our example, the surrogates for `a` and `b` are 3.0
+and 4.0 respectively. After transformation, the output columns will not contain missing value anymore.
+
+~~~
+      a     |      b     | out_a | out_b   
+------------|------------|-------|-------
+     1.0    | Double.NaN |  1.0  |  4.0 
+     2.0    | Double.NaN |  2.0  |  4.0 
+ Double.NaN |     3.0    |  3.0  |  3.0 
+     4.0    |     4.0    |  4.0  |  4.0
+     5.0    |     5.0    |  5.0  |  5.0 
+~~~
+
+<div class="codetabs">
+<div data-lang="scala" markdown="1">
+
+Refer to the [Imputer Scala docs](api/scala/index.html#org.apache.spark.ml.feature.Imputer)
+for more details on the API.
+
+{% include_example scala/org/apache/spark/examples/ml/ImputerExample.scala %}
+</div>
+
+<div data-lang="java" markdown="1">
+
+Refer to the [Imputer Java docs](api/java/org/apache/spark/ml/feature/Imputer.html)
+for more details on the API.
+
+{% include_example java/org/apache/spark/examples/ml/JavaImputerExample.java %}
+</div>
+</div>
+
 # Feature Selectors
 
 ## VectorSlicer
@@ -1625,4 +1680,4 @@ for more details on the API.
 
 {% include_example python/ml/min_hash_lsh_example.py %}
 </div>
-</div>
+</div>
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaImputerExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaImputerExample.java
@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml;
+
+// $example on$
+import java.util.Arrays;
+import java.util.List;
+
+import org.apache.spark.ml.feature.Imputer;
+import org.apache.spark.ml.feature.ImputerModel;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.SparkSession;
+import org.apache.spark.sql.types.*;
+// $example off$
+
+import static org.apache.spark.sql.types.DataTypes.*;
+
+public class JavaImputerExample {
+  public static void main(String[] args) {
+    SparkSession spark = SparkSession
+      .builder()
+      .appName("JavaImputerExample")
+      .getOrCreate();
+
+    // $example on$
+    List<Row> data = Arrays.asList(
+      RowFactory.create(1.0, Double.NaN),
+      RowFactory.create(2.0, Double.NaN),
+      RowFactory.create(Double.NaN, 3.0),
+      RowFactory.create(4.0, 4.0),
+      RowFactory.create(5.0, 5.0)
+    );
+    StructType schema = new StructType(new StructField[]{
+      createStructField("a", DoubleType, false),
+      createStructField("b", DoubleType, false)
+    });
+    Dataset<Row> df = spark.createDataFrame(data, schema);
+
+    Imputer imputerModel = new Imputer()
+      .setStrategy("mean")
+      .setInputCols(new String[]{"a", "b"})
+      .setOutputCols(new String[]{"out_a", "out_b"});
+
+    ImputerModel model = imputerModel.fit(df);
+    model.transform(df).show();
+    // $example off$
+
+    spark.stop();
+  }
+}
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/ImputerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/ImputerExample.scala
@@ -0,0 +1,52 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml
+
+// $example on$
+import org.apache.spark.ml.feature.Imputer
+// $example off$
+import org.apache.spark.sql.SparkSession
+
+object ImputerExample {
+
+  def main(args: Array[String]): Unit = {
+    val spark = SparkSession.builder
+      .appName("ImputerExample")
+      .getOrCreate()
+
+    // $example on$
+    val df = spark.createDataFrame( Seq(
+      (1.0, Double.NaN),
+      (2.0, Double.NaN),
+      (Double.NaN, 3.0),
+      (4.0, 4.0),
+      (5.0, 5.0)
+    )).toDF("a", "b")
+
+    val imputer = new Imputer()
+      .setStrategy("mean")
+      .setInputCols(Array("a", "b"))
+      .setOutputCols(Array("out_a", "out_b"))
+
+    val model = imputer.fit(df)
+    model.transform(df).show()
+    // $example off$
+
+    spark.stop()
+  }
+}
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala
@@ -35,7 +35,7 @@ import org.apache.spark.sql.types._
 private[feature] trait ImputerParams extends Params with HasInputCols {
 
   /**
-   * The imputation strategy.
+   * The imputation strategy. Currently only "mean" and "median" are supported.
    * If "mean", then replace missing values using the mean value of the feature.
    * If "median", then replace missing values using the approximate median value of the feature.
    * Default: mean
@@ -75,6 +75,8 @@ private[feature] trait ImputerParams extends Params with HasInputCols {
 
   /** Validates and transforms the input schema. */
   protected def validateAndTransformSchema(schema: StructType): StructType = {
+    require(get(inputCols).isDefined, "Input cols must be defined first.")
+    require(get(outputCols).isDefined, "Output cols must be defined first.")
     require($(inputCols).length == $(inputCols).distinct.length, s"inputCols contains" +
       s" duplicates: (${$(inputCols).mkString(", ")})")
     require($(outputCols).length == $(outputCols).distinct.length, s"outputCols contains" +