Skip to content
66 changes: 66 additions & 0 deletions docs/ml-features.md
Original file line number Diff line number Diff line change
Expand Up @@ -1284,6 +1284,72 @@ for more details on the API.

</div>


## Imputer

The `Imputer` transformer completes missing values in a dataset, either using the mean or the
median of the columns in which the missing values are located. The input columns should be of
`DoubleType` or `FloatType`. Currently `Imputer` does not support categorical features and possibly
creates incorrect values for columns containing categorical features.

**Note** all `null` values in the input columns are treated as missing, and so are also imputed.

**Examples**

Suppose that we have a DataFrame with the columns `a` and `b`:

~~~
a | b
------------|-----------
1.0 | Double.NaN
2.0 | Double.NaN
Double.NaN | 3.0
4.0 | 4.0
5.0 | 5.0
~~~

In this example, Imputer will replace all occurrences of `Double.NaN` (the default for the missing value)
with the mean (the default imputation strategy) computed from the other values in the corresponding columns.
In this example, the surrogate values for columns `a` and `b` are 3.0 and 4.0 respectively. After
transformation, the missing values in the output columns will be replaced by the surrogate value for
the relevant column.

~~~
a | b | out_a | out_b
------------|------------|-------|-------
1.0 | Double.NaN | 1.0 | 4.0
2.0 | Double.NaN | 2.0 | 4.0
Double.NaN | 3.0 | 3.0 | 3.0
4.0 | 4.0 | 4.0 | 4.0
5.0 | 5.0 | 5.0 | 5.0
~~~

<div class="codetabs">
<div data-lang="scala" markdown="1">

Refer to the [Imputer Scala docs](api/scala/index.html#org.apache.spark.ml.feature.Imputer)
for more details on the API.

{% include_example scala/org/apache/spark/examples/ml/ImputerExample.scala %}
</div>

<div data-lang="java" markdown="1">

Refer to the [Imputer Java docs](api/java/org/apache/spark/ml/feature/Imputer.html)
for more details on the API.

{% include_example java/org/apache/spark/examples/ml/JavaImputerExample.java %}
</div>

<div data-lang="python" markdown="1">

Refer to the [Imputer Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.Imputer)
for more details on the API.

{% include_example python/ml/imputer_example.py %}
</div>
</div>

# Feature Selectors

## VectorSlicer
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.spark.examples.ml;

// $example on$
import java.util.Arrays;
import java.util.List;

import org.apache.spark.ml.feature.Imputer;
import org.apache.spark.ml.feature.ImputerModel;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.types.*;
// $example off$

import static org.apache.spark.sql.types.DataTypes.*;

/**
* An example demonstrating Imputer.
* Run with:
* bin/run-example ml.JavaImputerExample
*/
public class JavaImputerExample {
public static void main(String[] args) {
SparkSession spark = SparkSession
.builder()
.appName("JavaImputerExample")
.getOrCreate();

// $example on$
List<Row> data = Arrays.asList(
RowFactory.create(1.0, Double.NaN),
RowFactory.create(2.0, Double.NaN),
RowFactory.create(Double.NaN, 3.0),
RowFactory.create(4.0, 4.0),
RowFactory.create(5.0, 5.0)
);
StructType schema = new StructType(new StructField[]{
createStructField("a", DoubleType, false),
createStructField("b", DoubleType, false)
});
Dataset<Row> df = spark.createDataFrame(data, schema);

Imputer imputer = new Imputer()
.setInputCols(new String[]{"a", "b"})
.setOutputCols(new String[]{"out_a", "out_b"});

ImputerModel model = imputer.fit(df);
model.transform(df).show();
// $example off$

spark.stop();
}
}
50 changes: 50 additions & 0 deletions examples/src/main/python/ml/imputer_example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

# $example on$
from pyspark.ml.feature import Imputer
# $example off$
from pyspark.sql import SparkSession

"""
An example demonstrating Imputer.
Run with:
bin/spark-submit examples/src/main/python/ml/imputer_example.py
"""

if __name__ == "__main__":
spark = SparkSession\
.builder\
.appName("ImputerExample")\
.getOrCreate()

# $example on$
df = spark.createDataFrame([
(1.0, float("nan")),
(2.0, float("nan")),
(float("nan"), 3.0),
(4.0, 4.0),
(5.0, 5.0)
], ["a", "b"])

imputer = Imputer(inputCols=["a", "b"], outputCols=["out_a", "out_b"])
model = imputer.fit(df)

model.transform(df).show()
# $example off$

spark.stop()
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.spark.examples.ml

// $example on$
import org.apache.spark.ml.feature.Imputer
// $example off$
import org.apache.spark.sql.SparkSession

Copy link
Contributor

@MLnick MLnick Mar 21, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Most examples have a small doc string that includes a "Run with:" part - see e.g. the recent MinHashLSHExample (this should also be added for the Java example)

/**
* An example demonstrating Imputer.
* Run with:
* bin/run-example ml.ImputerExample
*/
object ImputerExample {

def main(args: Array[String]): Unit = {
val spark = SparkSession.builder
.appName("ImputerExample")
.getOrCreate()

// $example on$
val df = spark.createDataFrame(Seq(
(1.0, Double.NaN),
(2.0, Double.NaN),
(Double.NaN, 3.0),
(4.0, 4.0),
(5.0, 5.0)
)).toDF("a", "b")

val imputer = new Imputer()
.setInputCols(Array("a", "b"))
.setOutputCols(Array("out_a", "out_b"))

val model = imputer.fit(df)
model.transform(df).show()
// $example off$

spark.stop()
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ import org.apache.spark.sql.types._
private[feature] trait ImputerParams extends Params with HasInputCols {

/**
* The imputation strategy.
* The imputation strategy. Currently only "mean" and "median" are supported.
* If "mean", then replace missing values using the mean value of the feature.
* If "median", then replace missing values using the approximate median value of the feature.
* Default: mean
Expand Down