From a0725a5a30b47345de93111cb437bf35c9c1a64b Mon Sep 17 00:00:00 2001 From: Yuhao Yang Date: Fri, 3 Jul 2015 16:54:54 +0800 Subject: [PATCH 1/4] doc for MinMaxScaler --- docs/ml-features.md | 69 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) diff --git a/docs/ml-features.md b/docs/ml-features.md index f88c0248c1a8..4941312aecf7 100644 --- a/docs/ml-features.md +++ b/docs/ml-features.md @@ -865,6 +865,7 @@ val scaledData = scalerModel.transform(dataFrame) {% highlight java %} import org.apache.spark.api.java.JavaRDD; import org.apache.spark.ml.feature.StandardScaler; +import org.apache.spark.ml.feature.StandardScalerModel; import org.apache.spark.mllib.regression.LabeledPoint; import org.apache.spark.mllib.util.MLUtils; import org.apache.spark.sql.DataFrame; @@ -905,6 +906,74 @@ scaledData = scalerModel.transform(dataFrame) +## MinMaxScaler + +`MinMaxScaler` transforms a dataset of `Vector` rows, rescaling each feature to a specific range (often [0, 1]). It takes parameters: + +* `min`: 0.0 by default. Lower bound after transformation, shared by all features. +* `max`: 1.0 by default. Upper bound after transformation, shared by all features. + +`MinMaxScaler` is a `Model` which can be `fit` on a dataset to produce a `MinMaxScalerModel`; this amounts to computing summary statistics. The model can then transform each feature individually such that it is in the given range. + +The rescaled value for a feature E is calculated as, + + Rescaled(e_i) = \frac{e_i - E_{min}}{E_{max} - E_{min}} * (max - min) + min + +For the case E_{max} == E_{min}, Rescaled(e_i) = 0.5 * (max + min) + +Note that since zero values will probably be transformed to non-zero values, output of the transformer will be DenseVector even for sparse input. + +More details can be found in the API docs for +[MinMaxScaler](api/scala/index.html#org.apache.spark.ml.feature.MinMaxScaler) and +[MinMaxScalerModel](api/scala/index.html#org.apache.spark.ml.feature.MinMaxScalerModel). + +The following example demonstrates how to load a dataset in libsvm format and then rescale each feature to [0, 1]. + +
+
+{% highlight scala %} +import org.apache.spark.ml.feature.MinMaxScaler +import org.apache.spark.mllib.util.MLUtils + +val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt") +val dataFrame = sqlContext.createDataFrame(data) +val scaler = new MinMaxScaler() + .setInputCol("features") + .setOutputCol("scaledFeatures") + +// Compute summary statistics by fitting the StandardScaler +val scalerModel = scaler.fit(dataFrame) + +// Normalize each feature to have unit standard deviation. +val scaledData = scalerModel.transform(dataFrame) +{% endhighlight %} +
+ +
+{% highlight java %} +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.ml.feature.MinMaxScaler; +import org.apache.spark.ml.feature.MinMaxScalerModel; +import org.apache.spark.mllib.regression.LabeledPoint; +import org.apache.spark.mllib.util.MLUtils; +import org.apache.spark.sql.DataFrame; + +JavaRDD data = + MLUtils.loadLibSVMFile(jsc.sc(), "data/mllib/sample_libsvm_data.txt").toJavaRDD(); +DataFrame dataFrame = jsql.createDataFrame(data, LabeledPoint.class); +MinMaxScaler scaler = new MinMaxScaler() + .setInputCol("features") + .setOutputCol("scaledFeatures"); + +// Compute summary statistics by fitting the StandardScaler +MinMaxScalerModel scalerModel = scaler.fit(dataFrame); + +// Normalize each feature to have unit standard deviation. +DataFrame scaledData = scalerModel.transform(dataFrame); +{% endhighlight %} +
+
+ ## Bucketizer `Bucketizer` transforms a column of continuous features to a column of feature buckets, where the buckets are specified by users. It takes a parameter: From 40942a7d8a9913cb4406a7661393bdd160b84690 Mon Sep 17 00:00:00 2001 From: Yuhao Yang Date: Mon, 3 Aug 2015 14:34:29 +0800 Subject: [PATCH 2/4] small update --- docs/ml-features.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/ml-features.md b/docs/ml-features.md index 4941312aecf7..aa2f835cff07 100644 --- a/docs/ml-features.md +++ b/docs/ml-features.md @@ -913,7 +913,7 @@ scaledData = scalerModel.transform(dataFrame) * `min`: 0.0 by default. Lower bound after transformation, shared by all features. * `max`: 1.0 by default. Upper bound after transformation, shared by all features. -`MinMaxScaler` is a `Model` which can be `fit` on a dataset to produce a `MinMaxScalerModel`; this amounts to computing summary statistics. The model can then transform each feature individually such that it is in the given range. +`MinMaxScaler` computes summary statistics on a data set and produces a `MinMaxScalerModel`. The model can then transform each feature individually such that it is in the given range. The rescaled value for a feature E is calculated as, From b6ac0fc6eae4a03e3498891e9ee4ebfde418af8f Mon Sep 17 00:00:00 2001 From: Yuhao Yang Date: Thu, 13 Aug 2015 13:48:22 +0800 Subject: [PATCH 3/4] fix comments --- docs/ml-features.md | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/docs/ml-features.md b/docs/ml-features.md index e58be64ac12b..a2faa06dd788 100644 --- a/docs/ml-features.md +++ b/docs/ml-features.md @@ -1109,21 +1109,20 @@ scaledData = scalerModel.transform(dataFrame) `MinMaxScaler` computes summary statistics on a data set and produces a `MinMaxScalerModel`. The model can then transform each feature individually such that it is in the given range. The rescaled value for a feature E is calculated as, - +`\begin{equation} Rescaled(e_i) = \frac{e_i - E_{min}}{E_{max} - E_{min}} * (max - min) + min - -For the case E_{max} == E_{min}, Rescaled(e_i) = 0.5 * (max + min) +\end{equation}` +For the case `E_{max} == E_{min}`, `Rescaled(e_i) = 0.5 * (max + min)` Note that since zero values will probably be transformed to non-zero values, output of the transformer will be DenseVector even for sparse input. -More details can be found in the API docs for -[MinMaxScaler](api/scala/index.html#org.apache.spark.ml.feature.MinMaxScaler) and -[MinMaxScalerModel](api/scala/index.html#org.apache.spark.ml.feature.MinMaxScalerModel). - The following example demonstrates how to load a dataset in libsvm format and then rescale each feature to [0, 1].
+More details can be found in the API docs for +[MinMaxScaler](api/scala/index.html#org.apache.spark.ml.feature.MinMaxScaler) and +[MinMaxScalerModel](api/scala/index.html#org.apache.spark.ml.feature.MinMaxScalerModel). {% highlight scala %} import org.apache.spark.ml.feature.MinMaxScaler import org.apache.spark.mllib.util.MLUtils @@ -1134,15 +1133,18 @@ val scaler = new MinMaxScaler() .setInputCol("features") .setOutputCol("scaledFeatures") -// Compute summary statistics by fitting the StandardScaler +// Compute summary statistics and generate MinMaxScalerModel val scalerModel = scaler.fit(dataFrame) -// Normalize each feature to have unit standard deviation. +// rescale each feature to range [min, max]. val scaledData = scalerModel.transform(dataFrame) {% endhighlight %}
+More details can be found in the API docs for +[MinMaxScaler](api/java/index.html#org.apache.spark.ml.feature.MinMaxScaler) and +[MinMaxScalerModel](api/java/index.html#org.apache.spark.ml.feature.MinMaxScalerModel). {% highlight java %} import org.apache.spark.api.java.JavaRDD; import org.apache.spark.ml.feature.MinMaxScaler; @@ -1158,10 +1160,10 @@ MinMaxScaler scaler = new MinMaxScaler() .setInputCol("features") .setOutputCol("scaledFeatures"); -// Compute summary statistics by fitting the StandardScaler +// Compute summary statistics and generate MinMaxScalerModel MinMaxScalerModel scalerModel = scaler.fit(dataFrame); -// Normalize each feature to have unit standard deviation. +// rescale each feature to range [min, max]. DataFrame scaledData = scalerModel.transform(dataFrame); {% endhighlight %}
From 1622f854244f35157dcdf5964e6c477dabf58f6c Mon Sep 17 00:00:00 2001 From: unknown Date: Tue, 25 Aug 2015 07:35:14 +0800 Subject: [PATCH 4/4] update document --- docs/ml-features.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/ml-features.md b/docs/ml-features.md index a2faa06dd788..fb600f048888 100644 --- a/docs/ml-features.md +++ b/docs/ml-features.md @@ -1119,7 +1119,7 @@ Note that since zero values will probably be transformed to non-zero values, out The following example demonstrates how to load a dataset in libsvm format and then rescale each feature to [0, 1].
-
+
More details can be found in the API docs for [MinMaxScaler](api/scala/index.html#org.apache.spark.ml.feature.MinMaxScaler) and [MinMaxScalerModel](api/scala/index.html#org.apache.spark.ml.feature.MinMaxScalerModel). @@ -1141,10 +1141,10 @@ val scaledData = scalerModel.transform(dataFrame) {% endhighlight %}
-
+
More details can be found in the API docs for -[MinMaxScaler](api/java/index.html#org.apache.spark.ml.feature.MinMaxScaler) and -[MinMaxScalerModel](api/java/index.html#org.apache.spark.ml.feature.MinMaxScalerModel). +[MinMaxScaler](api/java/org/apache/spark/ml/feature/MinMaxScaler.html) and +[MinMaxScalerModel](api/java/org/apache/spark/ml/feature/MinMaxScalerModel.html). {% highlight java %} import org.apache.spark.api.java.JavaRDD; import org.apache.spark.ml.feature.MinMaxScaler;