Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 8 additions & 3 deletions docs/mllib-feature-extraction.md
Original file line number Diff line number Diff line change
Expand Up @@ -240,11 +240,11 @@ following parameters in the constructor:

* `withMean` False by default. Centers the data with mean before scaling. It will build a dense
output, so this does not work on sparse input and will raise an exception.
* `withStd` True by default. Scales the data to unit variance.
* `withStd` True by default. Scales the data to unit standard deviation.

We provide a [`fit`](api/scala/index.html#org.apache.spark.mllib.feature.StandardScaler) method in
`StandardScaler` which can take an input of `RDD[Vector]`, learn the summary statistics, and then
return a model which can transform the input dataset into unit variance and/or zero mean features
return a model which can transform the input dataset into unit standard deviation and/or zero mean features
depending how we configure the `StandardScaler`.

This model implements [`VectorTransformer`](api/scala/index.html#org.apache.spark.mllib.feature.VectorTransformer)
Expand All @@ -257,7 +257,7 @@ for that feature.
### Example

The example below demonstrates how to load a dataset in libsvm format, and standardize the features
so that the new features have unit variance and/or zero mean.
so that the new features have unit standard deviation and/or zero mean.

<div class="codetabs">
<div data-lang="scala">
Expand All @@ -271,6 +271,8 @@ val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt")

val scaler1 = new StandardScaler().fit(data.map(x => x.features))
val scaler2 = new StandardScaler(withMean = true, withStd = true).fit(data.map(x => x.features))
// scaler3 is an identical model to scaler2, and will produce identical transformations
val scaler3 = new StandardScalerModel(scaler2.std, scaler2.mean)

// data1 will be unit variance.
val data1 = data.map(x => (x.label, scaler1.transform(x.features)))
Expand All @@ -294,6 +296,9 @@ features = data.map(lambda x: x.features)

scaler1 = StandardScaler().fit(features)
scaler2 = StandardScaler(withMean=True, withStd=True).fit(features)
# scaler3 is an identical model to scaler2, and will produce identical transformations
scaler3 = StandardScalerModel(scaler2.std, scaler2.mean)


# data1 will be unit variance.
data1 = label.zip(scaler1.transform(features))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,15 +18,15 @@
package org.apache.spark.mllib.feature

import org.apache.spark.Logging
import org.apache.spark.annotation.Experimental
import org.apache.spark.annotation.{DeveloperApi, Experimental}
import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors}
import org.apache.spark.mllib.rdd.RDDFunctions._
import org.apache.spark.mllib.stat.MultivariateOnlineSummarizer
import org.apache.spark.rdd.RDD

/**
* :: Experimental ::
* Standardizes features by removing the mean and scaling to unit variance using column summary
* Standardizes features by removing the mean and scaling to unit std using column summary
* statistics on the samples in the training set.
*
* @param withMean False by default. Centers the data with mean before scaling. It will build a
Expand All @@ -53,36 +53,55 @@ class StandardScaler(withMean: Boolean, withStd: Boolean) extends Logging {
val summary = data.treeAggregate(new MultivariateOnlineSummarizer)(
(aggregator, data) => aggregator.add(data),
(aggregator1, aggregator2) => aggregator1.merge(aggregator2))
new StandardScalerModel(withMean, withStd, summary.mean, summary.variance)
new StandardScalerModel(
Vectors.dense(summary.variance.toArray.map(v => math.sqrt(v))),
summary.mean,
withStd,
withMean)
}
}

/**
* :: Experimental ::
* Represents a StandardScaler model that can transform vectors.
*
* @param withMean whether to center the data before scaling
* @param withStd whether to scale the data to have unit standard deviation
* @param std column standard deviation values
* @param mean column mean values
* @param variance column variance values
* @param withStd whether to scale the data to have unit standard deviation
* @param withMean whether to center the data before scaling
*/
@Experimental
class StandardScalerModel private[mllib] (
val withMean: Boolean,
val withStd: Boolean,
class StandardScalerModel (
val std: Vector,
val mean: Vector,
val variance: Vector) extends VectorTransformer {

require(mean.size == variance.size)
var withStd: Boolean,
var withMean: Boolean) extends VectorTransformer {

private lazy val factor: Array[Double] = {
val f = Array.ofDim[Double](variance.size)
var i = 0
while (i < f.size) {
f(i) = if (variance(i) != 0.0) 1.0 / math.sqrt(variance(i)) else 0.0
i += 1
def this(std: Vector, mean: Vector) {
this(std, mean, withStd = std != null, withMean = mean != null)
require(this.withStd || this.withMean,
"at least one of std or mean vectors must be provided")
if (this.withStd && this.withMean) {
require(mean.size == std.size,
"mean and std vectors must have equal size if both are provided")
}
f
}

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The default argument is not friendly for Java though; why don't we add another constructor which takes only mean and variance?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also, users will want to know if withMean or withStd is used, do we really need to have them as private variables?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Moving require to the bottom of this constructor. Also add @DeveloperApi annotation to both setWithMean and setWithStd APIs.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have a question about this API. If the default withMean is false, why do we require mean in the constructor? If the feature dimension is really large, this puts some extra cost that cannot be ignored. Similarly, should we take std directly instead of variance in the constructor? My proposal is the following:

  • StandoardScalerModel(std: Vector, mean: Vector, withStd: Boolean, withMean: Boolean). I put variance in front of mean because scaling is used more frequently than shifting.
  • this(std: Vector, mean: Vector): enable withMean and withStd based on whether the input arguments are null or not. Throw exception is both are null.
  • this(std: Vector) = this(std, null).
  • setWithMean and setWithStd check whether the corresponding mean/variance is null or not and throw exceptions if a user want to set it to true while the value is null.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sounds reasonable for me. Although the changes will be larger, this will be more handy and save extra space if withMean is not used.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@mengxr Just to make sure I'm clear, are you suggesting changing the StandardScalerModel to take the standard deviation vector (instead of variance)? Or are you just calling it 'std' for short?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In my opinion, taking variance will be ideal since it's the output of MultivariateOnlineSummarizer.

def this(std: Vector) = this(std, null)

@DeveloperApi
def setWithMean(withMean: Boolean): this.type = {
require(!(withMean && this.mean == null),"cannot set withMean to true while mean is null")
this.withMean = withMean
this
}

@DeveloperApi
def setWithStd(withStd: Boolean): this.type = {
require(!(withStd && this.std == null),
"cannot set withStd to true while std is null")
this.withStd = withStd
this
}

// Since `shift` will be only used in `withMean` branch, we have it as
Expand All @@ -94,8 +113,8 @@ class StandardScalerModel private[mllib] (
* Applies standardization transformation on a vector.
*
* @param vector Vector to be standardized.
* @return Standardized vector. If the variance of a column is zero, it will return default `0.0`
* for the column with zero variance.
* @return Standardized vector. If the std of a column is zero, it will return default `0.0`
* for the column with zero std.
*/
override def transform(vector: Vector): Vector = {
require(mean.size == vector.size)
Expand All @@ -109,11 +128,9 @@ class StandardScalerModel private[mllib] (
val values = vs.clone()
val size = values.size
if (withStd) {
// Having a local reference of `factor` to avoid overhead as the comment before.
val localFactor = factor
var i = 0
while (i < size) {
values(i) = (values(i) - localShift(i)) * localFactor(i)
values(i) = if (std(i) != 0.0) (values(i) - localShift(i)) * (1.0 / std(i)) else 0.0
i += 1
}
} else {
Expand All @@ -127,15 +144,13 @@ class StandardScalerModel private[mllib] (
case v => throw new IllegalArgumentException("Do not support vector type " + v.getClass)
}
} else if (withStd) {
// Having a local reference of `factor` to avoid overhead as the comment before.
val localFactor = factor
vector match {
case DenseVector(vs) =>
val values = vs.clone()
val size = values.size
var i = 0
while(i < size) {
values(i) *= localFactor(i)
values(i) *= (if (std(i) != 0.0) 1.0 / std(i) else 0.0)
i += 1
}
Vectors.dense(values)
Expand All @@ -146,7 +161,7 @@ class StandardScalerModel private[mllib] (
val nnz = values.size
var i = 0
while (i < nnz) {
values(i) *= localFactor(indices(i))
values(i) *= (if (std(indices(i)) != 0.0) 1.0 / std(indices(i)) else 0.0)
i += 1
}
Vectors.sparse(size, indices, values)
Expand Down
Loading