[SPARK-5207] [MLLIB] [WIP] change StandardScalerModel to take stddev instead of variance

ogeagla · ogeagla · commit fa64dfaa9f27 · 2015-02-01T04:35:13.000-07:00
diff --git a/docs/mllib-feature-extraction.md b/docs/mllib-feature-extraction.md
@@ -240,11 +240,11 @@ following parameters in the constructor:
 
 * `withMean` False by default. Centers the data with mean before scaling. It will build a dense
 output, so this does not work on sparse input and will raise an exception.
-* `withStd` True by default. Scales the data to unit variance.
+* `withStd` True by default. Scales the data to unit standard deviation.
 
 We provide a [`fit`](api/scala/index.html#org.apache.spark.mllib.feature.StandardScaler) method in
 `StandardScaler` which can take an input of `RDD[Vector]`, learn the summary statistics, and then
-return a model which can transform the input dataset into unit variance and/or zero mean features
+return a model which can transform the input dataset into unit standard deviation and/or zero mean features
 depending how we configure the `StandardScaler`.
 
 This model implements [`VectorTransformer`](api/scala/index.html#org.apache.spark.mllib.feature.VectorTransformer)
@@ -257,7 +257,7 @@ for that feature.
 ### Example
 
 The example below demonstrates how to load a dataset in libsvm format, and standardize the features
-so that the new features have unit variance and/or zero mean.
+so that the new features have unit standard deviation and/or zero mean.
 
 <div class="codetabs">
 <div data-lang="scala">
@@ -272,7 +272,7 @@ val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt")
 val scaler1 = new StandardScaler().fit(data.map(x => x.features))
 val scaler2 = new StandardScaler(withMean = true, withStd = true).fit(data.map(x => x.features))
 // scaler3 is an identical model to scaler2, and will produce identical transformations
-val scaler3 = new StandardScalerModel(scaler2.variance, scaler2.mean)
+val scaler3 = new StandardScalerModel(scaler2.std, scaler2.mean)
 
 // data1 will be unit variance.
 val data1 = data.map(x => (x.label, scaler1.transform(x.features)))
@@ -297,7 +297,7 @@ features = data.map(lambda x: x.features)
 scaler1 = StandardScaler().fit(features)
 scaler2 = StandardScaler(withMean=True, withStd=True).fit(features)
 # scaler3 is an identical model to scaler2, and will produce identical transformations
-scaler3 = StandardScalerModel(scaler2.variance, scaler2.mean)
+scaler3 = StandardScalerModel(scaler2.std, scaler2.mean)
 
 
 # data1 will be unit variance.
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/StandardScaler.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/StandardScaler.scala
@@ -26,7 +26,7 @@ import org.apache.spark.rdd.RDD
 
 /**
  * :: Experimental ::
- * Standardizes features by removing the mean and scaling to unit standard deviation using column summary
+ * Standardizes features by removing the mean and scaling to unit std using column summary
  * statistics on the samples in the training set.
  *
  * @param withMean False by default. Centers the data with mean before scaling. It will build a
@@ -53,33 +53,41 @@ class StandardScaler(withMean: Boolean, withStd: Boolean) extends Logging {
     val summary = data.treeAggregate(new MultivariateOnlineSummarizer)(
       (aggregator, data) => aggregator.add(data),
       (aggregator1, aggregator2) => aggregator1.merge(aggregator2))
-    new StandardScalerModel(summary.variance, summary.mean, withStd, withMean)
+    new StandardScalerModel(
+      Vectors.dense(summary.variance.toArray.map(v => math.sqrt(v))),
+      summary.mean,
+      withStd,
+      withMean)
   }
 }
 
 /**
  * :: Experimental ::
  * Represents a StandardScaler model that can transform vectors.
  *
- * @param variance column variance values
+ * @param std column standard deviation values
  * @param mean column mean values
  * @param withStd whether to scale the data to have unit standard deviation
  * @param withMean whether to center the data before scaling
  */
 @Experimental
 class StandardScalerModel (
-    val variance: Vector,
+    val std: Vector,
     val mean: Vector,
     var withStd: Boolean,
     var withMean: Boolean) extends VectorTransformer {
 
-  def this(variance: Vector, mean: Vector) {
-    this(variance, mean, withStd = variance != null, withMean = mean != null)
-    require(this.withStd || this.withMean, "at least one of variance or mean vectors must be provided")
-    if (this.withStd && this.withMean) require(mean.size == variance.size, "mean and variance vectors must have equal size if both are provided")
+  def this(std: Vector, mean: Vector) {
+    this(std, mean, withStd = std != null, withMean = mean != null)
+    require(this.withStd || this.withMean,
+      "at least one of std or mean vectors must be provided")
+    if (this.withStd && this.withMean) {
+      require(mean.size == std.size,
+        "mean and std vectors must have equal size if both are provided")
+    }
   }
 
-  def this(variance: Vector) = this(variance, null)
+  def this(std: Vector) = this(std, null)
 
   @DeveloperApi
   def setWithMean(withMean: Boolean): this.type = {
@@ -90,21 +98,12 @@ class StandardScalerModel (
 
   @DeveloperApi
   def setWithStd(withStd: Boolean): this.type = {
-    require(!(withStd && this.variance == null), "cannot set withStd to true while variance is null")
+    require(!(withStd && this.std == null),
+      "cannot set withStd to true while std is null")
     this.withStd = withStd
     this
   }
 
-  private lazy val factor: Array[Double] = {
-    val f = Array.ofDim[Double](variance.size)
-    var i = 0
-    while (i < f.size) {
-      f(i) = if (variance(i) != 0.0) 1.0 / math.sqrt(variance(i)) else 0.0
-      i += 1
-    }
-    f
-  }
-
   // Since `shift` will be only used in `withMean` branch, we have it as
   // `lazy val` so it will be evaluated in that branch. Note that we don't
   // want to create this array multiple times in `transform` function.
@@ -114,8 +113,8 @@ class StandardScalerModel (
    * Applies standardization transformation on a vector.
    *
    * @param vector Vector to be standardized.
-   * @return Standardized vector. If the standard deviation of a column is zero, it will return default `0.0`
-   *         for the column with zero standard deviation.
+   * @return Standardized vector. If the std of a column is zero, it will return default `0.0`
+   *         for the column with zero std.
    */
   override def transform(vector: Vector): Vector = {
     require(mean.size == vector.size)
@@ -129,11 +128,9 @@ class StandardScalerModel (
           val values = vs.clone()
           val size = values.size
           if (withStd) {
-            // Having a local reference of `factor` to avoid overhead as the comment before.
-            val localFactor = factor
             var i = 0
             while (i < size) {
-              values(i) = (values(i) - localShift(i)) * localFactor(i)
+              values(i) = if (std(i) != 0.0) (values(i) - localShift(i)) * (1.0 / std(i)) else 0.0
               i += 1
             }
           } else {
@@ -147,15 +144,13 @@ class StandardScalerModel (
         case v => throw new IllegalArgumentException("Do not support vector type " + v.getClass)
       }
     } else if (withStd) {
-      // Having a local reference of `factor` to avoid overhead as the comment before.
-      val localFactor = factor
       vector match {
         case DenseVector(vs) =>
           val values = vs.clone()
           val size = values.size
           var i = 0
           while(i < size) {
-            values(i) *= localFactor(i)
+            values(i) *= (if (std(i) != 0.0) 1.0 / std(i) else 0.0)
             i += 1
           }
           Vectors.dense(values)
@@ -166,7 +161,7 @@ class StandardScalerModel (
           val nnz = values.size
           var i = 0
           while (i < nnz) {
-            values(i) *= localFactor(indices(i))
+            values(i) *= (if (std(indices(i)) != 0.0) 1.0 / std(indices(i)) else 0.0)
             i += 1
           }
           Vectors.sparse(size, indices, values)
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/feature/StandardScalerSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/feature/StandardScalerSuite.scala
@@ -60,7 +60,7 @@ class StandardScalerSuite extends FunSuite with MLlibTestSparkContext {
       (aggregator1, aggregator2) => aggregator1.merge(aggregator2))
   }
 
-  test("Standardization with dense input when means and variances are provided") {
+  test("Standardization with dense input when means and stds are provided") {
 
     val dataRDD = sc.parallelize(denseData, 3)
 
@@ -72,9 +72,9 @@ class StandardScalerSuite extends FunSuite with MLlibTestSparkContext {
     val model2 = standardizer2.fit(dataRDD)
     val model3 = standardizer3.fit(dataRDD)
 
-    val equivalentModel1 = new StandardScalerModel(model1.variance, model1.mean)
-    val equivalentModel2 = new StandardScalerModel(model2.variance, model2.mean, true, false)
-    val equivalentModel3 = new StandardScalerModel(model3.variance, model3.mean, false, true)
+    val equivalentModel1 = new StandardScalerModel(model1.std, model1.mean)
+    val equivalentModel2 = new StandardScalerModel(model2.std, model2.mean, true, false)
+    val equivalentModel3 = new StandardScalerModel(model3.std, model3.mean, false, true)
 
     val data1 = denseData.map(equivalentModel1.transform)
     val data2 = denseData.map(equivalentModel2.transform)
@@ -193,7 +193,7 @@ class StandardScalerSuite extends FunSuite with MLlibTestSparkContext {
   }
 
 
-  test("Standardization with sparse input when means and variances are provided") {
+  test("Standardization with sparse input when means and stds are provided") {
 
     val dataRDD = sc.parallelize(sparseData, 3)
 
@@ -205,9 +205,9 @@ class StandardScalerSuite extends FunSuite with MLlibTestSparkContext {
     val model2 = standardizer2.fit(dataRDD)
     val model3 = standardizer3.fit(dataRDD)
 
-    val equivalentModel1 = new StandardScalerModel(model1.variance, model1.mean)
-    val equivalentModel2 = new StandardScalerModel(model2.variance, model2.mean, true, false)
-    val equivalentModel3 = new StandardScalerModel(model3.variance, model3.mean, false, true)
+    val equivalentModel1 = new StandardScalerModel(model1.std, model1.mean)
+    val equivalentModel2 = new StandardScalerModel(model2.std, model2.mean, true, false)
+    val equivalentModel3 = new StandardScalerModel(model3.std, model3.mean, false, true)
 
     val data2 = sparseData.map(equivalentModel2.transform)
 
@@ -288,7 +288,7 @@ class StandardScalerSuite extends FunSuite with MLlibTestSparkContext {
     assert(data2(5) ~== Vectors.sparse(3, Seq((1, 0.71580142))) absTol 1E-5)
   }
 
-  test("Standardization with constant input when means and variances are provided") {
+  test("Standardization with constant input when means and stds are provided") {
 
     val dataRDD = sc.parallelize(constantData, 2)
 
@@ -300,9 +300,9 @@ class StandardScalerSuite extends FunSuite with MLlibTestSparkContext {
     val model2 = standardizer2.fit(dataRDD)
     val model3 = standardizer3.fit(dataRDD)
 
-    val equivalentModel1 = new StandardScalerModel(model1.variance, model1.mean)
-    val equivalentModel2 = new StandardScalerModel(model2.variance, model2.mean, true, false)
-    val equivalentModel3 = new StandardScalerModel(model3.variance, model3.mean, false, true)
+    val equivalentModel1 = new StandardScalerModel(model1.std, model1.mean)
+    val equivalentModel2 = new StandardScalerModel(model2.std, model2.mean, true, false)
+    val equivalentModel3 = new StandardScalerModel(model3.std, model3.mean, false, true)
 
     val data1 = constantData.map(equivalentModel1.transform)
     val data2 = constantData.map(equivalentModel2.transform)
@@ -342,12 +342,12 @@ class StandardScalerSuite extends FunSuite with MLlibTestSparkContext {
 
   test("StandardScalerModel argument nulls are properly handled") {
 
-    withClue("model needs at least one of variance or mean vectors") {
+    withClue("model needs at least one of std or mean vectors") {
       intercept[IllegalArgumentException] {
         val model = new StandardScalerModel(null, null)
       }
     }
-    withClue("model needs variance to set withStd to true") {
+    withClue("model needs std to set withStd to true") {
       intercept[IllegalArgumentException] {
         val model = new StandardScalerModel(null, Vectors.dense(0.0))
         model.setWithStd(true)
@@ -359,7 +359,7 @@ class StandardScalerSuite extends FunSuite with MLlibTestSparkContext {
         model.setWithMean(true)
       }
     }
-    withClue("model needs variance and mean vectors to be equal size when both are provided") {
+    withClue("model needs std and mean vectors to be equal size when both are provided") {
       intercept[IllegalArgumentException] {
         val model = new StandardScalerModel(Vectors.dense(0.0), Vectors.dense(0.0,1.0))
       }