From 203a29540841504324e11c350d5b92eb84bdb099 Mon Sep 17 00:00:00 2001
From: DB Tsai <dbt@netflix.com>
Date: Tue, 28 Apr 2015 17:53:43 -0700
Subject: [PATCH 1/7] Add more documentation to LinearRegression in new ML
 framework.

---
 .../ml/regression/LinearRegression.scala      | 176 ++++++++++++------
 .../GeneralizedLinearAlgorithm.scala          |   2 +-
 2 files changed, 121 insertions(+), 57 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
index f92c6816eb54..7fc97f7b84a0 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
@@ -110,59 +110,66 @@ class LinearRegression extends Regressor[Vector, LinearRegression, LinearRegress
     val yMean = statCounter.mean
     val yStd = math.sqrt(statCounter.variance)
 
-    val featuresMean = summarizer.mean.toArray
-    val featuresStd = summarizer.variance.toArray.map(math.sqrt)
+    val model = if (yStd != 0.0) {
+      val featuresMean = summarizer.mean.toArray
+      val featuresStd = summarizer.variance.toArray.map(math.sqrt)
 
-    // Since we implicitly do the feature scaling when we compute the cost function
-    // to improve the convergence, the effective regParam will be changed.
-    val effectiveRegParam = paramMap(regParam) / yStd
-    val effectiveL1RegParam = paramMap(elasticNetParam) * effectiveRegParam
-    val effectiveL2RegParam = (1.0 - paramMap(elasticNetParam)) * effectiveRegParam
+      // Since we implicitly do the feature scaling when we compute the cost function
+      // to improve the convergence, the effective regParam will be changed.
+      val effectiveRegParam = paramMap(regParam) / yStd
+      val effectiveL1RegParam = paramMap(elasticNetParam) * effectiveRegParam
+      val effectiveL2RegParam = (1.0 - paramMap(elasticNetParam)) * effectiveRegParam
 
-    val costFun = new LeastSquaresCostFun(instances, yStd, yMean,
-      featuresStd, featuresMean, effectiveL2RegParam)
+      val costFun = new LeastSquaresCostFun(instances, yStd, yMean,
+        featuresStd, featuresMean, effectiveL2RegParam)
 
-    val optimizer = if (paramMap(elasticNetParam) == 0.0 || effectiveRegParam == 0.0) {
-      new BreezeLBFGS[BDV[Double]](paramMap(maxIter), 10, paramMap(tol))
-    } else {
-      new BreezeOWLQN[Int, BDV[Double]](paramMap(maxIter), 10, effectiveL1RegParam, paramMap(tol))
-    }
+      val optimizer = if (paramMap(elasticNetParam) == 0.0 || effectiveRegParam == 0.0) {
+        new BreezeLBFGS[BDV[Double]](paramMap(maxIter), 10, paramMap(tol))
+      } else {
+        new BreezeOWLQN[Int, BDV[Double]](paramMap(maxIter), 10,
+          effectiveL1RegParam, paramMap(tol))
+      }
 
-    val initialWeights = Vectors.zeros(numFeatures)
-    val states =
-      optimizer.iterations(new CachedDiffFunction(costFun), initialWeights.toBreeze.toDenseVector)
+      val initialWeights = Vectors.zeros(numFeatures)
+      val states = optimizer.iterations(new CachedDiffFunction(costFun),
+        initialWeights.toBreeze.toDenseVector)
 
-    var state = states.next()
-    val lossHistory = mutable.ArrayBuilder.make[Double]
+      var state = states.next()
+      val lossHistory = mutable.ArrayBuilder.make[Double]
 
-    while (states.hasNext) {
+      while (states.hasNext) {
+        lossHistory += state.value
+        state = states.next()
+      }
       lossHistory += state.value
-      state = states.next()
-    }
-    lossHistory += state.value
 
-    // TODO: Based on the sparsity of weights, we may convert the weights to the sparse vector.
-    // The weights are trained in the scaled space; we're converting them back to
-    // the original space.
-    val weights = {
-      val rawWeights = state.x.toArray.clone()
-      var i = 0
-      while (i < rawWeights.length) {
-        rawWeights(i) *= { if (featuresStd(i) != 0.0) yStd / featuresStd(i) else 0.0 }
-        i += 1
+      // TODO: Based on the sparsity of weights, we may convert the weights to the sparse vector.
+      // The weights are trained in the scaled space; we're converting them back to
+      // the original space.
+      val weights = {
+        val rawWeights = state.x.toArray.clone()
+        var i = 0
+        while (i < rawWeights.length) {
+          rawWeights(i) *= {
+            if (featuresStd(i) != 0.0) yStd / featuresStd(i) else 0.0
+          }
+          i += 1
+        }
+        Vectors.dense(rawWeights)
       }
-      Vectors.dense(rawWeights)
-    }
-
-    // The intercept in R's GLMNET is computed using closed form after the coefficients are
-    // converged. See the following discussion for detail.
-    // http://stats.stackexchange.com/questions/13617/how-is-the-intercept-computed-in-glmnet
-    val intercept = yMean - dot(weights, Vectors.dense(featuresMean))
 
+      // The intercept in R's GLMNET is computed using closed form after the coefficients are
+      // converged. See the following discussion for detail.
+      // http://stats.stackexchange.com/questions/13617/how-is-the-intercept-computed-in-glmnet
+      val intercept = yMean - dot(weights, Vectors.dense(featuresMean))
+      new LinearRegressionModel(this, paramMap, weights, intercept)
+    } else {
+      new LinearRegressionModel(this, paramMap, Vectors.zeros(numFeatures), yMean)
+    }
     if (handlePersistence) {
       instances.unpersist()
     }
-    new LinearRegressionModel(this, paramMap, weights, intercept)
+    model
   }
 }
 
@@ -198,15 +205,74 @@ class LinearRegressionModel private[ml] (
  * Two LeastSquaresAggregator can be merged together to have a summary of loss and gradient of
  * the corresponding joint dataset.
  *
-
- *  * Compute gradient and loss for a Least-squared loss function, as used in linear regression.
- * This is correct for the averaged least squares loss function (mean squared error)
- *              L = 1/2n ||A weights-y||^2
- * See also the documentation for the precise formulation.
+ * For improving the convergence rate during the optimization process, and also preventing against
+ * features with very large variances exerting an overly large influence during model training,
+ * package like R's GLMNET performs the scaling to unit variance and removing the mean to reduce
+ * the condition number, and then trains the model in scaled space but returns the weights in
+ * the original scale. See page 9 in http://cran.r-project.org/web/packages/glmnet/glmnet.pdf
+ *
+ * However, we don't want to apply the `StandardScaler` on the training dataset, and then cache
+ * the standardized dataset since it will create a lot of overhead. As a result, we perform the
+ * scaling implicitly when we compute the objective function. The following is the mathematical
+ * derivation.
+ *
+ * Note that we don't deal with intercept by adding bias here, because the intercept
+ * can be computed using closed form after the coefficients are converged.
+ * See this discussion for detail.
+ * http://stats.stackexchange.com/questions/13617/how-is-the-intercept-computed-in-glmnet
+ *
+ * The objective function in the scaled space is given by
+ * {{{
+ * L = 1/2n ||\sum_i w_i(x_i - \bar{x_i}) / \hat{x_i} - (y - \bar{y}) / \hat{y}||^2,
+ * }}}
+ * where \bar{x_i} is the mean of x_i, \hat{x_i} is the standard deviation of x_i,
+ * \bar{y} is the mean of label, and \hat{y} is the standard deviation of label.
+ *
+ * This can be rewritten as
+ * {{{
+ * L = 1/2n ||\sum_i (w_i/\hat{x_i})x_i - \sum_i (w_i/\hat{x_i})\bar{x_i} - y / \hat{y}
+ *     + \bar{y} / \hat{y}||^2
+ *   = 1/2n ||\sum_i w_i^\prime x_i - y / \hat{y} + offset||^2 = 1/2n diff^2
+ * }}}
+ * where w_i^\prime is the effective weights defined by w_i/\hat{x_i}, offset is
+ * {{{
+ * - \sum_i (w_i/\hat{x_i})\bar{x_i} + \bar{y} / \hat{y}.
+ * }}}, and diff is
+ * {{{
+ * \sum_i w_i^\prime x_i - y / \hat{y} + offset
+ * }}}
  *
- * @param weights weights/coefficients corresponding to features
+ * Note that the effective weights and offset don't depend on training dataset,
+ * so they can be precomputed.
  *
- * @param updater Updater to be used to update weights after every iteration.
+ * Now, the first derivative of the objective function in scaled space is
+ * {{{
+ * \frac{\partial L}{\partial\w_i} = diff/N (x_i - \bar{x_i}) / \hat{x_i}
+ * }}}
+ * However, ($x_i - \bar{x_i}$) will densify the computation, so it's not
+ * an ideal formula when the training dataset is sparse format.
+ *
+ * This can be addressed by adding the dense \bar{x_i} / \har{x_i} terms
+ * in the end by keeping the sum of diff. The first derivative of total
+ * objective function from all the samples is
+ * {{{
+ * \frac{\partial L}{\partial\w_i} =
+ *     1/N \sum_j diff_j (x_{ij} - \bar{x_i}) / \hat{x_i}
+ *   = 1/N ((\sum_j diff_j x_{ij} / \hat{x_i}) - diffSum \bar{x_i}) / \hat{x_i})
+ *   = 1/N ((\sum_j diff_j x_{ij} / \hat{x_i}) + correction_i)
+ * }}},
+ * where correction_i = - diffSum \bar{x_i}) / \hat{x_i}
+ *
+ * As a result, the first term in the first derivative of the total objective function
+ * depends on training dataset, which can be computed in distributed fashion, and is sparse
+ * format friendly. We only have to loop through the whole gradientSum vector one time
+ * in the end for adding the correction terms back (in the driver, not in the executors).
+ *
+ * @param weights The weights/coefficients corresponding to the features.
+ * @param labelStd The standard deviation value of the label.
+ * @param labelMean The mean value of the label.
+ * @param featuresStd The standard deviation values of the features.
+ * @param featuresMean The mean values of the features.
  */
 private class LeastSquaresAggregator(
     weights: Vector,
@@ -301,19 +367,17 @@ private class LeastSquaresAggregator(
   def loss: Double = lossSum / totalCnt
 
   def gradient: Vector = {
-    val result = Vectors.dense(gradientSumArray.clone())
+    val resultArray = gradientSumArray.clone()
 
-    val correction = {
-      val temp = effectiveWeightsArray.clone()
-      var i = 0
-      while (i < temp.length) {
-        temp(i) *= featuresMean(i)
-        i += 1
+    // Adding the correction terms back to gradientSum;
+    // see the mathematical derivation for detail.
+    Vectors.dense(featuresMean).foreachActive { (index, value) =>
+      if (featuresStd(index) != 0.0 && value != 0.0) {
+        resultArray(index) -= diffSum * value / featuresStd(index)
       }
-      Vectors.dense(temp)
     }
 
-    axpy(-diffSum, correction, result)
+    val result = Vectors.dense(resultArray)
     scal(1.0 / totalCnt, result)
     result
   }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/regression/GeneralizedLinearAlgorithm.scala b/mllib/src/main/scala/org/apache/spark/mllib/regression/GeneralizedLinearAlgorithm.scala
index 9fd60ff7a0c7..26be30ff9d6f 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/regression/GeneralizedLinearAlgorithm.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/regression/GeneralizedLinearAlgorithm.scala
@@ -225,7 +225,7 @@ abstract class GeneralizedLinearAlgorithm[M <: GeneralizedLinearModel]
       throw new SparkException("Input validation failed.")
     }
 
-    /*
+    /**
      * Scaling columns to unit variance as a heuristic to reduce the condition number:
      *
      * During the optimization process, the convergence (rate) depends on the condition number of

From 63f7d1e2bdcff08600593311cd32390905e9f9d1 Mon Sep 17 00:00:00 2001
From: DB Tsai <dbt@netflix.com>
Date: Tue, 28 Apr 2015 23:14:44 -0700
Subject: [PATCH 2/7] Added compression to the model based on storage

---
 .../apache/spark/ml/regression/LinearRegression.scala  | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
index 7fc97f7b84a0..794c4f2753d8 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
@@ -110,6 +110,8 @@ class LinearRegression extends Regressor[Vector, LinearRegression, LinearRegress
     val yMean = statCounter.mean
     val yStd = math.sqrt(statCounter.variance)
 
+    // If the yStd is zero, then the intercept is yStd with zero weights.
+    // As a result, training is not required.
     val model = if (yStd != 0.0) {
       val featuresMean = summarizer.mean.toArray
       val featuresStd = summarizer.variance.toArray.map(math.sqrt)
@@ -143,7 +145,6 @@ class LinearRegression extends Regressor[Vector, LinearRegression, LinearRegress
       }
       lossHistory += state.value
 
-      // TODO: Based on the sparsity of weights, we may convert the weights to the sparse vector.
       // The weights are trained in the scaled space; we're converting them back to
       // the original space.
       val weights = {
@@ -162,9 +163,12 @@ class LinearRegression extends Regressor[Vector, LinearRegression, LinearRegress
       // converged. See the following discussion for detail.
       // http://stats.stackexchange.com/questions/13617/how-is-the-intercept-computed-in-glmnet
       val intercept = yMean - dot(weights, Vectors.dense(featuresMean))
-      new LinearRegressionModel(this, paramMap, weights, intercept)
+
+      // TODO: We convert to sparse format based on the storage.
+      // But we may base on the prediction speed.
+      new LinearRegressionModel(this, paramMap, weights.compressed, intercept)
     } else {
-      new LinearRegressionModel(this, paramMap, Vectors.zeros(numFeatures), yMean)
+      new LinearRegressionModel(this, paramMap, Vectors.sparse(numFeatures, Seq()), yMean)
     }
     if (handlePersistence) {
       instances.unpersist()

From 5929e492f615afcbaf3442bae5fe901f3cbdcf19 Mon Sep 17 00:00:00 2001
From: DB Tsai <dbt@netflix.com>
Date: Wed, 29 Apr 2015 09:51:01 -0700
Subject: [PATCH 3/7] typo

Signed-off-by: DB Tsai <dbt@netflix.com>
---
 .../org/apache/spark/ml/regression/LinearRegression.scala     | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
index 794c4f2753d8..bb921d87bb8e 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
@@ -110,8 +110,8 @@ class LinearRegression extends Regressor[Vector, LinearRegression, LinearRegress
     val yMean = statCounter.mean
     val yStd = math.sqrt(statCounter.variance)
 
-    // If the yStd is zero, then the intercept is yStd with zero weights.
-    // As a result, training is not required.
+    // If the yStd is zero, then the intercept is yMean with zero weights;
+    // as a result, training is not needed.
     val model = if (yStd != 0.0) {
       val featuresMean = summarizer.mean.toArray
       val featuresStd = summarizer.variance.toArray.map(math.sqrt)

From 69757b8adf76227d1afacb0a3b02a850d378fd8b Mon Sep 17 00:00:00 2001
From: DB Tsai <dbt@netflix.com>
Date: Wed, 29 Apr 2015 13:10:33 -0700
Subject: [PATCH 4/7] actually diffSum is mathematically zero! No correction is
 needed.

---
 .../ml/regression/LinearRegression.scala      | 30 +++++++++----------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
index bb921d87bb8e..67221bbf23f9 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
@@ -267,10 +267,20 @@ class LinearRegressionModel private[ml] (
  * }}},
  * where correction_i = - diffSum \bar{x_i}) / \hat{x_i}
  *
- * As a result, the first term in the first derivative of the total objective function
- * depends on training dataset, which can be computed in distributed fashion, and is sparse
- * format friendly. We only have to loop through the whole gradientSum vector one time
- * in the end for adding the correction terms back (in the driver, not in the executors).
+ * A simple math can show that diffSum is actually zero, so we don't even
+ * need to add the correction terms in the end. From the definition of diff,
+ * {{{
+ * diffSum = \sum_j (\sum_i w_i(x_{ij} - \bar{x_i}) / \hat{x_i} - (y_j - \bar{y}) / \hat{y})
+ *         = N * (\sum_i w_i(\bar{x_i} - \bar{x_i}) / \hat{x_i} - (\bar{y_j} - \bar{y}) / \hat{y})
+ *         = 0
+ * }}}
+ *
+ * As a result, the first derivative of the total objective function only depends on
+ * the training dataset, which can be easily computed in distributed fashion, and is
+ * sparse format friendly.
+ * {{{
+ * \frac{\partial L}{\partial\w_i} = 1/N ((\sum_j diff_j x_{ij} / \hat{x_i})
+ * }}},
  *
  * @param weights The weights/coefficients corresponding to the features.
  * @param labelStd The standard deviation value of the label.
@@ -371,17 +381,7 @@ private class LeastSquaresAggregator(
   def loss: Double = lossSum / totalCnt
 
   def gradient: Vector = {
-    val resultArray = gradientSumArray.clone()
-
-    // Adding the correction terms back to gradientSum;
-    // see the mathematical derivation for detail.
-    Vectors.dense(featuresMean).foreachActive { (index, value) =>
-      if (featuresStd(index) != 0.0 && value != 0.0) {
-        resultArray(index) -= diffSum * value / featuresStd(index)
-      }
-    }
-
-    val result = Vectors.dense(resultArray)
+    val result = Vectors.dense(gradientSumArray.clone())
     scal(1.0 / totalCnt, result)
     result
   }

From 58456d8fa46a40442d7b4369551dde0c975d0a92 Mon Sep 17 00:00:00 2001
From: DB Tsai <dbt@netflix.com>
Date: Wed, 29 Apr 2015 13:32:31 -0700
Subject: [PATCH 5/7] address feedback

---
 .../ml/regression/LinearRegression.scala      | 101 +++++++++---------
 1 file changed, 49 insertions(+), 52 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
index 67221bbf23f9..0ac22fd7ec6b 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
@@ -34,6 +34,7 @@ import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.DataFrame
 import org.apache.spark.storage.StorageLevel
 import org.apache.spark.util.StatCounter
+import org.apache.spark.Logging
 
 /**
  * Params for linear regression.
@@ -48,7 +49,7 @@ private[regression] trait LinearRegressionParams extends RegressorParams
  */
 @AlphaComponent
 class LinearRegression extends Regressor[Vector, LinearRegression, LinearRegressionModel]
-  with LinearRegressionParams {
+  with LinearRegressionParams with Logging {
 
   /**
    * Set the regularization parameter.
@@ -112,68 +113,64 @@ class LinearRegression extends Regressor[Vector, LinearRegression, LinearRegress
 
     // If the yStd is zero, then the intercept is yMean with zero weights;
     // as a result, training is not needed.
-    val model = if (yStd != 0.0) {
-      val featuresMean = summarizer.mean.toArray
-      val featuresStd = summarizer.variance.toArray.map(math.sqrt)
+    if (yStd == 0.0) {
+      logWarning(s"The standard deviation of the label is zero, " +
+        s"so the weights will be zeros and intercept will be the mean of the label.")
+      if (handlePersistence) instances.unpersist()
+      return new LinearRegressionModel(this, paramMap, Vectors.sparse(numFeatures, Seq()), yMean)
+    }
 
-      // Since we implicitly do the feature scaling when we compute the cost function
-      // to improve the convergence, the effective regParam will be changed.
-      val effectiveRegParam = paramMap(regParam) / yStd
-      val effectiveL1RegParam = paramMap(elasticNetParam) * effectiveRegParam
-      val effectiveL2RegParam = (1.0 - paramMap(elasticNetParam)) * effectiveRegParam
+    val featuresMean = summarizer.mean.toArray
+    val featuresStd = summarizer.variance.toArray.map(math.sqrt)
 
-      val costFun = new LeastSquaresCostFun(instances, yStd, yMean,
-        featuresStd, featuresMean, effectiveL2RegParam)
+    // Since we implicitly do the feature scaling when we compute the cost function
+    // to improve the convergence, the effective regParam will be changed.
+    val effectiveRegParam = paramMap(regParam) / yStd
+    val effectiveL1RegParam = paramMap(elasticNetParam) * effectiveRegParam
+    val effectiveL2RegParam = (1.0 - paramMap(elasticNetParam)) * effectiveRegParam
 
-      val optimizer = if (paramMap(elasticNetParam) == 0.0 || effectiveRegParam == 0.0) {
-        new BreezeLBFGS[BDV[Double]](paramMap(maxIter), 10, paramMap(tol))
-      } else {
-        new BreezeOWLQN[Int, BDV[Double]](paramMap(maxIter), 10,
-          effectiveL1RegParam, paramMap(tol))
-      }
+    val costFun = new LeastSquaresCostFun(instances, yStd, yMean,
+      featuresStd, featuresMean, effectiveL2RegParam)
 
-      val initialWeights = Vectors.zeros(numFeatures)
-      val states = optimizer.iterations(new CachedDiffFunction(costFun),
-        initialWeights.toBreeze.toDenseVector)
+    val optimizer = if (paramMap(elasticNetParam) == 0.0 || effectiveRegParam == 0.0) {
+      new BreezeLBFGS[BDV[Double]](paramMap(maxIter), 10, paramMap(tol))
+    } else {
+      new BreezeOWLQN[Int, BDV[Double]](paramMap(maxIter), 10, effectiveL1RegParam, paramMap(tol))
+    }
 
-      var state = states.next()
-      val lossHistory = mutable.ArrayBuilder.make[Double]
+    val initialWeights = Vectors.zeros(numFeatures)
+    val states =
+      optimizer.iterations(new CachedDiffFunction(costFun), initialWeights.toBreeze.toDenseVector)
 
-      while (states.hasNext) {
-        lossHistory += state.value
-        state = states.next()
-      }
+    var state = states.next()
+    val lossHistory = mutable.ArrayBuilder.make[Double]
+
+    while (states.hasNext) {
       lossHistory += state.value
+      state = states.next()
+    }
+    lossHistory += state.value
 
-      // The weights are trained in the scaled space; we're converting them back to
-      // the original space.
-      val weights = {
-        val rawWeights = state.x.toArray.clone()
-        var i = 0
-        while (i < rawWeights.length) {
-          rawWeights(i) *= {
-            if (featuresStd(i) != 0.0) yStd / featuresStd(i) else 0.0
-          }
-          i += 1
-        }
-        Vectors.dense(rawWeights)
+    // The weights are trained in the scaled space; we're converting them back to
+    // the original space.
+    val weights = {
+      val rawWeights = state.x.toArray.clone()
+      var i = 0
+      while (i < rawWeights.length) {
+        rawWeights(i) *= { if (featuresStd(i) != 0.0) yStd / featuresStd(i) else 0.0 }
+        i += 1
       }
+      Vectors.dense(rawWeights)
+    }
 
-      // The intercept in R's GLMNET is computed using closed form after the coefficients are
-      // converged. See the following discussion for detail.
-      // http://stats.stackexchange.com/questions/13617/how-is-the-intercept-computed-in-glmnet
-      val intercept = yMean - dot(weights, Vectors.dense(featuresMean))
+    // The intercept in R's GLMNET is computed using closed form after the coefficients are
+    // converged. See the following discussion for detail.
+    // http://stats.stackexchange.com/questions/13617/how-is-the-intercept-computed-in-glmnet
+    val intercept = yMean - dot(weights, Vectors.dense(featuresMean))
+    if (handlePersistence) instances.unpersist()
 
-      // TODO: We convert to sparse format based on the storage.
-      // But we may base on the prediction speed.
-      new LinearRegressionModel(this, paramMap, weights.compressed, intercept)
-    } else {
-      new LinearRegressionModel(this, paramMap, Vectors.sparse(numFeatures, Seq()), yMean)
-    }
-    if (handlePersistence) {
-      instances.unpersist()
-    }
-    model
+    // TODO: Converts to sparse format based on the storage, but may base on the scoring speed.
+    new LinearRegressionModel(this, paramMap, weights.compressed, intercept)
   }
 }
 

From fc9f582061155789a561e8fdc23fbc168877363d Mon Sep 17 00:00:00 2001
From: DB Tsai <dbt@netflix.com>
Date: Wed, 29 Apr 2015 13:36:16 -0700
Subject: [PATCH 6/7] doc

---
 .../org/apache/spark/ml/regression/LinearRegression.scala     | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
index 0ac22fd7ec6b..cc9ad22cb860 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
@@ -114,8 +114,8 @@ class LinearRegression extends Regressor[Vector, LinearRegression, LinearRegress
     // If the yStd is zero, then the intercept is yMean with zero weights;
     // as a result, training is not needed.
     if (yStd == 0.0) {
-      logWarning(s"The standard deviation of the label is zero, " +
-        s"so the weights will be zeros and intercept will be the mean of the label.")
+      logWarning(s"The standard deviation of the label is zero, so the weights will be zeros " +
+        s"and the intercept will be the mean of the label; as a result, training is not needed.")
       if (handlePersistence) instances.unpersist()
       return new LinearRegressionModel(this, paramMap, Vectors.sparse(numFeatures, Seq()), yMean)
     }

From 5e346c9cd61e729a9bbe015bff886ff45ab3654a Mon Sep 17 00:00:00 2001
From: DB Tsai <dbt@netflix.com>
Date: Wed, 29 Apr 2015 14:11:04 -0700
Subject: [PATCH 7/7] refactoring

---
 .../mllib/util/LinearDataGenerator.scala      | 21 +++++++++----------
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala
index d7bb943e84f5..91c2f4ccd3d1 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala
@@ -103,17 +103,16 @@ object LinearDataGenerator {
 
     val rnd = new Random(seed)
     val x = Array.fill[Array[Double]](nPoints)(
-      Array.fill[Double](weights.length)(rnd.nextDouble))
-
-    x.map(vector => {
-      // This doesn't work if `vector` is a sparse vector.
-      val vectorArray = vector.toArray
-      var i = 0
-      while (i < vectorArray.size) {
-        vectorArray(i) = (vectorArray(i) - 0.5) * math.sqrt(12.0 * xVariance(i)) + xMean(i)
-        i += 1
-      }
-    })
+      Array.fill[Double](weights.length)(rnd.nextDouble()))
+
+    x.foreach {
+      case v =>
+        var i = 0
+        while (i < v.length) {
+          v(i) = (v(i) - 0.5) * math.sqrt(12.0 * xVariance(i)) + xMean(i)
+          i += 1
+        }
+    }
 
     val y = x.map { xi =>
       blas.ddot(weights.length, xi, 1, weights, 1) + intercept + eps * rnd.nextGaussian()