From 203a29540841504324e11c350d5b92eb84bdb099 Mon Sep 17 00:00:00 2001 From: DB Tsai Date: Tue, 28 Apr 2015 17:53:43 -0700 Subject: [PATCH 1/7] Add more documentation to LinearRegression in new ML framework. --- .../ml/regression/LinearRegression.scala | 176 ++++++++++++------ .../GeneralizedLinearAlgorithm.scala | 2 +- 2 files changed, 121 insertions(+), 57 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala index f92c6816eb54..7fc97f7b84a0 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala @@ -110,59 +110,66 @@ class LinearRegression extends Regressor[Vector, LinearRegression, LinearRegress val yMean = statCounter.mean val yStd = math.sqrt(statCounter.variance) - val featuresMean = summarizer.mean.toArray - val featuresStd = summarizer.variance.toArray.map(math.sqrt) + val model = if (yStd != 0.0) { + val featuresMean = summarizer.mean.toArray + val featuresStd = summarizer.variance.toArray.map(math.sqrt) - // Since we implicitly do the feature scaling when we compute the cost function - // to improve the convergence, the effective regParam will be changed. - val effectiveRegParam = paramMap(regParam) / yStd - val effectiveL1RegParam = paramMap(elasticNetParam) * effectiveRegParam - val effectiveL2RegParam = (1.0 - paramMap(elasticNetParam)) * effectiveRegParam + // Since we implicitly do the feature scaling when we compute the cost function + // to improve the convergence, the effective regParam will be changed. + val effectiveRegParam = paramMap(regParam) / yStd + val effectiveL1RegParam = paramMap(elasticNetParam) * effectiveRegParam + val effectiveL2RegParam = (1.0 - paramMap(elasticNetParam)) * effectiveRegParam - val costFun = new LeastSquaresCostFun(instances, yStd, yMean, - featuresStd, featuresMean, effectiveL2RegParam) + val costFun = new LeastSquaresCostFun(instances, yStd, yMean, + featuresStd, featuresMean, effectiveL2RegParam) - val optimizer = if (paramMap(elasticNetParam) == 0.0 || effectiveRegParam == 0.0) { - new BreezeLBFGS[BDV[Double]](paramMap(maxIter), 10, paramMap(tol)) - } else { - new BreezeOWLQN[Int, BDV[Double]](paramMap(maxIter), 10, effectiveL1RegParam, paramMap(tol)) - } + val optimizer = if (paramMap(elasticNetParam) == 0.0 || effectiveRegParam == 0.0) { + new BreezeLBFGS[BDV[Double]](paramMap(maxIter), 10, paramMap(tol)) + } else { + new BreezeOWLQN[Int, BDV[Double]](paramMap(maxIter), 10, + effectiveL1RegParam, paramMap(tol)) + } - val initialWeights = Vectors.zeros(numFeatures) - val states = - optimizer.iterations(new CachedDiffFunction(costFun), initialWeights.toBreeze.toDenseVector) + val initialWeights = Vectors.zeros(numFeatures) + val states = optimizer.iterations(new CachedDiffFunction(costFun), + initialWeights.toBreeze.toDenseVector) - var state = states.next() - val lossHistory = mutable.ArrayBuilder.make[Double] + var state = states.next() + val lossHistory = mutable.ArrayBuilder.make[Double] - while (states.hasNext) { + while (states.hasNext) { + lossHistory += state.value + state = states.next() + } lossHistory += state.value - state = states.next() - } - lossHistory += state.value - // TODO: Based on the sparsity of weights, we may convert the weights to the sparse vector. - // The weights are trained in the scaled space; we're converting them back to - // the original space. - val weights = { - val rawWeights = state.x.toArray.clone() - var i = 0 - while (i < rawWeights.length) { - rawWeights(i) *= { if (featuresStd(i) != 0.0) yStd / featuresStd(i) else 0.0 } - i += 1 + // TODO: Based on the sparsity of weights, we may convert the weights to the sparse vector. + // The weights are trained in the scaled space; we're converting them back to + // the original space. + val weights = { + val rawWeights = state.x.toArray.clone() + var i = 0 + while (i < rawWeights.length) { + rawWeights(i) *= { + if (featuresStd(i) != 0.0) yStd / featuresStd(i) else 0.0 + } + i += 1 + } + Vectors.dense(rawWeights) } - Vectors.dense(rawWeights) - } - - // The intercept in R's GLMNET is computed using closed form after the coefficients are - // converged. See the following discussion for detail. - // http://stats.stackexchange.com/questions/13617/how-is-the-intercept-computed-in-glmnet - val intercept = yMean - dot(weights, Vectors.dense(featuresMean)) + // The intercept in R's GLMNET is computed using closed form after the coefficients are + // converged. See the following discussion for detail. + // http://stats.stackexchange.com/questions/13617/how-is-the-intercept-computed-in-glmnet + val intercept = yMean - dot(weights, Vectors.dense(featuresMean)) + new LinearRegressionModel(this, paramMap, weights, intercept) + } else { + new LinearRegressionModel(this, paramMap, Vectors.zeros(numFeatures), yMean) + } if (handlePersistence) { instances.unpersist() } - new LinearRegressionModel(this, paramMap, weights, intercept) + model } } @@ -198,15 +205,74 @@ class LinearRegressionModel private[ml] ( * Two LeastSquaresAggregator can be merged together to have a summary of loss and gradient of * the corresponding joint dataset. * - - * * Compute gradient and loss for a Least-squared loss function, as used in linear regression. - * This is correct for the averaged least squares loss function (mean squared error) - * L = 1/2n ||A weights-y||^2 - * See also the documentation for the precise formulation. + * For improving the convergence rate during the optimization process, and also preventing against + * features with very large variances exerting an overly large influence during model training, + * package like R's GLMNET performs the scaling to unit variance and removing the mean to reduce + * the condition number, and then trains the model in scaled space but returns the weights in + * the original scale. See page 9 in http://cran.r-project.org/web/packages/glmnet/glmnet.pdf + * + * However, we don't want to apply the `StandardScaler` on the training dataset, and then cache + * the standardized dataset since it will create a lot of overhead. As a result, we perform the + * scaling implicitly when we compute the objective function. The following is the mathematical + * derivation. + * + * Note that we don't deal with intercept by adding bias here, because the intercept + * can be computed using closed form after the coefficients are converged. + * See this discussion for detail. + * http://stats.stackexchange.com/questions/13617/how-is-the-intercept-computed-in-glmnet + * + * The objective function in the scaled space is given by + * {{{ + * L = 1/2n ||\sum_i w_i(x_i - \bar{x_i}) / \hat{x_i} - (y - \bar{y}) / \hat{y}||^2, + * }}} + * where \bar{x_i} is the mean of x_i, \hat{x_i} is the standard deviation of x_i, + * \bar{y} is the mean of label, and \hat{y} is the standard deviation of label. + * + * This can be rewritten as + * {{{ + * L = 1/2n ||\sum_i (w_i/\hat{x_i})x_i - \sum_i (w_i/\hat{x_i})\bar{x_i} - y / \hat{y} + * + \bar{y} / \hat{y}||^2 + * = 1/2n ||\sum_i w_i^\prime x_i - y / \hat{y} + offset||^2 = 1/2n diff^2 + * }}} + * where w_i^\prime is the effective weights defined by w_i/\hat{x_i}, offset is + * {{{ + * - \sum_i (w_i/\hat{x_i})\bar{x_i} + \bar{y} / \hat{y}. + * }}}, and diff is + * {{{ + * \sum_i w_i^\prime x_i - y / \hat{y} + offset + * }}} * - * @param weights weights/coefficients corresponding to features + * Note that the effective weights and offset don't depend on training dataset, + * so they can be precomputed. * - * @param updater Updater to be used to update weights after every iteration. + * Now, the first derivative of the objective function in scaled space is + * {{{ + * \frac{\partial L}{\partial\w_i} = diff/N (x_i - \bar{x_i}) / \hat{x_i} + * }}} + * However, ($x_i - \bar{x_i}$) will densify the computation, so it's not + * an ideal formula when the training dataset is sparse format. + * + * This can be addressed by adding the dense \bar{x_i} / \har{x_i} terms + * in the end by keeping the sum of diff. The first derivative of total + * objective function from all the samples is + * {{{ + * \frac{\partial L}{\partial\w_i} = + * 1/N \sum_j diff_j (x_{ij} - \bar{x_i}) / \hat{x_i} + * = 1/N ((\sum_j diff_j x_{ij} / \hat{x_i}) - diffSum \bar{x_i}) / \hat{x_i}) + * = 1/N ((\sum_j diff_j x_{ij} / \hat{x_i}) + correction_i) + * }}}, + * where correction_i = - diffSum \bar{x_i}) / \hat{x_i} + * + * As a result, the first term in the first derivative of the total objective function + * depends on training dataset, which can be computed in distributed fashion, and is sparse + * format friendly. We only have to loop through the whole gradientSum vector one time + * in the end for adding the correction terms back (in the driver, not in the executors). + * + * @param weights The weights/coefficients corresponding to the features. + * @param labelStd The standard deviation value of the label. + * @param labelMean The mean value of the label. + * @param featuresStd The standard deviation values of the features. + * @param featuresMean The mean values of the features. */ private class LeastSquaresAggregator( weights: Vector, @@ -301,19 +367,17 @@ private class LeastSquaresAggregator( def loss: Double = lossSum / totalCnt def gradient: Vector = { - val result = Vectors.dense(gradientSumArray.clone()) + val resultArray = gradientSumArray.clone() - val correction = { - val temp = effectiveWeightsArray.clone() - var i = 0 - while (i < temp.length) { - temp(i) *= featuresMean(i) - i += 1 + // Adding the correction terms back to gradientSum; + // see the mathematical derivation for detail. + Vectors.dense(featuresMean).foreachActive { (index, value) => + if (featuresStd(index) != 0.0 && value != 0.0) { + resultArray(index) -= diffSum * value / featuresStd(index) } - Vectors.dense(temp) } - axpy(-diffSum, correction, result) + val result = Vectors.dense(resultArray) scal(1.0 / totalCnt, result) result } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/regression/GeneralizedLinearAlgorithm.scala b/mllib/src/main/scala/org/apache/spark/mllib/regression/GeneralizedLinearAlgorithm.scala index 9fd60ff7a0c7..26be30ff9d6f 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/regression/GeneralizedLinearAlgorithm.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/regression/GeneralizedLinearAlgorithm.scala @@ -225,7 +225,7 @@ abstract class GeneralizedLinearAlgorithm[M <: GeneralizedLinearModel] throw new SparkException("Input validation failed.") } - /* + /** * Scaling columns to unit variance as a heuristic to reduce the condition number: * * During the optimization process, the convergence (rate) depends on the condition number of From 63f7d1e2bdcff08600593311cd32390905e9f9d1 Mon Sep 17 00:00:00 2001 From: DB Tsai Date: Tue, 28 Apr 2015 23:14:44 -0700 Subject: [PATCH 2/7] Added compression to the model based on storage --- .../apache/spark/ml/regression/LinearRegression.scala | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala index 7fc97f7b84a0..794c4f2753d8 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala @@ -110,6 +110,8 @@ class LinearRegression extends Regressor[Vector, LinearRegression, LinearRegress val yMean = statCounter.mean val yStd = math.sqrt(statCounter.variance) + // If the yStd is zero, then the intercept is yStd with zero weights. + // As a result, training is not required. val model = if (yStd != 0.0) { val featuresMean = summarizer.mean.toArray val featuresStd = summarizer.variance.toArray.map(math.sqrt) @@ -143,7 +145,6 @@ class LinearRegression extends Regressor[Vector, LinearRegression, LinearRegress } lossHistory += state.value - // TODO: Based on the sparsity of weights, we may convert the weights to the sparse vector. // The weights are trained in the scaled space; we're converting them back to // the original space. val weights = { @@ -162,9 +163,12 @@ class LinearRegression extends Regressor[Vector, LinearRegression, LinearRegress // converged. See the following discussion for detail. // http://stats.stackexchange.com/questions/13617/how-is-the-intercept-computed-in-glmnet val intercept = yMean - dot(weights, Vectors.dense(featuresMean)) - new LinearRegressionModel(this, paramMap, weights, intercept) + + // TODO: We convert to sparse format based on the storage. + // But we may base on the prediction speed. + new LinearRegressionModel(this, paramMap, weights.compressed, intercept) } else { - new LinearRegressionModel(this, paramMap, Vectors.zeros(numFeatures), yMean) + new LinearRegressionModel(this, paramMap, Vectors.sparse(numFeatures, Seq()), yMean) } if (handlePersistence) { instances.unpersist() From 5929e492f615afcbaf3442bae5fe901f3cbdcf19 Mon Sep 17 00:00:00 2001 From: DB Tsai Date: Wed, 29 Apr 2015 09:51:01 -0700 Subject: [PATCH 3/7] typo Signed-off-by: DB Tsai --- .../org/apache/spark/ml/regression/LinearRegression.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala index 794c4f2753d8..bb921d87bb8e 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala @@ -110,8 +110,8 @@ class LinearRegression extends Regressor[Vector, LinearRegression, LinearRegress val yMean = statCounter.mean val yStd = math.sqrt(statCounter.variance) - // If the yStd is zero, then the intercept is yStd with zero weights. - // As a result, training is not required. + // If the yStd is zero, then the intercept is yMean with zero weights; + // as a result, training is not needed. val model = if (yStd != 0.0) { val featuresMean = summarizer.mean.toArray val featuresStd = summarizer.variance.toArray.map(math.sqrt) From 69757b8adf76227d1afacb0a3b02a850d378fd8b Mon Sep 17 00:00:00 2001 From: DB Tsai Date: Wed, 29 Apr 2015 13:10:33 -0700 Subject: [PATCH 4/7] actually diffSum is mathematically zero! No correction is needed. --- .../ml/regression/LinearRegression.scala | 30 +++++++++---------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala index bb921d87bb8e..67221bbf23f9 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala @@ -267,10 +267,20 @@ class LinearRegressionModel private[ml] ( * }}}, * where correction_i = - diffSum \bar{x_i}) / \hat{x_i} * - * As a result, the first term in the first derivative of the total objective function - * depends on training dataset, which can be computed in distributed fashion, and is sparse - * format friendly. We only have to loop through the whole gradientSum vector one time - * in the end for adding the correction terms back (in the driver, not in the executors). + * A simple math can show that diffSum is actually zero, so we don't even + * need to add the correction terms in the end. From the definition of diff, + * {{{ + * diffSum = \sum_j (\sum_i w_i(x_{ij} - \bar{x_i}) / \hat{x_i} - (y_j - \bar{y}) / \hat{y}) + * = N * (\sum_i w_i(\bar{x_i} - \bar{x_i}) / \hat{x_i} - (\bar{y_j} - \bar{y}) / \hat{y}) + * = 0 + * }}} + * + * As a result, the first derivative of the total objective function only depends on + * the training dataset, which can be easily computed in distributed fashion, and is + * sparse format friendly. + * {{{ + * \frac{\partial L}{\partial\w_i} = 1/N ((\sum_j diff_j x_{ij} / \hat{x_i}) + * }}}, * * @param weights The weights/coefficients corresponding to the features. * @param labelStd The standard deviation value of the label. @@ -371,17 +381,7 @@ private class LeastSquaresAggregator( def loss: Double = lossSum / totalCnt def gradient: Vector = { - val resultArray = gradientSumArray.clone() - - // Adding the correction terms back to gradientSum; - // see the mathematical derivation for detail. - Vectors.dense(featuresMean).foreachActive { (index, value) => - if (featuresStd(index) != 0.0 && value != 0.0) { - resultArray(index) -= diffSum * value / featuresStd(index) - } - } - - val result = Vectors.dense(resultArray) + val result = Vectors.dense(gradientSumArray.clone()) scal(1.0 / totalCnt, result) result } From 58456d8fa46a40442d7b4369551dde0c975d0a92 Mon Sep 17 00:00:00 2001 From: DB Tsai Date: Wed, 29 Apr 2015 13:32:31 -0700 Subject: [PATCH 5/7] address feedback --- .../ml/regression/LinearRegression.scala | 101 +++++++++--------- 1 file changed, 49 insertions(+), 52 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala index 67221bbf23f9..0ac22fd7ec6b 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala @@ -34,6 +34,7 @@ import org.apache.spark.rdd.RDD import org.apache.spark.sql.DataFrame import org.apache.spark.storage.StorageLevel import org.apache.spark.util.StatCounter +import org.apache.spark.Logging /** * Params for linear regression. @@ -48,7 +49,7 @@ private[regression] trait LinearRegressionParams extends RegressorParams */ @AlphaComponent class LinearRegression extends Regressor[Vector, LinearRegression, LinearRegressionModel] - with LinearRegressionParams { + with LinearRegressionParams with Logging { /** * Set the regularization parameter. @@ -112,68 +113,64 @@ class LinearRegression extends Regressor[Vector, LinearRegression, LinearRegress // If the yStd is zero, then the intercept is yMean with zero weights; // as a result, training is not needed. - val model = if (yStd != 0.0) { - val featuresMean = summarizer.mean.toArray - val featuresStd = summarizer.variance.toArray.map(math.sqrt) + if (yStd == 0.0) { + logWarning(s"The standard deviation of the label is zero, " + + s"so the weights will be zeros and intercept will be the mean of the label.") + if (handlePersistence) instances.unpersist() + return new LinearRegressionModel(this, paramMap, Vectors.sparse(numFeatures, Seq()), yMean) + } - // Since we implicitly do the feature scaling when we compute the cost function - // to improve the convergence, the effective regParam will be changed. - val effectiveRegParam = paramMap(regParam) / yStd - val effectiveL1RegParam = paramMap(elasticNetParam) * effectiveRegParam - val effectiveL2RegParam = (1.0 - paramMap(elasticNetParam)) * effectiveRegParam + val featuresMean = summarizer.mean.toArray + val featuresStd = summarizer.variance.toArray.map(math.sqrt) - val costFun = new LeastSquaresCostFun(instances, yStd, yMean, - featuresStd, featuresMean, effectiveL2RegParam) + // Since we implicitly do the feature scaling when we compute the cost function + // to improve the convergence, the effective regParam will be changed. + val effectiveRegParam = paramMap(regParam) / yStd + val effectiveL1RegParam = paramMap(elasticNetParam) * effectiveRegParam + val effectiveL2RegParam = (1.0 - paramMap(elasticNetParam)) * effectiveRegParam - val optimizer = if (paramMap(elasticNetParam) == 0.0 || effectiveRegParam == 0.0) { - new BreezeLBFGS[BDV[Double]](paramMap(maxIter), 10, paramMap(tol)) - } else { - new BreezeOWLQN[Int, BDV[Double]](paramMap(maxIter), 10, - effectiveL1RegParam, paramMap(tol)) - } + val costFun = new LeastSquaresCostFun(instances, yStd, yMean, + featuresStd, featuresMean, effectiveL2RegParam) - val initialWeights = Vectors.zeros(numFeatures) - val states = optimizer.iterations(new CachedDiffFunction(costFun), - initialWeights.toBreeze.toDenseVector) + val optimizer = if (paramMap(elasticNetParam) == 0.0 || effectiveRegParam == 0.0) { + new BreezeLBFGS[BDV[Double]](paramMap(maxIter), 10, paramMap(tol)) + } else { + new BreezeOWLQN[Int, BDV[Double]](paramMap(maxIter), 10, effectiveL1RegParam, paramMap(tol)) + } - var state = states.next() - val lossHistory = mutable.ArrayBuilder.make[Double] + val initialWeights = Vectors.zeros(numFeatures) + val states = + optimizer.iterations(new CachedDiffFunction(costFun), initialWeights.toBreeze.toDenseVector) - while (states.hasNext) { - lossHistory += state.value - state = states.next() - } + var state = states.next() + val lossHistory = mutable.ArrayBuilder.make[Double] + + while (states.hasNext) { lossHistory += state.value + state = states.next() + } + lossHistory += state.value - // The weights are trained in the scaled space; we're converting them back to - // the original space. - val weights = { - val rawWeights = state.x.toArray.clone() - var i = 0 - while (i < rawWeights.length) { - rawWeights(i) *= { - if (featuresStd(i) != 0.0) yStd / featuresStd(i) else 0.0 - } - i += 1 - } - Vectors.dense(rawWeights) + // The weights are trained in the scaled space; we're converting them back to + // the original space. + val weights = { + val rawWeights = state.x.toArray.clone() + var i = 0 + while (i < rawWeights.length) { + rawWeights(i) *= { if (featuresStd(i) != 0.0) yStd / featuresStd(i) else 0.0 } + i += 1 } + Vectors.dense(rawWeights) + } - // The intercept in R's GLMNET is computed using closed form after the coefficients are - // converged. See the following discussion for detail. - // http://stats.stackexchange.com/questions/13617/how-is-the-intercept-computed-in-glmnet - val intercept = yMean - dot(weights, Vectors.dense(featuresMean)) + // The intercept in R's GLMNET is computed using closed form after the coefficients are + // converged. See the following discussion for detail. + // http://stats.stackexchange.com/questions/13617/how-is-the-intercept-computed-in-glmnet + val intercept = yMean - dot(weights, Vectors.dense(featuresMean)) + if (handlePersistence) instances.unpersist() - // TODO: We convert to sparse format based on the storage. - // But we may base on the prediction speed. - new LinearRegressionModel(this, paramMap, weights.compressed, intercept) - } else { - new LinearRegressionModel(this, paramMap, Vectors.sparse(numFeatures, Seq()), yMean) - } - if (handlePersistence) { - instances.unpersist() - } - model + // TODO: Converts to sparse format based on the storage, but may base on the scoring speed. + new LinearRegressionModel(this, paramMap, weights.compressed, intercept) } } From fc9f582061155789a561e8fdc23fbc168877363d Mon Sep 17 00:00:00 2001 From: DB Tsai Date: Wed, 29 Apr 2015 13:36:16 -0700 Subject: [PATCH 6/7] doc --- .../org/apache/spark/ml/regression/LinearRegression.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala index 0ac22fd7ec6b..cc9ad22cb860 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala @@ -114,8 +114,8 @@ class LinearRegression extends Regressor[Vector, LinearRegression, LinearRegress // If the yStd is zero, then the intercept is yMean with zero weights; // as a result, training is not needed. if (yStd == 0.0) { - logWarning(s"The standard deviation of the label is zero, " + - s"so the weights will be zeros and intercept will be the mean of the label.") + logWarning(s"The standard deviation of the label is zero, so the weights will be zeros " + + s"and the intercept will be the mean of the label; as a result, training is not needed.") if (handlePersistence) instances.unpersist() return new LinearRegressionModel(this, paramMap, Vectors.sparse(numFeatures, Seq()), yMean) } From 5e346c9cd61e729a9bbe015bff886ff45ab3654a Mon Sep 17 00:00:00 2001 From: DB Tsai Date: Wed, 29 Apr 2015 14:11:04 -0700 Subject: [PATCH 7/7] refactoring --- .../mllib/util/LinearDataGenerator.scala | 21 +++++++++---------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala index d7bb943e84f5..91c2f4ccd3d1 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala @@ -103,17 +103,16 @@ object LinearDataGenerator { val rnd = new Random(seed) val x = Array.fill[Array[Double]](nPoints)( - Array.fill[Double](weights.length)(rnd.nextDouble)) - - x.map(vector => { - // This doesn't work if `vector` is a sparse vector. - val vectorArray = vector.toArray - var i = 0 - while (i < vectorArray.size) { - vectorArray(i) = (vectorArray(i) - 0.5) * math.sqrt(12.0 * xVariance(i)) + xMean(i) - i += 1 - } - }) + Array.fill[Double](weights.length)(rnd.nextDouble())) + + x.foreach { + case v => + var i = 0 + while (i < v.length) { + v(i) = (v(i) - 0.5) * math.sqrt(12.0 * xVariance(i)) + xMean(i) + i += 1 + } + } val y = x.map { xi => blas.ddot(weights.length, xi, 1, weights, 1) + intercept + eps * rnd.nextGaussian()