CR feedback, move the fitIntercept down rather than changing ymean and etc above

holdenk · holdenk · commit ae5baa8eb160 · 2015-06-22T23:55:02.000-07:00
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
@@ -81,7 +81,6 @@ class LinearRegression(override val uid: String)
   def setFitIntercept(value: Boolean): this.type = set(fitIntercept, value)
   setDefault(fitIntercept -> true)
 
-
   /**
    * Set the ElasticNet mixing parameter.
    * For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.
@@ -131,7 +130,7 @@ class LinearRegression(override val uid: String)
       })
 
     val numFeatures = summarizer.mean.size
-    val yMean = if ($(fitIntercept)) statCounter.mean else 0.0
+    val yMean = statCounter.mean
     val yStd = math.sqrt(statCounter.variance)
     // look at glmnet5.m L761 maaaybe that has info
 
@@ -144,11 +143,7 @@ class LinearRegression(override val uid: String)
       return new LinearRegressionModel(uid, Vectors.sparse(numFeatures, Seq()), yMean)
     }
 
-    val featuresMean = if ($(fitIntercept)) {
-      summarizer.mean.toArray
-    } else {
-      new Array[Double](numFeatures)
-    }
+    val featuresMean = summarizer.mean.toArray
     val featuresStd = summarizer.variance.toArray.map(math.sqrt)
 
     // Since we implicitly do the feature scaling when we compute the cost function
@@ -157,7 +152,7 @@ class LinearRegression(override val uid: String)
     val effectiveL1RegParam = $(elasticNetParam) * effectiveRegParam
     val effectiveL2RegParam = (1.0 - $(elasticNetParam)) * effectiveRegParam
 
-    val costFun = new LeastSquaresCostFun(instances, yStd, yMean,
+    val costFun = new LeastSquaresCostFun(instances, yStd, yMean, $(fitIntercept),
       featuresStd, featuresMean, effectiveL2RegParam)
 
     val optimizer = if ($(elasticNetParam) == 0.0 || effectiveRegParam == 0.0) {
@@ -195,7 +190,7 @@ class LinearRegression(override val uid: String)
     // The intercept in R's GLMNET is computed using closed form after the coefficients are
     // converged. See the following discussion for detail.
     // http://stats.stackexchange.com/questions/13617/how-is-the-intercept-computed-in-glmnet
-    val intercept = yMean - dot(weights, Vectors.dense(featuresMean))
+    val intercept = if ($(fitIntercept)) yMean - dot(weights, Vectors.dense(featuresMean)) else 0.0
     if (handlePersistence) instances.unpersist()
 
     // TODO: Converts to sparse format based on the storage, but may base on the scoring speed.
@@ -320,6 +315,7 @@ private class LeastSquaresAggregator(
     weights: Vector,
     labelStd: Double,
     labelMean: Double,
+    fitIntercept: Boolean,
     featuresStd: Array[Double],
     featuresMean: Array[Double]) extends Serializable {
 
@@ -340,7 +336,7 @@ private class LeastSquaresAggregator(
       }
       i += 1
     }
-    (weightsArray, -sum + labelMean / labelStd, weightsArray.length)
+    (weightsArray, if (fitIntercept) labelMean / labelStd - sum else 0.0, weightsArray.length)
   }
 
   private val effectiveWeightsVector = Vectors.dense(effectiveWeightsArray)
@@ -423,6 +419,7 @@ private class LeastSquaresCostFun(
     data: RDD[(Double, Vector)],
     labelStd: Double,
     labelMean: Double,
+    fitIntercept: Boolean,
     featuresStd: Array[Double],
     featuresMean: Array[Double],
     effectiveL2regParam: Double) extends DiffFunction[BDV[Double]] {
@@ -431,7 +428,7 @@ private class LeastSquaresCostFun(
     val w = Vectors.fromBreeze(weights)
 
     val leastSquaresAggregator = data.treeAggregate(new LeastSquaresAggregator(w, labelStd,
-      labelMean, featuresStd, featuresMean))(
+      labelMean, fitIntercept, featuresStd, featuresMean))(
         seqOp = (c, v) => (c, v) match {
           case (aggregator, (label, features)) => aggregator.add(label, features)
         },
diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
@@ -26,7 +26,7 @@ import org.apache.spark.sql.{DataFrame, Row}
 class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
 
   @transient var dataset: DataFrame = _
-  @transient var datasetNR: DataFrame = _
+  @transient var datasetWithoutIntercept: DataFrame = _
 
   /**
    * In `LinearRegressionSuite`, we will make sure that the model trained by SparkML
@@ -45,7 +45,11 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
     dataset = sqlContext.createDataFrame(
       sc.parallelize(LinearDataGenerator.generateLinearInput(
         6.3, Array(4.7, 7.2), Array(0.9, -1.3), Array(0.7, 1.2), 10000, 42, 0.1), 2))
-    datasetNR = sqlContext.createDataFrame(
+    /**
+     * datasetWithoutIntercept is not needed for correctness testing but is useful for illustrating
+     * training model without intercept
+     */
+    datasetWithoutIntercept = sqlContext.createDataFrame(
       sc.parallelize(LinearDataGenerator.generateLinearInput(
         0.0, Array(4.7, 7.2), Array(0.9, -1.3), Array(0.7, 1.2), 10000, 42, 0.1), 2))
 
@@ -88,15 +92,9 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
   test("linear regression without intercept without regularization") {
     val trainer = (new LinearRegression).setFitIntercept(false)
     val model = trainer.fit(dataset)
-    val modelNR = trainer.fit(datasetNR)
+    val modelWithoutIntercept = trainer.fit(datasetWithoutIntercept)
 
     /**
-     * Using the following R code to load the data and train the model using glmnet package.
-     *
-     * library("glmnet")
-     * data <- read.csv("path", header=FALSE, stringsAsFactors=FALSE)
-     * features <- as.matrix(data.frame(as.numeric(data$V2), as.numeric(data$V3)))
-     * label <- as.numeric(data$V1)
      * weights <- coef(glmnet(features, label, family="gaussian", alpha = 0, lambda = 0,
      *   intercept = FALSE))
      * > weights
@@ -113,18 +111,18 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
     assert(model.weights(1) ~== weightsR(1) relTol 1E-3)
     /**
      * Then again with the data with no intercept:
-     * > weightsNR
+     * > weightsWithoutIntercept
      * 3 x 1 sparse Matrix of class "dgCMatrix"
      *                             s0
      * (Intercept)           .
      * as.numeric.data3.V2. 4.70011
      * as.numeric.data3.V3. 7.19943
      */
-    val weightsRNR = Array(4.70011, 7.19943)
+    val weightsWithoutInterceptR = Array(4.70011, 7.19943)
 
-    assert(modelNR.intercept ~== 0 relTol 1E-3)
-    assert(modelNR.weights(0) ~== weightsRNR(0) relTol 1E-3)
-    assert(modelNR.weights(1) ~== weightsRNR(1) relTol 1E-3)
+    assert(modelWithoutIntercept.intercept ~== 0 relTol 1E-3)
+    assert(modelWithoutIntercept.weights(0) ~== weightsWithoutInterceptR(0) relTol 1E-3)
+    assert(modelWithoutIntercept.weights(1) ~== weightsWithoutInterceptR(1) relTol 1E-3)
   }
 
 
@@ -186,7 +184,6 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
     }
   }
 
-
   test("linear regression with intercept with L2 regularization") {
     val trainer = (new LinearRegression).setElasticNetParam(0.0).setRegParam(2.3)
     val model = trainer.fit(dataset)
@@ -272,6 +269,7 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
         assert(prediction1 ~== prediction2 relTol 1E-5)
     }
   }
+
   test("linear regression without intercept with ElasticNet regularization") {
     val trainer = (new LinearRegression).setElasticNetParam(0.3).setRegParam(1.6)
       .setFitIntercept(false)