From 11cd9c13b78a7c1d9ecfb2950242e0525c3bf303 Mon Sep 17 00:00:00 2001
From: Lewuathe <lewuathe@me.com>
Date: Tue, 20 Oct 2015 22:50:23 +0900
Subject: [PATCH 01/11] [SPARK-11207][ML] Add test cases for solver selection
 of LinearRegression as followup.

---
 .../ml/regression/LinearRegressionSuite.scala | 57 ++++++++++++-------
 1 file changed, 35 insertions(+), 22 deletions(-)

diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
index a6e0c72ba9030..c3df4c5b11c90 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
@@ -34,6 +34,7 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
   private val seed: Int = 42
   @transient var dataset: DataFrame = _
   @transient var datasetWithoutIntercept: DataFrame = _
+  @transient var datasetWithBigFeature: DataFrame = _
 
   /*
      In `LinearRegressionSuite`, we will make sure that the model trained by SparkML
@@ -59,6 +60,14 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
     datasetWithoutIntercept = sqlContext.createDataFrame(
       sc.parallelize(LinearDataGenerator.generateLinearInput(
         0.0, Array(4.7, 7.2), Array(0.9, -1.3), Array(0.7, 1.2), 10000, seed, 0.1), 2))
+
+    val r = new Random(seed)
+    val featureSize = 4100
+    datasetWithBigFeature = sqlContext.createDataFrame(
+      sc.parallelize(LinearDataGenerator.generateLinearInput(
+        0.0, Seq.fill(featureSize)(r.nextDouble).toArray, Seq.fill(featureSize)(r.nextDouble).toArray,
+        Seq.fill(featureSize)(r.nextDouble).toArray, 200, seed, 0.1
+      ), 2))
   }
 
   test("params") {
@@ -186,9 +195,6 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
       val trainer2 = (new LinearRegression).setElasticNetParam(1.0).setRegParam(0.57)
         .setSolver(solver).setStandardization(false)
 
-      var model1: LinearRegressionModel = null
-      var model2: LinearRegressionModel = null
-
       // Normal optimizer is not supported with only L1 regularization case.
       if (solver == "normal") {
         intercept[IllegalArgumentException] {
@@ -196,9 +202,8 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
             trainer2.fit(dataset)
           }
       } else {
-        model1 = trainer1.fit(dataset)
-        model2 = trainer2.fit(dataset)
-
+        val model1 = trainer1.fit(dataset)
+        val model2 = trainer2.fit(dataset)
 
         /*
            weights <- coef(glmnet(features, label, family="gaussian", alpha = 1.0, lambda = 0.57))
@@ -247,9 +252,6 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
       val trainer2 = (new LinearRegression).setElasticNetParam(1.0).setRegParam(0.57)
         .setFitIntercept(false).setStandardization(false).setSolver(solver)
 
-      var model1: LinearRegressionModel = null
-      var model2: LinearRegressionModel = null
-
       // Normal optimizer is not supported with only L1 regularization case.
       if (solver == "normal") {
         intercept[IllegalArgumentException] {
@@ -257,8 +259,8 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
             trainer2.fit(dataset)
           }
       } else {
-        model1 = trainer1.fit(dataset)
-        model2 = trainer2.fit(dataset)
+        val model1 = trainer1.fit(dataset)
+        val model2 = trainer2.fit(dataset)
 
         /*
            weights <- coef(glmnet(features, label, family="gaussian", alpha = 1.0, lambda = 0.57,
@@ -408,9 +410,6 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
       val trainer2 = (new LinearRegression).setElasticNetParam(0.3).setRegParam(1.6)
         .setStandardization(false).setSolver(solver)
 
-      var model1: LinearRegressionModel = null
-      var model2: LinearRegressionModel = null
-
       // Normal optimizer is not supported with non-zero elasticnet parameter.
       if (solver == "normal") {
         intercept[IllegalArgumentException] {
@@ -418,8 +417,8 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
             trainer2.fit(dataset)
           }
       } else {
-        model1 = trainer1.fit(dataset)
-        model2 = trainer2.fit(dataset)
+        val model1 = trainer1.fit(dataset)
+        val model2 = trainer2.fit(dataset)
 
         /*
            weights <- coef(glmnet(features, label, family="gaussian", alpha = 0.3, lambda = 1.6))
@@ -469,9 +468,6 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
       val trainer2 = (new LinearRegression).setElasticNetParam(0.3).setRegParam(1.6)
         .setFitIntercept(false).setStandardization(false).setSolver(solver)
 
-      var model1: LinearRegressionModel = null
-      var model2: LinearRegressionModel = null
-
       // Normal optimizer is not supported with non-zero elasticnet parameter.
       if (solver == "normal") {
         intercept[IllegalArgumentException] {
@@ -479,8 +475,8 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
             trainer2.fit(dataset)
           }
       } else {
-        model1 = trainer1.fit(dataset)
-        model2 = trainer2.fit(dataset)
+        val model1 = trainer1.fit(dataset)
+        val model2 = trainer2.fit(dataset)
 
         /*
            weights <- coef(glmnet(features, label, family="gaussian", alpha = 0.3, lambda = 1.6,
@@ -531,7 +527,6 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
       val trainerNoPredictionCol = trainer.setPredictionCol("")
       val modelNoPredictionCol = trainerNoPredictionCol.fit(dataset)
 
-
       // Training results for the model should be available
       assert(model.hasSummary)
       assert(modelNoPredictionCol.hasSummary)
@@ -585,6 +580,10 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
             .objectiveHistory
             .sliding(2)
             .forall(x => x(0) >= x(1)))
+      } else {
+        // To clalify that the normal solver is used here.
+        assert(model.summary.objectiveHistory.length == 1)
+        assert(model.summary.objectiveHistory(0) == 0.0)
       }
     }
   }
@@ -693,4 +692,18 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
       assert(model4a0.weights ~== model4b.weights absTol 1E-3)
     }
   }
+
+  test("linear regression model with l-bfgs with big feature datasets") {
+    val trainer = new LinearRegression().setSolver("auto")
+    val model = trainer.fit(datasetWithBigFeature)
+
+    // Training results for the model should be available
+    assert(model.hasSummary)
+    // When LBFGS is used as optimizer, objective history can be restored.
+    assert(
+      model.summary
+        .objectiveHistory
+        .sliding(2)
+        .forall(x => x(0) >= x(1)))
+  }
 }

From 28427d29e8c398f25f9aac10f86074da084a933f Mon Sep 17 00:00:00 2001
From: Lewuathe <lewuathe@me.com>
Date: Tue, 20 Oct 2015 23:05:36 +0900
Subject: [PATCH 02/11] Fix scalastyle

---
 .../org/apache/spark/ml/regression/LinearRegressionSuite.scala | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
index c3df4c5b11c90..8464148668deb 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
@@ -65,7 +65,8 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
     val featureSize = 4100
     datasetWithBigFeature = sqlContext.createDataFrame(
       sc.parallelize(LinearDataGenerator.generateLinearInput(
-        0.0, Seq.fill(featureSize)(r.nextDouble).toArray, Seq.fill(featureSize)(r.nextDouble).toArray,
+        0.0, Seq.fill(featureSize)(r.nextDouble).toArray,
+        Seq.fill(featureSize)(r.nextDouble).toArray,
         Seq.fill(featureSize)(r.nextDouble).toArray, 200, seed, 0.1
       ), 2))
   }

From f85bca6667dcebbfccbd50cde46b11f6855d1974 Mon Sep 17 00:00:00 2001
From: Lewuathe <lewuathe@me.com>
Date: Wed, 21 Oct 2015 23:59:09 +0900
Subject: [PATCH 03/11] [SPARK-11207] Improve test case with many feature
 datasets

---
 .../mllib/util/LinearDataGenerator.scala      | 53 +++++++++++++++++++
 .../ml/regression/LinearRegressionSuite.scala | 23 ++++----
 2 files changed, 67 insertions(+), 9 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala
index d0ba454f379a9..e84b6708bab32 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala
@@ -124,6 +124,59 @@ object LinearDataGenerator {
     y.zip(x).map(p => LabeledPoint(p._1, Vectors.dense(p._2)))
   }
 
+  /**
+   *
+   * @param intercept Data intercept
+   * @param weights  Weights to be applied.
+   * @param xMean the mean of the generated features. Lots of time, if the features are not properly
+   *              standardized, the algorithm with poor implementation will have difficulty
+   *              to converge.
+   * @param xVariance the variance of the generated features.
+   * @param nPoints Number of points in sample.
+   * @param seed Random seed
+   * @param eps Epsilon scaling factor.
+   * @return Seq of LabeledPoint includes sparse vectors..
+   */
+  @Since("1.6.0")
+  def generateLinearSparseInput(
+      intercept: Double,
+      weights: Array[Double],
+      xMean: Array[Double],
+      xVariance: Array[Double],
+      nPoints: Int,
+      seed: Int,
+      eps: Double): Seq[LabeledPoint] = {
+    val rnd = new Random(seed)
+    val x = Array.fill[Array[Double]](nPoints)(
+      Array.fill[Double](weights.length)(rnd.nextDouble()))
+
+    x.foreach { v =>
+      var i = 0
+      val len = v.length
+      while (i < len) {
+        if (rnd.nextDouble() < 0.7) {
+          v(i) = 0.0
+        } else {
+          v(i) = (v(i) - 0.5) * math.sqrt(12.0 * xVariance(i)) + xMean(i)
+        }
+        i += 1
+      }
+    }
+
+    val y = x.map { xi =>
+      blas.ddot(weights.length, xi, 1, weights, 1) + intercept + eps * rnd.nextGaussian()
+    }
+
+    val sparseX = x.map { (v: Array[Double]) =>
+      v.zipWithIndex.filter{
+        case (d: Double, i: Int) => d != 0.0
+      }.map {
+        case (d: Double, i: Int) => (i, d)
+      }
+    }
+    y.zip(sparseX).map(p => LabeledPoint(p._1, Vectors.sparse(weights.length, p._2)))
+  }
+
   /**
    * Generate an RDD containing sample data for Linear Regression models - including Ridge, Lasso,
    * and uregularized variants.
diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
index 8464148668deb..a14ec6307d863 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
@@ -34,7 +34,7 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
   private val seed: Int = 42
   @transient var dataset: DataFrame = _
   @transient var datasetWithoutIntercept: DataFrame = _
-  @transient var datasetWithBigFeature: DataFrame = _
+  @transient var datasetWithManyFeature: DataFrame = _
 
   /*
      In `LinearRegressionSuite`, we will make sure that the model trained by SparkML
@@ -52,22 +52,27 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
     super.beforeAll()
     dataset = sqlContext.createDataFrame(
       sc.parallelize(LinearDataGenerator.generateLinearInput(
-        6.3, Array(4.7, 7.2), Array(0.9, -1.3), Array(0.7, 1.2), 10000, seed, 0.1), 2))
+        intercept = 6.3, weights = Array(4.7, 7.2), xMean = Array(0.9, -1.3),
+        xVariance = Array(0.7, 1.2), nPoints = 10000, seed = seed, eps = 0.1), 2))
     /*
        datasetWithoutIntercept is not needed for correctness testing but is useful for illustrating
        training model without intercept
      */
     datasetWithoutIntercept = sqlContext.createDataFrame(
       sc.parallelize(LinearDataGenerator.generateLinearInput(
-        0.0, Array(4.7, 7.2), Array(0.9, -1.3), Array(0.7, 1.2), 10000, seed, 0.1), 2))
+        intercept = 0.0, weights = Array(4.7, 7.2), xMean = Array(0.9, -1.3),
+        xVariance = Array(0.7, 1.2), nPoints = 10000, seed = seed, eps = 0.1), 2))
 
     val r = new Random(seed)
+    // When feature size is larger than 4096, normal optimizer is choosed
+    // as the solver of linear regression in the case of "auto" mode.
     val featureSize = 4100
-    datasetWithBigFeature = sqlContext.createDataFrame(
-      sc.parallelize(LinearDataGenerator.generateLinearInput(
-        0.0, Seq.fill(featureSize)(r.nextDouble).toArray,
-        Seq.fill(featureSize)(r.nextDouble).toArray,
-        Seq.fill(featureSize)(r.nextDouble).toArray, 200, seed, 0.1
+    datasetWithManyFeature = sqlContext.createDataFrame(
+      sc.parallelize(LinearDataGenerator.generateLinearSparseInput(
+        intercept = 0.0, weights = Seq.fill(featureSize)(r.nextDouble).toArray,
+        xMean = Seq.fill(featureSize)(r.nextDouble).toArray,
+        xVariance = Seq.fill(featureSize)(r.nextDouble).toArray, nPoints = 200,
+        seed = seed, eps = 0.1
       ), 2))
   }
 
@@ -696,7 +701,7 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
 
   test("linear regression model with l-bfgs with big feature datasets") {
     val trainer = new LinearRegression().setSolver("auto")
-    val model = trainer.fit(datasetWithBigFeature)
+    val model = trainer.fit(datasetWithManyFeature)
 
     // Training results for the model should be available
     assert(model.hasSummary)

From f6b2256fd669585f7e3b082730a63d0dbda631aa Mon Sep 17 00:00:00 2001
From: Lewuathe <lewuathe@me.com>
Date: Thu, 22 Oct 2015 10:21:09 +0900
Subject: [PATCH 04/11] [SPARK-11207] Remove extra lines

---
 .../scala/org/apache/spark/mllib/util/LinearDataGenerator.scala | 2 --
 1 file changed, 2 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala
index e84b6708bab32..d382f34e61357 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala
@@ -83,7 +83,6 @@ object LinearDataGenerator {
       nPoints, seed, eps)}
 
   /**
-   *
    * @param intercept Data intercept
    * @param weights  Weights to be applied.
    * @param xMean the mean of the generated features. Lots of time, if the features are not properly
@@ -125,7 +124,6 @@ object LinearDataGenerator {
   }
 
   /**
-   *
    * @param intercept Data intercept
    * @param weights  Weights to be applied.
    * @param xMean the mean of the generated features. Lots of time, if the features are not properly

From 2082d4781eeb009c3a0c45d4e92b546960b5a7ff Mon Sep 17 00:00:00 2001
From: Lewuathe <lewuathe@me.com>
Date: Sat, 24 Oct 2015 10:28:27 +0900
Subject: [PATCH 05/11] [SPARK-11207] Pass sparcity to
 generateLinearSparseInput

---
 .../org/apache/spark/mllib/util/LinearDataGenerator.scala   | 6 ++++--
 .../apache/spark/ml/regression/LinearRegressionSuite.scala  | 3 +--
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala
index d382f34e61357..219ecc709d6cc 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala
@@ -143,7 +143,9 @@ object LinearDataGenerator {
       xVariance: Array[Double],
       nPoints: Int,
       seed: Int,
-      eps: Double): Seq[LabeledPoint] = {
+      eps: Double,
+      sparcity: Double): Seq[LabeledPoint] = {
+    require(sparcity <= 1.0)
     val rnd = new Random(seed)
     val x = Array.fill[Array[Double]](nPoints)(
       Array.fill[Double](weights.length)(rnd.nextDouble()))
@@ -152,7 +154,7 @@ object LinearDataGenerator {
       var i = 0
       val len = v.length
       while (i < len) {
-        if (rnd.nextDouble() < 0.7) {
+        if (rnd.nextDouble() <= sparcity) {
           v(i) = 0.0
         } else {
           v(i) = (v(i) - 0.5) * math.sqrt(12.0 * xVariance(i)) + xMean(i)
diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
index a14ec6307d863..a9c07225747ea 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
@@ -72,8 +72,7 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
         intercept = 0.0, weights = Seq.fill(featureSize)(r.nextDouble).toArray,
         xMean = Seq.fill(featureSize)(r.nextDouble).toArray,
         xVariance = Seq.fill(featureSize)(r.nextDouble).toArray, nPoints = 200,
-        seed = seed, eps = 0.1
-      ), 2))
+        seed = seed, eps = 0.1, sparcity = 0.7), 2))
   }
 
   test("params") {

From 003d3bd87f3936c4fd6ee0dc77ca81f3811bcbd7 Mon Sep 17 00:00:00 2001
From: Lewuathe <lewuathe@me.com>
Date: Sun, 25 Oct 2015 08:52:59 +0900
Subject: [PATCH 06/11] [SPARK-11207] Add new API for generateLinearInput

---
 .../mllib/util/LinearDataGenerator.scala      | 36 ++++++++-----------
 1 file changed, 14 insertions(+), 22 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala
index 219ecc709d6cc..36c92bd3ad730 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala
@@ -103,26 +103,10 @@ object LinearDataGenerator {
       nPoints: Int,
       seed: Int,
       eps: Double): Seq[LabeledPoint] = {
-
-    val rnd = new Random(seed)
-    val x = Array.fill[Array[Double]](nPoints)(
-      Array.fill[Double](weights.length)(rnd.nextDouble()))
-
-    x.foreach { v =>
-      var i = 0
-      val len = v.length
-      while (i < len) {
-        v(i) = (v(i) - 0.5) * math.sqrt(12.0 * xVariance(i)) + xMean(i)
-        i += 1
-      }
-    }
-
-    val y = x.map { xi =>
-      blas.ddot(weights.length, xi, 1, weights, 1) + intercept + eps * rnd.nextGaussian()
-    }
-    y.zip(x).map(p => LabeledPoint(p._1, Vectors.dense(p._2)))
+    generateLinearInputInternal(intercept, weights, xMean, xVariance, nPoints, seed, eps, 0.0)
   }
 
+
   /**
    * @param intercept Data intercept
    * @param weights  Weights to be applied.
@@ -133,10 +117,12 @@ object LinearDataGenerator {
    * @param nPoints Number of points in sample.
    * @param seed Random seed
    * @param eps Epsilon scaling factor.
-   * @return Seq of LabeledPoint includes sparse vectors..
+   * @param sparcity The ratio of zero elements. If it is 0.0, LabeledPoints with
+   *                 DenseVector is returned.
+   * @return Seq of input.
    */
   @Since("1.6.0")
-  def generateLinearSparseInput(
+  def generateLinearInputInternal(
       intercept: Double,
       weights: Array[Double],
       xMean: Array[Double],
@@ -168,13 +154,19 @@ object LinearDataGenerator {
     }
 
     val sparseX = x.map { (v: Array[Double]) =>
-      v.zipWithIndex.filter{
+      v.zipWithIndex.filter {
         case (d: Double, i: Int) => d != 0.0
       }.map {
         case (d: Double, i: Int) => (i, d)
       }
     }
-    y.zip(sparseX).map(p => LabeledPoint(p._1, Vectors.sparse(weights.length, p._2)))
+    if (sparcity == 0.0) {
+      // Return LabeledPoints with DenseVector
+      y.zip(x).map(p => LabeledPoint(p._1, Vectors.dense(p._2)))
+    } else {
+      // Return LabeledPoints with SparseVector
+      y.zip(sparseX).map(p => LabeledPoint(p._1, Vectors.sparse(weights.length, p._2)))
+    }
   }
 
   /**

From 0a4303356455f28ca3b87ffd446cb5ef5f25d0e2 Mon Sep 17 00:00:00 2001
From: Lewuathe <lewuathe@me.com>
Date: Sun, 25 Oct 2015 15:59:52 +0900
Subject: [PATCH 07/11] [SPARK-11207] Fix tests

---
 .../org/apache/spark/ml/regression/LinearRegressionSuite.scala  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
index a9c07225747ea..dd055878155f6 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
@@ -68,7 +68,7 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
     // as the solver of linear regression in the case of "auto" mode.
     val featureSize = 4100
     datasetWithManyFeature = sqlContext.createDataFrame(
-      sc.parallelize(LinearDataGenerator.generateLinearSparseInput(
+      sc.parallelize(LinearDataGenerator.generateLinearInputInternal(
         intercept = 0.0, weights = Seq.fill(featureSize)(r.nextDouble).toArray,
         xMean = Seq.fill(featureSize)(r.nextDouble).toArray,
         xVariance = Seq.fill(featureSize)(r.nextDouble).toArray, nPoints = 200,

From 59383fd41f1d6b96274c564eb2fb7c96f5ab07e0 Mon Sep 17 00:00:00 2001
From: lewuathe <lewuathe@me.com>
Date: Sun, 25 Oct 2015 18:24:01 +0900
Subject: [PATCH 08/11] [SPARK-11207] Fix random values used by unit tests

---
 .../org/apache/spark/mllib/util/LinearDataGenerator.scala      | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala
index 36c92bd3ad730..95100e7ad31d7 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala
@@ -139,8 +139,9 @@ object LinearDataGenerator {
     x.foreach { v =>
       var i = 0
       val len = v.length
+      val sparceRnd = new Random(seed)
       while (i < len) {
-        if (rnd.nextDouble() <= sparcity) {
+        if (sparceRnd.nextDouble() < sparcity) {
           v(i) = 0.0
         } else {
           v(i) = (v(i) - 0.5) * math.sqrt(12.0 * xVariance(i)) + xMean(i)

From 888b2168c86ac8e9302c900f9ace10fd6cc69d14 Mon Sep 17 00:00:00 2001
From: Lewuathe <lewuathe@me.com>
Date: Fri, 30 Oct 2015 15:39:05 +0900
Subject: [PATCH 09/11] Update Fri Oct 30 15:39:05 JST 2015

---
 .../mllib/util/LinearDataGenerator.scala      |  34 +++---
 .../ml/regression/LinearRegressionSuite.scala | 104 +++++++++---------
 2 files changed, 70 insertions(+), 68 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala
index 36c92bd3ad730..af67f1fdf5392 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala
@@ -77,10 +77,9 @@ object LinearDataGenerator {
       nPoints: Int,
       seed: Int,
       eps: Double = 0.1): Seq[LabeledPoint] = {
-    generateLinearInput(intercept, weights,
-      Array.fill[Double](weights.length)(0.0),
-      Array.fill[Double](weights.length)(1.0 / 3.0),
-      nPoints, seed, eps)}
+    generateLinearInput(intercept, weights, Array.fill[Double](weights.length)(0.0),
+      Array.fill[Double](weights.length)(1.0 / 3.0), nPoints, seed, eps)
+  }
 
   /**
    * @param intercept Data intercept
@@ -103,7 +102,7 @@ object LinearDataGenerator {
       nPoints: Int,
       seed: Int,
       eps: Double): Seq[LabeledPoint] = {
-    generateLinearInputInternal(intercept, weights, xMean, xVariance, nPoints, seed, eps, 0.0)
+    generateLinearInput(intercept, weights, xMean, xVariance, nPoints, seed, eps, 0.0)
   }
 
 
@@ -117,12 +116,12 @@ object LinearDataGenerator {
    * @param nPoints Number of points in sample.
    * @param seed Random seed
    * @param eps Epsilon scaling factor.
-   * @param sparcity The ratio of zero elements. If it is 0.0, LabeledPoints with
+   * @param sparsity The ratio of zero elements. If it is 0.0, LabeledPoints with
    *                 DenseVector is returned.
    * @return Seq of input.
    */
   @Since("1.6.0")
-  def generateLinearInputInternal(
+  def generateLinearInput(
       intercept: Double,
       weights: Array[Double],
       xMean: Array[Double],
@@ -130,8 +129,8 @@ object LinearDataGenerator {
       nPoints: Int,
       seed: Int,
       eps: Double,
-      sparcity: Double): Seq[LabeledPoint] = {
-    require(sparcity <= 1.0)
+      sparsity: Double): Seq[LabeledPoint] = {
+    require(0.0 <= sparsity && sparsity <= 1.0)
     val rnd = new Random(seed)
     val x = Array.fill[Array[Double]](nPoints)(
       Array.fill[Double](weights.length)(rnd.nextDouble()))
@@ -140,7 +139,7 @@ object LinearDataGenerator {
       var i = 0
       val len = v.length
       while (i < len) {
-        if (rnd.nextDouble() <= sparcity) {
+        if (rnd.nextDouble() <= sparsity) {
           v(i) = 0.0
         } else {
           v(i) = (v(i) - 0.5) * math.sqrt(12.0 * xVariance(i)) + xMean(i)
@@ -160,12 +159,15 @@ object LinearDataGenerator {
         case (d: Double, i: Int) => (i, d)
       }
     }
-    if (sparcity == 0.0) {
-      // Return LabeledPoints with DenseVector
-      y.zip(x).map(p => LabeledPoint(p._1, Vectors.dense(p._2)))
-    } else {
-      // Return LabeledPoints with SparseVector
-      y.zip(sparseX).map(p => LabeledPoint(p._1, Vectors.sparse(weights.length, p._2)))
+
+    y.zip(x).map { p =>
+      if (sparsity == 0.0) {
+        // Return LabeledPoints with DenseVector
+        LabeledPoint(p._1, Vectors.dense(p._2))
+      } else {
+        // Return LabeledPoints with SparseVector
+        LabeledPoint(p._1, Vectors.dense(p._2).toSparse)
+      }
     }
   }
 
diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
index dd055878155f6..b1696dba6d534 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
@@ -32,9 +32,9 @@ import org.apache.spark.sql.{DataFrame, Row}
 class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
 
   private val seed: Int = 42
-  @transient var dataset: DataFrame = _
-  @transient var datasetWithoutIntercept: DataFrame = _
-  @transient var datasetWithManyFeature: DataFrame = _
+  @transient var datasetWithDenseFeature: DataFrame = _
+  @transient var datasetWithDenseFeatureWithoutIntercept: DataFrame = _
+  @transient var datasetWithSparseFeature: DataFrame = _
 
   /*
      In `LinearRegressionSuite`, we will make sure that the model trained by SparkML
@@ -50,7 +50,7 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
    */
   override def beforeAll(): Unit = {
     super.beforeAll()
-    dataset = sqlContext.createDataFrame(
+    datasetWithDenseFeature = sqlContext.createDataFrame(
       sc.parallelize(LinearDataGenerator.generateLinearInput(
         intercept = 6.3, weights = Array(4.7, 7.2), xMean = Array(0.9, -1.3),
         xVariance = Array(0.7, 1.2), nPoints = 10000, seed = seed, eps = 0.1), 2))
@@ -58,7 +58,7 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
        datasetWithoutIntercept is not needed for correctness testing but is useful for illustrating
        training model without intercept
      */
-    datasetWithoutIntercept = sqlContext.createDataFrame(
+    datasetWithDenseFeatureWithoutIntercept = sqlContext.createDataFrame(
       sc.parallelize(LinearDataGenerator.generateLinearInput(
         intercept = 0.0, weights = Array(4.7, 7.2), xMean = Array(0.9, -1.3),
         xVariance = Array(0.7, 1.2), nPoints = 10000, seed = seed, eps = 0.1), 2))
@@ -67,12 +67,12 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
     // When feature size is larger than 4096, normal optimizer is choosed
     // as the solver of linear regression in the case of "auto" mode.
     val featureSize = 4100
-    datasetWithManyFeature = sqlContext.createDataFrame(
-      sc.parallelize(LinearDataGenerator.generateLinearInputInternal(
+    datasetWithSparseFeature = sqlContext.createDataFrame(
+      sc.parallelize(LinearDataGenerator.generateLinearInput(
         intercept = 0.0, weights = Seq.fill(featureSize)(r.nextDouble).toArray,
         xMean = Seq.fill(featureSize)(r.nextDouble).toArray,
         xVariance = Seq.fill(featureSize)(r.nextDouble).toArray, nPoints = 200,
-        seed = seed, eps = 0.1, sparcity = 0.7), 2))
+        seed = seed, eps = 0.1, sparsity = 0.7), 2))
   }
 
   test("params") {
@@ -91,19 +91,19 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
     assert(lir.getFitIntercept)
     assert(lir.getStandardization)
     assert(lir.getSolver == "auto")
-    val model = lir.fit(dataset)
+    val model = lir.fit(datasetWithDenseFeature)
 
     // copied model must have the same parent.
     MLTestingUtils.checkCopy(model)
 
-    model.transform(dataset)
+    model.transform(datasetWithDenseFeature)
       .select("label", "prediction")
       .collect()
     assert(model.getFeaturesCol === "features")
     assert(model.getPredictionCol === "prediction")
     assert(model.intercept !== 0.0)
     assert(model.hasParent)
-    val numFeatures = dataset.select("features").first().getAs[Vector](0).size
+    val numFeatures = datasetWithDenseFeature.select("features").first().getAs[Vector](0).size
     assert(model.numFeatures === numFeatures)
   }
 
@@ -112,8 +112,8 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
       val trainer1 = new LinearRegression().setSolver(solver)
       // The result should be the same regardless of standardization without regularization
       val trainer2 = (new LinearRegression).setStandardization(false).setSolver(solver)
-      val model1 = trainer1.fit(dataset)
-      val model2 = trainer2.fit(dataset)
+      val model1 = trainer1.fit(datasetWithDenseFeature)
+      val model2 = trainer2.fit(datasetWithDenseFeature)
 
       /*
          Using the following R code to load the data and train the model using glmnet package.
@@ -138,7 +138,7 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
       assert(model2.intercept ~== interceptR relTol 1E-3)
       assert(model2.weights ~= weightsR relTol 1E-3)
 
-      model1.transform(dataset).select("features", "prediction").collect().foreach {
+      model1.transform(datasetWithDenseFeature).select("features", "prediction").collect().foreach {
         case Row(features: DenseVector, prediction1: Double) =>
           val prediction2 =
             features(0) * model1.weights(0) + features(1) * model1.weights(1) + model1.intercept
@@ -153,10 +153,10 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
       // Without regularization the results should be the same
       val trainer2 = (new LinearRegression).setFitIntercept(false).setStandardization(false)
         .setSolver(solver)
-      val model1 = trainer1.fit(dataset)
-      val modelWithoutIntercept1 = trainer1.fit(datasetWithoutIntercept)
-      val model2 = trainer2.fit(dataset)
-      val modelWithoutIntercept2 = trainer2.fit(datasetWithoutIntercept)
+      val model1 = trainer1.fit(datasetWithDenseFeature)
+      val modelWithoutIntercept1 = trainer1.fit(datasetWithDenseFeatureWithoutIntercept)
+      val model2 = trainer2.fit(datasetWithDenseFeature)
+      val modelWithoutIntercept2 = trainer2.fit(datasetWithDenseFeatureWithoutIntercept)
 
       /*
          weights <- coef(glmnet(features, label, family="gaussian", alpha = 0, lambda = 0,
@@ -203,12 +203,12 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
       // Normal optimizer is not supported with only L1 regularization case.
       if (solver == "normal") {
         intercept[IllegalArgumentException] {
-            trainer1.fit(dataset)
-            trainer2.fit(dataset)
+            trainer1.fit(datasetWithDenseFeature)
+            trainer2.fit(datasetWithDenseFeature)
           }
       } else {
-        val model1 = trainer1.fit(dataset)
-        val model2 = trainer2.fit(dataset)
+        val model1 = trainer1.fit(datasetWithDenseFeature)
+        val model2 = trainer2.fit(datasetWithDenseFeature)
 
         /*
            weights <- coef(glmnet(features, label, family="gaussian", alpha = 1.0, lambda = 0.57))
@@ -240,7 +240,7 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
         assert(model2.intercept ~== interceptR2 relTol 1E-3)
         assert(model2.weights ~= weightsR2 relTol 1E-3)
 
-        model1.transform(dataset).select("features", "prediction").collect().foreach {
+        model1.transform(datasetWithDenseFeature).select("features", "prediction").collect().foreach {
           case Row(features: DenseVector, prediction1: Double) =>
             val prediction2 =
               features(0) * model1.weights(0) + features(1) * model1.weights(1) + model1.intercept
@@ -260,12 +260,12 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
       // Normal optimizer is not supported with only L1 regularization case.
       if (solver == "normal") {
         intercept[IllegalArgumentException] {
-            trainer1.fit(dataset)
-            trainer2.fit(dataset)
+            trainer1.fit(datasetWithDenseFeature)
+            trainer2.fit(datasetWithDenseFeature)
           }
       } else {
-        val model1 = trainer1.fit(dataset)
-        val model2 = trainer2.fit(dataset)
+        val model1 = trainer1.fit(datasetWithDenseFeature)
+        val model2 = trainer2.fit(datasetWithDenseFeature)
 
         /*
            weights <- coef(glmnet(features, label, family="gaussian", alpha = 1.0, lambda = 0.57,
@@ -299,7 +299,7 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
         assert(model2.intercept ~== interceptR2 absTol 1E-3)
         assert(model2.weights ~= weightsR2 relTol 1E-3)
 
-        model1.transform(dataset).select("features", "prediction").collect().foreach {
+        model1.transform(datasetWithDenseFeature).select("features", "prediction").collect().foreach {
           case Row(features: DenseVector, prediction1: Double) =>
             val prediction2 =
               features(0) * model1.weights(0) + features(1) * model1.weights(1) + model1.intercept
@@ -315,8 +315,8 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
         .setSolver(solver)
       val trainer2 = (new LinearRegression).setElasticNetParam(0.0).setRegParam(2.3)
         .setStandardization(false).setSolver(solver)
-      val model1 = trainer1.fit(dataset)
-      val model2 = trainer2.fit(dataset)
+      val model1 = trainer1.fit(datasetWithDenseFeature)
+      val model2 = trainer2.fit(datasetWithDenseFeature)
 
       /*
          weights <- coef(glmnet(features, label, family="gaussian", alpha = 0.0, lambda = 2.3))
@@ -349,7 +349,7 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
       assert(model2.intercept ~== interceptR2 relTol 1E-3)
       assert(model2.weights ~= weightsR2 relTol 1E-3)
 
-      model1.transform(dataset).select("features", "prediction").collect().foreach {
+      model1.transform(datasetWithDenseFeature).select("features", "prediction").collect().foreach {
         case Row(features: DenseVector, prediction1: Double) =>
           val prediction2 =
             features(0) * model1.weights(0) + features(1) * model1.weights(1) + model1.intercept
@@ -364,8 +364,8 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
         .setFitIntercept(false).setSolver(solver)
       val trainer2 = (new LinearRegression).setElasticNetParam(0.0).setRegParam(2.3)
         .setFitIntercept(false).setStandardization(false).setSolver(solver)
-      val model1 = trainer1.fit(dataset)
-      val model2 = trainer2.fit(dataset)
+      val model1 = trainer1.fit(datasetWithDenseFeature)
+      val model2 = trainer2.fit(datasetWithDenseFeature)
 
       /*
          weights <- coef(glmnet(features, label, family="gaussian", alpha = 0.0, lambda = 2.3,
@@ -399,7 +399,7 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
       assert(model2.intercept ~== interceptR2 absTol 1E-3)
       assert(model2.weights ~= weightsR2 relTol 1E-3)
 
-      model1.transform(dataset).select("features", "prediction").collect().foreach {
+      model1.transform(datasetWithDenseFeature).select("features", "prediction").collect().foreach {
         case Row(features: DenseVector, prediction1: Double) =>
           val prediction2 =
             features(0) * model1.weights(0) + features(1) * model1.weights(1) + model1.intercept
@@ -418,12 +418,12 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
       // Normal optimizer is not supported with non-zero elasticnet parameter.
       if (solver == "normal") {
         intercept[IllegalArgumentException] {
-            trainer1.fit(dataset)
-            trainer2.fit(dataset)
+            trainer1.fit(datasetWithDenseFeature)
+            trainer2.fit(datasetWithDenseFeature)
           }
       } else {
-        val model1 = trainer1.fit(dataset)
-        val model2 = trainer2.fit(dataset)
+        val model1 = trainer1.fit(datasetWithDenseFeature)
+        val model2 = trainer2.fit(datasetWithDenseFeature)
 
         /*
            weights <- coef(glmnet(features, label, family="gaussian", alpha = 0.3, lambda = 1.6))
@@ -456,7 +456,7 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
         assert(model2.intercept ~== interceptR2 relTol 1E-3)
         assert(model2.weights ~= weightsR2 relTol 1E-3)
 
-        model1.transform(dataset).select("features", "prediction").collect().foreach {
+        model1.transform(datasetWithDenseFeature).select("features", "prediction").collect().foreach {
           case Row(features: DenseVector, prediction1: Double) =>
             val prediction2 =
               features(0) * model1.weights(0) + features(1) * model1.weights(1) + model1.intercept
@@ -476,12 +476,12 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
       // Normal optimizer is not supported with non-zero elasticnet parameter.
       if (solver == "normal") {
         intercept[IllegalArgumentException] {
-            trainer1.fit(dataset)
-            trainer2.fit(dataset)
+            trainer1.fit(datasetWithDenseFeature)
+            trainer2.fit(datasetWithDenseFeature)
           }
       } else {
-        val model1 = trainer1.fit(dataset)
-        val model2 = trainer2.fit(dataset)
+        val model1 = trainer1.fit(datasetWithDenseFeature)
+        val model2 = trainer2.fit(datasetWithDenseFeature)
 
         /*
            weights <- coef(glmnet(features, label, family="gaussian", alpha = 0.3, lambda = 1.6,
@@ -515,7 +515,7 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
         assert(model2.intercept ~== interceptR2 absTol 1E-3)
         assert(model2.weights ~= weightsR2 relTol 1E-3)
 
-        model1.transform(dataset).select("features", "prediction").collect().foreach {
+        model1.transform(datasetWithDenseFeature).select("features", "prediction").collect().foreach {
           case Row(features: DenseVector, prediction1: Double) =>
             val prediction2 =
               features(0) * model1.weights(0) + features(1) * model1.weights(1) + model1.intercept
@@ -528,26 +528,26 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
   test("linear regression model training summary") {
     Seq("auto", "l-bfgs", "normal").foreach { solver =>
       val trainer = new LinearRegression().setSolver(solver)
-      val model = trainer.fit(dataset)
+      val model = trainer.fit(datasetWithDenseFeature)
       val trainerNoPredictionCol = trainer.setPredictionCol("")
-      val modelNoPredictionCol = trainerNoPredictionCol.fit(dataset)
+      val modelNoPredictionCol = trainerNoPredictionCol.fit(datasetWithDenseFeature)
 
       // Training results for the model should be available
       assert(model.hasSummary)
       assert(modelNoPredictionCol.hasSummary)
 
       // Schema should be a superset of the input dataset
-      assert((dataset.schema.fieldNames.toSet + "prediction").subsetOf(
+      assert((datasetWithDenseFeature.schema.fieldNames.toSet + "prediction").subsetOf(
         model.summary.predictions.schema.fieldNames.toSet))
       // Validate that we re-insert a prediction column for evaluation
       val modelNoPredictionColFieldNames
       = modelNoPredictionCol.summary.predictions.schema.fieldNames
-      assert((dataset.schema.fieldNames.toSet).subsetOf(
+      assert((datasetWithDenseFeature.schema.fieldNames.toSet).subsetOf(
         modelNoPredictionColFieldNames.toSet))
       assert(modelNoPredictionColFieldNames.exists(s => s.startsWith("prediction_")))
 
       // Residuals in [[LinearRegressionResults]] should equal those manually computed
-      val expectedResiduals = dataset.select("features", "label")
+      val expectedResiduals = datasetWithDenseFeature.select("features", "label")
         .map { case Row(features: DenseVector, label: Double) =>
         val prediction =
           features(0) * model.weights(0) + features(1) * model.weights(1) + model.intercept
@@ -596,10 +596,10 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
   test("linear regression model testset evaluation summary") {
     Seq("auto", "l-bfgs", "normal").foreach { solver =>
       val trainer = new LinearRegression().setSolver(solver)
-      val model = trainer.fit(dataset)
+      val model = trainer.fit(datasetWithDenseFeature)
 
       // Evaluating on training dataset should yield results summary equal to training summary
-      val testSummary = model.evaluate(dataset)
+      val testSummary = model.evaluate(datasetWithDenseFeature)
       assert(model.summary.meanSquaredError ~== testSummary.meanSquaredError relTol 1E-5)
       assert(model.summary.r2 ~== testSummary.r2 relTol 1E-5)
       model.summary.residuals.select("residuals").collect()
@@ -700,7 +700,7 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
 
   test("linear regression model with l-bfgs with big feature datasets") {
     val trainer = new LinearRegression().setSolver("auto")
-    val model = trainer.fit(datasetWithManyFeature)
+    val model = trainer.fit(datasetWithSparseFeature)
 
     // Training results for the model should be available
     assert(model.hasSummary)

From 74de81ee4439c121437510b9b8e176a4e7df0724 Mon Sep 17 00:00:00 2001
From: Lewuathe <lewuathe@me.com>
Date: Fri, 30 Oct 2015 16:26:53 +0900
Subject: [PATCH 10/11] Fix style

---
 .../ml/regression/LinearRegressionSuite.scala | 36 ++++++++++---------
 1 file changed, 20 insertions(+), 16 deletions(-)

diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
index b1696dba6d534..b0f66e3223f43 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
@@ -240,11 +240,12 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
         assert(model2.intercept ~== interceptR2 relTol 1E-3)
         assert(model2.weights ~= weightsR2 relTol 1E-3)
 
-        model1.transform(datasetWithDenseFeature).select("features", "prediction").collect().foreach {
-          case Row(features: DenseVector, prediction1: Double) =>
-            val prediction2 =
-              features(0) * model1.weights(0) + features(1) * model1.weights(1) + model1.intercept
-            assert(prediction1 ~== prediction2 relTol 1E-5)
+        model1.transform(datasetWithDenseFeature).select("features", "prediction")
+          .collect().foreach {
+            case Row(features: DenseVector, prediction1: Double) =>
+              val prediction2 = features(0) * model1.weights(0) + features(1) * model1.weights(1) +
+                model1.intercept
+              assert(prediction1 ~== prediction2 relTol 1E-5)
         }
       }
     }
@@ -299,11 +300,12 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
         assert(model2.intercept ~== interceptR2 absTol 1E-3)
         assert(model2.weights ~= weightsR2 relTol 1E-3)
 
-        model1.transform(datasetWithDenseFeature).select("features", "prediction").collect().foreach {
-          case Row(features: DenseVector, prediction1: Double) =>
-            val prediction2 =
-              features(0) * model1.weights(0) + features(1) * model1.weights(1) + model1.intercept
-            assert(prediction1 ~== prediction2 relTol 1E-5)
+        model1.transform(datasetWithDenseFeature).select("features", "prediction")
+          .collect().foreach {
+            case Row(features: DenseVector, prediction1: Double) =>
+              val prediction2 = features(0) * model1.weights(0) + features(1) * model1.weights(1) +
+                model1.intercept
+              assert(prediction1 ~== prediction2 relTol 1E-5)
         }
       }
     }
@@ -456,10 +458,11 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
         assert(model2.intercept ~== interceptR2 relTol 1E-3)
         assert(model2.weights ~= weightsR2 relTol 1E-3)
 
-        model1.transform(datasetWithDenseFeature).select("features", "prediction").collect().foreach {
+        model1.transform(datasetWithDenseFeature).select("features", "prediction")
+          .collect().foreach {
           case Row(features: DenseVector, prediction1: Double) =>
-            val prediction2 =
-              features(0) * model1.weights(0) + features(1) * model1.weights(1) + model1.intercept
+            val prediction2 = features(0) * model1.weights(0) + features(1) * model1.weights(1) +
+              model1.intercept
             assert(prediction1 ~== prediction2 relTol 1E-5)
         }
       }
@@ -515,10 +518,11 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
         assert(model2.intercept ~== interceptR2 absTol 1E-3)
         assert(model2.weights ~= weightsR2 relTol 1E-3)
 
-        model1.transform(datasetWithDenseFeature).select("features", "prediction").collect().foreach {
+        model1.transform(datasetWithDenseFeature).select("features", "prediction")
+          .collect().foreach {
           case Row(features: DenseVector, prediction1: Double) =>
-            val prediction2 =
-              features(0) * model1.weights(0) + features(1) * model1.weights(1) + model1.intercept
+            val prediction2 = features(0) * model1.weights(0) + features(1) * model1.weights(1) +
+              model1.intercept
             assert(prediction1 ~== prediction2 relTol 1E-5)
         }
       }

From 241ec7293607d670c93293b5872b21ed0c9f411a Mon Sep 17 00:00:00 2001
From: Lewuathe <lewuathe@me.com>
Date: Fri, 30 Oct 2015 17:57:25 +0900
Subject: [PATCH 11/11] Minor fixed

---
 .../apache/spark/ml/regression/LinearRegressionSuite.scala  | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
index b0f66e3223f43..a2a5c0bbdcb90 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
@@ -53,7 +53,7 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
     datasetWithDenseFeature = sqlContext.createDataFrame(
       sc.parallelize(LinearDataGenerator.generateLinearInput(
         intercept = 6.3, weights = Array(4.7, 7.2), xMean = Array(0.9, -1.3),
-        xVariance = Array(0.7, 1.2), nPoints = 10000, seed = seed, eps = 0.1), 2))
+        xVariance = Array(0.7, 1.2), nPoints = 10000, seed, eps = 0.1), 2))
     /*
        datasetWithoutIntercept is not needed for correctness testing but is useful for illustrating
        training model without intercept
@@ -61,7 +61,7 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
     datasetWithDenseFeatureWithoutIntercept = sqlContext.createDataFrame(
       sc.parallelize(LinearDataGenerator.generateLinearInput(
         intercept = 0.0, weights = Array(4.7, 7.2), xMean = Array(0.9, -1.3),
-        xVariance = Array(0.7, 1.2), nPoints = 10000, seed = seed, eps = 0.1), 2))
+        xVariance = Array(0.7, 1.2), nPoints = 10000, seed, eps = 0.1), 2))
 
     val r = new Random(seed)
     // When feature size is larger than 4096, normal optimizer is choosed
@@ -72,7 +72,7 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
         intercept = 0.0, weights = Seq.fill(featureSize)(r.nextDouble).toArray,
         xMean = Seq.fill(featureSize)(r.nextDouble).toArray,
         xVariance = Seq.fill(featureSize)(r.nextDouble).toArray, nPoints = 200,
-        seed = seed, eps = 0.1, sparsity = 0.7), 2))
+        seed, eps = 0.1, sparsity = 0.7), 2))
   }
 
   test("params") {