From 11cd9c13b78a7c1d9ecfb2950242e0525c3bf303 Mon Sep 17 00:00:00 2001 From: Lewuathe Date: Tue, 20 Oct 2015 22:50:23 +0900 Subject: [PATCH 01/11] [SPARK-11207][ML] Add test cases for solver selection of LinearRegression as followup. --- .../ml/regression/LinearRegressionSuite.scala | 57 ++++++++++++------- 1 file changed, 35 insertions(+), 22 deletions(-) diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala index a6e0c72ba9030..c3df4c5b11c90 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala @@ -34,6 +34,7 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext { private val seed: Int = 42 @transient var dataset: DataFrame = _ @transient var datasetWithoutIntercept: DataFrame = _ + @transient var datasetWithBigFeature: DataFrame = _ /* In `LinearRegressionSuite`, we will make sure that the model trained by SparkML @@ -59,6 +60,14 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext { datasetWithoutIntercept = sqlContext.createDataFrame( sc.parallelize(LinearDataGenerator.generateLinearInput( 0.0, Array(4.7, 7.2), Array(0.9, -1.3), Array(0.7, 1.2), 10000, seed, 0.1), 2)) + + val r = new Random(seed) + val featureSize = 4100 + datasetWithBigFeature = sqlContext.createDataFrame( + sc.parallelize(LinearDataGenerator.generateLinearInput( + 0.0, Seq.fill(featureSize)(r.nextDouble).toArray, Seq.fill(featureSize)(r.nextDouble).toArray, + Seq.fill(featureSize)(r.nextDouble).toArray, 200, seed, 0.1 + ), 2)) } test("params") { @@ -186,9 +195,6 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext { val trainer2 = (new LinearRegression).setElasticNetParam(1.0).setRegParam(0.57) .setSolver(solver).setStandardization(false) - var model1: LinearRegressionModel = null - var model2: LinearRegressionModel = null - // Normal optimizer is not supported with only L1 regularization case. if (solver == "normal") { intercept[IllegalArgumentException] { @@ -196,9 +202,8 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext { trainer2.fit(dataset) } } else { - model1 = trainer1.fit(dataset) - model2 = trainer2.fit(dataset) - + val model1 = trainer1.fit(dataset) + val model2 = trainer2.fit(dataset) /* weights <- coef(glmnet(features, label, family="gaussian", alpha = 1.0, lambda = 0.57)) @@ -247,9 +252,6 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext { val trainer2 = (new LinearRegression).setElasticNetParam(1.0).setRegParam(0.57) .setFitIntercept(false).setStandardization(false).setSolver(solver) - var model1: LinearRegressionModel = null - var model2: LinearRegressionModel = null - // Normal optimizer is not supported with only L1 regularization case. if (solver == "normal") { intercept[IllegalArgumentException] { @@ -257,8 +259,8 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext { trainer2.fit(dataset) } } else { - model1 = trainer1.fit(dataset) - model2 = trainer2.fit(dataset) + val model1 = trainer1.fit(dataset) + val model2 = trainer2.fit(dataset) /* weights <- coef(glmnet(features, label, family="gaussian", alpha = 1.0, lambda = 0.57, @@ -408,9 +410,6 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext { val trainer2 = (new LinearRegression).setElasticNetParam(0.3).setRegParam(1.6) .setStandardization(false).setSolver(solver) - var model1: LinearRegressionModel = null - var model2: LinearRegressionModel = null - // Normal optimizer is not supported with non-zero elasticnet parameter. if (solver == "normal") { intercept[IllegalArgumentException] { @@ -418,8 +417,8 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext { trainer2.fit(dataset) } } else { - model1 = trainer1.fit(dataset) - model2 = trainer2.fit(dataset) + val model1 = trainer1.fit(dataset) + val model2 = trainer2.fit(dataset) /* weights <- coef(glmnet(features, label, family="gaussian", alpha = 0.3, lambda = 1.6)) @@ -469,9 +468,6 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext { val trainer2 = (new LinearRegression).setElasticNetParam(0.3).setRegParam(1.6) .setFitIntercept(false).setStandardization(false).setSolver(solver) - var model1: LinearRegressionModel = null - var model2: LinearRegressionModel = null - // Normal optimizer is not supported with non-zero elasticnet parameter. if (solver == "normal") { intercept[IllegalArgumentException] { @@ -479,8 +475,8 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext { trainer2.fit(dataset) } } else { - model1 = trainer1.fit(dataset) - model2 = trainer2.fit(dataset) + val model1 = trainer1.fit(dataset) + val model2 = trainer2.fit(dataset) /* weights <- coef(glmnet(features, label, family="gaussian", alpha = 0.3, lambda = 1.6, @@ -531,7 +527,6 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext { val trainerNoPredictionCol = trainer.setPredictionCol("") val modelNoPredictionCol = trainerNoPredictionCol.fit(dataset) - // Training results for the model should be available assert(model.hasSummary) assert(modelNoPredictionCol.hasSummary) @@ -585,6 +580,10 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext { .objectiveHistory .sliding(2) .forall(x => x(0) >= x(1))) + } else { + // To clalify that the normal solver is used here. + assert(model.summary.objectiveHistory.length == 1) + assert(model.summary.objectiveHistory(0) == 0.0) } } } @@ -693,4 +692,18 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext { assert(model4a0.weights ~== model4b.weights absTol 1E-3) } } + + test("linear regression model with l-bfgs with big feature datasets") { + val trainer = new LinearRegression().setSolver("auto") + val model = trainer.fit(datasetWithBigFeature) + + // Training results for the model should be available + assert(model.hasSummary) + // When LBFGS is used as optimizer, objective history can be restored. + assert( + model.summary + .objectiveHistory + .sliding(2) + .forall(x => x(0) >= x(1))) + } } From 28427d29e8c398f25f9aac10f86074da084a933f Mon Sep 17 00:00:00 2001 From: Lewuathe Date: Tue, 20 Oct 2015 23:05:36 +0900 Subject: [PATCH 02/11] Fix scalastyle --- .../org/apache/spark/ml/regression/LinearRegressionSuite.scala | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala index c3df4c5b11c90..8464148668deb 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala @@ -65,7 +65,8 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext { val featureSize = 4100 datasetWithBigFeature = sqlContext.createDataFrame( sc.parallelize(LinearDataGenerator.generateLinearInput( - 0.0, Seq.fill(featureSize)(r.nextDouble).toArray, Seq.fill(featureSize)(r.nextDouble).toArray, + 0.0, Seq.fill(featureSize)(r.nextDouble).toArray, + Seq.fill(featureSize)(r.nextDouble).toArray, Seq.fill(featureSize)(r.nextDouble).toArray, 200, seed, 0.1 ), 2)) } From f85bca6667dcebbfccbd50cde46b11f6855d1974 Mon Sep 17 00:00:00 2001 From: Lewuathe Date: Wed, 21 Oct 2015 23:59:09 +0900 Subject: [PATCH 03/11] [SPARK-11207] Improve test case with many feature datasets --- .../mllib/util/LinearDataGenerator.scala | 53 +++++++++++++++++++ .../ml/regression/LinearRegressionSuite.scala | 23 ++++---- 2 files changed, 67 insertions(+), 9 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala index d0ba454f379a9..e84b6708bab32 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala @@ -124,6 +124,59 @@ object LinearDataGenerator { y.zip(x).map(p => LabeledPoint(p._1, Vectors.dense(p._2))) } + /** + * + * @param intercept Data intercept + * @param weights Weights to be applied. + * @param xMean the mean of the generated features. Lots of time, if the features are not properly + * standardized, the algorithm with poor implementation will have difficulty + * to converge. + * @param xVariance the variance of the generated features. + * @param nPoints Number of points in sample. + * @param seed Random seed + * @param eps Epsilon scaling factor. + * @return Seq of LabeledPoint includes sparse vectors.. + */ + @Since("1.6.0") + def generateLinearSparseInput( + intercept: Double, + weights: Array[Double], + xMean: Array[Double], + xVariance: Array[Double], + nPoints: Int, + seed: Int, + eps: Double): Seq[LabeledPoint] = { + val rnd = new Random(seed) + val x = Array.fill[Array[Double]](nPoints)( + Array.fill[Double](weights.length)(rnd.nextDouble())) + + x.foreach { v => + var i = 0 + val len = v.length + while (i < len) { + if (rnd.nextDouble() < 0.7) { + v(i) = 0.0 + } else { + v(i) = (v(i) - 0.5) * math.sqrt(12.0 * xVariance(i)) + xMean(i) + } + i += 1 + } + } + + val y = x.map { xi => + blas.ddot(weights.length, xi, 1, weights, 1) + intercept + eps * rnd.nextGaussian() + } + + val sparseX = x.map { (v: Array[Double]) => + v.zipWithIndex.filter{ + case (d: Double, i: Int) => d != 0.0 + }.map { + case (d: Double, i: Int) => (i, d) + } + } + y.zip(sparseX).map(p => LabeledPoint(p._1, Vectors.sparse(weights.length, p._2))) + } + /** * Generate an RDD containing sample data for Linear Regression models - including Ridge, Lasso, * and uregularized variants. diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala index 8464148668deb..a14ec6307d863 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala @@ -34,7 +34,7 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext { private val seed: Int = 42 @transient var dataset: DataFrame = _ @transient var datasetWithoutIntercept: DataFrame = _ - @transient var datasetWithBigFeature: DataFrame = _ + @transient var datasetWithManyFeature: DataFrame = _ /* In `LinearRegressionSuite`, we will make sure that the model trained by SparkML @@ -52,22 +52,27 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext { super.beforeAll() dataset = sqlContext.createDataFrame( sc.parallelize(LinearDataGenerator.generateLinearInput( - 6.3, Array(4.7, 7.2), Array(0.9, -1.3), Array(0.7, 1.2), 10000, seed, 0.1), 2)) + intercept = 6.3, weights = Array(4.7, 7.2), xMean = Array(0.9, -1.3), + xVariance = Array(0.7, 1.2), nPoints = 10000, seed = seed, eps = 0.1), 2)) /* datasetWithoutIntercept is not needed for correctness testing but is useful for illustrating training model without intercept */ datasetWithoutIntercept = sqlContext.createDataFrame( sc.parallelize(LinearDataGenerator.generateLinearInput( - 0.0, Array(4.7, 7.2), Array(0.9, -1.3), Array(0.7, 1.2), 10000, seed, 0.1), 2)) + intercept = 0.0, weights = Array(4.7, 7.2), xMean = Array(0.9, -1.3), + xVariance = Array(0.7, 1.2), nPoints = 10000, seed = seed, eps = 0.1), 2)) val r = new Random(seed) + // When feature size is larger than 4096, normal optimizer is choosed + // as the solver of linear regression in the case of "auto" mode. val featureSize = 4100 - datasetWithBigFeature = sqlContext.createDataFrame( - sc.parallelize(LinearDataGenerator.generateLinearInput( - 0.0, Seq.fill(featureSize)(r.nextDouble).toArray, - Seq.fill(featureSize)(r.nextDouble).toArray, - Seq.fill(featureSize)(r.nextDouble).toArray, 200, seed, 0.1 + datasetWithManyFeature = sqlContext.createDataFrame( + sc.parallelize(LinearDataGenerator.generateLinearSparseInput( + intercept = 0.0, weights = Seq.fill(featureSize)(r.nextDouble).toArray, + xMean = Seq.fill(featureSize)(r.nextDouble).toArray, + xVariance = Seq.fill(featureSize)(r.nextDouble).toArray, nPoints = 200, + seed = seed, eps = 0.1 ), 2)) } @@ -696,7 +701,7 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext { test("linear regression model with l-bfgs with big feature datasets") { val trainer = new LinearRegression().setSolver("auto") - val model = trainer.fit(datasetWithBigFeature) + val model = trainer.fit(datasetWithManyFeature) // Training results for the model should be available assert(model.hasSummary) From f6b2256fd669585f7e3b082730a63d0dbda631aa Mon Sep 17 00:00:00 2001 From: Lewuathe Date: Thu, 22 Oct 2015 10:21:09 +0900 Subject: [PATCH 04/11] [SPARK-11207] Remove extra lines --- .../scala/org/apache/spark/mllib/util/LinearDataGenerator.scala | 2 -- 1 file changed, 2 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala index e84b6708bab32..d382f34e61357 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala @@ -83,7 +83,6 @@ object LinearDataGenerator { nPoints, seed, eps)} /** - * * @param intercept Data intercept * @param weights Weights to be applied. * @param xMean the mean of the generated features. Lots of time, if the features are not properly @@ -125,7 +124,6 @@ object LinearDataGenerator { } /** - * * @param intercept Data intercept * @param weights Weights to be applied. * @param xMean the mean of the generated features. Lots of time, if the features are not properly From 2082d4781eeb009c3a0c45d4e92b546960b5a7ff Mon Sep 17 00:00:00 2001 From: Lewuathe Date: Sat, 24 Oct 2015 10:28:27 +0900 Subject: [PATCH 05/11] [SPARK-11207] Pass sparcity to generateLinearSparseInput --- .../org/apache/spark/mllib/util/LinearDataGenerator.scala | 6 ++++-- .../apache/spark/ml/regression/LinearRegressionSuite.scala | 3 +-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala index d382f34e61357..219ecc709d6cc 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala @@ -143,7 +143,9 @@ object LinearDataGenerator { xVariance: Array[Double], nPoints: Int, seed: Int, - eps: Double): Seq[LabeledPoint] = { + eps: Double, + sparcity: Double): Seq[LabeledPoint] = { + require(sparcity <= 1.0) val rnd = new Random(seed) val x = Array.fill[Array[Double]](nPoints)( Array.fill[Double](weights.length)(rnd.nextDouble())) @@ -152,7 +154,7 @@ object LinearDataGenerator { var i = 0 val len = v.length while (i < len) { - if (rnd.nextDouble() < 0.7) { + if (rnd.nextDouble() <= sparcity) { v(i) = 0.0 } else { v(i) = (v(i) - 0.5) * math.sqrt(12.0 * xVariance(i)) + xMean(i) diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala index a14ec6307d863..a9c07225747ea 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala @@ -72,8 +72,7 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext { intercept = 0.0, weights = Seq.fill(featureSize)(r.nextDouble).toArray, xMean = Seq.fill(featureSize)(r.nextDouble).toArray, xVariance = Seq.fill(featureSize)(r.nextDouble).toArray, nPoints = 200, - seed = seed, eps = 0.1 - ), 2)) + seed = seed, eps = 0.1, sparcity = 0.7), 2)) } test("params") { From 003d3bd87f3936c4fd6ee0dc77ca81f3811bcbd7 Mon Sep 17 00:00:00 2001 From: Lewuathe Date: Sun, 25 Oct 2015 08:52:59 +0900 Subject: [PATCH 06/11] [SPARK-11207] Add new API for generateLinearInput --- .../mllib/util/LinearDataGenerator.scala | 36 ++++++++----------- 1 file changed, 14 insertions(+), 22 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala index 219ecc709d6cc..36c92bd3ad730 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala @@ -103,26 +103,10 @@ object LinearDataGenerator { nPoints: Int, seed: Int, eps: Double): Seq[LabeledPoint] = { - - val rnd = new Random(seed) - val x = Array.fill[Array[Double]](nPoints)( - Array.fill[Double](weights.length)(rnd.nextDouble())) - - x.foreach { v => - var i = 0 - val len = v.length - while (i < len) { - v(i) = (v(i) - 0.5) * math.sqrt(12.0 * xVariance(i)) + xMean(i) - i += 1 - } - } - - val y = x.map { xi => - blas.ddot(weights.length, xi, 1, weights, 1) + intercept + eps * rnd.nextGaussian() - } - y.zip(x).map(p => LabeledPoint(p._1, Vectors.dense(p._2))) + generateLinearInputInternal(intercept, weights, xMean, xVariance, nPoints, seed, eps, 0.0) } + /** * @param intercept Data intercept * @param weights Weights to be applied. @@ -133,10 +117,12 @@ object LinearDataGenerator { * @param nPoints Number of points in sample. * @param seed Random seed * @param eps Epsilon scaling factor. - * @return Seq of LabeledPoint includes sparse vectors.. + * @param sparcity The ratio of zero elements. If it is 0.0, LabeledPoints with + * DenseVector is returned. + * @return Seq of input. */ @Since("1.6.0") - def generateLinearSparseInput( + def generateLinearInputInternal( intercept: Double, weights: Array[Double], xMean: Array[Double], @@ -168,13 +154,19 @@ object LinearDataGenerator { } val sparseX = x.map { (v: Array[Double]) => - v.zipWithIndex.filter{ + v.zipWithIndex.filter { case (d: Double, i: Int) => d != 0.0 }.map { case (d: Double, i: Int) => (i, d) } } - y.zip(sparseX).map(p => LabeledPoint(p._1, Vectors.sparse(weights.length, p._2))) + if (sparcity == 0.0) { + // Return LabeledPoints with DenseVector + y.zip(x).map(p => LabeledPoint(p._1, Vectors.dense(p._2))) + } else { + // Return LabeledPoints with SparseVector + y.zip(sparseX).map(p => LabeledPoint(p._1, Vectors.sparse(weights.length, p._2))) + } } /** From 0a4303356455f28ca3b87ffd446cb5ef5f25d0e2 Mon Sep 17 00:00:00 2001 From: Lewuathe Date: Sun, 25 Oct 2015 15:59:52 +0900 Subject: [PATCH 07/11] [SPARK-11207] Fix tests --- .../org/apache/spark/ml/regression/LinearRegressionSuite.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala index a9c07225747ea..dd055878155f6 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala @@ -68,7 +68,7 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext { // as the solver of linear regression in the case of "auto" mode. val featureSize = 4100 datasetWithManyFeature = sqlContext.createDataFrame( - sc.parallelize(LinearDataGenerator.generateLinearSparseInput( + sc.parallelize(LinearDataGenerator.generateLinearInputInternal( intercept = 0.0, weights = Seq.fill(featureSize)(r.nextDouble).toArray, xMean = Seq.fill(featureSize)(r.nextDouble).toArray, xVariance = Seq.fill(featureSize)(r.nextDouble).toArray, nPoints = 200, From 59383fd41f1d6b96274c564eb2fb7c96f5ab07e0 Mon Sep 17 00:00:00 2001 From: lewuathe Date: Sun, 25 Oct 2015 18:24:01 +0900 Subject: [PATCH 08/11] [SPARK-11207] Fix random values used by unit tests --- .../org/apache/spark/mllib/util/LinearDataGenerator.scala | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala index 36c92bd3ad730..95100e7ad31d7 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala @@ -139,8 +139,9 @@ object LinearDataGenerator { x.foreach { v => var i = 0 val len = v.length + val sparceRnd = new Random(seed) while (i < len) { - if (rnd.nextDouble() <= sparcity) { + if (sparceRnd.nextDouble() < sparcity) { v(i) = 0.0 } else { v(i) = (v(i) - 0.5) * math.sqrt(12.0 * xVariance(i)) + xMean(i) From 888b2168c86ac8e9302c900f9ace10fd6cc69d14 Mon Sep 17 00:00:00 2001 From: Lewuathe Date: Fri, 30 Oct 2015 15:39:05 +0900 Subject: [PATCH 09/11] Update Fri Oct 30 15:39:05 JST 2015 --- .../mllib/util/LinearDataGenerator.scala | 34 +++--- .../ml/regression/LinearRegressionSuite.scala | 104 +++++++++--------- 2 files changed, 70 insertions(+), 68 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala index 36c92bd3ad730..af67f1fdf5392 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala @@ -77,10 +77,9 @@ object LinearDataGenerator { nPoints: Int, seed: Int, eps: Double = 0.1): Seq[LabeledPoint] = { - generateLinearInput(intercept, weights, - Array.fill[Double](weights.length)(0.0), - Array.fill[Double](weights.length)(1.0 / 3.0), - nPoints, seed, eps)} + generateLinearInput(intercept, weights, Array.fill[Double](weights.length)(0.0), + Array.fill[Double](weights.length)(1.0 / 3.0), nPoints, seed, eps) + } /** * @param intercept Data intercept @@ -103,7 +102,7 @@ object LinearDataGenerator { nPoints: Int, seed: Int, eps: Double): Seq[LabeledPoint] = { - generateLinearInputInternal(intercept, weights, xMean, xVariance, nPoints, seed, eps, 0.0) + generateLinearInput(intercept, weights, xMean, xVariance, nPoints, seed, eps, 0.0) } @@ -117,12 +116,12 @@ object LinearDataGenerator { * @param nPoints Number of points in sample. * @param seed Random seed * @param eps Epsilon scaling factor. - * @param sparcity The ratio of zero elements. If it is 0.0, LabeledPoints with + * @param sparsity The ratio of zero elements. If it is 0.0, LabeledPoints with * DenseVector is returned. * @return Seq of input. */ @Since("1.6.0") - def generateLinearInputInternal( + def generateLinearInput( intercept: Double, weights: Array[Double], xMean: Array[Double], @@ -130,8 +129,8 @@ object LinearDataGenerator { nPoints: Int, seed: Int, eps: Double, - sparcity: Double): Seq[LabeledPoint] = { - require(sparcity <= 1.0) + sparsity: Double): Seq[LabeledPoint] = { + require(0.0 <= sparsity && sparsity <= 1.0) val rnd = new Random(seed) val x = Array.fill[Array[Double]](nPoints)( Array.fill[Double](weights.length)(rnd.nextDouble())) @@ -140,7 +139,7 @@ object LinearDataGenerator { var i = 0 val len = v.length while (i < len) { - if (rnd.nextDouble() <= sparcity) { + if (rnd.nextDouble() <= sparsity) { v(i) = 0.0 } else { v(i) = (v(i) - 0.5) * math.sqrt(12.0 * xVariance(i)) + xMean(i) @@ -160,12 +159,15 @@ object LinearDataGenerator { case (d: Double, i: Int) => (i, d) } } - if (sparcity == 0.0) { - // Return LabeledPoints with DenseVector - y.zip(x).map(p => LabeledPoint(p._1, Vectors.dense(p._2))) - } else { - // Return LabeledPoints with SparseVector - y.zip(sparseX).map(p => LabeledPoint(p._1, Vectors.sparse(weights.length, p._2))) + + y.zip(x).map { p => + if (sparsity == 0.0) { + // Return LabeledPoints with DenseVector + LabeledPoint(p._1, Vectors.dense(p._2)) + } else { + // Return LabeledPoints with SparseVector + LabeledPoint(p._1, Vectors.dense(p._2).toSparse) + } } } diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala index dd055878155f6..b1696dba6d534 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala @@ -32,9 +32,9 @@ import org.apache.spark.sql.{DataFrame, Row} class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext { private val seed: Int = 42 - @transient var dataset: DataFrame = _ - @transient var datasetWithoutIntercept: DataFrame = _ - @transient var datasetWithManyFeature: DataFrame = _ + @transient var datasetWithDenseFeature: DataFrame = _ + @transient var datasetWithDenseFeatureWithoutIntercept: DataFrame = _ + @transient var datasetWithSparseFeature: DataFrame = _ /* In `LinearRegressionSuite`, we will make sure that the model trained by SparkML @@ -50,7 +50,7 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext { */ override def beforeAll(): Unit = { super.beforeAll() - dataset = sqlContext.createDataFrame( + datasetWithDenseFeature = sqlContext.createDataFrame( sc.parallelize(LinearDataGenerator.generateLinearInput( intercept = 6.3, weights = Array(4.7, 7.2), xMean = Array(0.9, -1.3), xVariance = Array(0.7, 1.2), nPoints = 10000, seed = seed, eps = 0.1), 2)) @@ -58,7 +58,7 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext { datasetWithoutIntercept is not needed for correctness testing but is useful for illustrating training model without intercept */ - datasetWithoutIntercept = sqlContext.createDataFrame( + datasetWithDenseFeatureWithoutIntercept = sqlContext.createDataFrame( sc.parallelize(LinearDataGenerator.generateLinearInput( intercept = 0.0, weights = Array(4.7, 7.2), xMean = Array(0.9, -1.3), xVariance = Array(0.7, 1.2), nPoints = 10000, seed = seed, eps = 0.1), 2)) @@ -67,12 +67,12 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext { // When feature size is larger than 4096, normal optimizer is choosed // as the solver of linear regression in the case of "auto" mode. val featureSize = 4100 - datasetWithManyFeature = sqlContext.createDataFrame( - sc.parallelize(LinearDataGenerator.generateLinearInputInternal( + datasetWithSparseFeature = sqlContext.createDataFrame( + sc.parallelize(LinearDataGenerator.generateLinearInput( intercept = 0.0, weights = Seq.fill(featureSize)(r.nextDouble).toArray, xMean = Seq.fill(featureSize)(r.nextDouble).toArray, xVariance = Seq.fill(featureSize)(r.nextDouble).toArray, nPoints = 200, - seed = seed, eps = 0.1, sparcity = 0.7), 2)) + seed = seed, eps = 0.1, sparsity = 0.7), 2)) } test("params") { @@ -91,19 +91,19 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext { assert(lir.getFitIntercept) assert(lir.getStandardization) assert(lir.getSolver == "auto") - val model = lir.fit(dataset) + val model = lir.fit(datasetWithDenseFeature) // copied model must have the same parent. MLTestingUtils.checkCopy(model) - model.transform(dataset) + model.transform(datasetWithDenseFeature) .select("label", "prediction") .collect() assert(model.getFeaturesCol === "features") assert(model.getPredictionCol === "prediction") assert(model.intercept !== 0.0) assert(model.hasParent) - val numFeatures = dataset.select("features").first().getAs[Vector](0).size + val numFeatures = datasetWithDenseFeature.select("features").first().getAs[Vector](0).size assert(model.numFeatures === numFeatures) } @@ -112,8 +112,8 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext { val trainer1 = new LinearRegression().setSolver(solver) // The result should be the same regardless of standardization without regularization val trainer2 = (new LinearRegression).setStandardization(false).setSolver(solver) - val model1 = trainer1.fit(dataset) - val model2 = trainer2.fit(dataset) + val model1 = trainer1.fit(datasetWithDenseFeature) + val model2 = trainer2.fit(datasetWithDenseFeature) /* Using the following R code to load the data and train the model using glmnet package. @@ -138,7 +138,7 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext { assert(model2.intercept ~== interceptR relTol 1E-3) assert(model2.weights ~= weightsR relTol 1E-3) - model1.transform(dataset).select("features", "prediction").collect().foreach { + model1.transform(datasetWithDenseFeature).select("features", "prediction").collect().foreach { case Row(features: DenseVector, prediction1: Double) => val prediction2 = features(0) * model1.weights(0) + features(1) * model1.weights(1) + model1.intercept @@ -153,10 +153,10 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext { // Without regularization the results should be the same val trainer2 = (new LinearRegression).setFitIntercept(false).setStandardization(false) .setSolver(solver) - val model1 = trainer1.fit(dataset) - val modelWithoutIntercept1 = trainer1.fit(datasetWithoutIntercept) - val model2 = trainer2.fit(dataset) - val modelWithoutIntercept2 = trainer2.fit(datasetWithoutIntercept) + val model1 = trainer1.fit(datasetWithDenseFeature) + val modelWithoutIntercept1 = trainer1.fit(datasetWithDenseFeatureWithoutIntercept) + val model2 = trainer2.fit(datasetWithDenseFeature) + val modelWithoutIntercept2 = trainer2.fit(datasetWithDenseFeatureWithoutIntercept) /* weights <- coef(glmnet(features, label, family="gaussian", alpha = 0, lambda = 0, @@ -203,12 +203,12 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext { // Normal optimizer is not supported with only L1 regularization case. if (solver == "normal") { intercept[IllegalArgumentException] { - trainer1.fit(dataset) - trainer2.fit(dataset) + trainer1.fit(datasetWithDenseFeature) + trainer2.fit(datasetWithDenseFeature) } } else { - val model1 = trainer1.fit(dataset) - val model2 = trainer2.fit(dataset) + val model1 = trainer1.fit(datasetWithDenseFeature) + val model2 = trainer2.fit(datasetWithDenseFeature) /* weights <- coef(glmnet(features, label, family="gaussian", alpha = 1.0, lambda = 0.57)) @@ -240,7 +240,7 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext { assert(model2.intercept ~== interceptR2 relTol 1E-3) assert(model2.weights ~= weightsR2 relTol 1E-3) - model1.transform(dataset).select("features", "prediction").collect().foreach { + model1.transform(datasetWithDenseFeature).select("features", "prediction").collect().foreach { case Row(features: DenseVector, prediction1: Double) => val prediction2 = features(0) * model1.weights(0) + features(1) * model1.weights(1) + model1.intercept @@ -260,12 +260,12 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext { // Normal optimizer is not supported with only L1 regularization case. if (solver == "normal") { intercept[IllegalArgumentException] { - trainer1.fit(dataset) - trainer2.fit(dataset) + trainer1.fit(datasetWithDenseFeature) + trainer2.fit(datasetWithDenseFeature) } } else { - val model1 = trainer1.fit(dataset) - val model2 = trainer2.fit(dataset) + val model1 = trainer1.fit(datasetWithDenseFeature) + val model2 = trainer2.fit(datasetWithDenseFeature) /* weights <- coef(glmnet(features, label, family="gaussian", alpha = 1.0, lambda = 0.57, @@ -299,7 +299,7 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext { assert(model2.intercept ~== interceptR2 absTol 1E-3) assert(model2.weights ~= weightsR2 relTol 1E-3) - model1.transform(dataset).select("features", "prediction").collect().foreach { + model1.transform(datasetWithDenseFeature).select("features", "prediction").collect().foreach { case Row(features: DenseVector, prediction1: Double) => val prediction2 = features(0) * model1.weights(0) + features(1) * model1.weights(1) + model1.intercept @@ -315,8 +315,8 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext { .setSolver(solver) val trainer2 = (new LinearRegression).setElasticNetParam(0.0).setRegParam(2.3) .setStandardization(false).setSolver(solver) - val model1 = trainer1.fit(dataset) - val model2 = trainer2.fit(dataset) + val model1 = trainer1.fit(datasetWithDenseFeature) + val model2 = trainer2.fit(datasetWithDenseFeature) /* weights <- coef(glmnet(features, label, family="gaussian", alpha = 0.0, lambda = 2.3)) @@ -349,7 +349,7 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext { assert(model2.intercept ~== interceptR2 relTol 1E-3) assert(model2.weights ~= weightsR2 relTol 1E-3) - model1.transform(dataset).select("features", "prediction").collect().foreach { + model1.transform(datasetWithDenseFeature).select("features", "prediction").collect().foreach { case Row(features: DenseVector, prediction1: Double) => val prediction2 = features(0) * model1.weights(0) + features(1) * model1.weights(1) + model1.intercept @@ -364,8 +364,8 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext { .setFitIntercept(false).setSolver(solver) val trainer2 = (new LinearRegression).setElasticNetParam(0.0).setRegParam(2.3) .setFitIntercept(false).setStandardization(false).setSolver(solver) - val model1 = trainer1.fit(dataset) - val model2 = trainer2.fit(dataset) + val model1 = trainer1.fit(datasetWithDenseFeature) + val model2 = trainer2.fit(datasetWithDenseFeature) /* weights <- coef(glmnet(features, label, family="gaussian", alpha = 0.0, lambda = 2.3, @@ -399,7 +399,7 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext { assert(model2.intercept ~== interceptR2 absTol 1E-3) assert(model2.weights ~= weightsR2 relTol 1E-3) - model1.transform(dataset).select("features", "prediction").collect().foreach { + model1.transform(datasetWithDenseFeature).select("features", "prediction").collect().foreach { case Row(features: DenseVector, prediction1: Double) => val prediction2 = features(0) * model1.weights(0) + features(1) * model1.weights(1) + model1.intercept @@ -418,12 +418,12 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext { // Normal optimizer is not supported with non-zero elasticnet parameter. if (solver == "normal") { intercept[IllegalArgumentException] { - trainer1.fit(dataset) - trainer2.fit(dataset) + trainer1.fit(datasetWithDenseFeature) + trainer2.fit(datasetWithDenseFeature) } } else { - val model1 = trainer1.fit(dataset) - val model2 = trainer2.fit(dataset) + val model1 = trainer1.fit(datasetWithDenseFeature) + val model2 = trainer2.fit(datasetWithDenseFeature) /* weights <- coef(glmnet(features, label, family="gaussian", alpha = 0.3, lambda = 1.6)) @@ -456,7 +456,7 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext { assert(model2.intercept ~== interceptR2 relTol 1E-3) assert(model2.weights ~= weightsR2 relTol 1E-3) - model1.transform(dataset).select("features", "prediction").collect().foreach { + model1.transform(datasetWithDenseFeature).select("features", "prediction").collect().foreach { case Row(features: DenseVector, prediction1: Double) => val prediction2 = features(0) * model1.weights(0) + features(1) * model1.weights(1) + model1.intercept @@ -476,12 +476,12 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext { // Normal optimizer is not supported with non-zero elasticnet parameter. if (solver == "normal") { intercept[IllegalArgumentException] { - trainer1.fit(dataset) - trainer2.fit(dataset) + trainer1.fit(datasetWithDenseFeature) + trainer2.fit(datasetWithDenseFeature) } } else { - val model1 = trainer1.fit(dataset) - val model2 = trainer2.fit(dataset) + val model1 = trainer1.fit(datasetWithDenseFeature) + val model2 = trainer2.fit(datasetWithDenseFeature) /* weights <- coef(glmnet(features, label, family="gaussian", alpha = 0.3, lambda = 1.6, @@ -515,7 +515,7 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext { assert(model2.intercept ~== interceptR2 absTol 1E-3) assert(model2.weights ~= weightsR2 relTol 1E-3) - model1.transform(dataset).select("features", "prediction").collect().foreach { + model1.transform(datasetWithDenseFeature).select("features", "prediction").collect().foreach { case Row(features: DenseVector, prediction1: Double) => val prediction2 = features(0) * model1.weights(0) + features(1) * model1.weights(1) + model1.intercept @@ -528,26 +528,26 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext { test("linear regression model training summary") { Seq("auto", "l-bfgs", "normal").foreach { solver => val trainer = new LinearRegression().setSolver(solver) - val model = trainer.fit(dataset) + val model = trainer.fit(datasetWithDenseFeature) val trainerNoPredictionCol = trainer.setPredictionCol("") - val modelNoPredictionCol = trainerNoPredictionCol.fit(dataset) + val modelNoPredictionCol = trainerNoPredictionCol.fit(datasetWithDenseFeature) // Training results for the model should be available assert(model.hasSummary) assert(modelNoPredictionCol.hasSummary) // Schema should be a superset of the input dataset - assert((dataset.schema.fieldNames.toSet + "prediction").subsetOf( + assert((datasetWithDenseFeature.schema.fieldNames.toSet + "prediction").subsetOf( model.summary.predictions.schema.fieldNames.toSet)) // Validate that we re-insert a prediction column for evaluation val modelNoPredictionColFieldNames = modelNoPredictionCol.summary.predictions.schema.fieldNames - assert((dataset.schema.fieldNames.toSet).subsetOf( + assert((datasetWithDenseFeature.schema.fieldNames.toSet).subsetOf( modelNoPredictionColFieldNames.toSet)) assert(modelNoPredictionColFieldNames.exists(s => s.startsWith("prediction_"))) // Residuals in [[LinearRegressionResults]] should equal those manually computed - val expectedResiduals = dataset.select("features", "label") + val expectedResiduals = datasetWithDenseFeature.select("features", "label") .map { case Row(features: DenseVector, label: Double) => val prediction = features(0) * model.weights(0) + features(1) * model.weights(1) + model.intercept @@ -596,10 +596,10 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext { test("linear regression model testset evaluation summary") { Seq("auto", "l-bfgs", "normal").foreach { solver => val trainer = new LinearRegression().setSolver(solver) - val model = trainer.fit(dataset) + val model = trainer.fit(datasetWithDenseFeature) // Evaluating on training dataset should yield results summary equal to training summary - val testSummary = model.evaluate(dataset) + val testSummary = model.evaluate(datasetWithDenseFeature) assert(model.summary.meanSquaredError ~== testSummary.meanSquaredError relTol 1E-5) assert(model.summary.r2 ~== testSummary.r2 relTol 1E-5) model.summary.residuals.select("residuals").collect() @@ -700,7 +700,7 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext { test("linear regression model with l-bfgs with big feature datasets") { val trainer = new LinearRegression().setSolver("auto") - val model = trainer.fit(datasetWithManyFeature) + val model = trainer.fit(datasetWithSparseFeature) // Training results for the model should be available assert(model.hasSummary) From 74de81ee4439c121437510b9b8e176a4e7df0724 Mon Sep 17 00:00:00 2001 From: Lewuathe Date: Fri, 30 Oct 2015 16:26:53 +0900 Subject: [PATCH 10/11] Fix style --- .../ml/regression/LinearRegressionSuite.scala | 36 ++++++++++--------- 1 file changed, 20 insertions(+), 16 deletions(-) diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala index b1696dba6d534..b0f66e3223f43 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala @@ -240,11 +240,12 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext { assert(model2.intercept ~== interceptR2 relTol 1E-3) assert(model2.weights ~= weightsR2 relTol 1E-3) - model1.transform(datasetWithDenseFeature).select("features", "prediction").collect().foreach { - case Row(features: DenseVector, prediction1: Double) => - val prediction2 = - features(0) * model1.weights(0) + features(1) * model1.weights(1) + model1.intercept - assert(prediction1 ~== prediction2 relTol 1E-5) + model1.transform(datasetWithDenseFeature).select("features", "prediction") + .collect().foreach { + case Row(features: DenseVector, prediction1: Double) => + val prediction2 = features(0) * model1.weights(0) + features(1) * model1.weights(1) + + model1.intercept + assert(prediction1 ~== prediction2 relTol 1E-5) } } } @@ -299,11 +300,12 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext { assert(model2.intercept ~== interceptR2 absTol 1E-3) assert(model2.weights ~= weightsR2 relTol 1E-3) - model1.transform(datasetWithDenseFeature).select("features", "prediction").collect().foreach { - case Row(features: DenseVector, prediction1: Double) => - val prediction2 = - features(0) * model1.weights(0) + features(1) * model1.weights(1) + model1.intercept - assert(prediction1 ~== prediction2 relTol 1E-5) + model1.transform(datasetWithDenseFeature).select("features", "prediction") + .collect().foreach { + case Row(features: DenseVector, prediction1: Double) => + val prediction2 = features(0) * model1.weights(0) + features(1) * model1.weights(1) + + model1.intercept + assert(prediction1 ~== prediction2 relTol 1E-5) } } } @@ -456,10 +458,11 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext { assert(model2.intercept ~== interceptR2 relTol 1E-3) assert(model2.weights ~= weightsR2 relTol 1E-3) - model1.transform(datasetWithDenseFeature).select("features", "prediction").collect().foreach { + model1.transform(datasetWithDenseFeature).select("features", "prediction") + .collect().foreach { case Row(features: DenseVector, prediction1: Double) => - val prediction2 = - features(0) * model1.weights(0) + features(1) * model1.weights(1) + model1.intercept + val prediction2 = features(0) * model1.weights(0) + features(1) * model1.weights(1) + + model1.intercept assert(prediction1 ~== prediction2 relTol 1E-5) } } @@ -515,10 +518,11 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext { assert(model2.intercept ~== interceptR2 absTol 1E-3) assert(model2.weights ~= weightsR2 relTol 1E-3) - model1.transform(datasetWithDenseFeature).select("features", "prediction").collect().foreach { + model1.transform(datasetWithDenseFeature).select("features", "prediction") + .collect().foreach { case Row(features: DenseVector, prediction1: Double) => - val prediction2 = - features(0) * model1.weights(0) + features(1) * model1.weights(1) + model1.intercept + val prediction2 = features(0) * model1.weights(0) + features(1) * model1.weights(1) + + model1.intercept assert(prediction1 ~== prediction2 relTol 1E-5) } } From 241ec7293607d670c93293b5872b21ed0c9f411a Mon Sep 17 00:00:00 2001 From: Lewuathe Date: Fri, 30 Oct 2015 17:57:25 +0900 Subject: [PATCH 11/11] Minor fixed --- .../apache/spark/ml/regression/LinearRegressionSuite.scala | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala index b0f66e3223f43..a2a5c0bbdcb90 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala @@ -53,7 +53,7 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext { datasetWithDenseFeature = sqlContext.createDataFrame( sc.parallelize(LinearDataGenerator.generateLinearInput( intercept = 6.3, weights = Array(4.7, 7.2), xMean = Array(0.9, -1.3), - xVariance = Array(0.7, 1.2), nPoints = 10000, seed = seed, eps = 0.1), 2)) + xVariance = Array(0.7, 1.2), nPoints = 10000, seed, eps = 0.1), 2)) /* datasetWithoutIntercept is not needed for correctness testing but is useful for illustrating training model without intercept @@ -61,7 +61,7 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext { datasetWithDenseFeatureWithoutIntercept = sqlContext.createDataFrame( sc.parallelize(LinearDataGenerator.generateLinearInput( intercept = 0.0, weights = Array(4.7, 7.2), xMean = Array(0.9, -1.3), - xVariance = Array(0.7, 1.2), nPoints = 10000, seed = seed, eps = 0.1), 2)) + xVariance = Array(0.7, 1.2), nPoints = 10000, seed, eps = 0.1), 2)) val r = new Random(seed) // When feature size is larger than 4096, normal optimizer is choosed @@ -72,7 +72,7 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext { intercept = 0.0, weights = Seq.fill(featureSize)(r.nextDouble).toArray, xMean = Seq.fill(featureSize)(r.nextDouble).toArray, xVariance = Seq.fill(featureSize)(r.nextDouble).toArray, nPoints = 200, - seed = seed, eps = 0.1, sparsity = 0.7), 2)) + seed, eps = 0.1, sparsity = 0.7), 2)) } test("params") {