From 338dad2ac341d40dbf802baba61d1d719079b47f Mon Sep 17 00:00:00 2001 From: Imran Younus Date: Fri, 11 Dec 2015 19:11:41 -0800 Subject: [PATCH 1/4] fixing fit in weighted least sqaures when std of lable is zero. --- .../spark/ml/optim/WeightedLeastSquares.scala | 18 +++++++- .../ml/optim/WeightedLeastSquaresSuite.scala | 42 +++++++++++++++++++ 2 files changed, 59 insertions(+), 1 deletion(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/optim/WeightedLeastSquares.scala b/mllib/src/main/scala/org/apache/spark/ml/optim/WeightedLeastSquares.scala index 8617722ae542..73a3db6fee0f 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/optim/WeightedLeastSquares.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/optim/WeightedLeastSquares.scala @@ -86,6 +86,22 @@ private[ml] class WeightedLeastSquares( val aaBar = summary.aaBar val aaValues = aaBar.values + if (bStd == 0) { + if (fitIntercept) { + logWarning(s"The standard deviation of the label is zero, so the coefficients will be " + + s"zeros and the intercept will be the mean of the label; as a result, " + + s"training is not needed.") + val coefficients = new DenseVector(Array.ofDim(k-1)) + val intercept = bBar + val diagInvAtWA = new DenseVector(Array.ofDim(k)) + return new WeightedLeastSquaresModel(coefficients, intercept, diagInvAtWA) + } + else { + logWarning(s"The standard deviation of the label is zero. " + + "Consider setting FitIntercept=true.") + } + } + // add regularization to diagonals var i = 0 var j = 2 @@ -94,7 +110,7 @@ private[ml] class WeightedLeastSquares( if (standardizeFeatures) { lambda *= aVar(j - 2) } - if (standardizeLabel) { + if (standardizeLabel && bStd != 0) { // TODO: handle the case when bStd = 0 lambda /= bStd } diff --git a/mllib/src/test/scala/org/apache/spark/ml/optim/WeightedLeastSquaresSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/optim/WeightedLeastSquaresSuite.scala index b542ba3dc54d..e70d39b0fd0c 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/optim/WeightedLeastSquaresSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/optim/WeightedLeastSquaresSuite.scala @@ -27,6 +27,7 @@ import org.apache.spark.rdd.RDD class WeightedLeastSquaresSuite extends SparkFunSuite with MLlibTestSparkContext { private var instances: RDD[Instance] = _ + private var instancesConstLabel: RDD[Instance] = _ override def beforeAll(): Unit = { super.beforeAll() @@ -43,6 +44,18 @@ class WeightedLeastSquaresSuite extends SparkFunSuite with MLlibTestSparkContext Instance(23.0, 3.0, Vectors.dense(2.0, 11.0)), Instance(29.0, 4.0, Vectors.dense(3.0, 13.0)) ), 2) + + /* + R code: + same as above except make the label constant + b <- c(17, 17, 17, 17) + */ + instancesConstLabel = sc.parallelize(Seq( + Instance(17.0, 1.0, Vectors.dense(0.0, 5.0).toSparse), + Instance(17.0, 2.0, Vectors.dense(1.0, 7.0)), + Instance(17.0, 3.0, Vectors.dense(2.0, 11.0)), + Instance(17.0, 4.0, Vectors.dense(3.0, 13.0)) + ), 2) } test("WLS against lm") { @@ -74,6 +87,35 @@ class WeightedLeastSquaresSuite extends SparkFunSuite with MLlibTestSparkContext } } + test("WLS against lm when label is constant") { + /* + R code: + # here b is constant + df <- as.data.frame(cbind(A, b)) + for (formula in c(b ~ . -1, b ~ .)) { + model <- lm(formula, data=df, weights=w) + print(as.vector(coef(model))) + } + + [1] -9.221298 3.394343 + [1] 17 0 0 + */ + + val expected = Seq( + Vectors.dense(0.0, -9.221298, 3.394343), + Vectors.dense(17.0, 0.0, 0.0)) + + var idx = 0 + for (fitIntercept <- Seq(false, true)) { + val wls = new WeightedLeastSquares( + fitIntercept, regParam = 0.0, standardizeFeatures = false, standardizeLabel = true) + .fit(instancesConstLabel) + val actual = Vectors.dense(wls.intercept, wls.coefficients(0), wls.coefficients(1)) + assert(actual ~== expected(idx) absTol 1e-4) + idx += 1 + } + } + test("WLS against glmnet") { /* R code: From a3102322ae33ee3a4e03b2dbcedc864f9da97d67 Mon Sep 17 00:00:00 2001 From: Imran Younus Date: Mon, 14 Dec 2015 12:22:42 -0800 Subject: [PATCH 2/4] modifications as suggested by sethah --- .../org/apache/spark/ml/optim/WeightedLeastSquares.scala | 5 ++--- .../apache/spark/ml/optim/WeightedLeastSquaresSuite.scala | 4 +++- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/optim/WeightedLeastSquares.scala b/mllib/src/main/scala/org/apache/spark/ml/optim/WeightedLeastSquares.scala index 73a3db6fee0f..a1658cf5bbac 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/optim/WeightedLeastSquares.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/optim/WeightedLeastSquares.scala @@ -93,12 +93,12 @@ private[ml] class WeightedLeastSquares( s"training is not needed.") val coefficients = new DenseVector(Array.ofDim(k-1)) val intercept = bBar - val diagInvAtWA = new DenseVector(Array.ofDim(k)) + val diagInvAtWA = new DenseVector(Array(0D)) return new WeightedLeastSquaresModel(coefficients, intercept, diagInvAtWA) } else { logWarning(s"The standard deviation of the label is zero. " + - "Consider setting FitIntercept=true.") + "Consider setting fitIntercept=true.") } } @@ -111,7 +111,6 @@ private[ml] class WeightedLeastSquares( lambda *= aVar(j - 2) } if (standardizeLabel && bStd != 0) { - // TODO: handle the case when bStd = 0 lambda /= bStd } aaValues(i) += lambda diff --git a/mllib/src/test/scala/org/apache/spark/ml/optim/WeightedLeastSquaresSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/optim/WeightedLeastSquaresSuite.scala index e70d39b0fd0c..64fdd8a30eb7 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/optim/WeightedLeastSquaresSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/optim/WeightedLeastSquaresSuite.scala @@ -47,8 +47,10 @@ class WeightedLeastSquaresSuite extends SparkFunSuite with MLlibTestSparkContext /* R code: - same as above except make the label constant + + A <- matrix(c(0, 1, 2, 3, 5, 7, 11, 13), 4, 2) b <- c(17, 17, 17, 17) + w <- c(1, 2, 3, 4) */ instancesConstLabel = sc.parallelize(Seq( Instance(17.0, 1.0, Vectors.dense(0.0, 5.0).toSparse), From e920c29d70a609499c6cfbdd01874b8010556483 Mon Sep 17 00:00:00 2001 From: Imran Younus Date: Fri, 8 Jan 2016 10:03:34 -0800 Subject: [PATCH 3/4] minor fix --- .../scala/org/apache/spark/ml/optim/WeightedLeastSquares.scala | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/optim/WeightedLeastSquares.scala b/mllib/src/main/scala/org/apache/spark/ml/optim/WeightedLeastSquares.scala index a1658cf5bbac..409fbf28317b 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/optim/WeightedLeastSquares.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/optim/WeightedLeastSquares.scala @@ -95,8 +95,7 @@ private[ml] class WeightedLeastSquares( val intercept = bBar val diagInvAtWA = new DenseVector(Array(0D)) return new WeightedLeastSquaresModel(coefficients, intercept, diagInvAtWA) - } - else { + } else { logWarning(s"The standard deviation of the label is zero. " + "Consider setting fitIntercept=true.") } From d591989f7383b713110750f80b2720bcf24814b5 Mon Sep 17 00:00:00 2001 From: Imran Younus Date: Thu, 14 Jan 2016 18:28:39 -0800 Subject: [PATCH 4/4] added exception for the case when regParam>0 and standardLabel=true, and modified test accordingly. --- .../spark/ml/optim/WeightedLeastSquares.scala | 7 ++- .../ml/optim/WeightedLeastSquaresSuite.scala | 47 ++++++++++++------- 2 files changed, 36 insertions(+), 18 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/optim/WeightedLeastSquares.scala b/mllib/src/main/scala/org/apache/spark/ml/optim/WeightedLeastSquares.scala index 409fbf28317b..797870eb8ce8 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/optim/WeightedLeastSquares.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/optim/WeightedLeastSquares.scala @@ -96,8 +96,11 @@ private[ml] class WeightedLeastSquares( val diagInvAtWA = new DenseVector(Array(0D)) return new WeightedLeastSquaresModel(coefficients, intercept, diagInvAtWA) } else { - logWarning(s"The standard deviation of the label is zero. " + - "Consider setting fitIntercept=true.") + require(!(regParam > 0.0 && standardizeLabel), + "The standard deviation of the label is zero. " + + "Model cannot be regularized with standardization=true") + logWarning(s"The standard deviation of the label is zero. " + + "Consider setting fitIntercept=true.") } } diff --git a/mllib/src/test/scala/org/apache/spark/ml/optim/WeightedLeastSquaresSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/optim/WeightedLeastSquaresSuite.scala index 64fdd8a30eb7..0b58a9821f57 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/optim/WeightedLeastSquaresSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/optim/WeightedLeastSquaresSuite.scala @@ -49,7 +49,7 @@ class WeightedLeastSquaresSuite extends SparkFunSuite with MLlibTestSparkContext R code: A <- matrix(c(0, 1, 2, 3, 5, 7, 11, 13), 4, 2) - b <- c(17, 17, 17, 17) + b.const <- c(17, 17, 17, 17) w <- c(1, 2, 3, 4) */ instancesConstLabel = sc.parallelize(Seq( @@ -80,22 +80,24 @@ class WeightedLeastSquaresSuite extends SparkFunSuite with MLlibTestSparkContext var idx = 0 for (fitIntercept <- Seq(false, true)) { - val wls = new WeightedLeastSquares( - fitIntercept, regParam = 0.0, standardizeFeatures = false, standardizeLabel = false) - .fit(instances) - val actual = Vectors.dense(wls.intercept, wls.coefficients(0), wls.coefficients(1)) - assert(actual ~== expected(idx) absTol 1e-4) + for (standardization <- Seq(false, true)) { + val wls = new WeightedLeastSquares( + fitIntercept, regParam = 0.0, standardizeFeatures = standardization, + standardizeLabel = standardization).fit(instances) + val actual = Vectors.dense(wls.intercept, wls.coefficients(0), wls.coefficients(1)) + assert(actual ~== expected(idx) absTol 1e-4) + } idx += 1 } } - test("WLS against lm when label is constant") { + test("WLS against lm when label is constant and no regularization") { /* R code: - # here b is constant - df <- as.data.frame(cbind(A, b)) - for (formula in c(b ~ . -1, b ~ .)) { - model <- lm(formula, data=df, weights=w) + + df.const.label <- as.data.frame(cbind(A, b.const)) + for (formula in c(b.const ~ . -1, b.const ~ .)) { + model <- lm(formula, data=df.const.label, weights=w) print(as.vector(coef(model))) } @@ -109,15 +111,28 @@ class WeightedLeastSquaresSuite extends SparkFunSuite with MLlibTestSparkContext var idx = 0 for (fitIntercept <- Seq(false, true)) { - val wls = new WeightedLeastSquares( - fitIntercept, regParam = 0.0, standardizeFeatures = false, standardizeLabel = true) - .fit(instancesConstLabel) - val actual = Vectors.dense(wls.intercept, wls.coefficients(0), wls.coefficients(1)) - assert(actual ~== expected(idx) absTol 1e-4) + for (standardization <- Seq(false, true)) { + val wls = new WeightedLeastSquares( + fitIntercept, regParam = 0.0, standardizeFeatures = standardization, + standardizeLabel = standardization).fit(instancesConstLabel) + val actual = Vectors.dense(wls.intercept, wls.coefficients(0), wls.coefficients(1)) + assert(actual ~== expected(idx) absTol 1e-4) + } idx += 1 } } + test("WLS with regularization when label is constant") { + // if regParam is non-zero and standardization is true, the problem is ill-defined and + // an exception is thrown. + val wls = new WeightedLeastSquares( + fitIntercept = false, regParam = 0.1, standardizeFeatures = true, + standardizeLabel = true) + intercept[IllegalArgumentException]{ + wls.fit(instancesConstLabel) + } + } + test("WLS against glmnet") { /* R code: