From 784cb09343bb1bae50c23dd943acf11a4baded03 Mon Sep 17 00:00:00 2001 From: actuaryzhang Date: Sat, 3 Dec 2016 16:41:29 -0800 Subject: [PATCH 1/5] Change initial value in Poisson GLM to avoid numerical issue --- .../spark/ml/regression/GeneralizedLinearRegression.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala index 770a2571bb9c2..aca39473a1f27 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala @@ -505,7 +505,7 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine override def initialize(y: Double, weight: Double): Double = { require(y >= 0.0, "The response variable of Poisson family " + s"should be non-negative, but got $y") - y + y + 0.1 } override def variance(mu: Double): Double = mu From 56c4779a5e0f7de902aa22c943192500b6c85c37 Mon Sep 17 00:00:00 2001 From: actuaryzhang Date: Sat, 3 Dec 2016 19:06:53 -0800 Subject: [PATCH 2/5] Update Poisson GLM test (for incorrect initialization) --- .../GeneralizedLinearRegressionSuite.scala | 30 +++++++++++++------ 1 file changed, 21 insertions(+), 9 deletions(-) diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala index 4fab2160339c6..b77ed273fd1ae 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala @@ -89,11 +89,23 @@ class GeneralizedLinearRegressionSuite xVariance = Array(0.7, 1.2), nPoints = 10000, seed, noiseLevel = 0.01, family = "poisson", link = "log").toDF() - datasetPoissonLogWithZero = generateGeneralizedLinearRegressionInput( - intercept = -1.5, coefficients = Array(0.22, 0.06), xMean = Array(2.9, 10.5), - xVariance = Array(0.7, 1.2), nPoints = 100, seed, noiseLevel = 0.01, - family = "poisson", link = "log") - .map{x => LabeledPoint(if (x.label < 0.7) 0.0 else x.label, x.features)}.toDF() + datasetPoissonLogWithZero = Seq( + LabeledPoint(0.0, Vectors.dense(18, 1.0)), + LabeledPoint(1.0, Vectors.dense(12, 0.0)), + LabeledPoint(0.0, Vectors.dense(15, 0.0)), + LabeledPoint(0.0, Vectors.dense(13, 2.0)), + LabeledPoint(0.0, Vectors.dense(15, 1.0)), + LabeledPoint(1.0, Vectors.dense(16, 1.0)), + LabeledPoint(0.0, Vectors.dense(10, 0.0)), + LabeledPoint(0.0, Vectors.dense(15, 0.0)), + LabeledPoint(0.0, Vectors.dense(12, 2.0)), + LabeledPoint(0.0, Vectors.dense(13, 0.0)), + LabeledPoint(1.0, Vectors.dense(15, 0.0)), + LabeledPoint(1.0, Vectors.dense(15, 0.0)), + LabeledPoint(0.0, Vectors.dense(15, 0.0)), + LabeledPoint(0.0, Vectors.dense(12, 2.0)), + LabeledPoint(1.0, Vectors.dense(12, 2.0)) + ).toDF() datasetPoissonIdentity = generateGeneralizedLinearRegressionInput( intercept = 2.5, coefficients = Array(2.2, 0.6), xMean = Array(2.9, 10.5), @@ -480,12 +492,12 @@ class GeneralizedLinearRegressionSuite model <- glm(formula, family="poisson", data=data) print(as.vector(coef(model))) } - [1] 0.4272661 -0.1565423 - [1] -3.6911354 0.6214301 0.1295814 + [1] -0.0666978 -0.2369600 + [1] -1.2464752 0.0194581 -0.1853699 */ val expected = Seq( - Vectors.dense(0.0, 0.4272661, -0.1565423), - Vectors.dense(-3.6911354, 0.6214301, 0.1295814)) + Vectors.dense(0.0, -0.0666978, -0.2369600), + Vectors.dense(-1.2464752, 0.0194581, -0.1853699)) import GeneralizedLinearRegression._ From a24966a0746a8c1165195d956963fbaa14fdbb37 Mon Sep 17 00:00:00 2001 From: actuaryzhang Date: Mon, 5 Dec 2016 14:41:32 -0800 Subject: [PATCH 3/5] Set lower bound in Poisson GLM initialization --- .../GeneralizedLinearRegression.scala | 3 ++- .../GeneralizedLinearRegressionSuite.scala | 20 ++++++------------- 2 files changed, 8 insertions(+), 15 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala index aca39473a1f27..255e66926fab0 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala @@ -505,7 +505,8 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine override def initialize(y: Double, weight: Double): Double = { require(y >= 0.0, "The response variable of Poisson family " + s"should be non-negative, but got $y") - y + 0.1 + // Set lower bound for mean in the FIRST step in IWLS + math.max(y, 0.1) } override def variance(mu: Double): Double = mu diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala index b77ed273fd1ae..702ef2ab87ee6 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala @@ -95,16 +95,7 @@ class GeneralizedLinearRegressionSuite LabeledPoint(0.0, Vectors.dense(15, 0.0)), LabeledPoint(0.0, Vectors.dense(13, 2.0)), LabeledPoint(0.0, Vectors.dense(15, 1.0)), - LabeledPoint(1.0, Vectors.dense(16, 1.0)), - LabeledPoint(0.0, Vectors.dense(10, 0.0)), - LabeledPoint(0.0, Vectors.dense(15, 0.0)), - LabeledPoint(0.0, Vectors.dense(12, 2.0)), - LabeledPoint(0.0, Vectors.dense(13, 0.0)), - LabeledPoint(1.0, Vectors.dense(15, 0.0)), - LabeledPoint(1.0, Vectors.dense(15, 0.0)), - LabeledPoint(0.0, Vectors.dense(15, 0.0)), - LabeledPoint(0.0, Vectors.dense(12, 2.0)), - LabeledPoint(1.0, Vectors.dense(12, 2.0)) + LabeledPoint(1.0, Vectors.dense(16, 1.0)) ).toDF() datasetPoissonIdentity = generateGeneralizedLinearRegressionInput( @@ -492,12 +483,12 @@ class GeneralizedLinearRegressionSuite model <- glm(formula, family="poisson", data=data) print(as.vector(coef(model))) } - [1] -0.0666978 -0.2369600 - [1] -1.2464752 0.0194581 -0.1853699 + [1] -0.0457441 -0.6833928 + [1] 1.8121235 -0.1747493 -0.5815417 */ val expected = Seq( - Vectors.dense(0.0, -0.0666978, -0.2369600), - Vectors.dense(-1.2464752, 0.0194581, -0.1853699)) + Vectors.dense(0.0, -0.0457441, -0.6833928), + Vectors.dense(1.8121235, -0.1747493, -0.5815417)) import GeneralizedLinearRegression._ @@ -509,6 +500,7 @@ class GeneralizedLinearRegressionSuite .setFitIntercept(fitIntercept).setLinkPredictionCol("linkPrediction") val model = trainer.fit(dataset) val actual = Vectors.dense(model.intercept, model.coefficients(0), model.coefficients(1)) + println("coeff is " + actual) assert(actual ~= expected(idx) absTol 1e-4, "Model mismatch: GLM with poisson family, " + s"$link link and fitIntercept = $fitIntercept (with zero values).") idx += 1 From b162970dd2b22c877e2b1e8db548822ae02a70d2 Mon Sep 17 00:00:00 2001 From: actuaryzhang Date: Mon, 5 Dec 2016 16:28:43 -0800 Subject: [PATCH 4/5] Update comment and remove unnecessary printing in test --- .../spark/ml/regression/GeneralizedLinearRegression.scala | 2 +- .../spark/ml/regression/GeneralizedLinearRegressionSuite.scala | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala index 255e66926fab0..82f54c9a846be 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala @@ -505,7 +505,7 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine override def initialize(y: Double, weight: Double): Double = { require(y >= 0.0, "The response variable of Poisson family " + s"should be non-negative, but got $y") - // Set lower bound for mean in the FIRST step in IWLS + // force Poisson mean > 0 to avoid numerical instability in IRLS math.max(y, 0.1) } diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala index 702ef2ab87ee6..3e9e1fced8ec4 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala @@ -500,7 +500,6 @@ class GeneralizedLinearRegressionSuite .setFitIntercept(fitIntercept).setLinkPredictionCol("linkPrediction") val model = trainer.fit(dataset) val actual = Vectors.dense(model.intercept, model.coefficients(0), model.coefficients(1)) - println("coeff is " + actual) assert(actual ~= expected(idx) absTol 1e-4, "Model mismatch: GLM with poisson family, " + s"$link link and fitIntercept = $fitIntercept (with zero values).") idx += 1 From 271d315259c356bd0f7787ae94a3acc34cf1e484 Mon Sep 17 00:00:00 2001 From: actuaryzhang Date: Mon, 5 Dec 2016 21:57:43 -0800 Subject: [PATCH 5/5] Add reference to R's Poisson GLM initialization --- .../spark/ml/regression/GeneralizedLinearRegression.scala | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala index 82f54c9a846be..f137c8cb41894 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala @@ -505,7 +505,10 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine override def initialize(y: Double, weight: Double): Double = { require(y >= 0.0, "The response variable of Poisson family " + s"should be non-negative, but got $y") - // force Poisson mean > 0 to avoid numerical instability in IRLS + /* + Force Poisson mean > 0 to avoid numerical instability in IRLS. + R uses y + 0.1 for initialization. See poisson()$initialize. + */ math.max(y, 0.1) }