From 3c535e42a9db55649be02e4f3ca1225d38fa61c4 Mon Sep 17 00:00:00 2001 From: WeichenXu Date: Wed, 9 Aug 2017 10:17:43 -0700 Subject: [PATCH 1/5] init pr --- .../ml/optim/aggregator/LogisticAggregator.scala | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/optim/aggregator/LogisticAggregator.scala b/mllib/src/main/scala/org/apache/spark/ml/optim/aggregator/LogisticAggregator.scala index 66a52942e668..272d36dd94ae 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/optim/aggregator/LogisticAggregator.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/optim/aggregator/LogisticAggregator.scala @@ -270,11 +270,13 @@ private[ml] class LogisticAggregator( val margins = new Array[Double](numClasses) features.foreachActive { (index, value) => - val stdValue = value / localFeaturesStd(index) - var j = 0 - while (j < numClasses) { - margins(j) += localCoefficients(index * numClasses + j) * stdValue - j += 1 + if (localFeaturesStd(index) != 0.0 && value != 0.0) { + val stdValue = value / localFeaturesStd(index) + var j = 0 + while (j < numClasses) { + margins(j) += localCoefficients(index * numClasses + j) * stdValue + j += 1 + } } } var i = 0 From 48ea363de055e74be8ed8391ee0279ff974eb095 Mon Sep 17 00:00:00 2001 From: WeichenXu Date: Tue, 15 Aug 2017 18:22:51 +0800 Subject: [PATCH 2/5] add test --- .../LogisticRegressionSuite.scala | 78 +++++++++++++++++++ 1 file changed, 78 insertions(+) diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala index 0570499e7451..73330f87a315 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala @@ -46,6 +46,7 @@ class LogisticRegressionSuite @transient var smallMultinomialDataset: Dataset[_] = _ @transient var binaryDataset: Dataset[_] = _ @transient var multinomialDataset: Dataset[_] = _ + @transient var multinomialDatasetWithZeroVar: Dataset[_] = _ private val eps: Double = 1e-5 override def beforeAll(): Unit = { @@ -99,6 +100,23 @@ class LogisticRegressionSuite df.cache() df } + + multinomialDatasetWithZeroVar = { + val nPoints = 100 + val coefficients = Array( + -0.57997, 0.912083, -0.371077, + -0.16624, -0.84355, -0.048509) + + val xMean = Array(5.843, 3.0) + val xVariance = Array(0.6856, 0.0) + + val testData = generateMultinomialLogisticInput( + coefficients, xMean, xVariance, addIntercept = true, nPoints, seed) + + val df = sc.parallelize(testData, 4).toDF().withColumn("weight", lit(1.0)) + df.cache() + df + } } /** @@ -112,6 +130,11 @@ class LogisticRegressionSuite multinomialDataset.rdd.map { case Row(label: Double, features: Vector, weight: Double) => label + "," + weight + "," + features.toArray.mkString(",") }.repartition(1).saveAsTextFile("target/tmp/LogisticRegressionSuite/multinomialDataset") + multinomialDatasetWithZeroVar.rdd.map { + case Row(label: Double, features: Vector, weight: Double) => + label + "," + weight + "," + features.toArray.mkString(",") + }.repartition(1) + .saveAsTextFile("target/tmp/LogisticRegressionSuite/multinomialDatasetWithZeroVar") } test("params") { @@ -1392,6 +1415,61 @@ class LogisticRegressionSuite assert(model2.interceptVector.toArray.sum ~== 0.0 absTol eps) } + test("test SPARK-21681") { + val sqlContext = multinomialDatasetWithZeroVar.sqlContext + import sqlContext.implicits._ + val mlr = new LogisticRegression().setFamily("multinomial").setFitIntercept(true) + .setElasticNetParam(0.0).setRegParam(0.0).setStandardization(true).setWeightCol("weight") + + val model = mlr.fit(multinomialDatasetWithZeroVar) + + /* + Use the following R code to load the data and train the model using glmnet package. + + library("glmnet") + data <- read.csv("path", header=FALSE) + label = as.factor(data$V1) + w = data$V2 + features = as.matrix(data.frame(data$V3, data$V4)) + coefficients = coef(glmnet(features, label, weights=w, family="multinomial", + alpha = 0, lambda = 0)) + coefficients + $`0` + 3 x 1 sparse Matrix of class "dgCMatrix" + s0 + 0.2658824 + data.V3 0.1881871 + data.V4 . + + $`1` + 3 x 1 sparse Matrix of class "dgCMatrix" + s0 + 0.53604701 + data.V3 -0.02412645 + data.V4 . + + $`2` + 3 x 1 sparse Matrix of class "dgCMatrix" + s0 + -0.8019294 + data.V3 -0.1640607 + data.V4 . + */ + + val coefficientsR = new DenseMatrix(3, 2, Array( + 0.1881871, -0.0, + -0.02412645, 0.0, + -0.1640607, -0.0), isTransposed = true) + val interceptsR = Vectors.dense(0.2658824, 0.53604701, -0.8019294) + + model.coefficientMatrix.colIter.foreach(v => assert(v.toArray.sum ~== 0.0 absTol eps)) + + assert(model.coefficientMatrix ~== coefficientsR relTol 0.05) + assert(model.coefficientMatrix.toArray.sum ~== 0.0 absTol eps) + assert(model.interceptVector ~== interceptsR relTol 0.05) + assert(model.interceptVector.toArray.sum ~== 0.0 absTol eps) + } + test("multinomial logistic regression with intercept without regularization with bound") { // Bound constrained optimization with bound on one side. val lowerBoundsOnCoefficients = Matrices.dense(3, 4, Array.fill(12)(1.0)) From 8515b208c96787b88d45f6ee929bc644022fa720 Mon Sep 17 00:00:00 2001 From: WeichenXu Date: Wed, 16 Aug 2017 10:32:00 +0800 Subject: [PATCH 3/5] update testcase description --- .../spark/ml/classification/LogisticRegressionSuite.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala index 73330f87a315..07d395e78ee1 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala @@ -1415,7 +1415,7 @@ class LogisticRegressionSuite assert(model2.interceptVector.toArray.sum ~== 0.0 absTol eps) } - test("test SPARK-21681") { + test("multinomial logistic regression with zero variance (SPARK-21681)") { val sqlContext = multinomialDatasetWithZeroVar.sqlContext import sqlContext.implicits._ val mlr = new LogisticRegression().setFamily("multinomial").setFitIntercept(true) From 0f28e5ea8fb3380d4be8b7771ebc69dd820419bd Mon Sep 17 00:00:00 2001 From: WeichenXu Date: Fri, 18 Aug 2017 10:51:38 +0800 Subject: [PATCH 4/5] update logisticAggregatorSuite --- .../optim/aggregator/LogisticAggregatorSuite.scala | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/mllib/src/test/scala/org/apache/spark/ml/optim/aggregator/LogisticAggregatorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/optim/aggregator/LogisticAggregatorSuite.scala index 2b29c67d859d..bcbcb0e8efab 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/optim/aggregator/LogisticAggregatorSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/optim/aggregator/LogisticAggregatorSuite.scala @@ -238,8 +238,17 @@ class LogisticAggregatorSuite extends SparkFunSuite with MLlibTestSparkContext { val aggConstantFeature = getNewAggregator(instancesConstantFeature, Vectors.dense(coefArray ++ interceptArray), fitIntercept = true, isMultinomial = true) instances.foreach(aggConstantFeature.add) + // constant features should not affect gradient - assert(aggConstantFeature.gradient(0) === 0.0) + def validateGradient(grad: Vector): Unit = { + assert(grad(0) === 0.0) + grad.toArray.foreach { gradientValue => + assert(!gradientValue.isNaN && + gradientValue > Double.NegativeInfinity && gradientValue < Double.PositiveInfinity) + } + } + + validateGradient(aggConstantFeature.gradient) val binaryCoefArray = Array(1.0, 2.0) val intercept = 1.0 @@ -248,6 +257,6 @@ class LogisticAggregatorSuite extends SparkFunSuite with MLlibTestSparkContext { isMultinomial = false) instances.foreach(aggConstantFeatureBinary.add) // constant features should not affect gradient - assert(aggConstantFeatureBinary.gradient(0) === 0.0) + validateGradient(aggConstantFeatureBinary.gradient) } } From 1f4ba14bf41f1cf05749bfc0951cd4844b00b861 Mon Sep 17 00:00:00 2001 From: WeichenXu Date: Mon, 21 Aug 2017 21:14:24 +0800 Subject: [PATCH 5/5] update testcase --- .../LogisticRegressionSuite.scala | 4 +- .../aggregator/LogisticAggregatorSuite.scala | 38 ++++++++++++++----- 2 files changed, 31 insertions(+), 11 deletions(-) diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala index 07d395e78ee1..542977a48f0a 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala @@ -1457,9 +1457,9 @@ class LogisticRegressionSuite */ val coefficientsR = new DenseMatrix(3, 2, Array( - 0.1881871, -0.0, + 0.1881871, 0.0, -0.02412645, 0.0, - -0.1640607, -0.0), isTransposed = true) + -0.1640607, 0.0), isTransposed = true) val interceptsR = Vectors.dense(0.2658824, 0.53604701, -0.8019294) model.coefficientMatrix.colIter.foreach(v => assert(v.toArray.sum ~== 0.0 absTol eps)) diff --git a/mllib/src/test/scala/org/apache/spark/ml/optim/aggregator/LogisticAggregatorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/optim/aggregator/LogisticAggregatorSuite.scala index bcbcb0e8efab..16ef4af4f94e 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/optim/aggregator/LogisticAggregatorSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/optim/aggregator/LogisticAggregatorSuite.scala @@ -28,6 +28,7 @@ class LogisticAggregatorSuite extends SparkFunSuite with MLlibTestSparkContext { @transient var instances: Array[Instance] = _ @transient var instancesConstantFeature: Array[Instance] = _ + @transient var instancesConstantFeatureFiltered: Array[Instance] = _ override def beforeAll(): Unit = { super.beforeAll() @@ -41,6 +42,11 @@ class LogisticAggregatorSuite extends SparkFunSuite with MLlibTestSparkContext { Instance(1.0, 0.5, Vectors.dense(1.0, 1.0)), Instance(2.0, 0.3, Vectors.dense(1.0, 0.5)) ) + instancesConstantFeatureFiltered = Array( + Instance(0.0, 0.1, Vectors.dense(2.0)), + Instance(1.0, 0.5, Vectors.dense(1.0)), + Instance(2.0, 0.3, Vectors.dense(0.5)) + ) } /** Get summary statistics for some data and create a new LogisticAggregator. */ @@ -233,30 +239,44 @@ class LogisticAggregatorSuite extends SparkFunSuite with MLlibTestSparkContext { val binaryInstances = instancesConstantFeature.map { instance => if (instance.label <= 1.0) instance else Instance(0.0, instance.weight, instance.features) } + val binaryInstancesFiltered = instancesConstantFeatureFiltered.map { instance => + if (instance.label <= 1.0) instance else Instance(0.0, instance.weight, instance.features) + } val coefArray = Array(1.0, 2.0, -2.0, 3.0, 0.0, -1.0) + val coefArrayFiltered = Array(3.0, 0.0, -1.0) val interceptArray = Array(4.0, 2.0, -3.0) val aggConstantFeature = getNewAggregator(instancesConstantFeature, Vectors.dense(coefArray ++ interceptArray), fitIntercept = true, isMultinomial = true) - instances.foreach(aggConstantFeature.add) + val aggConstantFeatureFiltered = getNewAggregator(instancesConstantFeatureFiltered, + Vectors.dense(coefArrayFiltered ++ interceptArray), fitIntercept = true, isMultinomial = true) + + instancesConstantFeature.foreach(aggConstantFeature.add) + instancesConstantFeatureFiltered.foreach(aggConstantFeatureFiltered.add) // constant features should not affect gradient - def validateGradient(grad: Vector): Unit = { - assert(grad(0) === 0.0) - grad.toArray.foreach { gradientValue => - assert(!gradientValue.isNaN && - gradientValue > Double.NegativeInfinity && gradientValue < Double.PositiveInfinity) + def validateGradient(grad: Vector, gradFiltered: Vector, numCoefficientSets: Int): Unit = { + for (i <- 0 until numCoefficientSets) { + assert(grad(i) === 0.0) + assert(grad(numCoefficientSets + i) == gradFiltered(i)) } } - validateGradient(aggConstantFeature.gradient) + validateGradient(aggConstantFeature.gradient, aggConstantFeatureFiltered.gradient, 3) val binaryCoefArray = Array(1.0, 2.0) + val binaryCoefArrayFiltered = Array(2.0) val intercept = 1.0 val aggConstantFeatureBinary = getNewAggregator(binaryInstances, Vectors.dense(binaryCoefArray ++ Array(intercept)), fitIntercept = true, isMultinomial = false) - instances.foreach(aggConstantFeatureBinary.add) + val aggConstantFeatureBinaryFiltered = getNewAggregator(binaryInstancesFiltered, + Vectors.dense(binaryCoefArrayFiltered ++ Array(intercept)), fitIntercept = true, + isMultinomial = false) + binaryInstances.foreach(aggConstantFeatureBinary.add) + binaryInstancesFiltered.foreach(aggConstantFeatureBinaryFiltered.add) + // constant features should not affect gradient - validateGradient(aggConstantFeatureBinary.gradient) + validateGradient(aggConstantFeatureBinary.gradient, + aggConstantFeatureBinaryFiltered.gradient, 1) } }