Skip to content

Commit d56c262

Browse files
WeichenXu123jkbradley
authored andcommitted
[SPARK-21681][ML] fix bug of MLOR do not work correctly when featureStd contains zero
## What changes were proposed in this pull request? fix bug of MLOR do not work correctly when featureStd contains zero We can reproduce the bug through such dataset (features including zero variance), will generate wrong result (all coefficients becomes 0) ``` val multinomialDatasetWithZeroVar = { val nPoints = 100 val coefficients = Array( -0.57997, 0.912083, -0.371077, -0.16624, -0.84355, -0.048509) val xMean = Array(5.843, 3.0) val xVariance = Array(0.6856, 0.0) // including zero variance val testData = generateMultinomialLogisticInput( coefficients, xMean, xVariance, addIntercept = true, nPoints, seed) val df = sc.parallelize(testData, 4).toDF().withColumn("weight", lit(1.0)) df.cache() df } ``` ## How was this patch tested? testcase added. Author: WeichenXu <[email protected]> Closes #18896 from WeichenXu123/fix_mlor_stdvalue_zero_bug.
1 parent 01a8e46 commit d56c262

File tree

3 files changed

+118
-9
lines changed

3 files changed

+118
-9
lines changed

mllib/src/main/scala/org/apache/spark/ml/optim/aggregator/LogisticAggregator.scala

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -270,11 +270,13 @@ private[ml] class LogisticAggregator(
270270

271271
val margins = new Array[Double](numClasses)
272272
features.foreachActive { (index, value) =>
273-
val stdValue = value / localFeaturesStd(index)
274-
var j = 0
275-
while (j < numClasses) {
276-
margins(j) += localCoefficients(index * numClasses + j) * stdValue
277-
j += 1
273+
if (localFeaturesStd(index) != 0.0 && value != 0.0) {
274+
val stdValue = value / localFeaturesStd(index)
275+
var j = 0
276+
while (j < numClasses) {
277+
margins(j) += localCoefficients(index * numClasses + j) * stdValue
278+
j += 1
279+
}
278280
}
279281
}
280282
var i = 0

mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ class LogisticRegressionSuite
4646
@transient var smallMultinomialDataset: Dataset[_] = _
4747
@transient var binaryDataset: Dataset[_] = _
4848
@transient var multinomialDataset: Dataset[_] = _
49+
@transient var multinomialDatasetWithZeroVar: Dataset[_] = _
4950
private val eps: Double = 1e-5
5051

5152
override def beforeAll(): Unit = {
@@ -99,6 +100,23 @@ class LogisticRegressionSuite
99100
df.cache()
100101
df
101102
}
103+
104+
multinomialDatasetWithZeroVar = {
105+
val nPoints = 100
106+
val coefficients = Array(
107+
-0.57997, 0.912083, -0.371077,
108+
-0.16624, -0.84355, -0.048509)
109+
110+
val xMean = Array(5.843, 3.0)
111+
val xVariance = Array(0.6856, 0.0)
112+
113+
val testData = generateMultinomialLogisticInput(
114+
coefficients, xMean, xVariance, addIntercept = true, nPoints, seed)
115+
116+
val df = sc.parallelize(testData, 4).toDF().withColumn("weight", lit(1.0))
117+
df.cache()
118+
df
119+
}
102120
}
103121

104122
/**
@@ -112,6 +130,11 @@ class LogisticRegressionSuite
112130
multinomialDataset.rdd.map { case Row(label: Double, features: Vector, weight: Double) =>
113131
label + "," + weight + "," + features.toArray.mkString(",")
114132
}.repartition(1).saveAsTextFile("target/tmp/LogisticRegressionSuite/multinomialDataset")
133+
multinomialDatasetWithZeroVar.rdd.map {
134+
case Row(label: Double, features: Vector, weight: Double) =>
135+
label + "," + weight + "," + features.toArray.mkString(",")
136+
}.repartition(1)
137+
.saveAsTextFile("target/tmp/LogisticRegressionSuite/multinomialDatasetWithZeroVar")
115138
}
116139

117140
test("params") {
@@ -1392,6 +1415,61 @@ class LogisticRegressionSuite
13921415
assert(model2.interceptVector.toArray.sum ~== 0.0 absTol eps)
13931416
}
13941417

1418+
test("multinomial logistic regression with zero variance (SPARK-21681)") {
1419+
val sqlContext = multinomialDatasetWithZeroVar.sqlContext
1420+
import sqlContext.implicits._
1421+
val mlr = new LogisticRegression().setFamily("multinomial").setFitIntercept(true)
1422+
.setElasticNetParam(0.0).setRegParam(0.0).setStandardization(true).setWeightCol("weight")
1423+
1424+
val model = mlr.fit(multinomialDatasetWithZeroVar)
1425+
1426+
/*
1427+
Use the following R code to load the data and train the model using glmnet package.
1428+
1429+
library("glmnet")
1430+
data <- read.csv("path", header=FALSE)
1431+
label = as.factor(data$V1)
1432+
w = data$V2
1433+
features = as.matrix(data.frame(data$V3, data$V4))
1434+
coefficients = coef(glmnet(features, label, weights=w, family="multinomial",
1435+
alpha = 0, lambda = 0))
1436+
coefficients
1437+
$`0`
1438+
3 x 1 sparse Matrix of class "dgCMatrix"
1439+
s0
1440+
0.2658824
1441+
data.V3 0.1881871
1442+
data.V4 .
1443+
1444+
$`1`
1445+
3 x 1 sparse Matrix of class "dgCMatrix"
1446+
s0
1447+
0.53604701
1448+
data.V3 -0.02412645
1449+
data.V4 .
1450+
1451+
$`2`
1452+
3 x 1 sparse Matrix of class "dgCMatrix"
1453+
s0
1454+
-0.8019294
1455+
data.V3 -0.1640607
1456+
data.V4 .
1457+
*/
1458+
1459+
val coefficientsR = new DenseMatrix(3, 2, Array(
1460+
0.1881871, 0.0,
1461+
-0.02412645, 0.0,
1462+
-0.1640607, 0.0), isTransposed = true)
1463+
val interceptsR = Vectors.dense(0.2658824, 0.53604701, -0.8019294)
1464+
1465+
model.coefficientMatrix.colIter.foreach(v => assert(v.toArray.sum ~== 0.0 absTol eps))
1466+
1467+
assert(model.coefficientMatrix ~== coefficientsR relTol 0.05)
1468+
assert(model.coefficientMatrix.toArray.sum ~== 0.0 absTol eps)
1469+
assert(model.interceptVector ~== interceptsR relTol 0.05)
1470+
assert(model.interceptVector.toArray.sum ~== 0.0 absTol eps)
1471+
}
1472+
13951473
test("multinomial logistic regression with intercept without regularization with bound") {
13961474
// Bound constrained optimization with bound on one side.
13971475
val lowerBoundsOnCoefficients = Matrices.dense(3, 4, Array.fill(12)(1.0))

mllib/src/test/scala/org/apache/spark/ml/optim/aggregator/LogisticAggregatorSuite.scala

Lines changed: 33 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ class LogisticAggregatorSuite extends SparkFunSuite with MLlibTestSparkContext {
2828

2929
@transient var instances: Array[Instance] = _
3030
@transient var instancesConstantFeature: Array[Instance] = _
31+
@transient var instancesConstantFeatureFiltered: Array[Instance] = _
3132

3233
override def beforeAll(): Unit = {
3334
super.beforeAll()
@@ -41,6 +42,11 @@ class LogisticAggregatorSuite extends SparkFunSuite with MLlibTestSparkContext {
4142
Instance(1.0, 0.5, Vectors.dense(1.0, 1.0)),
4243
Instance(2.0, 0.3, Vectors.dense(1.0, 0.5))
4344
)
45+
instancesConstantFeatureFiltered = Array(
46+
Instance(0.0, 0.1, Vectors.dense(2.0)),
47+
Instance(1.0, 0.5, Vectors.dense(1.0)),
48+
Instance(2.0, 0.3, Vectors.dense(0.5))
49+
)
4450
}
4551

4652
/** Get summary statistics for some data and create a new LogisticAggregator. */
@@ -233,21 +239,44 @@ class LogisticAggregatorSuite extends SparkFunSuite with MLlibTestSparkContext {
233239
val binaryInstances = instancesConstantFeature.map { instance =>
234240
if (instance.label <= 1.0) instance else Instance(0.0, instance.weight, instance.features)
235241
}
242+
val binaryInstancesFiltered = instancesConstantFeatureFiltered.map { instance =>
243+
if (instance.label <= 1.0) instance else Instance(0.0, instance.weight, instance.features)
244+
}
236245
val coefArray = Array(1.0, 2.0, -2.0, 3.0, 0.0, -1.0)
246+
val coefArrayFiltered = Array(3.0, 0.0, -1.0)
237247
val interceptArray = Array(4.0, 2.0, -3.0)
238248
val aggConstantFeature = getNewAggregator(instancesConstantFeature,
239249
Vectors.dense(coefArray ++ interceptArray), fitIntercept = true, isMultinomial = true)
240-
instances.foreach(aggConstantFeature.add)
250+
val aggConstantFeatureFiltered = getNewAggregator(instancesConstantFeatureFiltered,
251+
Vectors.dense(coefArrayFiltered ++ interceptArray), fitIntercept = true, isMultinomial = true)
252+
253+
instancesConstantFeature.foreach(aggConstantFeature.add)
254+
instancesConstantFeatureFiltered.foreach(aggConstantFeatureFiltered.add)
255+
241256
// constant features should not affect gradient
242-
assert(aggConstantFeature.gradient(0) === 0.0)
257+
def validateGradient(grad: Vector, gradFiltered: Vector, numCoefficientSets: Int): Unit = {
258+
for (i <- 0 until numCoefficientSets) {
259+
assert(grad(i) === 0.0)
260+
assert(grad(numCoefficientSets + i) == gradFiltered(i))
261+
}
262+
}
263+
264+
validateGradient(aggConstantFeature.gradient, aggConstantFeatureFiltered.gradient, 3)
243265

244266
val binaryCoefArray = Array(1.0, 2.0)
267+
val binaryCoefArrayFiltered = Array(2.0)
245268
val intercept = 1.0
246269
val aggConstantFeatureBinary = getNewAggregator(binaryInstances,
247270
Vectors.dense(binaryCoefArray ++ Array(intercept)), fitIntercept = true,
248271
isMultinomial = false)
249-
instances.foreach(aggConstantFeatureBinary.add)
272+
val aggConstantFeatureBinaryFiltered = getNewAggregator(binaryInstancesFiltered,
273+
Vectors.dense(binaryCoefArrayFiltered ++ Array(intercept)), fitIntercept = true,
274+
isMultinomial = false)
275+
binaryInstances.foreach(aggConstantFeatureBinary.add)
276+
binaryInstancesFiltered.foreach(aggConstantFeatureBinaryFiltered.add)
277+
250278
// constant features should not affect gradient
251-
assert(aggConstantFeatureBinary.gradient(0) === 0.0)
279+
validateGradient(aggConstantFeatureBinary.gradient,
280+
aggConstantFeatureBinaryFiltered.gradient, 1)
252281
}
253282
}

0 commit comments

Comments
 (0)