Skip to content

Commit f85bca6

Browse files
committed
[SPARK-11207] Improve test case with many feature datasets
1 parent 28427d2 commit f85bca6

File tree

2 files changed

+67
-9
lines changed

2 files changed

+67
-9
lines changed

mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,59 @@ object LinearDataGenerator {
124124
y.zip(x).map(p => LabeledPoint(p._1, Vectors.dense(p._2)))
125125
}
126126

127+
/**
128+
*
129+
* @param intercept Data intercept
130+
* @param weights Weights to be applied.
131+
* @param xMean the mean of the generated features. Lots of time, if the features are not properly
132+
* standardized, the algorithm with poor implementation will have difficulty
133+
* to converge.
134+
* @param xVariance the variance of the generated features.
135+
* @param nPoints Number of points in sample.
136+
* @param seed Random seed
137+
* @param eps Epsilon scaling factor.
138+
* @return Seq of LabeledPoint includes sparse vectors..
139+
*/
140+
@Since("1.6.0")
141+
def generateLinearSparseInput(
142+
intercept: Double,
143+
weights: Array[Double],
144+
xMean: Array[Double],
145+
xVariance: Array[Double],
146+
nPoints: Int,
147+
seed: Int,
148+
eps: Double): Seq[LabeledPoint] = {
149+
val rnd = new Random(seed)
150+
val x = Array.fill[Array[Double]](nPoints)(
151+
Array.fill[Double](weights.length)(rnd.nextDouble()))
152+
153+
x.foreach { v =>
154+
var i = 0
155+
val len = v.length
156+
while (i < len) {
157+
if (rnd.nextDouble() < 0.7) {
158+
v(i) = 0.0
159+
} else {
160+
v(i) = (v(i) - 0.5) * math.sqrt(12.0 * xVariance(i)) + xMean(i)
161+
}
162+
i += 1
163+
}
164+
}
165+
166+
val y = x.map { xi =>
167+
blas.ddot(weights.length, xi, 1, weights, 1) + intercept + eps * rnd.nextGaussian()
168+
}
169+
170+
val sparseX = x.map { (v: Array[Double]) =>
171+
v.zipWithIndex.filter{
172+
case (d: Double, i: Int) => d != 0.0
173+
}.map {
174+
case (d: Double, i: Int) => (i, d)
175+
}
176+
}
177+
y.zip(sparseX).map(p => LabeledPoint(p._1, Vectors.sparse(weights.length, p._2)))
178+
}
179+
127180
/**
128181
* Generate an RDD containing sample data for Linear Regression models - including Ridge, Lasso,
129182
* and uregularized variants.

mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala

Lines changed: 14 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
3434
private val seed: Int = 42
3535
@transient var dataset: DataFrame = _
3636
@transient var datasetWithoutIntercept: DataFrame = _
37-
@transient var datasetWithBigFeature: DataFrame = _
37+
@transient var datasetWithManyFeature: DataFrame = _
3838

3939
/*
4040
In `LinearRegressionSuite`, we will make sure that the model trained by SparkML
@@ -52,22 +52,27 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
5252
super.beforeAll()
5353
dataset = sqlContext.createDataFrame(
5454
sc.parallelize(LinearDataGenerator.generateLinearInput(
55-
6.3, Array(4.7, 7.2), Array(0.9, -1.3), Array(0.7, 1.2), 10000, seed, 0.1), 2))
55+
intercept = 6.3, weights = Array(4.7, 7.2), xMean = Array(0.9, -1.3),
56+
xVariance = Array(0.7, 1.2), nPoints = 10000, seed = seed, eps = 0.1), 2))
5657
/*
5758
datasetWithoutIntercept is not needed for correctness testing but is useful for illustrating
5859
training model without intercept
5960
*/
6061
datasetWithoutIntercept = sqlContext.createDataFrame(
6162
sc.parallelize(LinearDataGenerator.generateLinearInput(
62-
0.0, Array(4.7, 7.2), Array(0.9, -1.3), Array(0.7, 1.2), 10000, seed, 0.1), 2))
63+
intercept = 0.0, weights = Array(4.7, 7.2), xMean = Array(0.9, -1.3),
64+
xVariance = Array(0.7, 1.2), nPoints = 10000, seed = seed, eps = 0.1), 2))
6365

6466
val r = new Random(seed)
67+
// When feature size is larger than 4096, normal optimizer is choosed
68+
// as the solver of linear regression in the case of "auto" mode.
6569
val featureSize = 4100
66-
datasetWithBigFeature = sqlContext.createDataFrame(
67-
sc.parallelize(LinearDataGenerator.generateLinearInput(
68-
0.0, Seq.fill(featureSize)(r.nextDouble).toArray,
69-
Seq.fill(featureSize)(r.nextDouble).toArray,
70-
Seq.fill(featureSize)(r.nextDouble).toArray, 200, seed, 0.1
70+
datasetWithManyFeature = sqlContext.createDataFrame(
71+
sc.parallelize(LinearDataGenerator.generateLinearSparseInput(
72+
intercept = 0.0, weights = Seq.fill(featureSize)(r.nextDouble).toArray,
73+
xMean = Seq.fill(featureSize)(r.nextDouble).toArray,
74+
xVariance = Seq.fill(featureSize)(r.nextDouble).toArray, nPoints = 200,
75+
seed = seed, eps = 0.1
7176
), 2))
7277
}
7378

@@ -696,7 +701,7 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
696701

697702
test("linear regression model with l-bfgs with big feature datasets") {
698703
val trainer = new LinearRegression().setSolver("auto")
699-
val model = trainer.fit(datasetWithBigFeature)
704+
val model = trainer.fit(datasetWithManyFeature)
700705

701706
// Training results for the model should be available
702707
assert(model.hasSummary)

0 commit comments

Comments
 (0)