1818package org .apache .spark .ml .clustering
1919
2020import org .apache .spark .SparkFunSuite
21- import org .apache .spark .ml .linalg .{Matrices , Vector , Vectors }
21+ import org .apache .spark .ml .linalg .{DenseMatrix , Matrices , Vector , Vectors }
2222import org .apache .spark .ml .param .ParamMap
23+ import org .apache .spark .ml .stat .distribution .MultivariateGaussian
2324import org .apache .spark .ml .util .{DefaultReadWriteTest , MLTestingUtils }
2425import org .apache .spark .ml .util .TestingUtils ._
2526import org .apache .spark .mllib .util .MLlibTestSparkContext
@@ -33,6 +34,7 @@ class GaussianMixtureSuite extends SparkFunSuite with MLlibTestSparkContext
3334 import GaussianMixtureSuite ._
3435
3536 final val k = 5
37+ private val seed = 538009335
3638 @ transient var dataset : Dataset [_] = _
3739 @ transient var denseDataset : Dataset [_] = _
3840 @ transient var sparseDataset : Dataset [_] = _
@@ -45,7 +47,7 @@ class GaussianMixtureSuite extends SparkFunSuite with MLlibTestSparkContext
4547 dataset = KMeansSuite .generateKMeansData(spark, 50 , 3 , k)
4648 denseDataset = denseData.map(FeatureData ).toDF()
4749 sparseDataset = denseData.map { point =>
48- FeatureData (Vectors .sparse( 1 , Array ( 0 ), point.toArray) )
50+ FeatureData (point.toSparse )
4951 }.toDF()
5052 decompositionDataset = decompositionData.map(FeatureData ).toDF()
5153 rDataset = rData.map(FeatureData ).toDF()
@@ -144,40 +146,36 @@ class GaussianMixtureSuite extends SparkFunSuite with MLlibTestSparkContext
144146
145147 test(" univariate dense data with two clusters" ) {
146148 val weights = Array (2.0 / 3.0 , 1.0 / 3.0 )
147- val mean = Array (Vectors .dense(5.1604 ), Vectors .dense(- 4.3673 ))
148- val cov = Array (Matrices .dense(1 , 1 , Array (0.86644 )), Matrices .dense(1 , 1 , Array (1.1098 )))
149-
150- val gmm = new GaussianMixture ().setK(2 ).fit(denseDataset)
149+ val means = Array (Vectors .dense(5.1604 ), Vectors .dense(- 4.3673 ))
150+ val covs = Array (Matrices .dense(1 , 1 , Array (0.86644 )), Matrices .dense(1 , 1 , Array (1.1098 )))
151+ val gaussians = means.zip(covs).map { case (mean, cov) =>
152+ new MultivariateGaussian (mean, cov)
153+ }
151154
152- assert(gmm.weights(0 ) ~== weights(0 ) absTol 1E-3 )
153- assert(gmm.weights(1 ) ~== weights(1 ) absTol 1E-3 )
154- assert(gmm.gaussians(0 ).mean ~== mean(0 ) absTol 1E-3 )
155- assert(gmm.gaussians(1 ).mean ~== mean(1 ) absTol 1E-3 )
156- assert(gmm.gaussians(0 ).cov ~== cov(0 ) absTol 1E-3 )
157- assert(gmm.gaussians(1 ).cov ~== cov(1 ) absTol 1E-3 )
155+ val expected = new GaussianMixtureModel (" dummy" , weights, gaussians)
156+ val actual = new GaussianMixture ().setK(2 ).setSeed(seed).fit(denseDataset)
157+ modelEquals(expected, actual)
158158 }
159159
160160 test(" univariate sparse data with two clusters" ) {
161161 val weights = Array (2.0 / 3.0 , 1.0 / 3.0 )
162- val mean = Array (Vectors .dense(5.1604 ), Vectors .dense(- 4.3673 ))
163- val cov = Array (Matrices .dense(1 , 1 , Array (0.86644 )), Matrices .dense(1 , 1 , Array (1.1098 )))
164-
165- val gmm = new GaussianMixture ().setK(2 ).fit(sparseDataset)
162+ val means = Array (Vectors .dense(5.1604 ), Vectors .dense(- 4.3673 ))
163+ val covs = Array (Matrices .dense(1 , 1 , Array (0.86644 )), Matrices .dense(1 , 1 , Array (1.1098 )))
164+ val gaussians = means.zip(covs).map { case (mean, cov) =>
165+ new MultivariateGaussian (mean, cov)
166+ }
166167
167- assert(gmm.weights(0 ) ~== weights(0 ) absTol 1E-3 )
168- assert(gmm.weights(1 ) ~== weights(1 ) absTol 1E-3 )
169- assert(gmm.gaussians(0 ).mean ~== mean(0 ) absTol 1E-3 )
170- assert(gmm.gaussians(1 ).mean ~== mean(1 ) absTol 1E-3 )
171- assert(gmm.gaussians(0 ).cov ~== cov(0 ) absTol 1E-3 )
172- assert(gmm.gaussians(1 ).cov ~== cov(1 ) absTol 1E-3 )
168+ val expected = new GaussianMixtureModel (" dummy" , weights, gaussians)
169+ val actual = new GaussianMixture ().setK(2 ).setSeed(seed).fit(sparseDataset)
170+ modelEquals(expected, actual)
173171 }
174172
175173 test(" check distributed decomposition" ) {
176174 val k = 5
177175 val d = decompositionData.head.size
178176 assert(GaussianMixture .shouldDistributeGaussians(k, d))
179177
180- val gmm = new GaussianMixture ().setK(k).fit(decompositionDataset)
178+ val gmm = new GaussianMixture ().setK(k).setSeed(seed). fit(decompositionDataset)
181179 assert(gmm.getK === k)
182180 }
183181
@@ -213,18 +211,16 @@ class GaussianMixtureSuite extends SparkFunSuite with MLlibTestSparkContext
213211 [2,] 0.1607830 1.008878
214212 */
215213 val weights = Array (0.5333333 , 0.4666667 )
216- val mean = Array (Vectors .dense(10.363673 , 9.897081 ), Vectors .dense(0.11731091 , - 0.06192351 ))
217- val cov = Array (Matrices .dense(2 , 2 , Array (0.2961543 , 0.1607830 , 0.160783 , 1.008878 )),
214+ val means = Array (Vectors .dense(10.363673 , 9.897081 ), Vectors .dense(0.11731091 , - 0.06192351 ))
215+ val covs = Array (Matrices .dense(2 , 2 , Array (0.2961543 , 0.1607830 , 0.160783 , 1.008878 )),
218216 Matrices .dense(2 , 2 , Array (0.62049934 , 0.06880802 , 0.06880802 , 1.27431874 )))
217+ val gaussians = means.zip(covs).map { case (mean, cov) =>
218+ new MultivariateGaussian (mean, cov)
219+ }
219220
220- val gmm = new GaussianMixture ().setK(2 ).fit(rDataset)
221-
222- assert(gmm.weights(0 ) ~== weights(0 ) absTol 1E-3 )
223- assert(gmm.weights(1 ) ~== weights(1 ) absTol 1E-3 )
224- assert(gmm.gaussians(0 ).mean ~== mean(0 ) absTol 1E-3 )
225- assert(gmm.gaussians(1 ).mean ~== mean(1 ) absTol 1E-3 )
226- assert(gmm.gaussians(0 ).cov ~== cov(0 ) absTol 1E-3 )
227- assert(gmm.gaussians(1 ).cov ~== cov(1 ) absTol 1E-3 )
221+ val expected = new GaussianMixtureModel (" dummy" , weights, gaussians)
222+ val actual = new GaussianMixture ().setK(2 ).setSeed(seed).fit(rDataset)
223+ modelEquals(expected, actual)
228224 }
229225
230226 test(" upper triangular matrix unpacking" ) {
@@ -238,12 +234,13 @@ class GaussianMixtureSuite extends SparkFunSuite with MLlibTestSparkContext
238234 val triangularValues = Array (1.0 , 2.5 , 2.0 , 3.8 , 7.2 , 3.0 , 0.9 , 3.8 , 1.0 , 4.0 )
239235 val symmetricValues = Array (1.0 , 2.5 , 3.8 , 0.9 , 2.5 , 2.0 , 7.2 , 3.8 ,
240236 3.8 , 7.2 , 3.0 , 1.0 , 0.9 , 3.8 , 1.0 , 4.0 )
241- val expected = GaussianMixture .unpackUpperTriangularMatrix(4 , triangularValues)
242- assert(symmetricValues === expected)
237+ val symmetricMatrix = new DenseMatrix (4 , 4 , symmetricValues)
238+ val expectedMatrix = GaussianMixture .unpackUpperTriangularMatrix(4 , triangularValues)
239+ assert(symmetricMatrix === expectedMatrix)
243240 }
244241}
245242
246- object GaussianMixtureSuite {
243+ object GaussianMixtureSuite extends SparkFunSuite {
247244 /**
248245 * Mapping from all Params to valid settings which differ from the defaults.
249246 * This is useful for tests which need to exercise all Params, such as save/load.
@@ -281,4 +278,12 @@ object GaussianMixtureSuite {
281278 )
282279
283280 case class FeatureData (features : Vector )
281+
282+ def modelEquals (m1 : GaussianMixtureModel , m2 : GaussianMixtureModel ): Unit = {
283+ assert(m1.weights.length === m2.weights.length)
284+ for (i <- m1.weights.indices) {
285+ assert(m1.gaussians(i).mean ~== m2.gaussians(i).mean absTol 1E-3 )
286+ assert(m1.gaussians(i).cov ~== m2.gaussians(i).cov absTol 1E-3 )
287+ }
288+ }
284289}
0 commit comments