Skip to content

Commit fcd013c

Browse files
zero323jkbradley
authored andcommitted
[SPARK-12006][ML][PYTHON] Fix GMM failure if initialModel is not None
If initial model passed to GMM is not empty it causes `net.razorvine.pickle.PickleException`. It can be fixed by converting `initialModel.weights` to `list`. Author: zero323 <[email protected]> Closes #9986 from zero323/SPARK-12006.
1 parent ea489f1 commit fcd013c

File tree

2 files changed

+13
-1
lines changed

2 files changed

+13
-1
lines changed

python/pyspark/mllib/clustering.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -346,7 +346,7 @@ def train(cls, rdd, k, convergenceTol=1e-3, maxIterations=100, seed=None, initia
346346
if initialModel.k != k:
347347
raise Exception("Mismatched cluster count, initialModel.k = %s, however k = %s"
348348
% (initialModel.k, k))
349-
initialModelWeights = initialModel.weights
349+
initialModelWeights = list(initialModel.weights)
350350
initialModelMu = [initialModel.gaussians[i].mu for i in range(initialModel.k)]
351351
initialModelSigma = [initialModel.gaussians[i].sigma for i in range(initialModel.k)]
352352
java_model = callMLlibFunc("trainGaussianMixtureModel", rdd.map(_convert_to_vector),

python/pyspark/mllib/tests.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -475,6 +475,18 @@ def test_gmm_deterministic(self):
475475
for c1, c2 in zip(clusters1.weights, clusters2.weights):
476476
self.assertEqual(round(c1, 7), round(c2, 7))
477477

478+
def test_gmm_with_initial_model(self):
479+
from pyspark.mllib.clustering import GaussianMixture
480+
data = self.sc.parallelize([
481+
(-10, -5), (-9, -4), (10, 5), (9, 4)
482+
])
483+
484+
gmm1 = GaussianMixture.train(data, 2, convergenceTol=0.001,
485+
maxIterations=10, seed=63)
486+
gmm2 = GaussianMixture.train(data, 2, convergenceTol=0.001,
487+
maxIterations=10, seed=63, initialModel=gmm1)
488+
self.assertAlmostEqual((gmm1.weights - gmm2.weights).sum(), 0.0)
489+
478490
def test_classification(self):
479491
from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
480492
from pyspark.mllib.tree import DecisionTree, DecisionTreeModel, RandomForest,\

0 commit comments

Comments
 (0)