From c42f5dd69c5ee073e54c3f79e3e7eddbe8ec88c4 Mon Sep 17 00:00:00 2001 From: GayathriMurali Date: Mon, 25 Apr 2016 17:05:25 -0700 Subject: [PATCH 1/7] [SPARK-14894][PySpark] Add result summary api to Gaussian Mixture --- python/pyspark/ml/clustering.py | 121 +++++++++++++++++++++++++++++++- python/pyspark/ml/tests.py | 52 ++++++++++++++ 2 files changed, 172 insertions(+), 1 deletion(-) diff --git a/python/pyspark/ml/clustering.py b/python/pyspark/ml/clustering.py index 75d9a0e8cac1..29ccb788d6f1 100644 --- a/python/pyspark/ml/clustering.py +++ b/python/pyspark/ml/clustering.py @@ -17,7 +17,7 @@ from pyspark import since, keyword_only from pyspark.ml.util import * -from pyspark.ml.wrapper import JavaEstimator, JavaModel +from pyspark.ml.wrapper import JavaEstimator, JavaModel, JavaWrapper from pyspark.ml.param.shared import * from pyspark.ml.common import inherit_doc @@ -56,6 +56,125 @@ def gaussiansDF(self): """ return self._call_java("gaussiansDF") + @property + @since("2.0.0") + def summary(self): + """ + Gets summary of model on + training set. An exception is thrown if + `trainingSummary is None`. + """ + java_gmt_summary = self._call_java("summary") + return GaussianMixtureTrainingSummary(java_gmt_summary) + + @property + @since("2.0.0") + def hasSummary(self): + """ + Indicates whether a training summary exists for this model + instance. + """ + return self._call_java("hasSummary") + + @since("2.0.0") + def evaluate(self, dataset): + """ + Evaluates the model on a test dataset. + + :param dataset: + Test dataset to evaluate model on, where dataset is an + instance of :py:class:`pyspark.sql.DataFrame` + """ + if not isinstance(dataset, DataFrame): + raise ValueError("dataset must be a DataFrame but got %s." % type(dataset)) + java_gmt_summary = self._call_java("evaluate", dataset) + return GaussianMixtureSummary(java_gmt_summary) + + +class GaussianMixtureSummary(JavaWrapper): + """ + Abstraction for Gaussian Mixture Results for a given model. + + .. versionadded:: 2.0.0 + """ + + @property + @since("2.0.0") + def predictions(self): + """ + Dataframe outputted by the model's `transform` method. + """ + return self._call_java("predictions") + + @property + @since("2.0.0") + def probabilityCol(self): + """ + Field in "predictions" which gives the probability + of each class. + """ + return self._call_java("probabilityCol") + + @property + @since("2.0.0") + def featuresCol(self): + """ + Field in "predictions" which gives the features of each instance. + """ + return self._call_java("featuresCol") + + @property + @since("2.0.0") + def cluster(self): + """ + Cluster centers of the transformed data. + """ + return self._call_java("cluster") + + @property + @since("2.0.0") + def probability(self): + """ + Probability of each cluster. + """ + return self._call_java("probability") + + @property + @since("2.0.0") + def clusterSizes(self): + """ + Size of (number of data points in) each cluster. + """ + return self._call_java("clusterSizes") + + +@inherit_doc +class GaussianMixtureTrainingSummary(GaussianMixtureSummary): + """ + Abstraction for Gaussian Mixture Training results. + Currently, the training summary ignores the training weights except + for the objective trace. + + .. versionadded:: 2.0.0 + """ + + @property + @since("2.0.0") + def objectiveHistory(self): + """ + Objective function (scaled loss + regularization) at each + iteration. + """ + return self._call_java("objectiveHistory") + + @property + @since("2.0.0") + def totalIterations(self): + """ + Number of training iterations until termination. + """ + return self._call_java("totalIterations") + @inherit_doc class GaussianMixture(JavaEstimator, HasFeaturesCol, HasPredictionCol, HasMaxIter, HasTol, HasSeed, diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py index 981ed9dda042..18fcc085e94d 100755 --- a/python/pyspark/ml/tests.py +++ b/python/pyspark/ml/tests.py @@ -1070,6 +1070,58 @@ def test_logistic_regression_summary(self): sameSummary = model.evaluate(df) self.assertAlmostEqual(sameSummary.areaUnderROC, s.areaUnderROC) + def test_gaussian_mixture_summary(self): + from pyspark.mllib.linalg import Vectors + sqlContext = SQLContext(self.sc) + df = sqlContext.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)), + (0.0, 2.0, Vectors.sparse(1, [], []))], + ["features"]) + gm = GaussianMixture(k=3, tol=0.0001, maxIter=10, seed=10) + model = gm.fit(df) + self.assertTrue(model.hasSummary) + s = model.summary + # test that api is callable and returns expected types + self.assertGreater(s.totalIterations, 0) + self.assertTrue(isinstance(s.predictions, DataFrame)) + self.assertEqual(s.predictionCol, "prediction") + self.assertEqual(s.featuresCol, "features") + objHist = s.objectiveHistory + cluster_sizes = s.clusterSizes + self.assertTrue(isinstance(objHist, list) and isinstance(objHist[0], float)) + self.assertTrue(isinstance(s.cluster, DataFrame)) + self.assertTrue(isinstance(s.probability, DataFrame)) + self.assertEqual(isinstance(cluster_sizes[0], long)) + # test evaluation (with a training dataset) produces a summary with same values + # one check is enough to verify a summary is returned, Scala version runs full test + sameSummary = model.evaluate(df) + self.assertAlmostEqual(sameSummary.cluster, s.cluster) + + def test_gaussian_mixture_summary(self): + from pyspark.mllib.linalg import Vectors + sqlContext = SQLContext(self.sc) + df = sqlContext.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)), + (0.0, 2.0, Vectors.sparse(1, [], []))], + ["features"]) + gm = GaussianMixture(k=3, tol=0.0001, maxIter=10, seed=10) + model = gm.fit(df) + self.assertTrue(model.hasSummary) + s = model.summary + # test that api is callable and returns expected types + self.assertGreater(s.totalIterations, 0) + self.assertTrue(isinstance(s.predictions, DataFrame)) + self.assertEqual(s.predictionCol, "prediction") + self.assertEqual(s.featuresCol, "features") + objHist = s.objectiveHistory + cluster_sizes = s.clusterSizes + self.assertTrue(isinstance(objHist, list) and isinstance(objHist[0], float)) + self.assertTrue(isinstance(s.cluster, DataFrame)) + self.assertTrue(isinstance(s.probability, DataFrame)) + self.assertEqual(isinstance(cluster_sizes[0], long)) + # test evaluation (with a training dataset) produces a summary with same values + # one check is enough to verify a summary is returned, Scala version runs full test + sameSummary = model.evaluate(df) + self.assertAlmostEqual(sameSummary.cluster, s.cluster) + class OneVsRestTests(SparkSessionTestCase): From 7db9c0dd8458b2516310ad3c8c0ae01a4061343d Mon Sep 17 00:00:00 2001 From: GayathriMurali Date: Mon, 25 Apr 2016 17:10:30 -0700 Subject: [PATCH 2/7] Fixed a duplication in tests.py --- python/pyspark/ml/tests.py | 26 -------------------------- 1 file changed, 26 deletions(-) diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py index 18fcc085e94d..bcd5d8a481f9 100755 --- a/python/pyspark/ml/tests.py +++ b/python/pyspark/ml/tests.py @@ -1070,32 +1070,6 @@ def test_logistic_regression_summary(self): sameSummary = model.evaluate(df) self.assertAlmostEqual(sameSummary.areaUnderROC, s.areaUnderROC) - def test_gaussian_mixture_summary(self): - from pyspark.mllib.linalg import Vectors - sqlContext = SQLContext(self.sc) - df = sqlContext.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)), - (0.0, 2.0, Vectors.sparse(1, [], []))], - ["features"]) - gm = GaussianMixture(k=3, tol=0.0001, maxIter=10, seed=10) - model = gm.fit(df) - self.assertTrue(model.hasSummary) - s = model.summary - # test that api is callable and returns expected types - self.assertGreater(s.totalIterations, 0) - self.assertTrue(isinstance(s.predictions, DataFrame)) - self.assertEqual(s.predictionCol, "prediction") - self.assertEqual(s.featuresCol, "features") - objHist = s.objectiveHistory - cluster_sizes = s.clusterSizes - self.assertTrue(isinstance(objHist, list) and isinstance(objHist[0], float)) - self.assertTrue(isinstance(s.cluster, DataFrame)) - self.assertTrue(isinstance(s.probability, DataFrame)) - self.assertEqual(isinstance(cluster_sizes[0], long)) - # test evaluation (with a training dataset) produces a summary with same values - # one check is enough to verify a summary is returned, Scala version runs full test - sameSummary = model.evaluate(df) - self.assertAlmostEqual(sameSummary.cluster, s.cluster) - def test_gaussian_mixture_summary(self): from pyspark.mllib.linalg import Vectors sqlContext = SQLContext(self.sc) From b19756cb573f52692ceb253e3772d293a056d916 Mon Sep 17 00:00:00 2001 From: GayathriMurali Date: Tue, 26 Apr 2016 14:29:30 -0700 Subject: [PATCH 3/7] Fixed line break in PyDoc --- python/pyspark/ml/clustering.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/python/pyspark/ml/clustering.py b/python/pyspark/ml/clustering.py index 29ccb788d6f1..445a5ed7dd87 100644 --- a/python/pyspark/ml/clustering.py +++ b/python/pyspark/ml/clustering.py @@ -60,8 +60,7 @@ def gaussiansDF(self): @since("2.0.0") def summary(self): """ - Gets summary of model on - training set. An exception is thrown if + Gets summary of model on training set. An exception is thrown if `trainingSummary is None`. """ java_gmt_summary = self._call_java("summary") From 7d16a2370fb17b3fcb82ff9fb48e83ad46244c7f Mon Sep 17 00:00:00 2001 From: GayathriMurali Date: Fri, 17 Jun 2016 17:53:03 -0700 Subject: [PATCH 4/7] Fixed python style issues --- python/pyspark/ml/tests.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py index bcd5d8a481f9..621fbd100617 100755 --- a/python/pyspark/ml/tests.py +++ b/python/pyspark/ml/tests.py @@ -1097,6 +1097,7 @@ def test_gaussian_mixture_summary(self): self.assertAlmostEqual(sameSummary.cluster, s.cluster) + class OneVsRestTests(SparkSessionTestCase): def test_copy(self): From 3cf080a54899ad164dc5ad0643ff2dc6ba69743f Mon Sep 17 00:00:00 2001 From: GayathriMurali Date: Fri, 17 Jun 2016 20:24:42 -0700 Subject: [PATCH 5/7] Fixed python style issues --- python/pyspark/ml/tests.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py index 621fbd100617..bcd5d8a481f9 100755 --- a/python/pyspark/ml/tests.py +++ b/python/pyspark/ml/tests.py @@ -1097,7 +1097,6 @@ def test_gaussian_mixture_summary(self): self.assertAlmostEqual(sameSummary.cluster, s.cluster) - class OneVsRestTests(SparkSessionTestCase): def test_copy(self): From c2b1aef45d98110c263f8ae53e6402871724b8d2 Mon Sep 17 00:00:00 2001 From: GayathriMurali Date: Sun, 19 Jun 2016 14:02:50 -0700 Subject: [PATCH 6/7] Fixed python unit test issues --- python/pyspark/ml/clustering.py | 46 +-------------------------------- python/pyspark/ml/tests.py | 15 ++--------- 2 files changed, 3 insertions(+), 58 deletions(-) diff --git a/python/pyspark/ml/clustering.py b/python/pyspark/ml/clustering.py index 445a5ed7dd87..23e88765e0b6 100644 --- a/python/pyspark/ml/clustering.py +++ b/python/pyspark/ml/clustering.py @@ -64,7 +64,7 @@ def summary(self): `trainingSummary is None`. """ java_gmt_summary = self._call_java("summary") - return GaussianMixtureTrainingSummary(java_gmt_summary) + return GaussianMixtureSummary(java_gmt_summary) @property @since("2.0.0") @@ -75,20 +75,6 @@ def hasSummary(self): """ return self._call_java("hasSummary") - @since("2.0.0") - def evaluate(self, dataset): - """ - Evaluates the model on a test dataset. - - :param dataset: - Test dataset to evaluate model on, where dataset is an - instance of :py:class:`pyspark.sql.DataFrame` - """ - if not isinstance(dataset, DataFrame): - raise ValueError("dataset must be a DataFrame but got %s." % type(dataset)) - java_gmt_summary = self._call_java("evaluate", dataset) - return GaussianMixtureSummary(java_gmt_summary) - class GaussianMixtureSummary(JavaWrapper): """ @@ -146,36 +132,6 @@ def clusterSizes(self): """ return self._call_java("clusterSizes") - -@inherit_doc -class GaussianMixtureTrainingSummary(GaussianMixtureSummary): - """ - Abstraction for Gaussian Mixture Training results. - Currently, the training summary ignores the training weights except - for the objective trace. - - .. versionadded:: 2.0.0 - """ - - @property - @since("2.0.0") - def objectiveHistory(self): - """ - Objective function (scaled loss + regularization) at each - iteration. - """ - return self._call_java("objectiveHistory") - - @property - @since("2.0.0") - def totalIterations(self): - """ - Number of training iterations until termination. - """ - return self._call_java("totalIterations") - - -@inherit_doc class GaussianMixture(JavaEstimator, HasFeaturesCol, HasPredictionCol, HasMaxIter, HasTol, HasSeed, HasProbabilityCol, JavaMLWritable, JavaMLReadable): """ diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py index bcd5d8a481f9..ebfc13ac740e 100755 --- a/python/pyspark/ml/tests.py +++ b/python/pyspark/ml/tests.py @@ -1072,29 +1072,18 @@ def test_logistic_regression_summary(self): def test_gaussian_mixture_summary(self): from pyspark.mllib.linalg import Vectors - sqlContext = SQLContext(self.sc) - df = sqlContext.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)), - (0.0, 2.0, Vectors.sparse(1, [], []))], - ["features"]) + df = self.spark.read.format("libsvm").load("data/mllib/sample_kmeans_data.txt") gm = GaussianMixture(k=3, tol=0.0001, maxIter=10, seed=10) model = gm.fit(df) self.assertTrue(model.hasSummary) s = model.summary # test that api is callable and returns expected types - self.assertGreater(s.totalIterations, 0) self.assertTrue(isinstance(s.predictions, DataFrame)) - self.assertEqual(s.predictionCol, "prediction") self.assertEqual(s.featuresCol, "features") - objHist = s.objectiveHistory cluster_sizes = s.clusterSizes - self.assertTrue(isinstance(objHist, list) and isinstance(objHist[0], float)) self.assertTrue(isinstance(s.cluster, DataFrame)) self.assertTrue(isinstance(s.probability, DataFrame)) - self.assertEqual(isinstance(cluster_sizes[0], long)) - # test evaluation (with a training dataset) produces a summary with same values - # one check is enough to verify a summary is returned, Scala version runs full test - sameSummary = model.evaluate(df) - self.assertAlmostEqual(sameSummary.cluster, s.cluster) + self.assertTrue(isinstance(cluster_sizes[0], int)) class OneVsRestTests(SparkSessionTestCase): From 3bc75a3f413d1c4bdfd774cafbd1034ef50d216c Mon Sep 17 00:00:00 2001 From: GayathriMurali Date: Sun, 19 Jun 2016 14:13:10 -0700 Subject: [PATCH 7/7] Style issue --- python/pyspark/ml/clustering.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/pyspark/ml/clustering.py b/python/pyspark/ml/clustering.py index 23e88765e0b6..8dbabc6d502a 100644 --- a/python/pyspark/ml/clustering.py +++ b/python/pyspark/ml/clustering.py @@ -132,6 +132,7 @@ def clusterSizes(self): """ return self._call_java("clusterSizes") + class GaussianMixture(JavaEstimator, HasFeaturesCol, HasPredictionCol, HasMaxIter, HasTol, HasSeed, HasProbabilityCol, JavaMLWritable, JavaMLReadable): """