From 8d69a3291afed0911a80973b8f9c17564916324e Mon Sep 17 00:00:00 2001 From: Martin MENESTRET Date: Thu, 6 Aug 2015 18:28:48 +0200 Subject: [PATCH 1/5] SPARK-9690 Adding the possibility to set the seed of the rand in the CrossValidator fold --- python/pyspark/ml/tuning.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/python/pyspark/ml/tuning.py b/python/pyspark/ml/tuning.py index 705ee53685752..345c5a4211ef7 100644 --- a/python/pyspark/ml/tuning.py +++ b/python/pyspark/ml/tuning.py @@ -129,7 +129,7 @@ class CrossValidator(Estimator): numFolds = Param(Params._dummy(), "numFolds", "number of folds for cross validation") @keyword_only - def __init__(self, estimator=None, estimatorParamMaps=None, evaluator=None, numFolds=3): + def __init__(self, estimator=None, estimatorParamMaps=None, evaluator=None, numFolds=3, seed=0): """ __init__(self, estimator=None, estimatorParamMaps=None, evaluator=None, numFolds=3) """ @@ -144,6 +144,8 @@ def __init__(self, estimator=None, estimatorParamMaps=None, evaluator=None, numF self, "evaluator", "evaluator used to select hyper-parameters that maximize the cross-validated metric") #: param for number of folds for cross validation + self._setDefault(seed=0) + self.seed = Param(self, "seed", "seed value used for k-fold") self.numFolds = Param(self, "numFolds", "number of folds for cross validation") self._setDefault(numFolds=3) kwargs = self.__init__._input_kwargs @@ -227,7 +229,7 @@ def _fit(self, dataset): nFolds = self.getOrDefault(self.numFolds) h = 1.0 / nFolds randCol = self.uid + "_rand" - df = dataset.select("*", rand(0).alias(randCol)) + df = dataset.select("*", rand(self.getOrDefault(self.seed)).alias(randCol)) metrics = np.zeros(numModels) for i in range(nFolds): validateLB = i * h From 85aae68f25f81e709973a9adba0b43e3f087fd1c Mon Sep 17 00:00:00 2001 From: "Joseph K. Bradley" Date: Fri, 11 Dec 2015 12:51:10 -0800 Subject: [PATCH 2/5] Fixed to use HasSeed --- python/pyspark/ml/tuning.py | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/python/pyspark/ml/tuning.py b/python/pyspark/ml/tuning.py index 345c5a4211ef7..2a467aa592b49 100644 --- a/python/pyspark/ml/tuning.py +++ b/python/pyspark/ml/tuning.py @@ -19,7 +19,7 @@ import numpy as np from pyspark import since -from pyspark.ml.param import Params, Param +from pyspark.ml.param import HasSeed, Params, Param from pyspark.ml import Estimator, Model from pyspark.ml.util import keyword_only from pyspark.sql.functions import rand @@ -89,7 +89,7 @@ def build(self): return [dict(zip(keys, prod)) for prod in itertools.product(*grid_values)] -class CrossValidator(Estimator): +class CrossValidator(Estimator, HasSeed): """ K-fold cross validation. @@ -106,7 +106,7 @@ class CrossValidator(Estimator): >>> lr = LogisticRegression() >>> grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build() >>> evaluator = BinaryClassificationEvaluator() - >>> cv = CrossValidator(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator) + >>> cv = CrossValidator(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator, seed=42) >>> cvModel = cv.fit(dataset) >>> evaluator.evaluate(cvModel.transform(dataset)) 0.8333... @@ -129,9 +129,11 @@ class CrossValidator(Estimator): numFolds = Param(Params._dummy(), "numFolds", "number of folds for cross validation") @keyword_only - def __init__(self, estimator=None, estimatorParamMaps=None, evaluator=None, numFolds=3, seed=0): + def __init__(self, estimator=None, estimatorParamMaps=None, evaluator=None, numFolds=3, + seed=None): """ - __init__(self, estimator=None, estimatorParamMaps=None, evaluator=None, numFolds=3) + __init__(self, estimator=None, estimatorParamMaps=None, evaluator=None, numFolds=3, + seed=None) """ super(CrossValidator, self).__init__() #: param for estimator to be cross-validated @@ -144,8 +146,6 @@ def __init__(self, estimator=None, estimatorParamMaps=None, evaluator=None, numF self, "evaluator", "evaluator used to select hyper-parameters that maximize the cross-validated metric") #: param for number of folds for cross validation - self._setDefault(seed=0) - self.seed = Param(self, "seed", "seed value used for k-fold") self.numFolds = Param(self, "numFolds", "number of folds for cross validation") self._setDefault(numFolds=3) kwargs = self.__init__._input_kwargs @@ -153,9 +153,11 @@ def __init__(self, estimator=None, estimatorParamMaps=None, evaluator=None, numF @keyword_only @since("1.4.0") - def setParams(self, estimator=None, estimatorParamMaps=None, evaluator=None, numFolds=3): + def setParams(self, estimator=None, estimatorParamMaps=None, evaluator=None, numFolds=3, + seed=None): """ - setParams(self, estimator=None, estimatorParamMaps=None, evaluator=None, numFolds=3): + setParams(self, estimator=None, estimatorParamMaps=None, evaluator=None, numFolds=3, + seed=None): Sets params for cross validator. """ kwargs = self.setParams._input_kwargs @@ -227,9 +229,10 @@ def _fit(self, dataset): numModels = len(epm) eva = self.getOrDefault(self.evaluator) nFolds = self.getOrDefault(self.numFolds) + seed = self.getOrDefault(self.seed) h = 1.0 / nFolds randCol = self.uid + "_rand" - df = dataset.select("*", rand(self.getOrDefault(self.seed)).alias(randCol)) + df = dataset.select("*", rand(seed).alias(randCol)) metrics = np.zeros(numModels) for i in range(nFolds): validateLB = i * h From 2f606079747c9935954254e615135d7c08624c58 Mon Sep 17 00:00:00 2001 From: "Joseph K. Bradley" Date: Fri, 11 Dec 2015 14:19:53 -0800 Subject: [PATCH 3/5] fixed import --- python/pyspark/ml/tuning.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/pyspark/ml/tuning.py b/python/pyspark/ml/tuning.py index 2a467aa592b49..59bdeb6afdf63 100644 --- a/python/pyspark/ml/tuning.py +++ b/python/pyspark/ml/tuning.py @@ -19,8 +19,9 @@ import numpy as np from pyspark import since -from pyspark.ml.param import HasSeed, Params, Param from pyspark.ml import Estimator, Model +from pyspark.ml.param import Params, Param +from pyspark.ml.param.shared import HasSeed from pyspark.ml.util import keyword_only from pyspark.sql.functions import rand From 0bd1457a67a59b49f2976f74a34e6eb8f6db2078 Mon Sep 17 00:00:00 2001 From: "Joseph K. Bradley" Date: Tue, 15 Dec 2015 16:58:38 -0800 Subject: [PATCH 4/5] fixed doc issue --- python/pyspark/ml/tuning.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/pyspark/ml/tuning.py b/python/pyspark/ml/tuning.py index 59bdeb6afdf63..97bdebffb4e12 100644 --- a/python/pyspark/ml/tuning.py +++ b/python/pyspark/ml/tuning.py @@ -133,7 +133,7 @@ class CrossValidator(Estimator, HasSeed): def __init__(self, estimator=None, estimatorParamMaps=None, evaluator=None, numFolds=3, seed=None): """ - __init__(self, estimator=None, estimatorParamMaps=None, evaluator=None, numFolds=3, + __init__(self, estimator=None, estimatorParamMaps=None, evaluator=None, numFolds=3,\ seed=None) """ super(CrossValidator, self).__init__() @@ -157,7 +157,7 @@ def __init__(self, estimator=None, estimatorParamMaps=None, evaluator=None, numF def setParams(self, estimator=None, estimatorParamMaps=None, evaluator=None, numFolds=3, seed=None): """ - setParams(self, estimator=None, estimatorParamMaps=None, evaluator=None, numFolds=3, + setParams(self, estimator=None, estimatorParamMaps=None, evaluator=None, numFolds=3,\ seed=None): Sets params for cross validator. """ From 9bf75adc5f39cfa7015b6c9934350bfa40470010 Mon Sep 17 00:00:00 2001 From: "Joseph K. Bradley" Date: Wed, 16 Dec 2015 11:09:52 -0800 Subject: [PATCH 5/5] removed seed from CrossValidator example in tuning.py --- python/pyspark/ml/tuning.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyspark/ml/tuning.py b/python/pyspark/ml/tuning.py index 97bdebffb4e12..08f8db57f4400 100644 --- a/python/pyspark/ml/tuning.py +++ b/python/pyspark/ml/tuning.py @@ -107,7 +107,7 @@ class CrossValidator(Estimator, HasSeed): >>> lr = LogisticRegression() >>> grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build() >>> evaluator = BinaryClassificationEvaluator() - >>> cv = CrossValidator(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator, seed=42) + >>> cv = CrossValidator(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator) >>> cvModel = cv.fit(dataset) >>> evaluator.evaluate(cvModel.transform(dataset)) 0.8333...