From aa2785136e8be6aa1805b26a10964956e6650fc2 Mon Sep 17 00:00:00 2001 From: Huaxin Gao Date: Wed, 30 May 2018 16:49:31 -0700 Subject: [PATCH 1/8] [SPARK-24333][ML][PYTHON]Add fit with validation set to spark.ml GBT: Python API --- python/pyspark/ml/classification.py | 40 +++++++++++++++---- .../ml/param/_shared_params_code_gen.py | 5 ++- python/pyspark/ml/param/shared.py | 22 ++++++++++ python/pyspark/ml/regression.py | 40 +++++++++++++++---- 4 files changed, 92 insertions(+), 15 deletions(-) diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index ce028512357f2..dc75182355366 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -1176,8 +1176,8 @@ def trees(self): @inherit_doc class GBTClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, HasMaxIter, - GBTParams, HasCheckpointInterval, HasStepSize, HasSeed, JavaMLWritable, - JavaMLReadable): + GBTParams, HasCheckpointInterval, HasStepSize, HasSeed, + HasValidationIndicatorCol, JavaMLWritable, JavaMLReadable): """ `Gradient-Boosted Trees (GBTs) `_ learning algorithm for classification. @@ -1242,6 +1242,11 @@ class GBTClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol [0.25..., 0.23..., 0.21..., 0.19..., 0.18...] >>> model.numClasses 2 + >>> gbt = gbt.setValidationIndicatorCol("validationIndicator") + >>> gbt.getValidationIndicatorCol() + 'validationIndicator' + >>> gbt.getValidationTol() + 0.01 .. versionadded:: 1.4.0 """ @@ -1256,18 +1261,25 @@ class GBTClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol "the contribution of each estimator.", typeConverter=TypeConverters.toFloat) + validationTol = Param(Params._dummy(), "validationTol", + "Threshold for stopping early when fit with validation is used. " + + "If the error rate on the validation input changes by less than the " + + "validationTol, then learning will stop early (before `maxIter`). " + + "This parameter is ignored when fit without validation is used.", + typeConverter=TypeConverters.toFloat) + @keyword_only def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, lossType="logistic", maxIter=20, stepSize=0.1, seed=None, subsamplingRate=1.0, - featureSubsetStrategy="all"): + featureSubsetStrategy="all", validationTol=0.01): """ __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", \ maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, \ maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, \ lossType="logistic", maxIter=20, stepSize=0.1, seed=None, subsamplingRate=1.0, \ - featureSubsetStrategy="all") + featureSubsetStrategy="all", validationTol=0.01) """ super(GBTClassifier, self).__init__() self._java_obj = self._new_java_obj( @@ -1275,7 +1287,7 @@ def __init__(self, featuresCol="features", labelCol="label", predictionCol="pred self._setDefault(maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, lossType="logistic", maxIter=20, stepSize=0.1, subsamplingRate=1.0, - featureSubsetStrategy="all") + featureSubsetStrategy="all", validationTol=0.01) kwargs = self._input_kwargs self.setParams(**kwargs) @@ -1285,13 +1297,13 @@ def setParams(self, featuresCol="features", labelCol="label", predictionCol="pre maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, lossType="logistic", maxIter=20, stepSize=0.1, seed=None, subsamplingRate=1.0, - featureSubsetStrategy="all"): + featureSubsetStrategy="all", validationTol=0.01): """ setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", \ maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, \ maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, \ lossType="logistic", maxIter=20, stepSize=0.1, seed=None, subsamplingRate=1.0, \ - featureSubsetStrategy="all") + featureSubsetStrategy="all", validationTol=0.01) Sets params for Gradient Boosted Tree Classification. """ kwargs = self._input_kwargs @@ -1321,6 +1333,20 @@ def setFeatureSubsetStrategy(self, value): """ return self._set(featureSubsetStrategy=value) + @since("2.4.0") + def setValidationTol(self, value): + """ + Sets the value of :py:attr:`validationTol`. + """ + return self._set(validationTol=value) + + @since("2.4.0") + def getValidationTol(self): + """ + Gets the value of validationTol or its default value. + """ + return self.getOrDefault(self.validationTol) + class GBTClassificationModel(TreeEnsembleModel, JavaClassificationModel, JavaMLWritable, JavaMLReadable): diff --git a/python/pyspark/ml/param/_shared_params_code_gen.py b/python/pyspark/ml/param/_shared_params_code_gen.py index e45ba840b412b..1b0c8c5d28b78 100644 --- a/python/pyspark/ml/param/_shared_params_code_gen.py +++ b/python/pyspark/ml/param/_shared_params_code_gen.py @@ -164,7 +164,10 @@ def get$Name(self): "False", "TypeConverters.toBoolean"), ("loss", "the loss function to be optimized.", None, "TypeConverters.toString"), ("distanceMeasure", "the distance measure. Supported options: 'euclidean' and 'cosine'.", - "'euclidean'", "TypeConverters.toString")] + "'euclidean'", "TypeConverters.toString"), + ("validationIndicatorCol", "name of the column that indicates whether each row is for " + + "training or for validation. False indicates training; true indicates validation.", + None, "TypeConverters.toString")] code = [] for name, doc, defaultValueStr, typeConverter in shared: diff --git a/python/pyspark/ml/param/shared.py b/python/pyspark/ml/param/shared.py index 618f5bf0a8103..d73ea0e2e6ace 100644 --- a/python/pyspark/ml/param/shared.py +++ b/python/pyspark/ml/param/shared.py @@ -814,3 +814,25 @@ def getDistanceMeasure(self): """ return self.getOrDefault(self.distanceMeasure) + +class HasValidationIndicatorCol(Params): + """ + Mixin for param validationIndicatorCol: name of the column that indicates whether each row is for training or for validation. false indicates training; true indicates validation. + """ + + validationIndicatorCol = Param(Params._dummy(), "validationIndicatorCol", "name of the column that indicates whether each row is for training or for validation. false indicates training; true indicates validation.", typeConverter=TypeConverters.toString) + + def __init__(self): + super(HasValidationIndicatorCol, self).__init__() + + def setValidationIndicatorCol(self, value): + """ + Sets the value of :py:attr:`validationIndicatorCol`. + """ + return self._set(validationIndicatorCol=value) + + def getValidationIndicatorCol(self): + """ + Gets the value of validationIndicatorCol or its default value. + """ + return self.getOrDefault(self.validationIndicatorCol) \ No newline at end of file diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py index 98f4361351847..c336154355d1b 100644 --- a/python/pyspark/ml/regression.py +++ b/python/pyspark/ml/regression.py @@ -1031,8 +1031,8 @@ def featureImportances(self): @inherit_doc class GBTRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, HasMaxIter, - GBTParams, HasCheckpointInterval, HasStepSize, HasSeed, JavaMLWritable, - JavaMLReadable, TreeRegressorParams): + GBTParams, HasCheckpointInterval, HasStepSize, HasSeed, + HasValidationIndicatorCol, JavaMLWritable, JavaMLReadable, TreeRegressorParams): """ `Gradient-Boosted Trees (GBTs) `_ learning algorithm for regression. @@ -1079,6 +1079,11 @@ class GBTRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, ... ["label", "features"]) >>> model.evaluateEachIteration(validation, "squared") [0.0, 0.0, 0.0, 0.0, 0.0] + >>> gbt = gbt.setValidationIndicatorCol("validationIndicator") + >>> gbt.getValidationIndicatorCol() + 'validationIndicator' + >>> gbt.getValidationTol() + 0.01 .. versionadded:: 1.4.0 """ @@ -1093,25 +1098,32 @@ class GBTRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, "the contribution of each estimator.", typeConverter=TypeConverters.toFloat) + validationTol = Param(Params._dummy(), "validationTol", + "Threshold for stopping early when fit with validation is used. " + + "If the error rate on the validation input changes by less than the " + + "validationTol, then learning will stop early (before `maxIter`). " + + "This parameter is ignored when fit without validation is used.", + typeConverter=TypeConverters.toFloat) + @keyword_only def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, maxMemoryInMB=256, cacheNodeIds=False, subsamplingRate=1.0, checkpointInterval=10, lossType="squared", maxIter=20, stepSize=0.1, seed=None, - impurity="variance", featureSubsetStrategy="all"): + impurity="variance", featureSubsetStrategy="all", validationTol=0.01): """ __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", \ maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, \ maxMemoryInMB=256, cacheNodeIds=False, subsamplingRate=1.0, \ checkpointInterval=10, lossType="squared", maxIter=20, stepSize=0.1, seed=None, \ - impurity="variance", featureSubsetStrategy="all") + impurity="variance", featureSubsetStrategy="all", validationTol=0.01) """ super(GBTRegressor, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.regression.GBTRegressor", self.uid) self._setDefault(maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, maxMemoryInMB=256, cacheNodeIds=False, subsamplingRate=1.0, checkpointInterval=10, lossType="squared", maxIter=20, stepSize=0.1, - impurity="variance", featureSubsetStrategy="all") + impurity="variance", featureSubsetStrategy="all", validationTol=0.01) kwargs = self._input_kwargs self.setParams(**kwargs) @@ -1121,13 +1133,13 @@ def setParams(self, featuresCol="features", labelCol="label", predictionCol="pre maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, maxMemoryInMB=256, cacheNodeIds=False, subsamplingRate=1.0, checkpointInterval=10, lossType="squared", maxIter=20, stepSize=0.1, seed=None, - impuriy="variance", featureSubsetStrategy="all"): + impuriy="variance", featureSubsetStrategy="all", validationTol=0.01): """ setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", \ maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, \ maxMemoryInMB=256, cacheNodeIds=False, subsamplingRate=1.0, \ checkpointInterval=10, lossType="squared", maxIter=20, stepSize=0.1, seed=None, \ - impurity="variance", featureSubsetStrategy="all") + impurity="variance", featureSubsetStrategy="all", validationTol=0.01) Sets params for Gradient Boosted Tree Regression. """ kwargs = self._input_kwargs @@ -1157,6 +1169,20 @@ def setFeatureSubsetStrategy(self, value): """ return self._set(featureSubsetStrategy=value) + @since("2.4.0") + def setValidationTol(self, value): + """ + Sets the value of :py:attr:`validationTol`. + """ + return self._set(validationTol=value) + + @since("2.4.0") + def getValidationTol(self): + """ + Gets the value of validationTol or its default value. + """ + return self.getOrDefault(self.validationTol) + class GBTRegressionModel(TreeEnsembleModel, JavaPredictionModel, JavaMLWritable, JavaMLReadable): """ From 43ff084ed34a8d3c3942c1964b50bd48bb317af7 Mon Sep 17 00:00:00 2001 From: Huaxin Gao Date: Fri, 8 Jun 2018 15:51:27 -0700 Subject: [PATCH 2/8] add validationIndicatorCol in init --- python/pyspark/ml/classification.py | 8 ++++---- python/pyspark/ml/regression.py | 12 ++++++++---- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index dc75182355366..f87ede67704d9 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -1273,13 +1273,13 @@ def __init__(self, featuresCol="features", labelCol="label", predictionCol="pred maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, lossType="logistic", maxIter=20, stepSize=0.1, seed=None, subsamplingRate=1.0, - featureSubsetStrategy="all", validationTol=0.01): + featureSubsetStrategy="all", validationTol=0.01, validationIndicatorCol=None): """ __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", \ maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, \ maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, \ lossType="logistic", maxIter=20, stepSize=0.1, seed=None, subsamplingRate=1.0, \ - featureSubsetStrategy="all", validationTol=0.01) + featureSubsetStrategy="all", validationTol=0.01, validationIndicatorCol=None) """ super(GBTClassifier, self).__init__() self._java_obj = self._new_java_obj( @@ -1297,13 +1297,13 @@ def setParams(self, featuresCol="features", labelCol="label", predictionCol="pre maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, lossType="logistic", maxIter=20, stepSize=0.1, seed=None, subsamplingRate=1.0, - featureSubsetStrategy="all", validationTol=0.01): + featureSubsetStrategy="all", validationTol=0.01, validationIndicatorCol=None): """ setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", \ maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, \ maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, \ lossType="logistic", maxIter=20, stepSize=0.1, seed=None, subsamplingRate=1.0, \ - featureSubsetStrategy="all", validationTol=0.01) + featureSubsetStrategy="all", validationTol=0.01, validationIndicatorCol=None) Sets params for Gradient Boosted Tree Classification. """ kwargs = self._input_kwargs diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py index c336154355d1b..bc26947e523b1 100644 --- a/python/pyspark/ml/regression.py +++ b/python/pyspark/ml/regression.py @@ -1110,13 +1110,15 @@ def __init__(self, featuresCol="features", labelCol="label", predictionCol="pred maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, maxMemoryInMB=256, cacheNodeIds=False, subsamplingRate=1.0, checkpointInterval=10, lossType="squared", maxIter=20, stepSize=0.1, seed=None, - impurity="variance", featureSubsetStrategy="all", validationTol=0.01): + impurity="variance", featureSubsetStrategy="all", validationTol=0.01, + validationIndicatorCol=None): """ __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", \ maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, \ maxMemoryInMB=256, cacheNodeIds=False, subsamplingRate=1.0, \ checkpointInterval=10, lossType="squared", maxIter=20, stepSize=0.1, seed=None, \ - impurity="variance", featureSubsetStrategy="all", validationTol=0.01) + impurity="variance", featureSubsetStrategy="all", validationTol=0.01, \ + validationIndicatorCol=None) """ super(GBTRegressor, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.regression.GBTRegressor", self.uid) @@ -1133,13 +1135,15 @@ def setParams(self, featuresCol="features", labelCol="label", predictionCol="pre maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, maxMemoryInMB=256, cacheNodeIds=False, subsamplingRate=1.0, checkpointInterval=10, lossType="squared", maxIter=20, stepSize=0.1, seed=None, - impuriy="variance", featureSubsetStrategy="all", validationTol=0.01): + impuriy="variance", featureSubsetStrategy="all", validationTol=0.01, + validationIndicatorCol=None): """ setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", \ maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, \ maxMemoryInMB=256, cacheNodeIds=False, subsamplingRate=1.0, \ checkpointInterval=10, lossType="squared", maxIter=20, stepSize=0.1, seed=None, \ - impurity="variance", featureSubsetStrategy="all", validationTol=0.01) + impurity="variance", featureSubsetStrategy="all", validationTol=0.01, \ + validationIndicatorCol=None) Sets params for Gradient Boosted Tree Regression. """ kwargs = self._input_kwargs From c0e57571c3eb72540e591c9e864fee1c11d5a659 Mon Sep 17 00:00:00 2001 From: Huaxin Gao Date: Mon, 19 Nov 2018 11:03:41 -0800 Subject: [PATCH 3/8] change version to 3.0 --- python/pyspark/ml/classification.py | 4 ++-- python/pyspark/ml/regression.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index f87ede67704d9..e12182bad7856 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -1333,14 +1333,14 @@ def setFeatureSubsetStrategy(self, value): """ return self._set(featureSubsetStrategy=value) - @since("2.4.0") + @since("3.0.0") def setValidationTol(self, value): """ Sets the value of :py:attr:`validationTol`. """ return self._set(validationTol=value) - @since("2.4.0") + @since("3.0.0") def getValidationTol(self): """ Gets the value of validationTol or its default value. diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py index bc26947e523b1..df78251df50bf 100644 --- a/python/pyspark/ml/regression.py +++ b/python/pyspark/ml/regression.py @@ -1173,14 +1173,14 @@ def setFeatureSubsetStrategy(self, value): """ return self._set(featureSubsetStrategy=value) - @since("2.4.0") + @since("3.0.0") def setValidationTol(self, value): """ Sets the value of :py:attr:`validationTol`. """ return self._set(validationTol=value) - @since("2.4.0") + @since("3.0.0") def getValidationTol(self): """ Gets the value of validationTol or its default value. From 3919057e6d6942752ee9bd8533a8637067034842 Mon Sep 17 00:00:00 2001 From: Huaxin Gao Date: Wed, 21 Nov 2018 09:55:59 -0800 Subject: [PATCH 4/8] address comment --- python/pyspark/ml/classification.py | 41 ++++---------------- python/pyspark/ml/regression.py | 59 +++++++++++++++++------------ 2 files changed, 41 insertions(+), 59 deletions(-) diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index e12182bad7856..e385f55e734a1 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -23,7 +23,7 @@ from pyspark.ml import Estimator, Model from pyspark.ml.param.shared import * from pyspark.ml.regression import DecisionTreeModel, DecisionTreeRegressionModel, \ - RandomForestParams, TreeEnsembleModel, TreeEnsembleParams + GBTParams, RandomForestParams, TreeEnsembleModel, TreeEnsembleParams from pyspark.ml.util import * from pyspark.ml.wrapper import JavaEstimator, JavaModel, JavaParams from pyspark.ml.wrapper import JavaWrapper @@ -895,15 +895,6 @@ def getImpurity(self): return self.getOrDefault(self.impurity) -class GBTParams(TreeEnsembleParams): - """ - Private class to track supported GBT params. - - .. versionadded:: 1.4.0 - """ - supportedLossTypes = ["logistic"] - - @inherit_doc class DecisionTreeClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, HasProbabilityCol, HasRawPredictionCol, DecisionTreeParams, @@ -1175,9 +1166,8 @@ def trees(self): @inherit_doc -class GBTClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, HasMaxIter, - GBTParams, HasCheckpointInterval, HasStepSize, HasSeed, - HasValidationIndicatorCol, JavaMLWritable, JavaMLReadable): +class GBTClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, GBTParams, + HasCheckpointInterval, HasSeed, JavaMLWritable, JavaMLReadable): """ `Gradient-Boosted Trees (GBTs) `_ learning algorithm for classification. @@ -1256,17 +1246,7 @@ class GBTClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol "Supported options: " + ", ".join(GBTParams.supportedLossTypes), typeConverter=TypeConverters.toString) - stepSize = Param(Params._dummy(), "stepSize", - "Step size (a.k.a. learning rate) in interval (0, 1] for shrinking " + - "the contribution of each estimator.", - typeConverter=TypeConverters.toFloat) - - validationTol = Param(Params._dummy(), "validationTol", - "Threshold for stopping early when fit with validation is used. " + - "If the error rate on the validation input changes by less than the " + - "validationTol, then learning will stop early (before `maxIter`). " + - "This parameter is ignored when fit without validation is used.", - typeConverter=TypeConverters.toFloat) + supportedLossTypes = ["logistic"] @keyword_only def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", @@ -1334,18 +1314,11 @@ def setFeatureSubsetStrategy(self, value): return self._set(featureSubsetStrategy=value) @since("3.0.0") - def setValidationTol(self, value): - """ - Sets the value of :py:attr:`validationTol`. - """ - return self._set(validationTol=value) - - @since("3.0.0") - def getValidationTol(self): + def setValidationIndicatorCol(self, value): """ - Gets the value of validationTol or its default value. + Sets the value of :py:attr:`validationIndicatorCol`. """ - return self.getOrDefault(self.validationTol) + return self._set(validationIndicatorCol=value) class GBTClassificationModel(TreeEnsembleModel, JavaClassificationModel, JavaMLWritable, diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py index df78251df50bf..8131c4f9e6dd3 100644 --- a/python/pyspark/ml/regression.py +++ b/python/pyspark/ml/regression.py @@ -705,12 +705,38 @@ def getNumTrees(self): return self.getOrDefault(self.numTrees) -class GBTParams(TreeEnsembleParams): +class GBTParams(TreeEnsembleParams, HasMaxIter, HasStepSize, HasValidationIndicatorCol): """ Private class to track supported GBT params. """ supportedLossTypes = ["squared", "absolute"] + stepSize = Param(Params._dummy(), "stepSize", + "Step size (a.k.a. learning rate) in interval (0, 1] for shrinking " + + "the contribution of each estimator.", + typeConverter=TypeConverters.toFloat) + + validationTol = Param(Params._dummy(), "validationTol", + "Threshold for stopping early when fit with validation is used. " + + "If the error rate on the validation input changes by less than the " + + "validationTol, then learning will stop early (before `maxIter`). " + + "This parameter is ignored when fit without validation is used.", + typeConverter=TypeConverters.toFloat) + + @since("3.0.0") + def setValidationTol(self, value): + """ + Sets the value of :py:attr:`validationTol`. + """ + return self._set(validationTol=value) + + @since("3.0.0") + def getValidationTol(self): + """ + Gets the value of validationTol or its default value. + """ + return self.getOrDefault(self.validationTol) + @inherit_doc class DecisionTreeRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, @@ -1030,9 +1056,9 @@ def featureImportances(self): @inherit_doc -class GBTRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, HasMaxIter, - GBTParams, HasCheckpointInterval, HasStepSize, HasSeed, - HasValidationIndicatorCol, JavaMLWritable, JavaMLReadable, TreeRegressorParams): +class GBTRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, GBTParams, + HasCheckpointInterval, HasStepSize, HasSeed, JavaMLWritable, JavaMLReadable, + TreeRegressorParams): """ `Gradient-Boosted Trees (GBTs) `_ learning algorithm for regression. @@ -1093,17 +1119,7 @@ class GBTRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, "Supported options: " + ", ".join(GBTParams.supportedLossTypes), typeConverter=TypeConverters.toString) - stepSize = Param(Params._dummy(), "stepSize", - "Step size (a.k.a. learning rate) in interval (0, 1] for shrinking " + - "the contribution of each estimator.", - typeConverter=TypeConverters.toFloat) - - validationTol = Param(Params._dummy(), "validationTol", - "Threshold for stopping early when fit with validation is used. " + - "If the error rate on the validation input changes by less than the " + - "validationTol, then learning will stop early (before `maxIter`). " + - "This parameter is ignored when fit without validation is used.", - typeConverter=TypeConverters.toFloat) + supportedLossTypes = ["squared", "absolute"] @keyword_only def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", @@ -1174,18 +1190,11 @@ def setFeatureSubsetStrategy(self, value): return self._set(featureSubsetStrategy=value) @since("3.0.0") - def setValidationTol(self, value): - """ - Sets the value of :py:attr:`validationTol`. - """ - return self._set(validationTol=value) - - @since("3.0.0") - def getValidationTol(self): + def setValidationIndicatorCol(self, value): """ - Gets the value of validationTol or its default value. + Sets the value of :py:attr:`validationIndicatorCol`. """ - return self.getOrDefault(self.validationTol) + return self._set(validationIndicatorCol=value) class GBTRegressionModel(TreeEnsembleModel, JavaPredictionModel, JavaMLWritable, JavaMLReadable): From c0fcbb397ae3c954092e461bbabf2b8f8cf85386 Mon Sep 17 00:00:00 2001 From: Huaxin Gao Date: Wed, 28 Nov 2018 14:23:53 -0800 Subject: [PATCH 5/8] add GBTClassifierParams and GBTRegressorParams --- python/pyspark/ml/classification.py | 63 ++++++++++++++++------------- python/pyspark/ml/regression.py | 59 ++++++++++++++------------- 2 files changed, 66 insertions(+), 56 deletions(-) diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index e385f55e734a1..c5b567615c633 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -23,7 +23,7 @@ from pyspark.ml import Estimator, Model from pyspark.ml.param.shared import * from pyspark.ml.regression import DecisionTreeModel, DecisionTreeRegressionModel, \ - GBTParams, RandomForestParams, TreeEnsembleModel, TreeEnsembleParams + GBTParams, HasVarianceImpurity, RandomForestParams, TreeEnsembleModel, TreeEnsembleParams from pyspark.ml.util import * from pyspark.ml.wrapper import JavaEstimator, JavaModel, JavaParams from pyspark.ml.wrapper import JavaWrapper @@ -1165,9 +1165,32 @@ def trees(self): return [DecisionTreeClassificationModel(m) for m in list(self._call_java("trees"))] +class GBTClassifierParams(GBTParams, HasVarianceImpurity): + """ + Private class to track supported GBTClassifier params. + + .. versionadded:: 3.0.0 + """ + + supportedLossTypes = ["logistic"] + + lossType = Param(Params._dummy(), "lossType", + "Loss function which GBT tries to minimize (case-insensitive). " + + "Supported options: " + ", ".join(supportedLossTypes), + typeConverter=TypeConverters.toString) + + @since("3.0.0") + def setLossType(self, value): + """ + Sets the value of :py:attr:`lossType`. + """ + return self._set(lossType=value) + + @inherit_doc -class GBTClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, GBTParams, - HasCheckpointInterval, HasSeed, JavaMLWritable, JavaMLReadable): +class GBTClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, + GBTClassifierParams, HasCheckpointInterval, HasSeed, JavaMLWritable, + JavaMLReadable): """ `Gradient-Boosted Trees (GBTs) `_ learning algorithm for classification. @@ -1241,25 +1264,19 @@ class GBTClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol .. versionadded:: 1.4.0 """ - lossType = Param(Params._dummy(), "lossType", - "Loss function which GBT tries to minimize (case-insensitive). " + - "Supported options: " + ", ".join(GBTParams.supportedLossTypes), - typeConverter=TypeConverters.toString) - - supportedLossTypes = ["logistic"] - @keyword_only def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, lossType="logistic", - maxIter=20, stepSize=0.1, seed=None, subsamplingRate=1.0, + maxIter=20, stepSize=0.1, seed=None, subsamplingRate=1.0, impurity="variance", featureSubsetStrategy="all", validationTol=0.01, validationIndicatorCol=None): """ __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", \ maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, \ maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, \ lossType="logistic", maxIter=20, stepSize=0.1, seed=None, subsamplingRate=1.0, \ - featureSubsetStrategy="all", validationTol=0.01, validationIndicatorCol=None) + impurity="variance", featureSubsetStrategy="all", validationTol=0.01, \ + validationIndicatorCol=None) """ super(GBTClassifier, self).__init__() self._java_obj = self._new_java_obj( @@ -1267,7 +1284,7 @@ def __init__(self, featuresCol="features", labelCol="label", predictionCol="pred self._setDefault(maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, lossType="logistic", maxIter=20, stepSize=0.1, subsamplingRate=1.0, - featureSubsetStrategy="all", validationTol=0.01) + impurity="variance", featureSubsetStrategy="all", validationTol=0.01) kwargs = self._input_kwargs self.setParams(**kwargs) @@ -1277,13 +1294,15 @@ def setParams(self, featuresCol="features", labelCol="label", predictionCol="pre maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, lossType="logistic", maxIter=20, stepSize=0.1, seed=None, subsamplingRate=1.0, - featureSubsetStrategy="all", validationTol=0.01, validationIndicatorCol=None): + impurity="variance", featureSubsetStrategy="all", validationTol=0.01, + validationIndicatorCol=None): """ setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", \ maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, \ maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, \ lossType="logistic", maxIter=20, stepSize=0.1, seed=None, subsamplingRate=1.0, \ - featureSubsetStrategy="all", validationTol=0.01, validationIndicatorCol=None) + impurity="variance", featureSubsetStrategy="all", validationTol=0.01, + validationIndicatorCol=None) Sets params for Gradient Boosted Tree Classification. """ kwargs = self._input_kwargs @@ -1292,20 +1311,6 @@ def setParams(self, featuresCol="features", labelCol="label", predictionCol="pre def _create_model(self, java_model): return GBTClassificationModel(java_model) - @since("1.4.0") - def setLossType(self, value): - """ - Sets the value of :py:attr:`lossType`. - """ - return self._set(lossType=value) - - @since("1.4.0") - def getLossType(self): - """ - Gets the value of lossType or its default value. - """ - return self.getOrDefault(self.lossType) - @since("2.4.0") def setFeatureSubsetStrategy(self, value): """ diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py index 8131c4f9e6dd3..88da7341eaba6 100644 --- a/python/pyspark/ml/regression.py +++ b/python/pyspark/ml/regression.py @@ -650,19 +650,20 @@ def getFeatureSubsetStrategy(self): return self.getOrDefault(self.featureSubsetStrategy) -class TreeRegressorParams(Params): +class HasVarianceImpurity(Params): """ Private class to track supported impurity measures. """ supportedImpurities = ["variance"] + impurity = Param(Params._dummy(), "impurity", "Criterion used for information gain calculation (case-insensitive). " + "Supported options: " + ", ".join(supportedImpurities), typeConverter=TypeConverters.toString) def __init__(self): - super(TreeRegressorParams, self).__init__() + super(HasVarianceImpurity, self).__init__() @since("1.4.0") def setImpurity(self, value): @@ -679,6 +680,10 @@ def getImpurity(self): return self.getOrDefault(self.impurity) +class TreeRegressorParams(HasVarianceImpurity): + pass + + class RandomForestParams(TreeEnsembleParams): """ Private class to track supported random forest parameters. @@ -709,7 +714,6 @@ class GBTParams(TreeEnsembleParams, HasMaxIter, HasStepSize, HasValidationIndica """ Private class to track supported GBT params. """ - supportedLossTypes = ["squared", "absolute"] stepSize = Param(Params._dummy(), "stepSize", "Step size (a.k.a. learning rate) in interval (0, 1] for shrinking " + @@ -738,6 +742,28 @@ def getValidationTol(self): return self.getOrDefault(self.validationTol) +class GBTRegressorParams(GBTParams, TreeRegressorParams): + """ + Private class to track supported GBTRegressor params. + + .. versionadded:: 3.0.0 + """ + + supportedLossTypes = ["squared", "absolute"] + + lossType = Param(Params._dummy(), "lossType", + "Loss function which GBT tries to minimize (case-insensitive). " + + "Supported options: " + ", ".join(supportedLossTypes), + typeConverter=TypeConverters.toString) + + @since("1.4.0") + def setLossType(self, value): + """ + Sets the value of :py:attr:`lossType`. + """ + return self._set(lossType=value) + + @inherit_doc class DecisionTreeRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, DecisionTreeParams, TreeRegressorParams, HasCheckpointInterval, @@ -1056,9 +1082,9 @@ def featureImportances(self): @inherit_doc -class GBTRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, GBTParams, - HasCheckpointInterval, HasStepSize, HasSeed, JavaMLWritable, JavaMLReadable, - TreeRegressorParams): +class GBTRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, + GBTRegressorParams, HasCheckpointInterval, HasSeed, JavaMLWritable, + JavaMLReadable): """ `Gradient-Boosted Trees (GBTs) `_ learning algorithm for regression. @@ -1114,13 +1140,6 @@ class GBTRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, .. versionadded:: 1.4.0 """ - lossType = Param(Params._dummy(), "lossType", - "Loss function which GBT tries to minimize (case-insensitive). " + - "Supported options: " + ", ".join(GBTParams.supportedLossTypes), - typeConverter=TypeConverters.toString) - - supportedLossTypes = ["squared", "absolute"] - @keyword_only def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, @@ -1168,20 +1187,6 @@ def setParams(self, featuresCol="features", labelCol="label", predictionCol="pre def _create_model(self, java_model): return GBTRegressionModel(java_model) - @since("1.4.0") - def setLossType(self, value): - """ - Sets the value of :py:attr:`lossType`. - """ - return self._set(lossType=value) - - @since("1.4.0") - def getLossType(self): - """ - Gets the value of lossType or its default value. - """ - return self.getOrDefault(self.lossType) - @since("2.4.0") def setFeatureSubsetStrategy(self, value): """ From c0586bdc98277e43a5d902b6d3fdfe97579d3821 Mon Sep 17 00:00:00 2001 From: Huaxin Gao Date: Wed, 28 Nov 2018 15:03:45 -0800 Subject: [PATCH 6/8] fix docstring problem --- python/pyspark/ml/classification.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index c5b567615c633..abb453c5f2690 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -1301,7 +1301,7 @@ def setParams(self, featuresCol="features", labelCol="label", predictionCol="pre maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, \ maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, \ lossType="logistic", maxIter=20, stepSize=0.1, seed=None, subsamplingRate=1.0, \ - impurity="variance", featureSubsetStrategy="all", validationTol=0.01, + impurity="variance", featureSubsetStrategy="all", validationTol=0.01, \ validationIndicatorCol=None) Sets params for Gradient Boosted Tree Classification. """ From 30a743d79ecd2b18b5d1fa997d63dee914b714a0 Mon Sep 17 00:00:00 2001 From: Huaxin Gao Date: Wed, 5 Dec 2018 14:24:59 -0800 Subject: [PATCH 7/8] address comments --- python/pyspark/ml/classification.py | 15 +++++++++++---- python/pyspark/ml/regression.py | 20 ++++++++++---------- 2 files changed, 21 insertions(+), 14 deletions(-) diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index abb453c5f2690..6ddfce95a3d4d 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -1179,12 +1179,12 @@ class GBTClassifierParams(GBTParams, HasVarianceImpurity): "Supported options: " + ", ".join(supportedLossTypes), typeConverter=TypeConverters.toString) - @since("3.0.0") - def setLossType(self, value): + @since("1.4.0") + def getLossType(self): """ - Sets the value of :py:attr:`lossType`. + Gets the value of lossType or its default value. """ - return self._set(lossType=value) + return self.getOrDefault(self.lossType) @inherit_doc @@ -1311,6 +1311,13 @@ def setParams(self, featuresCol="features", labelCol="label", predictionCol="pre def _create_model(self, java_model): return GBTClassificationModel(java_model) + @since("1.4.0") + def setLossType(self, value): + """ + Sets the value of :py:attr:`lossType`. + """ + return self._set(lossType=value) + @since("2.4.0") def setFeatureSubsetStrategy(self, value): """ diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py index 88da7341eaba6..78cb4a6703554 100644 --- a/python/pyspark/ml/regression.py +++ b/python/pyspark/ml/regression.py @@ -727,13 +727,6 @@ class GBTParams(TreeEnsembleParams, HasMaxIter, HasStepSize, HasValidationIndica "This parameter is ignored when fit without validation is used.", typeConverter=TypeConverters.toFloat) - @since("3.0.0") - def setValidationTol(self, value): - """ - Sets the value of :py:attr:`validationTol`. - """ - return self._set(validationTol=value) - @since("3.0.0") def getValidationTol(self): """ @@ -757,11 +750,11 @@ class GBTRegressorParams(GBTParams, TreeRegressorParams): typeConverter=TypeConverters.toString) @since("1.4.0") - def setLossType(self, value): + def getLossType(self): """ - Sets the value of :py:attr:`lossType`. + Gets the value of lossType or its default value. """ - return self._set(lossType=value) + return self.getOrDefault(self.lossType) @inherit_doc @@ -1187,6 +1180,13 @@ def setParams(self, featuresCol="features", labelCol="label", predictionCol="pre def _create_model(self, java_model): return GBTRegressionModel(java_model) + @since("1.4.0") + def setLossType(self, value): + """ + Sets the value of :py:attr:`lossType`. + """ + return self._set(lossType=value) + @since("2.4.0") def setFeatureSubsetStrategy(self, value): """ From 6fc95a77e0c541b25cc7acba434b263a8e378926 Mon Sep 17 00:00:00 2001 From: Huaxin Gao Date: Fri, 7 Dec 2018 10:35:29 -0800 Subject: [PATCH 8/8] regenerate shared.py --- python/pyspark/ml/param/shared.py | 93 ++++++++++++++++--------------- 1 file changed, 47 insertions(+), 46 deletions(-) diff --git a/python/pyspark/ml/param/shared.py b/python/pyspark/ml/param/shared.py index d73ea0e2e6ace..6405b9fce7efb 100644 --- a/python/pyspark/ml/param/shared.py +++ b/python/pyspark/ml/param/shared.py @@ -702,6 +702,53 @@ def getLoss(self): return self.getOrDefault(self.loss) +class HasDistanceMeasure(Params): + """ + Mixin for param distanceMeasure: the distance measure. Supported options: 'euclidean' and 'cosine'. + """ + + distanceMeasure = Param(Params._dummy(), "distanceMeasure", "the distance measure. Supported options: 'euclidean' and 'cosine'.", typeConverter=TypeConverters.toString) + + def __init__(self): + super(HasDistanceMeasure, self).__init__() + self._setDefault(distanceMeasure='euclidean') + + def setDistanceMeasure(self, value): + """ + Sets the value of :py:attr:`distanceMeasure`. + """ + return self._set(distanceMeasure=value) + + def getDistanceMeasure(self): + """ + Gets the value of distanceMeasure or its default value. + """ + return self.getOrDefault(self.distanceMeasure) + + +class HasValidationIndicatorCol(Params): + """ + Mixin for param validationIndicatorCol: name of the column that indicates whether each row is for training or for validation. False indicates training; true indicates validation. + """ + + validationIndicatorCol = Param(Params._dummy(), "validationIndicatorCol", "name of the column that indicates whether each row is for training or for validation. False indicates training; true indicates validation.", typeConverter=TypeConverters.toString) + + def __init__(self): + super(HasValidationIndicatorCol, self).__init__() + + def setValidationIndicatorCol(self, value): + """ + Sets the value of :py:attr:`validationIndicatorCol`. + """ + return self._set(validationIndicatorCol=value) + + def getValidationIndicatorCol(self): + """ + Gets the value of validationIndicatorCol or its default value. + """ + return self.getOrDefault(self.validationIndicatorCol) + + class DecisionTreeParams(Params): """ Mixin for Decision Tree parameters. @@ -790,49 +837,3 @@ def getCacheNodeIds(self): """ return self.getOrDefault(self.cacheNodeIds) - -class HasDistanceMeasure(Params): - """ - Mixin for param distanceMeasure: the distance measure. Supported options: 'euclidean' and 'cosine'. - """ - - distanceMeasure = Param(Params._dummy(), "distanceMeasure", "the distance measure. Supported options: 'euclidean' and 'cosine'.", typeConverter=TypeConverters.toString) - - def __init__(self): - super(HasDistanceMeasure, self).__init__() - self._setDefault(distanceMeasure='euclidean') - - def setDistanceMeasure(self, value): - """ - Sets the value of :py:attr:`distanceMeasure`. - """ - return self._set(distanceMeasure=value) - - def getDistanceMeasure(self): - """ - Gets the value of distanceMeasure or its default value. - """ - return self.getOrDefault(self.distanceMeasure) - - -class HasValidationIndicatorCol(Params): - """ - Mixin for param validationIndicatorCol: name of the column that indicates whether each row is for training or for validation. false indicates training; true indicates validation. - """ - - validationIndicatorCol = Param(Params._dummy(), "validationIndicatorCol", "name of the column that indicates whether each row is for training or for validation. false indicates training; true indicates validation.", typeConverter=TypeConverters.toString) - - def __init__(self): - super(HasValidationIndicatorCol, self).__init__() - - def setValidationIndicatorCol(self, value): - """ - Sets the value of :py:attr:`validationIndicatorCol`. - """ - return self._set(validationIndicatorCol=value) - - def getValidationIndicatorCol(self): - """ - Gets the value of validationIndicatorCol or its default value. - """ - return self.getOrDefault(self.validationIndicatorCol) \ No newline at end of file