From 7950cb92e9348ebbf2118a70532ef79b3eaf3046 Mon Sep 17 00:00:00 2001 From: zhengruifeng Date: Thu, 26 Sep 2019 16:21:35 +0800 Subject: [PATCH 1/5] create pr --- .../BinaryClassificationEvaluator.scala | 29 +++++++-- .../ml/evaluation/RegressionEvaluator.scala | 37 +++++++++--- .../evaluation/RegressionEvaluatorSuite.scala | 4 ++ python/pyspark/ml/evaluation.py | 59 ++++++++++++++++--- 4 files changed, 106 insertions(+), 23 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluator.scala b/mllib/src/main/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluator.scala index 2a7b3c579b078..09e8e7b232f3a 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluator.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluator.scala @@ -59,6 +59,28 @@ class BinaryClassificationEvaluator @Since("1.4.0") (@Since("1.4.0") override va @Since("1.2.0") def setMetricName(value: String): this.type = set(metricName, value) + /** + * param for number of bins to down-sample the curves (ROC curve, PR curve) in area + * computation. If 0, no down-sampling will occur. + * Default: 1000. + * @group expertParam + */ + @Since("3.0.0") + val numBins: IntParam = new IntParam(this, "numBins", "Number of bins to down-sample " + + "the curves (ROC curve, PR curve) in area computation. If 0, no down-sampling will occur. " + + "Must be >= 0.", + ParamValidators.gtEq(0)) + + /** @group expertGetParam */ + @Since("3.0.0") + def getNumBins: Int = $(numBins) + + /** @group expertSetParam */ + @Since("3.0.0") + def setNumBins(value: Int): this.type = set(numBins, value) + + setDefault(numBins -> 1000) + /** @group setParam */ @Since("1.5.0") def setRawPredictionCol(value: String): this.type = set(rawPredictionCol, value) @@ -94,7 +116,7 @@ class BinaryClassificationEvaluator @Since("1.4.0") (@Since("1.4.0") override va case Row(rawPrediction: Double, label: Double, weight: Double) => (rawPrediction, label, weight) } - val metrics = new BinaryClassificationMetrics(scoreAndLabelsWithWeights) + val metrics = new BinaryClassificationMetrics(scoreAndLabelsWithWeights, $(numBins)) val metric = $(metricName) match { case "areaUnderROC" => metrics.areaUnderROC() case "areaUnderPR" => metrics.areaUnderPR() @@ -104,10 +126,7 @@ class BinaryClassificationEvaluator @Since("1.4.0") (@Since("1.4.0") override va } @Since("1.5.0") - override def isLargerBetter: Boolean = $(metricName) match { - case "areaUnderROC" => true - case "areaUnderPR" => true - } + override def isLargerBetter: Boolean = true @Since("1.4.1") override def copy(extra: ParamMap): BinaryClassificationEvaluator = defaultCopy(extra) diff --git a/mllib/src/main/scala/org/apache/spark/ml/evaluation/RegressionEvaluator.scala b/mllib/src/main/scala/org/apache/spark/ml/evaluation/RegressionEvaluator.scala index dd667a85fa598..a8853d8425886 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/evaluation/RegressionEvaluator.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/evaluation/RegressionEvaluator.scala @@ -18,7 +18,7 @@ package org.apache.spark.ml.evaluation import org.apache.spark.annotation.Since -import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators} +import org.apache.spark.ml.param.{BooleanParam, Param, ParamMap, ParamValidators} import org.apache.spark.ml.param.shared.{HasLabelCol, HasPredictionCol, HasWeightCol} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} import org.apache.spark.mllib.evaluation.RegressionMetrics @@ -43,13 +43,14 @@ final class RegressionEvaluator @Since("1.4.0") (@Since("1.4.0") override val ui * - `"mse"`: mean squared error * - `"r2"`: R^2^ metric * - `"mae"`: mean absolute error + * - `"var"`: explained variance * * @group param */ @Since("1.4.0") val metricName: Param[String] = { - val allowedParams = ParamValidators.inArray(Array("mse", "rmse", "r2", "mae")) - new Param(this, "metricName", "metric name in evaluation (mse|rmse|r2|mae)", allowedParams) + val allowedParams = ParamValidators.inArray(Array("mse", "rmse", "r2", "mae", "var")) + new Param(this, "metricName", "metric name in evaluation (mse|rmse|r2|mae|var)", allowedParams) } /** @group getParam */ @@ -60,6 +61,25 @@ final class RegressionEvaluator @Since("1.4.0") (@Since("1.4.0") override val ui @Since("1.4.0") def setMetricName(value: String): this.type = set(metricName, value) + /** + * param for whether the regression is through the origin. + * Default: false. + * @group expertParam + */ + @Since("3.0.0") + val throughOrigin: BooleanParam = new BooleanParam(this, "throughOrigin", + "Whether the regression is through the origin.") + + /** @group expertGetParam */ + @Since("3.0.0") + def getThroughOrigin: Boolean = $(throughOrigin) + + /** @group expertSetParam */ + @Since("3.0.0") + def setThroughOrigin(value: Boolean): this.type = set(throughOrigin, value) + + setDefault(throughOrigin -> false) + /** @group setParam */ @Since("1.4.0") def setPredictionCol(value: String): this.type = set(predictionCol, value) @@ -86,22 +106,21 @@ final class RegressionEvaluator @Since("1.4.0") (@Since("1.4.0") override val ui .rdd .map { case Row(prediction: Double, label: Double, weight: Double) => (prediction, label, weight) } - val metrics = new RegressionMetrics(predictionAndLabelsWithWeights) - val metric = $(metricName) match { + val metrics = new RegressionMetrics(predictionAndLabelsWithWeights, $(throughOrigin)) + $(metricName) match { case "rmse" => metrics.rootMeanSquaredError case "mse" => metrics.meanSquaredError case "r2" => metrics.r2 case "mae" => metrics.meanAbsoluteError + case "var" => metrics.explainedVariance } - metric } @Since("1.4.0") override def isLargerBetter: Boolean = $(metricName) match { - case "rmse" => false - case "mse" => false case "r2" => true - case "mae" => false + case "var" => true + case _ => false } @Since("1.5.0") diff --git a/mllib/src/test/scala/org/apache/spark/ml/evaluation/RegressionEvaluatorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/evaluation/RegressionEvaluatorSuite.scala index c1a156959618e..f4f858c3e92dc 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/evaluation/RegressionEvaluatorSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/evaluation/RegressionEvaluatorSuite.scala @@ -76,6 +76,10 @@ class RegressionEvaluatorSuite // mae evaluator.setMetricName("mae") assert(evaluator.evaluate(predictions) ~== 0.08399089 absTol 0.01) + + // var + evaluator.setMetricName("var") + assert(evaluator.evaluate(predictions) ~== 63.6944519 absTol 0.01) } test("read/write") { diff --git a/python/pyspark/ml/evaluation.py b/python/pyspark/ml/evaluation.py index d96cdd594a3f3..48bc83cc01fcf 100644 --- a/python/pyspark/ml/evaluation.py +++ b/python/pyspark/ml/evaluation.py @@ -139,6 +139,8 @@ class BinaryClassificationEvaluator(JavaEvaluator, HasLabelCol, HasRawPrediction 0.70... >>> evaluator.evaluate(dataset, {evaluator.metricName: "areaUnderPR"}) 0.82... + >>> evaluator.getNumBins + 1000 .. versionadded:: 1.4.0 """ @@ -147,12 +149,17 @@ class BinaryClassificationEvaluator(JavaEvaluator, HasLabelCol, HasRawPrediction "metric name in evaluation (areaUnderROC|areaUnderPR)", typeConverter=TypeConverters.toString) + numBins = Param(Params._dummy(), "numBins", "Number of bins to down-sample the curves " + "(ROC curve, PR curve) in area computation. If 0, no down-sampling will " + "occur. Must be >= 0.", + typeConverter=TypeConverters.toInt) + @keyword_only def __init__(self, rawPredictionCol="rawPrediction", labelCol="label", - metricName="areaUnderROC", weightCol=None): + metricName="areaUnderROC", weightCol=None, numBins=1000): """ __init__(self, rawPredictionCol="rawPrediction", labelCol="label", \ - metricName="areaUnderROC", weightCol=None) + metricName="areaUnderROC", weightCol=None, numBins=1000) """ super(BinaryClassificationEvaluator, self).__init__() self._java_obj = self._new_java_obj( @@ -175,13 +182,27 @@ def getMetricName(self): """ return self.getOrDefault(self.metricName) + @since("3.0.0") + def setNumBins(self, value): + """ + Sets the value of :py:attr:`numBins`. + """ + return self._set(numBins=value) + + @since("3.0.0") + def getNumBins(self): + """ + Gets the value of numBins or its default value. + """ + return self.getOrDefault(self.numBins) + @keyword_only @since("1.4.0") def setParams(self, rawPredictionCol="rawPrediction", labelCol="label", - metricName="areaUnderROC", weightCol=None): + metricName="areaUnderROC", weightCol=None, numBins=1000): """ setParams(self, rawPredictionCol="rawPrediction", labelCol="label", \ - metricName="areaUnderROC", weightCol=None) + metricName="areaUnderROC", weightCol=None, numBins=1000) Sets params for binary classification evaluator. """ kwargs = self._input_kwargs @@ -218,6 +239,7 @@ class RegressionEvaluator(JavaEvaluator, HasLabelCol, HasPredictionCol, HasWeigh >>> evaluator = RegressionEvaluator(predictionCol="raw", weightCol="weight") >>> evaluator.evaluate(dataset) 2.740... + >>> evaluator.throughOrigin .. versionadded:: 1.4.0 """ @@ -226,15 +248,20 @@ class RegressionEvaluator(JavaEvaluator, HasLabelCol, HasPredictionCol, HasWeigh rmse - root mean squared error (default) mse - mean squared error r2 - r^2 metric - mae - mean absolute error.""", + mae - mean absolute error + var - explained variance.""", typeConverter=TypeConverters.toString) + throughOrigin = Param(Params._dummy(), "throughOrigin", + "whether the regression is through the origin.", + typeConverter=TypeConverters.toBoolean) + @keyword_only def __init__(self, predictionCol="prediction", labelCol="label", - metricName="rmse", weightCol=None): + metricName="rmse", weightCol=None, throughOrigin=False): """ __init__(self, predictionCol="prediction", labelCol="label", \ - metricName="rmse", weightCol=None) + metricName="rmse", weightCol=None, throughOrigin=False) """ super(RegressionEvaluator, self).__init__() self._java_obj = self._new_java_obj( @@ -257,13 +284,27 @@ def getMetricName(self): """ return self.getOrDefault(self.metricName) + @since("3.0.0") + def setThroughOrigin(self, value): + """ + Sets the value of :py:attr:`throughOrigin`. + """ + return self._set(throughOrigin=value) + + @since("3.0.0") + def getThroughOrigin(self): + """ + Gets the value of throughOrigin or its default value. + """ + return self.getOrDefault(self.throughOrigin) + @keyword_only @since("1.4.0") def setParams(self, predictionCol="prediction", labelCol="label", - metricName="rmse", weightCol=None): + metricName="rmse", weightCol=None, throughOrigin=False): """ setParams(self, predictionCol="prediction", labelCol="label", \ - metricName="rmse", weightCol=None) + metricName="rmse", weightCol=None, throughOrigin=False) Sets params for regression evaluator. """ kwargs = self._input_kwargs From aaef3f7e8897fd5bdf5b1f23c47fda514a92bfca Mon Sep 17 00:00:00 2001 From: zhengruifeng Date: Thu, 26 Sep 2019 16:32:37 +0800 Subject: [PATCH 2/5] nit --- python/pyspark/ml/evaluation.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/python/pyspark/ml/evaluation.py b/python/pyspark/ml/evaluation.py index 48bc83cc01fcf..b51d5f695a388 100644 --- a/python/pyspark/ml/evaluation.py +++ b/python/pyspark/ml/evaluation.py @@ -239,7 +239,8 @@ class RegressionEvaluator(JavaEvaluator, HasLabelCol, HasPredictionCol, HasWeigh >>> evaluator = RegressionEvaluator(predictionCol="raw", weightCol="weight") >>> evaluator.evaluate(dataset) 2.740... - >>> evaluator.throughOrigin + >>> evaluator.getThroughOrigin + False .. versionadded:: 1.4.0 """ @@ -254,7 +255,7 @@ class RegressionEvaluator(JavaEvaluator, HasLabelCol, HasPredictionCol, HasWeigh throughOrigin = Param(Params._dummy(), "throughOrigin", "whether the regression is through the origin.", - typeConverter=TypeConverters.toBoolean) + typeConverter=TypeConverters.toBoolean) @keyword_only def __init__(self, predictionCol="prediction", labelCol="label", From 932a17eb7bebdc200fddda4dda46ff49a4674b23 Mon Sep 17 00:00:00 2001 From: zhengruifeng Date: Thu, 26 Sep 2019 19:07:53 +0800 Subject: [PATCH 3/5] py add setDefault --- python/pyspark/ml/evaluation.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/pyspark/ml/evaluation.py b/python/pyspark/ml/evaluation.py index b51d5f695a388..c7bd561d7500e 100644 --- a/python/pyspark/ml/evaluation.py +++ b/python/pyspark/ml/evaluation.py @@ -164,7 +164,7 @@ def __init__(self, rawPredictionCol="rawPrediction", labelCol="label", super(BinaryClassificationEvaluator, self).__init__() self._java_obj = self._new_java_obj( "org.apache.spark.ml.evaluation.BinaryClassificationEvaluator", self.uid) - self._setDefault(metricName="areaUnderROC") + self._setDefault(metricName="areaUnderROC", numBins=1000) kwargs = self._input_kwargs self._set(**kwargs) @@ -267,7 +267,7 @@ def __init__(self, predictionCol="prediction", labelCol="label", super(RegressionEvaluator, self).__init__() self._java_obj = self._new_java_obj( "org.apache.spark.ml.evaluation.RegressionEvaluator", self.uid) - self._setDefault(metricName="rmse") + self._setDefault(metricName="rmse", throughOrigin=False) kwargs = self._input_kwargs self._set(**kwargs) From 4b4cbc6d97ae3fc155bff0a86dcf788d9745a328 Mon Sep 17 00:00:00 2001 From: zhengruifeng Date: Thu, 26 Sep 2019 19:59:50 +0800 Subject: [PATCH 4/5] nit --- python/pyspark/ml/evaluation.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/pyspark/ml/evaluation.py b/python/pyspark/ml/evaluation.py index c7bd561d7500e..6917cce615aa4 100644 --- a/python/pyspark/ml/evaluation.py +++ b/python/pyspark/ml/evaluation.py @@ -139,7 +139,7 @@ class BinaryClassificationEvaluator(JavaEvaluator, HasLabelCol, HasRawPrediction 0.70... >>> evaluator.evaluate(dataset, {evaluator.metricName: "areaUnderPR"}) 0.82... - >>> evaluator.getNumBins + >>> evaluator.getNumBins() 1000 .. versionadded:: 1.4.0 @@ -239,7 +239,7 @@ class RegressionEvaluator(JavaEvaluator, HasLabelCol, HasPredictionCol, HasWeigh >>> evaluator = RegressionEvaluator(predictionCol="raw", weightCol="weight") >>> evaluator.evaluate(dataset) 2.740... - >>> evaluator.getThroughOrigin + >>> evaluator.getThroughOrigin() False .. versionadded:: 1.4.0 From 45139b50776dd8f1bb5e592bad3e42bc2cf003d0 Mon Sep 17 00:00:00 2001 From: zhengruifeng Date: Fri, 27 Sep 2019 09:58:06 +0800 Subject: [PATCH 5/5] update match case --- .../org/apache/spark/ml/evaluation/RegressionEvaluator.scala | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/evaluation/RegressionEvaluator.scala b/mllib/src/main/scala/org/apache/spark/ml/evaluation/RegressionEvaluator.scala index a8853d8425886..b0cafefe420a3 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/evaluation/RegressionEvaluator.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/evaluation/RegressionEvaluator.scala @@ -118,8 +118,7 @@ final class RegressionEvaluator @Since("1.4.0") (@Since("1.4.0") override val ui @Since("1.4.0") override def isLargerBetter: Boolean = $(metricName) match { - case "r2" => true - case "var" => true + case "r2" | "var" => true case _ => false }