From 7950cb92e9348ebbf2118a70532ef79b3eaf3046 Mon Sep 17 00:00:00 2001
From: zhengruifeng <ruifengz@foxmail.com>
Date: Thu, 26 Sep 2019 16:21:35 +0800
Subject: [PATCH 1/5] create pr

---
 .../BinaryClassificationEvaluator.scala       | 29 +++++++--
 .../ml/evaluation/RegressionEvaluator.scala   | 37 +++++++++---
 .../evaluation/RegressionEvaluatorSuite.scala |  4 ++
 python/pyspark/ml/evaluation.py               | 59 ++++++++++++++++---
 4 files changed, 106 insertions(+), 23 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluator.scala b/mllib/src/main/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluator.scala
index 2a7b3c579b078..09e8e7b232f3a 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluator.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluator.scala
@@ -59,6 +59,28 @@ class BinaryClassificationEvaluator @Since("1.4.0") (@Since("1.4.0") override va
   @Since("1.2.0")
   def setMetricName(value: String): this.type = set(metricName, value)
 
+  /**
+   * param for number of bins to down-sample the curves (ROC curve, PR curve) in area
+   * computation. If 0, no down-sampling will occur.
+   * Default: 1000.
+   * @group expertParam
+   */
+  @Since("3.0.0")
+  val numBins: IntParam = new IntParam(this, "numBins", "Number of bins to down-sample " +
+    "the curves (ROC curve, PR curve) in area computation. If 0, no down-sampling will occur. " +
+    "Must be >= 0.",
+    ParamValidators.gtEq(0))
+
+  /** @group expertGetParam */
+  @Since("3.0.0")
+  def getNumBins: Int = $(numBins)
+
+  /** @group expertSetParam */
+  @Since("3.0.0")
+  def setNumBins(value: Int): this.type = set(numBins, value)
+
+  setDefault(numBins -> 1000)
+
   /** @group setParam */
   @Since("1.5.0")
   def setRawPredictionCol(value: String): this.type = set(rawPredictionCol, value)
@@ -94,7 +116,7 @@ class BinaryClassificationEvaluator @Since("1.4.0") (@Since("1.4.0") override va
         case Row(rawPrediction: Double, label: Double, weight: Double) =>
           (rawPrediction, label, weight)
       }
-    val metrics = new BinaryClassificationMetrics(scoreAndLabelsWithWeights)
+    val metrics = new BinaryClassificationMetrics(scoreAndLabelsWithWeights, $(numBins))
     val metric = $(metricName) match {
       case "areaUnderROC" => metrics.areaUnderROC()
       case "areaUnderPR" => metrics.areaUnderPR()
@@ -104,10 +126,7 @@ class BinaryClassificationEvaluator @Since("1.4.0") (@Since("1.4.0") override va
   }
 
   @Since("1.5.0")
-  override def isLargerBetter: Boolean = $(metricName) match {
-    case "areaUnderROC" => true
-    case "areaUnderPR" => true
-  }
+  override def isLargerBetter: Boolean = true
 
   @Since("1.4.1")
   override def copy(extra: ParamMap): BinaryClassificationEvaluator = defaultCopy(extra)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/evaluation/RegressionEvaluator.scala b/mllib/src/main/scala/org/apache/spark/ml/evaluation/RegressionEvaluator.scala
index dd667a85fa598..a8853d8425886 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/evaluation/RegressionEvaluator.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/evaluation/RegressionEvaluator.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.ml.evaluation
 
 import org.apache.spark.annotation.Since
-import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators}
+import org.apache.spark.ml.param.{BooleanParam, Param, ParamMap, ParamValidators}
 import org.apache.spark.ml.param.shared.{HasLabelCol, HasPredictionCol, HasWeightCol}
 import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils}
 import org.apache.spark.mllib.evaluation.RegressionMetrics
@@ -43,13 +43,14 @@ final class RegressionEvaluator @Since("1.4.0") (@Since("1.4.0") override val ui
    *  - `"mse"`: mean squared error
    *  - `"r2"`: R^2^ metric
    *  - `"mae"`: mean absolute error
+   *  - `"var"`: explained variance
    *
    * @group param
    */
   @Since("1.4.0")
   val metricName: Param[String] = {
-    val allowedParams = ParamValidators.inArray(Array("mse", "rmse", "r2", "mae"))
-    new Param(this, "metricName", "metric name in evaluation (mse|rmse|r2|mae)", allowedParams)
+    val allowedParams = ParamValidators.inArray(Array("mse", "rmse", "r2", "mae", "var"))
+    new Param(this, "metricName", "metric name in evaluation (mse|rmse|r2|mae|var)", allowedParams)
   }
 
   /** @group getParam */
@@ -60,6 +61,25 @@ final class RegressionEvaluator @Since("1.4.0") (@Since("1.4.0") override val ui
   @Since("1.4.0")
   def setMetricName(value: String): this.type = set(metricName, value)
 
+  /**
+   * param for whether the regression is through the origin.
+   * Default: false.
+   * @group expertParam
+   */
+  @Since("3.0.0")
+  val throughOrigin: BooleanParam = new BooleanParam(this, "throughOrigin",
+    "Whether the regression is through the origin.")
+
+  /** @group expertGetParam */
+  @Since("3.0.0")
+  def getThroughOrigin: Boolean = $(throughOrigin)
+
+  /** @group expertSetParam */
+  @Since("3.0.0")
+  def setThroughOrigin(value: Boolean): this.type = set(throughOrigin, value)
+
+  setDefault(throughOrigin -> false)
+
   /** @group setParam */
   @Since("1.4.0")
   def setPredictionCol(value: String): this.type = set(predictionCol, value)
@@ -86,22 +106,21 @@ final class RegressionEvaluator @Since("1.4.0") (@Since("1.4.0") override val ui
       .rdd
       .map { case Row(prediction: Double, label: Double, weight: Double) =>
         (prediction, label, weight) }
-    val metrics = new RegressionMetrics(predictionAndLabelsWithWeights)
-    val metric = $(metricName) match {
+    val metrics = new RegressionMetrics(predictionAndLabelsWithWeights, $(throughOrigin))
+    $(metricName) match {
       case "rmse" => metrics.rootMeanSquaredError
       case "mse" => metrics.meanSquaredError
       case "r2" => metrics.r2
       case "mae" => metrics.meanAbsoluteError
+      case "var" => metrics.explainedVariance
     }
-    metric
   }
 
   @Since("1.4.0")
   override def isLargerBetter: Boolean = $(metricName) match {
-    case "rmse" => false
-    case "mse" => false
     case "r2" => true
-    case "mae" => false
+    case "var" => true
+    case _ => false
   }
 
   @Since("1.5.0")
diff --git a/mllib/src/test/scala/org/apache/spark/ml/evaluation/RegressionEvaluatorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/evaluation/RegressionEvaluatorSuite.scala
index c1a156959618e..f4f858c3e92dc 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/evaluation/RegressionEvaluatorSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/evaluation/RegressionEvaluatorSuite.scala
@@ -76,6 +76,10 @@ class RegressionEvaluatorSuite
     // mae
     evaluator.setMetricName("mae")
     assert(evaluator.evaluate(predictions) ~== 0.08399089 absTol 0.01)
+
+    // var
+    evaluator.setMetricName("var")
+    assert(evaluator.evaluate(predictions) ~== 63.6944519 absTol 0.01)
   }
 
   test("read/write") {
diff --git a/python/pyspark/ml/evaluation.py b/python/pyspark/ml/evaluation.py
index d96cdd594a3f3..48bc83cc01fcf 100644
--- a/python/pyspark/ml/evaluation.py
+++ b/python/pyspark/ml/evaluation.py
@@ -139,6 +139,8 @@ class BinaryClassificationEvaluator(JavaEvaluator, HasLabelCol, HasRawPrediction
     0.70...
     >>> evaluator.evaluate(dataset, {evaluator.metricName: "areaUnderPR"})
     0.82...
+    >>> evaluator.getNumBins
+    1000
 
     .. versionadded:: 1.4.0
     """
@@ -147,12 +149,17 @@ class BinaryClassificationEvaluator(JavaEvaluator, HasLabelCol, HasRawPrediction
                        "metric name in evaluation (areaUnderROC|areaUnderPR)",
                        typeConverter=TypeConverters.toString)
 
+    numBins = Param(Params._dummy(), "numBins", "Number of bins to down-sample the curves "
+                    "(ROC curve, PR curve) in area computation. If 0, no down-sampling will "
+                    "occur. Must be >= 0.",
+                    typeConverter=TypeConverters.toInt)
+
     @keyword_only
     def __init__(self, rawPredictionCol="rawPrediction", labelCol="label",
-                 metricName="areaUnderROC", weightCol=None):
+                 metricName="areaUnderROC", weightCol=None, numBins=1000):
         """
         __init__(self, rawPredictionCol="rawPrediction", labelCol="label", \
-                 metricName="areaUnderROC", weightCol=None)
+                 metricName="areaUnderROC", weightCol=None, numBins=1000)
         """
         super(BinaryClassificationEvaluator, self).__init__()
         self._java_obj = self._new_java_obj(
@@ -175,13 +182,27 @@ def getMetricName(self):
         """
         return self.getOrDefault(self.metricName)
 
+    @since("3.0.0")
+    def setNumBins(self, value):
+        """
+        Sets the value of :py:attr:`numBins`.
+        """
+        return self._set(numBins=value)
+
+    @since("3.0.0")
+    def getNumBins(self):
+        """
+        Gets the value of numBins or its default value.
+        """
+        return self.getOrDefault(self.numBins)
+
     @keyword_only
     @since("1.4.0")
     def setParams(self, rawPredictionCol="rawPrediction", labelCol="label",
-                  metricName="areaUnderROC", weightCol=None):
+                  metricName="areaUnderROC", weightCol=None, numBins=1000):
         """
         setParams(self, rawPredictionCol="rawPrediction", labelCol="label", \
-                  metricName="areaUnderROC", weightCol=None)
+                  metricName="areaUnderROC", weightCol=None, numBins=1000)
         Sets params for binary classification evaluator.
         """
         kwargs = self._input_kwargs
@@ -218,6 +239,7 @@ class RegressionEvaluator(JavaEvaluator, HasLabelCol, HasPredictionCol, HasWeigh
     >>> evaluator = RegressionEvaluator(predictionCol="raw", weightCol="weight")
     >>> evaluator.evaluate(dataset)
     2.740...
+    >>> evaluator.throughOrigin
 
     .. versionadded:: 1.4.0
     """
@@ -226,15 +248,20 @@ class RegressionEvaluator(JavaEvaluator, HasLabelCol, HasPredictionCol, HasWeigh
                        rmse - root mean squared error (default)
                        mse - mean squared error
                        r2 - r^2 metric
-                       mae - mean absolute error.""",
+                       mae - mean absolute error
+                       var - explained variance.""",
                        typeConverter=TypeConverters.toString)
 
+    throughOrigin = Param(Params._dummy(), "throughOrigin",
+                          "whether the regression is through the origin.",
+                         typeConverter=TypeConverters.toBoolean)
+
     @keyword_only
     def __init__(self, predictionCol="prediction", labelCol="label",
-                 metricName="rmse", weightCol=None):
+                 metricName="rmse", weightCol=None, throughOrigin=False):
         """
         __init__(self, predictionCol="prediction", labelCol="label", \
-                 metricName="rmse", weightCol=None)
+                 metricName="rmse", weightCol=None, throughOrigin=False)
         """
         super(RegressionEvaluator, self).__init__()
         self._java_obj = self._new_java_obj(
@@ -257,13 +284,27 @@ def getMetricName(self):
         """
         return self.getOrDefault(self.metricName)
 
+    @since("3.0.0")
+    def setThroughOrigin(self, value):
+        """
+        Sets the value of :py:attr:`throughOrigin`.
+        """
+        return self._set(throughOrigin=value)
+
+    @since("3.0.0")
+    def getThroughOrigin(self):
+        """
+        Gets the value of throughOrigin or its default value.
+        """
+        return self.getOrDefault(self.throughOrigin)
+
     @keyword_only
     @since("1.4.0")
     def setParams(self, predictionCol="prediction", labelCol="label",
-                  metricName="rmse", weightCol=None):
+                  metricName="rmse", weightCol=None, throughOrigin=False):
         """
         setParams(self, predictionCol="prediction", labelCol="label", \
-                  metricName="rmse", weightCol=None)
+                  metricName="rmse", weightCol=None, throughOrigin=False)
         Sets params for regression evaluator.
         """
         kwargs = self._input_kwargs

From aaef3f7e8897fd5bdf5b1f23c47fda514a92bfca Mon Sep 17 00:00:00 2001
From: zhengruifeng <ruifengz@foxmail.com>
Date: Thu, 26 Sep 2019 16:32:37 +0800
Subject: [PATCH 2/5] nit

---
 python/pyspark/ml/evaluation.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/python/pyspark/ml/evaluation.py b/python/pyspark/ml/evaluation.py
index 48bc83cc01fcf..b51d5f695a388 100644
--- a/python/pyspark/ml/evaluation.py
+++ b/python/pyspark/ml/evaluation.py
@@ -239,7 +239,8 @@ class RegressionEvaluator(JavaEvaluator, HasLabelCol, HasPredictionCol, HasWeigh
     >>> evaluator = RegressionEvaluator(predictionCol="raw", weightCol="weight")
     >>> evaluator.evaluate(dataset)
     2.740...
-    >>> evaluator.throughOrigin
+    >>> evaluator.getThroughOrigin
+    False
 
     .. versionadded:: 1.4.0
     """
@@ -254,7 +255,7 @@ class RegressionEvaluator(JavaEvaluator, HasLabelCol, HasPredictionCol, HasWeigh
 
     throughOrigin = Param(Params._dummy(), "throughOrigin",
                           "whether the regression is through the origin.",
-                         typeConverter=TypeConverters.toBoolean)
+                          typeConverter=TypeConverters.toBoolean)
 
     @keyword_only
     def __init__(self, predictionCol="prediction", labelCol="label",

From 932a17eb7bebdc200fddda4dda46ff49a4674b23 Mon Sep 17 00:00:00 2001
From: zhengruifeng <ruifengz@foxmail.com>
Date: Thu, 26 Sep 2019 19:07:53 +0800
Subject: [PATCH 3/5] py add setDefault

---
 python/pyspark/ml/evaluation.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/pyspark/ml/evaluation.py b/python/pyspark/ml/evaluation.py
index b51d5f695a388..c7bd561d7500e 100644
--- a/python/pyspark/ml/evaluation.py
+++ b/python/pyspark/ml/evaluation.py
@@ -164,7 +164,7 @@ def __init__(self, rawPredictionCol="rawPrediction", labelCol="label",
         super(BinaryClassificationEvaluator, self).__init__()
         self._java_obj = self._new_java_obj(
             "org.apache.spark.ml.evaluation.BinaryClassificationEvaluator", self.uid)
-        self._setDefault(metricName="areaUnderROC")
+        self._setDefault(metricName="areaUnderROC", numBins=1000)
         kwargs = self._input_kwargs
         self._set(**kwargs)
 
@@ -267,7 +267,7 @@ def __init__(self, predictionCol="prediction", labelCol="label",
         super(RegressionEvaluator, self).__init__()
         self._java_obj = self._new_java_obj(
             "org.apache.spark.ml.evaluation.RegressionEvaluator", self.uid)
-        self._setDefault(metricName="rmse")
+        self._setDefault(metricName="rmse", throughOrigin=False)
         kwargs = self._input_kwargs
         self._set(**kwargs)
 

From 4b4cbc6d97ae3fc155bff0a86dcf788d9745a328 Mon Sep 17 00:00:00 2001
From: zhengruifeng <ruifengz@foxmail.com>
Date: Thu, 26 Sep 2019 19:59:50 +0800
Subject: [PATCH 4/5] nit

---
 python/pyspark/ml/evaluation.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/pyspark/ml/evaluation.py b/python/pyspark/ml/evaluation.py
index c7bd561d7500e..6917cce615aa4 100644
--- a/python/pyspark/ml/evaluation.py
+++ b/python/pyspark/ml/evaluation.py
@@ -139,7 +139,7 @@ class BinaryClassificationEvaluator(JavaEvaluator, HasLabelCol, HasRawPrediction
     0.70...
     >>> evaluator.evaluate(dataset, {evaluator.metricName: "areaUnderPR"})
     0.82...
-    >>> evaluator.getNumBins
+    >>> evaluator.getNumBins()
     1000
 
     .. versionadded:: 1.4.0
@@ -239,7 +239,7 @@ class RegressionEvaluator(JavaEvaluator, HasLabelCol, HasPredictionCol, HasWeigh
     >>> evaluator = RegressionEvaluator(predictionCol="raw", weightCol="weight")
     >>> evaluator.evaluate(dataset)
     2.740...
-    >>> evaluator.getThroughOrigin
+    >>> evaluator.getThroughOrigin()
     False
 
     .. versionadded:: 1.4.0

From 45139b50776dd8f1bb5e592bad3e42bc2cf003d0 Mon Sep 17 00:00:00 2001
From: zhengruifeng <ruifengz@foxmail.com>
Date: Fri, 27 Sep 2019 09:58:06 +0800
Subject: [PATCH 5/5] update match case

---
 .../org/apache/spark/ml/evaluation/RegressionEvaluator.scala   | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/evaluation/RegressionEvaluator.scala b/mllib/src/main/scala/org/apache/spark/ml/evaluation/RegressionEvaluator.scala
index a8853d8425886..b0cafefe420a3 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/evaluation/RegressionEvaluator.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/evaluation/RegressionEvaluator.scala
@@ -118,8 +118,7 @@ final class RegressionEvaluator @Since("1.4.0") (@Since("1.4.0") override val ui
 
   @Since("1.4.0")
   override def isLargerBetter: Boolean = $(metricName) match {
-    case "r2" => true
-    case "var" => true
+    case "r2" | "var" => true
     case _ => false
   }