updated python code

imatiach-msft · imatiach-msft · commit a571adcf5dcb · 2019-02-23T23:54:29.000-05:00
diff --git a/python/pyspark/ml/evaluation.py b/python/pyspark/ml/evaluation.py
@@ -106,7 +106,7 @@ def isLargerBetter(self):
 
 
 @inherit_doc
-class BinaryClassificationEvaluator(JavaEvaluator, HasLabelCol, HasRawPredictionCol,
+class BinaryClassificationEvaluator(JavaEvaluator, HasLabelCol, HasRawPredictionCol, HasWeightCol,
                                     JavaMLReadable, JavaMLWritable):
     """
     .. note:: Experimental
@@ -130,6 +130,14 @@ class BinaryClassificationEvaluator(JavaEvaluator, HasLabelCol, HasRawPrediction
     >>> evaluator2 = BinaryClassificationEvaluator.load(bce_path)
     >>> str(evaluator2.getRawPredictionCol())
     'raw'
+    >>> scoreAndLabelsAndWeight = map(lambda x: (Vectors.dense([1.0 - x[0], x[0]]), x[1], x[2]),
+    ...    [(0.1, 0.0, 1.0), (0.1, 1.0, 0.9), (0.4, 0.0, 0.7), (0.6, 0.0, 0.9),
+    ...     (0.6, 1.0, 1.0), (0.6, 1.0, 0.3), (0.8, 1.0, 1.0)])
+    >>> dataset = spark.createDataFrame(scoreAndLabelsAndWeight, ["raw", "label", "weight"])
+    ...
+    >>> evaluator = BinaryClassificationEvaluator(rawPredictionCol="raw", weightCol="weight")
+    >>> evaluator.evaluate(dataset)
+    0.70...
 
     .. versionadded:: 1.4.0
     """
@@ -140,10 +148,10 @@ class BinaryClassificationEvaluator(JavaEvaluator, HasLabelCol, HasRawPrediction
 
     @keyword_only
     def __init__(self, rawPredictionCol="rawPrediction", labelCol="label",
-                 metricName="areaUnderROC"):
+                 metricName="areaUnderROC", weightCol=None):
         """
         __init__(self, rawPredictionCol="rawPrediction", labelCol="label", \
-                 metricName="areaUnderROC")
+                 metricName="areaUnderROC", weightCol=None)
         """
         super(BinaryClassificationEvaluator, self).__init__()
         self._java_obj = self._new_java_obj(
@@ -169,10 +177,10 @@ def getMetricName(self):
     @keyword_only
     @since("1.4.0")
     def setParams(self, rawPredictionCol="rawPrediction", labelCol="label",
-                  metricName="areaUnderROC"):
+                  metricName="areaUnderROC", weightCol=None):
         """
         setParams(self, rawPredictionCol="rawPrediction", labelCol="label", \
-                  metricName="areaUnderROC")
+                  metricName="areaUnderROC", weightCol=None)
         Sets params for binary classification evaluator.
         """
         kwargs = self._input_kwargs
diff --git a/python/pyspark/mllib/evaluation.py b/python/pyspark/mllib/evaluation.py
@@ -30,7 +30,7 @@ class BinaryClassificationMetrics(JavaModelWrapper):
     """
     Evaluator for binary classification.
 
-    :param scoreAndLabels: an RDD of (score, label) pairs
+    :param scoreAndLabelsWithOptWeight: an RDD of score, label and optional weight.
 
     >>> scoreAndLabels = sc.parallelize([
     ...     (0.1, 0.0), (0.1, 1.0), (0.4, 0.0), (0.6, 0.0), (0.6, 1.0), (0.6, 1.0), (0.8, 1.0)], 2)
@@ -40,16 +40,28 @@ class BinaryClassificationMetrics(JavaModelWrapper):
     >>> metrics.areaUnderPR
     0.83...
     >>> metrics.unpersist()
+    >>> scoreAndLabelsWithOptWeight = sc.parallelize([
+    ...     (0.1, 0.0, 1.0), (0.1, 1.0, 0.4), (0.4, 0.0, 0.2), (0.6, 0.0, 0.6), (0.6, 1.0, 0.9),
+    ...     (0.6, 1.0, 0.5), (0.8, 1.0, 0.7)], 2)
+    >>> metrics = BinaryClassificationMetrics(scoreAndLabelsWithOptWeight)
+    >>> metrics.areaUnderROC
+    0.70...
+    >>> metrics.areaUnderPR
+    0.83...
 
     .. versionadded:: 1.4.0
     """
 
-    def __init__(self, scoreAndLabels):
-        sc = scoreAndLabels.ctx
+    def __init__(self, scoreAndLabelsWithOptWeight):
+        sc = scoreAndLabelsWithOptWeight.ctx
         sql_ctx = SQLContext.getOrCreate(sc)
-        df = sql_ctx.createDataFrame(scoreAndLabels, schema=StructType([
+        numCol = len(scoreAndLabelsWithOptWeight.first())
+        schema = StructType([
             StructField("score", DoubleType(), nullable=False),
-            StructField("label", DoubleType(), nullable=False)]))
+            StructField("label", DoubleType(), nullable=False)])
+        if (numCol == 3):
+            schema.add("weight", DoubleType(), False)
+        df = sql_ctx.createDataFrame(scoreAndLabelsWithOptWeight, schema=schema)
         java_class = sc._jvm.org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
         java_model = java_class(df._jdf)
         super(BinaryClassificationMetrics, self).__init__(java_model)