ENH: python, cache weightCol

facaiy · facaiy · commit 25d681f38ff6 · 2017-07-07T12:15:45.000+08:00
diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py
@@ -1546,7 +1546,10 @@ def _fit(self, dataset):
 
         numClasses = int(dataset.agg({labelCol: "max"}).head()["max("+labelCol+")"]) + 1
 
-        multiclassLabeled = dataset.select(labelCol, featuresCol)
+        if isinstance(classifier, HasWeightCol) and classifier.getWeightCol():
+            multiclassLabeled = dataset.select(labelCol, featuresCol, classifier.getWeightCol())
+        else:
+            multiclassLabeled = dataset.select(labelCol, featuresCol)
 
         # persist if underlying dataset is not persistent.
         handlePersistence = \
diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py
@@ -1255,6 +1255,16 @@ def test_output_columns(self):
         output = model.transform(df)
         self.assertEqual(output.columns, ["label", "features", "prediction"])
 
+    def test_cache_weightCol_if_necessary(self):
+        df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8), 1.0),
+                                         (1.0, Vectors.sparse(2, [], []), 1.0),
+                                         (2.0, Vectors.dense(0.5, 0.5), 1.0)],
+                                        ["label", "features", "weight"])
+        lr = LogisticRegression(maxIter=5, regParam=0.01, weightCol="weight")
+        ovr = OneVsRest(classifier=lr)
+        model = ovr.fit(df)
+        self.assertIsNone(model)
+
 
 class HashingTFTest(SparkSessionTestCase):