add unit tests

yinxusen · yinxusen · commit e0cf36f79101 · 2016-04-14T16:28:42.000-07:00
diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py
@@ -1352,7 +1352,9 @@ def copy(self, extra=None):
         """
         if extra is None:
             extra = dict()
-        return self._copyValues(OneVsRestModel([model.copy(extra) for model in self.models]))
+        newModel = Params.copy(self, extra)
+        newModel.models = [model.copy(extra) for model in self.models]
+        return newModel
 
 
 if __name__ == "__main__":
diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py
@@ -42,7 +42,7 @@
 import numpy as np
 
 from pyspark.ml import Estimator, Model, Pipeline, PipelineModel, Transformer
-from pyspark.ml.classification import LogisticRegression, DecisionTreeClassifier
+from pyspark.ml.classification import LogisticRegression, DecisionTreeClassifier, OneVsRest
 from pyspark.ml.clustering import KMeans
 from pyspark.ml.evaluation import BinaryClassificationEvaluator, RegressionEvaluator
 from pyspark.ml.feature import *
@@ -831,6 +831,36 @@ def test_logistic_regression_summary(self):
         self.assertAlmostEqual(sameSummary.areaUnderROC, s.areaUnderROC)
 
 
+class OneVsRestTests(PySparkTestCase):
+
+    def test_copy(self):
+        sqlContext = SQLContext(self.sc)
+        df = sqlContext.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)),
+                                         (1.0, Vectors.sparse(2, [], [])),
+                                         (2.0, Vectors.dense(0.5, 0.5))],
+                                        ["label", "features"])
+        lr = LogisticRegression(maxIter=5, regParam=0.01)
+        ovr = OneVsRest(classifier=lr)
+        ovr1 = ovr.copy({lr.maxIter: 10})
+        self.assertEqual(ovr.getClassifier().getMaxIter(), 5)
+        self.assertEqual(ovr1.getClassifier().getMaxIter(), 10)
+        model = ovr.fit(df)
+        model1 = model.copy({model.predictionCol: "indexed"})
+        self.assertEqual(model1.getPredictionCol(), "indexed")
+
+    def test_output_columns(self):
+        sqlContext = SQLContext(self.sc)
+        df = sqlContext.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)),
+                                         (1.0, Vectors.sparse(2, [], [])),
+                                         (2.0, Vectors.dense(0.5, 0.5))],
+                                        ["label", "features"])
+        lr = LogisticRegression(maxIter=5, regParam=0.01)
+        ovr = OneVsRest(classifier=lr)
+        model = ovr.fit(df)
+        output = model.transform(df)
+        self.assertEqual(output.columns, ["label", "features", "prediction"])
+
+
 if __name__ == "__main__":
     from pyspark.ml.tests import *
     if xmlrunner: