address comments

WeichenXu123 · WeichenXu123 · commit 80f07fb93a00 · 2018-04-13T18:35:51.000+08:00
diff --git a/python/pyspark/ml/param/_shared_params_code_gen.py b/python/pyspark/ml/param/_shared_params_code_gen.py
@@ -157,7 +157,10 @@ def get$Name(self):
          "TypeConverters.toInt"),
         ("parallelism", "the number of threads to use when running parallel algorithms (>= 1).",
          "1", "TypeConverters.toInt"),
-        ("collectSubModels", "whether to collect a list of sub-models trained during tuning",
+        ("collectSubModels", "Param for whether to collect a list of sub-models trained during " +
+         "tuning. If set to false, then only the single best sub-model will be available after " +
+         "fitting. If set to true, then all sub-models will be available. Warning: For large " +
+         "models, collecting all sub-models can cause OOMs on the Spark driver.",
          "False", "TypeConverters.toBoolean"),
         ("loss", "the loss function to be optimized.", None, "TypeConverters.toString")]
 
diff --git a/python/pyspark/ml/param/shared.py b/python/pyspark/ml/param/shared.py
@@ -657,10 +657,10 @@ def getParallelism(self):
 
 class HasCollectSubModels(Params):
     """
-    Mixin for param collectSubModels: whether to collect a list of sub-models trained during tuning
+    Mixin for param collectSubModels: Param for whether to collect a list of sub-models trained during tuning. If set to false, then only the single best sub-model will be available after fitting. If set to true, then all sub-models will be available. Warning: For large models, collecting all sub-models can cause OOMs on the Spark driver.
     """
 
-    collectSubModels = Param(Params._dummy(), "collectSubModels", "whether to collect a list of sub-models trained during tuning", typeConverter=TypeConverters.toBoolean)
+    collectSubModels = Param(Params._dummy(), "collectSubModels", "Param for whether to collect a list of sub-models trained during tuning. If set to false, then only the single best sub-model will be available after fitting. If set to true, then all sub-models will be available. Warning: For large models, collecting all sub-models can cause OOMs on the Spark driver.", typeConverter=TypeConverters.toBoolean)
 
     def __init__(self):
         super(HasCollectSubModels, self).__init__()
diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py
@@ -1037,9 +1037,9 @@ def test_expose_sub_models(self):
                             numFolds=numFolds, collectSubModels=True)
 
         def checkSubModels(subModels):
-            assert len(subModels) == numFolds
+            self.assertEqual(len(subModels), numFolds)
             for i in range(numFolds):
-                assert len(subModels[i]) == len(grid)
+                self.assertEqual(len(subModels[i]), len(grid))
 
         cvModel = cv.fit(dataset)
         checkSubModels(cvModel.subModels)
@@ -1050,11 +1050,13 @@ def checkSubModels(subModels):
         cvModel.save(savingPathWithSubModels)
         cvModel3 = CrossValidatorModel.load(savingPathWithSubModels)
         checkSubModels(cvModel3.subModels)
+        cvModel4 = cvModel3.copy()
+        checkSubModels(cvModel4.subModels)
 
         savingPathWithoutSubModels = testSubPath + "cvModel2"
         cvModel.write().option("persistSubModels", "false").save(savingPathWithoutSubModels)
         cvModel2 = CrossValidatorModel.load(savingPathWithoutSubModels)
-        assert cvModel2.subModels is None
+        self.assertEqual(cvModel2.subModels, None)
 
         for i in range(numFolds):
             for j in range(len(grid)):
@@ -1243,19 +1245,21 @@ def test_expose_sub_models(self):
         tvs = TrainValidationSplit(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator,
                                    collectSubModels=True)
         tvsModel = tvs.fit(dataset)
-        assert len(tvsModel.subModels) == len(grid)
+        self.assertEqual(len(tvsModel.subModels), len(grid))
 
         # Test the default value for option "persistSubModel" to be "true"
         testSubPath = temp_path + "/testTrainValidationSplitSubModels"
         savingPathWithSubModels = testSubPath + "cvModel3"
         tvsModel.save(savingPathWithSubModels)
         tvsModel3 = TrainValidationSplitModel.load(savingPathWithSubModels)
-        assert len(tvsModel3.subModels) == len(grid)
+        self.assertEqual(len(tvsModel3.subModels), len(grid))
+        tvsModel4 = tvsModel3.copy()
+        self.assertEqual(len(tvsModel4.subModels), len(grid))
 
         savingPathWithoutSubModels = testSubPath + "cvModel2"
         tvsModel.write().option("persistSubModels", "false").save(savingPathWithoutSubModels)
         tvsModel2 = TrainValidationSplitModel.load(savingPathWithoutSubModels)
-        assert tvsModel2.subModels is None
+        self.assertEqual(tvsModel2.subModels, None)
 
         for i in range(len(grid)):
             self.assertEqual(tvsModel.subModels[i].uid, tvsModel3.subModels[i].uid)
diff --git a/python/pyspark/ml/tuning.py b/python/pyspark/ml/tuning.py
@@ -354,9 +354,11 @@ def _from_java(cls, java_stage):
         numFolds = java_stage.getNumFolds()
         seed = java_stage.getSeed()
         parallelism = java_stage.getParallelism()
+        collectSubModels = java_stage.getCollectSubModels()
         # Create a new instance of this stage.
         py_stage = cls(estimator=estimator, estimatorParamMaps=epms, evaluator=evaluator,
-                       numFolds=numFolds, seed=seed, parallelism=parallelism)
+                       numFolds=numFolds, seed=seed, parallelism=parallelism,
+                       collectSubModels=collectSubModels)
         py_stage._resetUid(java_stage.uid())
         return py_stage
 
@@ -376,6 +378,7 @@ def _to_java(self):
         _java_obj.setSeed(self.getSeed())
         _java_obj.setNumFolds(self.getNumFolds())
         _java_obj.setParallelism(self.getParallelism())
+        _java_obj.setCollectSubModels(self.getCollectSubModels())
 
         return _java_obj
 
@@ -410,6 +413,7 @@ def copy(self, extra=None):
         and some extra params. This copies the underlying bestModel,
         creates a deep copy of the embedded paramMap, and
         copies the embedded and extra parameters over.
+        It does not copy the extra Params into the subModels.
 
         :param extra: Extra parameters to copy to the new instance
         :return: Copy of this instance
@@ -628,9 +632,11 @@ def _from_java(cls, java_stage):
         trainRatio = java_stage.getTrainRatio()
         seed = java_stage.getSeed()
         parallelism = java_stage.getParallelism()
+        collectSubModels = java_stage.getCollectSubModels()
         # Create a new instance of this stage.
         py_stage = cls(estimator=estimator, estimatorParamMaps=epms, evaluator=evaluator,
-                       trainRatio=trainRatio, seed=seed, parallelism=parallelism)
+                       trainRatio=trainRatio, seed=seed, parallelism=parallelism,
+                       collectSubModels=collectSubModels)
         py_stage._resetUid(java_stage.uid())
         return py_stage
 
@@ -650,6 +656,7 @@ def _to_java(self):
         _java_obj.setTrainRatio(self.getTrainRatio())
         _java_obj.setSeed(self.getSeed())
         _java_obj.setParallelism(self.getParallelism())
+        _java_obj.setCollectSubModels(self.getCollectSubModels())
         return _java_obj
 
 
@@ -682,6 +689,7 @@ def copy(self, extra=None):
         creates a deep copy of the embedded paramMap, and
         copies the embedded and extra parameters over.
         And, this creates a shallow copy of the validationMetrics.
+        It does not copy the extra Params into the subModels.
 
         :param extra: Extra parameters to copy to the new instance
         :return: Copy of this instance