Refactor Trainer in advance of implementing Trainer.validate

EliaCereda · EliaCereda · commit edb3e834db22 · 2020-12-02T13:47:25.000+01:00
* Replace the `Trainer.testing` attribute with `Trainer.evaluating`, which is currently set to `'test'` if the top-level function called by the user was `Trainer.test(…)` and `None` otherwise. In the next PR, it will be set to `'validation’` when the user calls `validate(…)`.
* Update the other components to use the new attribute instead of `Trainer.testing`
* Disable the `EarlyStopping` and `ModelCheckpoint` callbacks when `evaluating`. This has no effect when evaluating on the test set, since they were already disabled, but it will be necessary for the validation set
* Rename a few other attributes of `Trainer` to clarify that they will be used by both `test(…)` and `validate(…)`
diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py
@@ -60,7 +60,7 @@ def broadcast(self, obj, src=0):
         return obj
 
     def train_or_test(self):
-        if self.trainer.testing:
+        if self.trainer.evaluating:
             results = self.trainer.run_test()
         else:
             results = self.trainer.train()
@@ -160,7 +160,7 @@ def early_stopping_should_stop(self, pl_module):
         return self.trainer.should_stop
 
     def setup_optimizers(self, model):
-        if self.trainer.testing is True:
+        if self.trainer.evaluating:
             return
 
         optimizers, lr_schedulers, optimizer_frequencies = self.trainer.init_optimizers(model)
diff --git a/pytorch_lightning/callbacks/early_stopping.py b/pytorch_lightning/callbacks/early_stopping.py
@@ -134,13 +134,13 @@ def on_load_checkpoint(self, checkpointed_state):
         self.patience = checkpointed_state['patience']
 
     def on_validation_end(self, trainer, pl_module):
-        if trainer.running_sanity_check:
+        if trainer.running_sanity_check or trainer.evaluating:
             return
 
         self._run_early_stopping_check(trainer, pl_module)
 
     def on_validation_epoch_end(self, trainer, pl_module):
-        if trainer.running_sanity_check:
+        if trainer.running_sanity_check or trainer.evaluating:
             return
 
         if self._validate_condition_metric(trainer.logger_connector.callback_metrics):
diff --git a/pytorch_lightning/callbacks/model_checkpoint.py b/pytorch_lightning/callbacks/model_checkpoint.py
@@ -220,6 +220,7 @@ def save_checkpoint(self, trainer, pl_module):
             or self.period < 1  # no models are saved
             or (epoch + 1) % self.period  # skip epoch
             or trainer.running_sanity_check  # don't save anything during sanity check
+            or trainer.evaluating  # don't save anything during evaluation: might delete the checkpoint being evaluated
             or self.last_global_step_saved == global_step  # already saved at the last step
         ):
             return
diff --git a/pytorch_lightning/trainer/configuration_validator.py b/pytorch_lightning/trainer/configuration_validator.py
@@ -31,12 +31,12 @@ def verify_loop_configurations(self, model: LightningModule):
             model: The model to check the configuration.
 
         """
-        if not self.trainer.testing:
+        if not self.trainer.evaluating:
             self.__verify_train_loop_configuration(model)
             self.__verify_eval_loop_configuration(model, 'validation')
         else:
-            # check test loop configuration
-            self.__verify_eval_loop_configuration(model, 'test')
+            # check evaluation loop configurations
+            self.__verify_eval_loop_configuration(model, self.trainer.evaluating)
 
     def __verify_train_loop_configuration(self, model):
         # -----------------------------------
diff --git a/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py b/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py
@@ -265,7 +265,7 @@ def prepare_eval_loop_results(self):
         for dl_idx in range(self.trainer.evaluation_loop.num_dataloaders):
             self.add_to_eval_loop_results(dl_idx, has_been_initialized)
 
-    def get_evaluate_epoch_results(self, test_mode):
+    def get_evaluate_epoch_results(self):
         if not self.trainer.running_sanity_check:
             # log all the metrics as a single dict
             metrics_to_log = self.cached_results.get_epoch_log_metrics()
@@ -274,11 +274,11 @@ def get_evaluate_epoch_results(self, test_mode):
 
         self.prepare_eval_loop_results()
 
-        # log results of test
-        if test_mode and self.trainer.is_global_zero and self.trainer.verbose_test:
+        # log results of evaluation
+        if self.trainer.evaluating and self.trainer.is_global_zero and self.trainer.verbose_evaluate:
             print('-' * 80)
             for result_idx, results in enumerate(self.eval_loop_results):
-                print(f'DATALOADER:{result_idx} TEST RESULTS')
+                print(f'DATALOADER:{result_idx} {self.trainer.evaluating.upper()} RESULTS')
                 pprint(results)
                 print('-' * 80)
 
diff --git a/pytorch_lightning/trainer/connectors/model_connector.py b/pytorch_lightning/trainer/connectors/model_connector.py
@@ -36,7 +36,11 @@ def copy_trainer_model_properties(self, model):
             m.use_ddp2 = self.trainer.use_ddp2
             m.use_ddp = self.trainer.use_ddp
             m.use_amp = self.trainer.amp_backend is not None
-            m.testing = self.trainer.testing
+            # Currently, the only users of m.testing appear to be DP and DDP,
+            # which use it to determine whether the model is currently inside
+            # the validation or test loop. For this reason it must check if
+            # trainer.evaluating is equal to "test" specifically.
+            m.testing = self.trainer.evaluating == 'test'
             m.use_single_gpu = self.trainer.use_single_gpu
             m.use_tpu = self.trainer.use_tpu
             m.tpu_local_core_rank = self.trainer.tpu_local_core_rank
diff --git a/pytorch_lightning/trainer/evaluation_loop.py b/pytorch_lightning/trainer/evaluation_loop.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 import torch
 
+import pytorch_lightning as pl
 from pytorch_lightning.core.step_result import EvalResult, Result
 from pytorch_lightning.trainer.supporters import PredictionCollection
 from pytorch_lightning.utilities.distributed import rank_zero_warn
@@ -22,7 +23,7 @@
 
 
 class EvaluationLoop(object):
-    def __init__(self, trainer):
+    def __init__(self, trainer: 'pl.Trainer'):
         self.trainer = trainer
         self.testing = False
         self.outputs = []
@@ -39,13 +40,15 @@ def on_trainer_init(self):
         self.trainer.test_dataloaders = None
         self.trainer.val_dataloaders = None
         self.trainer.running_sanity_check = False
-        self.trainer.testing = False
 
-        # when .test() is called, it sets this
-        self.trainer.tested_ckpt_path = None
+        # .validate() sets this to 'validation' and .test() sets this to 'test'
+        self.trainer.evaluating = None
 
-        # when true, prints test results
-        self.trainer.verbose_test = True
+        # .validate() and .test() set this when they load a checkpoint
+        self.trainer.evaluated_ckpt_path = None
+
+        # when true, print evaluation results in .validate() and .test()
+        self.trainer.verbose_evaluate = True
 
     def get_evaluation_dataloaders(self, max_batches):
         # select dataloaders
@@ -216,7 +219,7 @@ def evaluation_epoch_end(self):
 
     def log_epoch_metrics_on_evaluation_end(self):
         # get the final loop results
-        eval_loop_results = self.trainer.logger_connector.get_evaluate_epoch_results(self.testing)
+        eval_loop_results = self.trainer.logger_connector.get_evaluate_epoch_results()
         return eval_loop_results
 
     def __run_eval_epoch_end(self, num_dataloaders, using_eval_result):
diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
@@ -441,10 +441,6 @@ def fit(
         # hook
         self.data_connector.prepare_data(model)
 
-        # bookkeeping
-        # we reuse fit in .test() but change its behavior using this flag
-        self.testing = os.environ.get('PL_TESTING_MODE', self.testing)
-
         # ----------------------------
         # SET UP TRAINING
         # ----------------------------
@@ -720,33 +716,31 @@ def test(
         datamodule: Optional[LightningDataModule] = None,
     ):
         r"""
-
-        Separates from fit to make sure you never run on your test set until you want to.
+        Perform one evaluation epoch over the test set. It's separated from
+        fit to make sure you never run on your test set until you want to.
 
         Args:
             ckpt_path: Either ``best`` or path to the checkpoint you wish to test.
-                If ``None``, use the weights from the last epoch to test. Default to ``best``.
-
+                If ``None``, use the current weights of the model. Default to ``best``.
             datamodule: A instance of :class:`LightningDataModule`.
-
-            model: The model to test.
-
-            test_dataloaders: Either a single
-                Pytorch Dataloader or a list of them, specifying validation samples.
-
-            verbose: If True, prints the test results
+            model: The model to evaluate.
+            test_dataloaders: Either a single PyTorch DataLoader or a list of them,
+                specifying test samples.
+            verbose: If True, prints the test results.
 
         Returns:
-            The final test result dictionary. If no test_epoch_end is defined returns a list of dictionaries
+            The dictionary with final test results returned by test_epoch_end.
+            If test_epoch_end is not defined, the output is a list of the dictionaries
+            returned by test_step.
         """
         # --------------------
         # SETUP HOOK
         # --------------------
-        self.verbose_test = verbose
+        self.verbose_evaluate = verbose
 
         self.logger_connector.set_stage("test")
 
-        # If you supply a datamodule you can't supply train_dataloader or val_dataloaders
+        # If you supply a datamodule you can't supply test_dataloaders
         if test_dataloaders and datamodule:
             raise MisconfigurationException(
                 'You cannot pass test_dataloaders to trainer.test if you supply a datamodule'
@@ -756,15 +750,15 @@ def test(
         self.data_connector.attach_datamodule(model or self.get_model(), datamodule, 'test')
 
         if model is not None:
-            results = self.__test_given_model(model, test_dataloaders)
+            results = self.__evaluate_given_model(model, test_dataloaders, 'test')
         else:
-            results = self.__test_using_best_weights(ckpt_path, test_dataloaders)
+            results = self.__evaluate_using_best_weights(ckpt_path, test_dataloaders, 'test')
 
         self.teardown('test')
 
         return results
 
-    def __test_using_best_weights(self, ckpt_path, test_dataloaders):
+    def __evaluate_using_best_weights(self, ckpt_path, test_dataloaders, stage: str):
         model = self.get_model()
 
         # if user requests the best checkpoint but we don't have it, error
@@ -796,40 +790,56 @@ def __test_using_best_weights(self, ckpt_path, test_dataloaders):
             self.data_connector.attach_dataloaders(model, test_dataloaders=test_dataloaders)
 
         # run tests
-        self.tested_ckpt_path = ckpt_path
-        self.testing = True
-        os.environ['PL_TESTING_MODE'] = '1'
+        self.evaluating = stage
+        self.evaluated_ckpt_path = ckpt_path
         self.model = model
         results = self.fit(model)
-        self.testing = False
-        del os.environ['PL_TESTING_MODE']
+        self.evaluating = None
 
         # teardown
         if self.is_function_implemented('teardown'):
             model_ref = self.get_model()
-            model_ref.teardown('test')
+            model_ref.teardown(stage)
 
         return results
 
-    def __test_given_model(self, model, test_dataloaders):
+    def __evaluate_given_model(self, model, test_dataloaders, stage: str):
 
         # attach data
         if test_dataloaders is not None:
             self.data_connector.attach_dataloaders(model, test_dataloaders=test_dataloaders)
 
         # run test
         # sets up testing so we short circuit to eval
-        self.testing = True
+        self.evaluating = stage
         self.model = model
         results = self.fit(model)
-        self.testing = False
+        self.evaluating = None
 
         # teardown
         if self.is_function_implemented('teardown'):
-            model.teardown('test')
+            model.teardown(stage)
 
         return results
 
+    @property
+    def testing(self):
+        warnings.warn(
+            'Trainer.testing has been deprecated in v1.1 and will be removed '
+            'in v1.3, use Trainer.evaluating instead.',
+            DeprecationWarning, stacklevel=2
+        )
+        return bool(self.evaluating)
+
+    @property
+    def tested_ckpt_path(self):
+        warnings.warn(
+            'Trainer.tested_ckpt_path has been renamed Trainer.evaluated_ckpt_path '
+            'in v1.1 and will be removed in v1.3.',
+            DeprecationWarning, stacklevel=2
+        )
+        return self.evaluated_ckpt_path
+
     def tune(
         self,
         model: LightningModule,
@@ -856,11 +866,17 @@ def tune(
 
     def call_setup_hook(self, model):
         # call setup after the ddp process has connected
-        stage_name = 'test' if self.testing else 'fit'
+        stage_name = self.evaluating or 'fit'
+
         if self.datamodule is not None:
-            called = self.datamodule.has_setup_test if self.testing else self.datamodule.has_setup_fit
+            called = {
+                None: self.datamodule.has_setup_fit,
+                'test': self.datamodule.has_setup_test,
+            }[self.evaluating]
+
             if not called:
                 self.datamodule.setup(stage_name)
+
         self.setup(model, stage_name)
         model.setup(stage_name)
 
diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py
@@ -161,7 +161,7 @@ def setup_training(self, model: LightningModule):
             ref_model.on_pretrain_routine_start()
 
         # print model summary
-        if self.trainer.is_global_zero and self.trainer.weights_summary is not None and not self.trainer.testing:
+        if self.trainer.is_global_zero and self.trainer.weights_summary is not None and not self.trainer.evaluating:
             if self.trainer.weights_summary in ModelSummary.MODES:
                 ref_model.summarize(mode=self.trainer.weights_summary)
             else:
diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py
@@ -728,12 +728,12 @@ def test_test_checkpoint_path(tmpdir, ckpt_path, save_top_k):
                 trainer.test(ckpt_path=ckpt_path)
         else:
             trainer.test(ckpt_path=ckpt_path)
-            assert trainer.tested_ckpt_path == trainer.checkpoint_callback.best_model_path
+            assert trainer.evaluated_ckpt_path == trainer.checkpoint_callback.best_model_path
     elif ckpt_path is None:
         # ckpt_path is None, meaning we don't load any checkpoints and
         # use the weights from the end of training
         trainer.test(ckpt_path=ckpt_path)
-        assert trainer.tested_ckpt_path is None
+        assert trainer.evaluated_ckpt_path is None
     else:
         # specific checkpoint, pick one from saved ones
         if save_top_k == 0:
@@ -746,7 +746,7 @@ def test_test_checkpoint_path(tmpdir, ckpt_path, save_top_k):
                 ].absolute()
             )
             trainer.test(ckpt_path=ckpt_path)
-            assert trainer.tested_ckpt_path == ckpt_path
+            assert trainer.evaluated_ckpt_path == ckpt_path
 
 
 def test_disabled_training(tmpdir):