[1/2] Deprecate outputs in on_train_epoch_end hooks (#7339)

ananthsub · ethanwharris · Borda · web-flow · commit 6104a6316afb · 2021-05-05T17:18:16.000+02:00
* Remove outputs from on_train_epoch_end

* iterate

* Update callback_hook.py

* update

* Update training_loop.py

* Update test_training_loop.py

* early stop?

* fix

* update tests

* Update test_hooks.py

* Update pytorch_lightning/trainer/callback_hook.py

Co-authored-by: Ethan Harris &lt;ewah1g13@soton.ac.uk&gt;

* Update pytorch_lightning/trainer/training_loop.py

Co-authored-by: Ethan Harris &lt;ewah1g13@soton.ac.uk&gt;

* Update trainer.py

* Update pytorch_lightning/trainer/trainer.py

Co-authored-by: Jirka Borovec &lt;Borda@users.noreply.github.com&gt;

Co-authored-by: Ethan Harris &lt;ewah1g13@soton.ac.uk&gt;
Co-authored-by: Jirka Borovec &lt;Borda@users.noreply.github.com&gt;
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -207,6 +207,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 ### Deprecated
 
+- Deprecated `outputs` in both `LightningModule.on_train_epoch_end` and `Callback.on_train_epoch_end` hooks ([#7339](https://github.com/PyTorchLightning/pytorch-lightning/pull/7339))
+
 
 - Deprecated `Trainer.truncated_bptt_steps` in favor of `LightningModule.truncated_bptt_steps` ([#7323](https://github.com/PyTorchLightning/pytorch-lightning/pull/7323))
 
@@ -217,7 +219,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Deprecated the `save_function` property from the `ModelCheckpoint` callback ([#7201](https://github.com/PyTorchLightning/pytorch-lightning/pull/7201))
 
 
-- Deprecated `LightningModule.write_predictions` and `LigtningModule.write_predictions_dict` ([#7066](https://github.com/PyTorchLightning/pytorch-lightning/pull/7066))
+- Deprecated `LightningModule.write_predictions` and `LightningModule.write_predictions_dict` ([#7066](https://github.com/PyTorchLightning/pytorch-lightning/pull/7066))
 
 
 - Deprecated `TrainerLoggingMixin` in favor of a separate utilities module for metric handling ([#7180](https://github.com/PyTorchLightning/pytorch-lightning/pull/7180))
diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py
@@ -28,7 +28,7 @@
 from pytorch_lightning.utilities import _NATIVE_AMP_AVAILABLE, rank_zero_warn
 from pytorch_lightning.utilities.apply_func import apply_to_collection, move_data_to_device
 from pytorch_lightning.utilities.enums import AMPType, GradClipAlgorithmType, LightningEnum
-from pytorch_lightning.utilities.types import EPOCH_OUTPUT, STEP_OUTPUT
+from pytorch_lightning.utilities.types import STEP_OUTPUT
 
 if _NATIVE_AMP_AVAILABLE:
     from torch.cuda.amp import GradScaler
@@ -354,12 +354,8 @@ def clip_gradients(
             model=self.model,
         )
 
-    def on_train_epoch_end(self, outputs: EPOCH_OUTPUT) -> None:
-        """Hook to do something on the end of an training epoch
-
-        Args:
-            outputs: the outputs of the training steps
-        """
+    def on_train_epoch_end(self) -> None:
+        """Hook to do something on the end of an training epoch."""
         pass
 
     def on_train_end(self) -> None:
diff --git a/pytorch_lightning/callbacks/base.py b/pytorch_lightning/callbacks/base.py
@@ -98,7 +98,9 @@ def on_train_epoch_start(self, trainer: 'pl.Trainer', pl_module: 'pl.LightningMo
         """Called when the train epoch begins."""
         pass
 
-    def on_train_epoch_end(self, trainer: 'pl.Trainer', pl_module: 'pl.LightningModule', outputs: EPOCH_OUTPUT) -> None:
+    def on_train_epoch_end(
+        self, trainer: 'pl.Trainer', pl_module: 'pl.LightningModule', unused: Optional = None
+    ) -> None:
         """Called when the train epoch ends."""
         pass
 
diff --git a/pytorch_lightning/callbacks/early_stopping.py b/pytorch_lightning/callbacks/early_stopping.py
@@ -161,7 +161,7 @@ def _should_skip_check(self, trainer) -> bool:
         from pytorch_lightning.trainer.states import TrainerFn
         return trainer.state.fn != TrainerFn.FITTING or trainer.sanity_checking
 
-    def on_train_epoch_end(self, trainer, pl_module, outputs) -> None:
+    def on_train_epoch_end(self, trainer, pl_module) -> None:
         if not self._check_on_train_epoch_end or self._should_skip_check(trainer):
             return
         self._run_early_stopping_check(trainer)
diff --git a/pytorch_lightning/callbacks/pruning.py b/pytorch_lightning/callbacks/pruning.py
@@ -373,7 +373,7 @@ def on_before_accelerator_backend_setup(self, trainer, pl_module: LightningModul
                 self._original_layers.setdefault(id_, {"data": deepcopy(module), "names": []})
                 self._original_layers[id_]["names"].append((i, name))
 
-    def on_train_epoch_end(self, trainer, pl_module: LightningModule, outputs):
+    def on_train_epoch_end(self, trainer, pl_module: LightningModule):
         current_epoch = trainer.current_epoch
         prune = self._apply_pruning(current_epoch) if isinstance(self._apply_pruning, Callable) else self._apply_pruning
         amount = self.amount(current_epoch) if isinstance(self.amount, Callable) else self.amount
diff --git a/pytorch_lightning/core/hooks.py b/pytorch_lightning/core/hooks.py
@@ -235,7 +235,7 @@ def on_train_epoch_start(self) -> None:
         Called in the training loop at the very beginning of the epoch.
         """
 
-    def on_train_epoch_end(self, outputs: EPOCH_OUTPUT) -> None:
+    def on_train_epoch_end(self, unused: Optional = None) -> None:
         """
         Called in the training loop at the very end of the epoch.
         """
diff --git a/pytorch_lightning/trainer/callback_hook.py b/pytorch_lightning/trainer/callback_hook.py
@@ -96,7 +96,15 @@ def on_train_epoch_end(self, outputs: EPOCH_OUTPUT):
             outputs: List of outputs on each ``train`` epoch
         """
         for callback in self.callbacks:
-            callback.on_train_epoch_end(self, self.lightning_module, outputs)
+            if is_param_in_hook_signature(callback.on_train_epoch_end, "outputs"):
+                warning_cache.warn(
+                    "The signature of `Callback.on_train_epoch_end` has changed in v1.3."
+                    " `outputs` parameter has been removed."
+                    " Support for the old signature will be removed in v1.5", DeprecationWarning
+                )
+                callback.on_train_epoch_end(self, self.lightning_module, outputs)
+            else:
+                callback.on_train_epoch_end(self, self.lightning_module)
 
     def on_validation_epoch_start(self):
         """Called when the epoch begins."""
diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
@@ -1211,6 +1211,11 @@ def _cache_logged_metrics(self):
             self.logger_connector.cache_logged_metrics()
 
     def call_hook(self, hook_name: str, *args, **kwargs) -> Any:
+        # Note this implementation is copy/pasted into the TrainLoop class in TrainLoop._on_train_epoch_end_hook
+        # This was done to manage the deprecation of an argument to on_train_epoch_end
+        # If making chnages to this function, ensure that those changes are also made to
+        # TrainLoop._on_train_epoch_end_hook
+
         # set hook_name to model + reset Result obj
         skip = self._reset_result_and_set_hook_fx_name(hook_name)
 
diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py
@@ -31,6 +31,7 @@
 from pytorch_lightning.utilities.grads import grad_norm
 from pytorch_lightning.utilities.model_helpers import is_overridden
 from pytorch_lightning.utilities.parsing import AttributeDict
+from pytorch_lightning.utilities.signature_utils import is_param_in_hook_signature
 from pytorch_lightning.utilities.warnings import WarningCache
 
 
@@ -197,16 +198,14 @@ def reset_train_val_dataloaders(self, model) -> None:
 
     def track_epoch_end_reduce_metrics(self, epoch_output, batch_end_outputs):
 
+        hook_overridden = self._should_add_batch_output_to_epoch_output()
+
         # track the outputs to reduce at the end of the epoch
         for opt_idx, opt_outputs in enumerate(batch_end_outputs):
             sample_output = opt_outputs[-1]
 
             # decide if we need to reduce at the end of the epoch automatically
             auto_reduce_tng_result = isinstance(sample_output, Result) and sample_output.should_reduce_on_epoch_end
-            hook_overridden = (
-                is_overridden("training_epoch_end", model=self.trainer.lightning_module)
-                or is_overridden("on_train_epoch_end", model=self.trainer.lightning_module)
-            )
 
             # only track when a) it needs to be autoreduced OR b) the user wants to manually reduce on epoch end
             if not (hook_overridden or auto_reduce_tng_result):
@@ -218,6 +217,22 @@ def track_epoch_end_reduce_metrics(self, epoch_output, batch_end_outputs):
 
             epoch_output[opt_idx].append(opt_outputs)
 
+    def _should_add_batch_output_to_epoch_output(self) -> bool:
+        # We add to the epoch outputs if
+        # 1. The model defines training_epoch_end OR
+        # 2. The model overrides on_train_epoch_end which has `outputs` in the signature
+        # TODO: in v1.5 this only needs to check if training_epoch_end is overridden
+        lightning_module = self.trainer.lightning_module
+        if is_overridden("training_epoch_end", model=lightning_module):
+            return True
+
+        if is_overridden("on_train_epoch_end", model=lightning_module):
+            model_hook_fx = getattr(lightning_module, "on_train_epoch_end")
+            if is_param_in_hook_signature(model_hook_fx, "outputs"):
+                return True
+
+        return False
+
     def get_optimizers_iterable(self, batch_idx=None):
         """
         Generates an iterable with (idx, optimizer) for each optimizer.
@@ -593,9 +608,51 @@ def on_train_epoch_end(self, epoch_output: List[List[List[Result]]]) -> None:
             self.trainer.logger_connector.cache_logged_metrics()
 
         # call train epoch end hooks
-        self.trainer.call_hook('on_train_epoch_end', processed_epoch_output)
+        self._on_train_epoch_end_hook(processed_epoch_output)
         self.trainer.call_hook('on_epoch_end')
 
+    def _on_train_epoch_end_hook(self, processed_epoch_output) -> None:
+        # We cannot rely on Trainer.call_hook because the signatures might be different across
+        # lightning module and callback
+        # As a result, we need to inspect if the module accepts `outputs` in `on_train_epoch_end`
+
+        # This implementation is copied from Trainer.call_hook
+        hook_name = "on_train_epoch_end"
+
+        # set hook_name to model + reset Result obj
+        skip = self.trainer._reset_result_and_set_hook_fx_name(hook_name)
+
+        # always profile hooks
+        with self.trainer.profiler.profile(hook_name):
+
+            # first call trainer hook
+            if hasattr(self.trainer, hook_name):
+                trainer_hook = getattr(self.trainer, hook_name)
+                trainer_hook(processed_epoch_output)
+
+            # next call hook in lightningModule
+            model_ref = self.trainer.lightning_module
+            if is_overridden(hook_name, model_ref):
+                hook_fx = getattr(model_ref, hook_name)
+                if is_param_in_hook_signature(hook_fx, "outputs"):
+                    self.warning_cache.warn(
+                        "The signature of `ModelHooks.on_train_epoch_end` has changed in v1.3."
+                        " `outputs` parameter has been deprecated."
+                        " Support for the old signature will be removed in v1.5", DeprecationWarning
+                    )
+                    model_ref.on_train_epoch_end(processed_epoch_output)
+                else:
+                    model_ref.on_train_epoch_end()
+
+            # if the PL module doesn't have the hook then call the accelerator
+            # used to auto-reduce things for the user with Results obj
+            elif hasattr(self.trainer.accelerator, hook_name):
+                accelerator_hook = getattr(self.trainer.accelerator, hook_name)
+                accelerator_hook()
+
+        if not skip:
+            self.trainer._cache_logged_metrics()
+
     def run_training_batch(self, batch, batch_idx, dataloader_idx):
         # track grad norms
         grad_norm_dic = {}
diff --git a/tests/callbacks/test_callback_hook_outputs.py b/tests/callbacks/test_callback_hook_outputs.py
@@ -34,9 +34,6 @@ def on_validation_batch_end(self, trainer, pl_module, outputs, batch, batch_idx,
         def on_test_batch_end(self, trainer, pl_module, outputs, batch, batch_idx, dataloader_idx):
             assert 'x' in outputs
 
-        def on_train_epoch_end(self, trainer, pl_module, outputs):
-            assert len(outputs) == trainer.num_training_batches
-
     class TestModel(BoringModel):
 
         def on_train_batch_end(self, outputs, batch, batch_idx: int, dataloader_idx: int) -> None:
@@ -48,7 +45,7 @@ def on_validation_batch_end(self, outputs, batch, batch_idx: int, dataloader_idx
         def on_test_batch_end(self, outputs, batch, batch_idx: int, dataloader_idx: int) -> None:
             assert 'x' in outputs
 
-        def on_train_epoch_end(self, outputs) -> None:
+        def training_epoch_end(self, outputs) -> None:
             assert len(outputs) == self.trainer.num_training_batches
 
     model = TestModel()
diff --git a/tests/callbacks/test_finetuning_callback.py b/tests/callbacks/test_finetuning_callback.py
@@ -27,7 +27,7 @@
 
 class TestBackboneFinetuningCallback(BackboneFinetuning):
 
-    def on_train_epoch_end(self, trainer, pl_module, outputs):
+    def on_train_epoch_end(self, trainer, pl_module):
         epoch = trainer.current_epoch
         if self.unfreeze_backbone_at_epoch <= epoch:
             optimizer = trainer.optimizers[0]
diff --git a/tests/deprecated_api/test_remove_1-5.py b/tests/deprecated_api/test_remove_1-5.py
@@ -216,6 +216,53 @@ def test_v1_5_0_model_checkpoint_period(tmpdir):
         ModelCheckpoint(dirpath=tmpdir, period=1)
 
 
+def test_v1_5_0_old_on_train_epoch_end(tmpdir):
+    callback_warning_cache.clear()
+
+    class OldSignature(Callback):
+
+        def on_train_epoch_end(self, trainer, pl_module, outputs):  # noqa
+            ...
+
+    class OldSignatureModel(BoringModel):
+
+        def on_train_epoch_end(self, outputs):  # noqa
+            ...
+
+    model = BoringModel()
+    trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, callbacks=OldSignature())
+
+    with pytest.deprecated_call(match="old signature will be removed in v1.5"):
+        trainer.fit(model)
+
+    callback_warning_cache.clear()
+
+    model = OldSignatureModel()
+
+    with pytest.deprecated_call(match="old signature will be removed in v1.5"):
+        trainer.fit(model)
+
+    trainer.train_loop.warning_cache.clear()
+
+    class NewSignature(Callback):
+
+        def on_train_epoch_end(self, trainer, pl_module):
+            ...
+
+    trainer.callbacks = [NewSignature()]
+    with no_deprecated_call(match="`Callback.on_train_epoch_end` signature has changed in v1.3."):
+        trainer.fit(model)
+
+    class NewSignatureModel(BoringModel):
+
+        def on_train_epoch_end(self):
+            ...
+
+    model = NewSignatureModel()
+    with no_deprecated_call(match="`ModelHooks.on_train_epoch_end` signature has changed in v1.3."):
+        trainer.fit(model)
+
+
 def test_v1_5_0_old_on_validation_epoch_end(tmpdir):
     callback_warning_cache.clear()
 
diff --git a/tests/models/test_hooks.py b/tests/models/test_hooks.py
@@ -17,7 +17,7 @@
 import pytest
 import torch
 
-from pytorch_lightning import Callback, Trainer
+from pytorch_lightning import Trainer
 from tests.helpers import BoringDataModule, BoringModel, RandomDataset
 from tests.helpers.runif import RunIf
 
@@ -92,21 +92,17 @@ def training_epoch_end(self, outputs):
 def test_training_epoch_end_metrics_collection_on_override(tmpdir):
     """ Test that batch end metrics are collected when training_epoch_end is overridden at the end of an epoch. """
 
-    class LoggingCallback(Callback):
+    class OverriddenModel(BoringModel):
 
-        def on_train_epoch_start(self, trainer, pl_module):
+        def __init__(self):
+            super().__init__()
             self.len_outputs = 0
 
-        def on_train_epoch_end(self, trainer, pl_module, outputs):
-            self.len_outputs = len(outputs)
-
-    class OverriddenModel(BoringModel):
-
         def on_train_epoch_start(self):
             self.num_train_batches = 0
 
-        def training_epoch_end(self, outputs):  # Overridden
-            return
+        def training_epoch_end(self, outputs):
+            self.len_outputs = len(outputs)
 
         def on_train_batch_end(self, outputs, batch, batch_idx, dataloader_idx):
             self.num_train_batches += 1
@@ -123,22 +119,14 @@ def on_train_batch_end(self, outputs, batch, batch_idx, dataloader_idx):
     not_overridden_model = NotOverriddenModel()
     not_overridden_model.training_epoch_end = None
 
-    callback = LoggingCallback()
     trainer = Trainer(
         max_epochs=1,
         default_root_dir=tmpdir,
         overfit_batches=2,
-        callbacks=[callback],
     )
 
     trainer.fit(overridden_model)
-    # outputs from on_train_batch_end should be accessible in on_train_epoch_end hook
-    # if training_epoch_end is overridden
-    assert callback.len_outputs == overridden_model.num_train_batches
-
-    trainer.fit(not_overridden_model)
-    # outputs from on_train_batch_end should be empty
-    assert callback.len_outputs == 0
+    assert overridden_model.len_outputs == overridden_model.num_train_batches
 
 
 @RunIf(min_gpus=1)
@@ -334,9 +322,9 @@ def on_train_epoch_start(self):
             self.called.append("on_train_epoch_start")
             super().on_train_epoch_start()
 
-        def on_train_epoch_end(self, outputs):
+        def on_train_epoch_end(self):
             self.called.append("on_train_epoch_end")
-            super().on_train_epoch_end(outputs)
+            super().on_train_epoch_end()
 
         def on_validation_start(self):
             self.called.append("on_validation_start")
diff --git a/tests/trainer/logging_/test_logger_connector.py b/tests/trainer/logging_/test_logger_connector.py
@@ -678,7 +678,7 @@ def _assert_epoch_end(self, stage):
             acc.reset.asset_not_called()
             ap.reset.assert_not_called()
 
-        def on_train_epoch_end(self, outputs):
+        def on_train_epoch_end(self):
             self._assert_epoch_end('train')
 
         def on_validation_epoch_end(self, outputs):
diff --git a/tests/trainer/logging_/test_train_loop_logging.py b/tests/trainer/logging_/test_train_loop_logging.py
@@ -599,7 +599,7 @@ def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx, data
             # with func = np.mean if on_epoch else func = np.max
             self.count += 1
 
-        def on_train_epoch_end(self, trainer, pl_module, outputs):
+        def on_train_epoch_end(self, trainer, pl_module):
             self.make_logging(
                 pl_module, 'on_train_epoch_end', 8, on_steps=[False], on_epochs=self.choices, prob_bars=self.choices
             )
diff --git a/tests/trainer/test_training_loop.py b/tests/trainer/test_training_loop.py
@@ -155,11 +155,6 @@ def training_epoch_end(self, outputs):
             [HookedModel._check_output(output) for output in outputs]
             super().training_epoch_end(outputs)
 
-        def on_train_epoch_end(self, outputs):
-            assert len(outputs) == 2
-            [HookedModel._check_output(output) for output in outputs]
-            super().on_train_epoch_end(outputs)
-
     model = HookedModel()
 
     # fit model