Move tracking epoch end outputs logic to the EvaluationEpochLoop (#9261)

justusschock · justusschock · commit 1b8336e8117e · 2021-09-07T16:32:08.000+02:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -21,6 +21,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Fixed bug where data-loading functions where not getting the correct running stage passed ([#8858](https://github.com/PyTorchLightning/pytorch-lightning/pull/8858))
 
 
+- Fixed intra-epoch evaluation outputs staying in memory when the respective `*_epoch_end` hook wasn't overridden ([#9261](https://github.com/PyTorchLightning/pytorch-lightning/pull/9261))
+
+
 - Fixed error handling in DDP process reconciliation when `_sync_dir` was not initialized ([#9267](https://github.com/PyTorchLightning/pytorch-lightning/pull/9267))
 
 
diff --git a/pytorch_lightning/loops/dataloader/evaluation_loop.py b/pytorch_lightning/loops/dataloader/evaluation_loop.py
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 from typing import Any, List, Optional, Sequence, Union
 
 from deprecate.utils import void
@@ -30,7 +29,7 @@ class EvaluationLoop(DataLoaderLoop):
 
     def __init__(self):
         super().__init__()
-        self.outputs = []
+        self.outputs: List[EPOCH_OUTPUT] = []
         self.epoch_loop = EvaluationEpochLoop()
 
         self._results = ResultCollection(training=False)
@@ -112,8 +111,7 @@ def advance(self, *args: Any, **kwargs: Any) -> None:
         )
 
         # store batch level output per dataloader
-        if self.should_track_batch_outputs_for_epoch_end:
-            self.outputs.append(dl_outputs)
+        self.outputs.append(dl_outputs)
 
         if not self.trainer.sanity_checking:
             # indicate the loop has run
@@ -174,8 +172,6 @@ def reload_evaluation_dataloaders(self) -> None:
 
     def on_evaluation_start(self, *args: Any, **kwargs: Any) -> None:
         """Runs ``on_{validation/test}_start`` hooks"""
-        self.should_track_batch_outputs_for_epoch_end: bool = self._should_track_batch_outputs_for_epoch_end()
-
         assert self._results is not None
         self._results.to(device=self.trainer.lightning_module.device)
 
@@ -224,13 +220,6 @@ def on_evaluation_epoch_start(self, *args: Any, **kwargs: Any) -> None:
         else:
             self.trainer.call_hook("on_validation_epoch_start", *args, **kwargs)
 
-    def _should_track_batch_outputs_for_epoch_end(self) -> bool:
-        """Whether the batch outputs should be stored for later usage"""
-        model = self.trainer.lightning_module
-        if self.trainer.testing:
-            return is_overridden("test_epoch_end", model)
-        return is_overridden("validation_epoch_end", model)
-
     def evaluation_epoch_end(self, outputs: EPOCH_OUTPUT) -> None:
         """Runs ``{validation/test}_epoch_end``"""
         # inform logger the batch loop has finished
diff --git a/pytorch_lightning/loops/epoch/evaluation_epoch_loop.py b/pytorch_lightning/loops/epoch/evaluation_epoch_loop.py
@@ -13,17 +13,18 @@
 # limitations under the License.
 
 from collections import OrderedDict
-from typing import Any, Dict, Iterator, List, Optional, Union
+from functools import lru_cache
+from typing import Any, Dict, Iterator, Optional, Union
 
 from deprecate import void
-from torch import Tensor
 
 from pytorch_lightning.loops.base import Loop
 from pytorch_lightning.trainer.connectors.logger_connector.result import ResultCollection
 from pytorch_lightning.trainer.progress import Progress
 from pytorch_lightning.trainer.supporters import PredictionCollection
 from pytorch_lightning.utilities.memory import recursive_detach
-from pytorch_lightning.utilities.types import STEP_OUTPUT
+from pytorch_lightning.utilities.model_helpers import is_overridden
+from pytorch_lightning.utilities.types import EPOCH_OUTPUT, STEP_OUTPUT
 
 
 class EvaluationEpochLoop(Loop):
@@ -38,7 +39,7 @@ def __init__(self) -> None:
         self.dataloader: Optional[Iterator] = None
         self._dl_max_batches: Optional[int] = None
         self._num_dataloaders: Optional[int] = None
-        self.outputs: List[STEP_OUTPUT] = []
+        self.outputs: EPOCH_OUTPUT = []
         self.batch_progress = Progress()
 
     @property
@@ -121,9 +122,12 @@ def advance(
         self.trainer.logger_connector.update_eval_step_metrics()
 
         # track epoch level outputs
-        self.outputs = self._track_output_for_epoch_end(self.outputs, output)
+        if self._should_track_batch_outputs_for_epoch_end():
+            output = recursive_detach(output, to_cpu=self.trainer.move_metrics_to_cpu)
+            if output is not None:
+                self.outputs.append(output)
 
-    def on_run_end(self) -> List[STEP_OUTPUT]:
+    def on_run_end(self) -> EPOCH_OUTPUT:
         """Returns the outputs of the whole run"""
         outputs = self.outputs
         # free memory
@@ -239,19 +243,14 @@ def _build_kwargs(self, batch: Any, batch_idx: int, dataloader_idx: int) -> Dict
 
         return step_kwargs
 
-    def _track_output_for_epoch_end(
-        self,
-        outputs: List[Union[ResultCollection, Dict, Tensor]],
-        output: Optional[Union[ResultCollection, Dict, Tensor]],
-    ) -> List[Union[ResultCollection, Dict, Tensor]]:
-        if output is not None:
-            if isinstance(output, ResultCollection):
-                output = output.detach()
-                if self.trainer.move_metrics_to_cpu:
-                    output = output.cpu()
-            elif isinstance(output, dict):
-                output = recursive_detach(output, to_cpu=self.trainer.move_metrics_to_cpu)
-            elif isinstance(output, Tensor) and output.is_cuda and self.trainer.move_metrics_to_cpu:
-                output = output.cpu()
-            outputs.append(output)
-        return outputs
+    @lru_cache(1)
+    def _should_track_batch_outputs_for_epoch_end(self) -> bool:
+        """Whether the batch outputs should be stored for later usage"""
+        model = self.trainer.lightning_module
+        if self.trainer.testing:
+            return is_overridden("test_epoch_end", model)
+        return is_overridden("validation_epoch_end", model)
+
+    def teardown(self) -> None:
+        # in case the model changes
+        self._should_track_batch_outputs_for_epoch_end.cache_clear()
diff --git a/pytorch_lightning/trainer/connectors/logger_connector/result.py b/pytorch_lightning/trainer/connectors/logger_connector/result.py
@@ -17,7 +17,6 @@
 from typing import Any, Callable, Dict, List, Mapping, Optional, Tuple, Union
 
 import torch
-from torch.functional import Tensor
 from torchmetrics import Metric
 
 from pytorch_lightning.core.mixins import DeviceDtypeModuleMixin
@@ -26,6 +25,7 @@
 from pytorch_lightning.utilities.data import extract_batch_size
 from pytorch_lightning.utilities.enums import LightningEnum
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
+from pytorch_lightning.utilities.memory import recursive_detach
 from pytorch_lightning.utilities.metrics import metrics_to_scalars
 from pytorch_lightning.utilities.warnings import WarningCache
 
@@ -436,11 +436,7 @@ def log(
         """See :meth:`~pytorch_lightning.core.lightning.LightningModule.log`"""
         # no metrics should be logged with graphs
         if not enable_graph:
-
-            def detach_fn(tensor: Tensor) -> Tensor:
-                return tensor.detach()
-
-            value = apply_to_collection(value, Tensor, detach_fn)
+            value = recursive_detach(value)
 
         # move metrics to cpu on TPU.
         if isinstance(value, torch.Tensor) and value.device.type == "xla":
diff --git a/pytorch_lightning/utilities/memory.py b/pytorch_lightning/utilities/memory.py
@@ -13,11 +13,14 @@
 # limitations under the License.
 
 import gc
+from typing import Any
 
 import torch
 
+from pytorch_lightning.utilities.apply_func import apply_to_collection
 
-def recursive_detach(in_dict: dict, to_cpu: bool = False) -> dict:
+
+def recursive_detach(in_dict: Any, to_cpu: bool = False) -> Any:
     """Detach all tensors in `in_dict`.
 
     May operate recursively if some of the values in `in_dict` are dictionaries
@@ -31,19 +34,17 @@ def recursive_detach(in_dict: dict, to_cpu: bool = False) -> dict:
     Return:
         out_dict: Dictionary with detached tensors
     """
-    out_dict = {}
-    for k, v in in_dict.items():
-        if isinstance(v, dict):
-            v = recursive_detach(v, to_cpu=to_cpu)
-        elif callable(getattr(v, "detach", None)):
-            v = v.detach()
-            if to_cpu:
-                v = v.cpu()
-        out_dict[k] = v
-    return out_dict
-
-
-def is_oom_error(exception):
+
+    def detach_and_move(t: torch.Tensor, to_cpu: bool) -> torch.Tensor:
+        t = t.detach()
+        if to_cpu:
+            t = t.cpu()
+        return t
+
+    return apply_to_collection(in_dict, torch.Tensor, detach_and_move, to_cpu=to_cpu)
+
+
+def is_oom_error(exception: BaseException) -> bool:
     return is_cuda_out_of_memory(exception) or is_cudnn_snafu(exception) or is_out_of_cpu_memory(exception)
 
 
diff --git a/tests/trainer/loops/test_evaluation_loop.py b/tests/trainer/loops/test_evaluation_loop.py
@@ -16,7 +16,8 @@
 import torch
 from torch.utils.data import DataLoader
 
-from pytorch_lightning import Trainer
+from pytorch_lightning import LightningModule, Trainer
+from pytorch_lightning.loops import EvaluationEpochLoop
 from tests.helpers.boring_model import BoringModel, RandomDataset
 from tests.helpers.runif import RunIf
 
@@ -101,3 +102,30 @@ def validation_step(self, batch, batch_idx):
     torch.cuda.empty_cache()
     trainer = Trainer(gpus=1, default_root_dir=tmpdir, fast_dev_run=2, move_metrics_to_cpu=True, weights_summary=None)
     trainer.fit(BoringLargeBatchModel())
+
+
+def test_evaluation_loop_doesnt_store_outputs_if_epoch_end_not_overridden(tmpdir):
+    did_assert = False
+
+    class TestModel(BoringModel):
+        def on_test_batch_end(self, outputs, *_):
+            # check `test_step` returns something
+            assert outputs is not None
+
+    class TestLoop(EvaluationEpochLoop):
+        def on_advance_end(self):
+            # should be empty
+            assert not self.outputs
+            # sanity check
+            nonlocal did_assert
+            did_assert = True
+            super().on_advance_end()
+
+    model = TestModel()
+    # make sure this hook is not overridden
+    model.test_epoch_end = LightningModule.test_epoch_end
+
+    trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=3)
+    trainer.test_loop.connect(TestLoop())
+    trainer.test(model)
+    assert did_assert