Lightning-AI
diff --git a/‎.azure-pipelines/ipu-tests.yml‎
Lines changed: 2 additions & 1 deletion b/‎.azure-pipelines/ipu-tests.yml‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎CHANGELOG.md‎
Lines changed: 15 additions & 10 deletions b/‎CHANGELOG.md‎
Lines changed: 15 additions & 10 deletions
diff --git a/‎dockers/nvidia/Dockerfile‎
Lines changed: 2 additions & 2 deletions b/‎dockers/nvidia/Dockerfile‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎pytorch_lightning/callbacks/model_checkpoint.py‎
Lines changed: 2 additions & 2 deletions b/‎pytorch_lightning/callbacks/model_checkpoint.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎pytorch_lightning/core/lightning.py‎
Lines changed: 1 addition & 1 deletion b/‎pytorch_lightning/core/lightning.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎pytorch_lightning/loops/base.py‎
Lines changed: 19 additions & 3 deletions b/‎pytorch_lightning/loops/base.py‎
Lines changed: 19 additions & 3 deletions
diff --git a/‎pytorch_lightning/loops/batch/training_batch_loop.py‎
Lines changed: 7 additions & 5 deletions b/‎pytorch_lightning/loops/batch/training_batch_loop.py‎
Lines changed: 7 additions & 5 deletions
diff --git a/‎pytorch_lightning/loops/dataloader/evaluation_loop.py‎
Lines changed: 9 additions & 8 deletions b/‎pytorch_lightning/loops/dataloader/evaluation_loop.py‎
Lines changed: 9 additions & 8 deletions
diff --git a/‎pytorch_lightning/loops/dataloader/prediction_loop.py‎
Lines changed: 4 additions & 1 deletion b/‎pytorch_lightning/loops/dataloader/prediction_loop.py‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎pytorch_lightning/loops/epoch/evaluation_epoch_loop.py‎
Lines changed: 3 additions & 4 deletions b/‎pytorch_lightning/loops/epoch/evaluation_epoch_loop.py‎
Lines changed: 3 additions & 4 deletions
@@ -81,7 +81,7 @@ jobs:
     - bash: |
         source ${{ variables.poplar_sdk }}/poplar-ubuntu*/enable.sh
         source ${{ variables.poplar_sdk }}/popart-ubuntu*/enable.sh
-
+        export POPTORCH_WAIT_FOR_IPU=1
         python -m coverage run --source pytorch_lightning -m pytest pytorch_lightning tests -v --junitxml=$(Build.StagingDirectory)/test-results.xml --durations=50
       env:
         MKL_THREADING_LAYER: "GNU"
@@ -90,6 +90,7 @@ jobs:
     - bash: |
         source ${{ variables.poplar_sdk }}/poplar-ubuntu*/enable.sh
         source ${{ variables.poplar_sdk }}/popart-ubuntu*/enable.sh
+        export POPTORCH_WAIT_FOR_IPU=1
         bash tests/special_tests.sh
       env:
         MKL_THREADING_LAYER: "GNU"
 
@@ -127,6 +127,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Added `max_depth` parameter in `ModelSummary` ([#8062](https://github.com/PyTorchLightning/pytorch-lightning/pull/8062))
 
 
+- Added `restore` function and `restarting` attribute to base `Loop` ([#8247](https://github.com/PyTorchLightning/pytorch-lightning/pull/8247))
+
+
 ### Changed
 
 
@@ -167,6 +170,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
     * Refactored trainer `_run_*` functions and separate evaluation loops ([#8065](https://github.com/PyTorchLightning/pytorch-lightning/pull/8065))
     * Refactored prediction loop interface; added new classes `PredictionLoop`, `PredictionEpochLoop` ([#7700](https://github.com/PyTorchLightning/pytorch-lightning/pull/7700), [#8077](https://github.com/PyTorchLightning/pytorch-lightning/pull/8077))
     * Removed `pytorch_lightning/trainer/predict_loop.py` ([#8094](https://github.com/PyTorchLightning/pytorch-lightning/pull/8094))
+    * Moved result teardown to the loops ([#8245](https://github.com/PyTorchLightning/pytorch-lightning/pull/8245))
 
 
 - Refactored logging
@@ -341,6 +345,17 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Fixed a bug where using `precision=64` would cause buffers with complex dtype to be cast to real ([#8208](https://github.com/PyTorchLightning/pytorch-lightning/pull/8208))
 
 
+- Fixed a bug where `truncated_bptt_steps` would throw an AttributeError when the target RNN has multiple hidden states ([#8145](https://github.com/PyTorchLightning/pytorch-lightning/pull/8145))
+
+
+- Fixes access to `callback_metrics` in ddp_spawn ([#7916](https://github.com/PyTorchLightning/pytorch-lightning/pull/7916))
+
+
+- Fixed moving batch to device before sending it to the `on_*_batch_start`/`on_*_batch_end` callbacks and model hooks ([#7378](https://github.com/PyTorchLightning/pytorch-lightning/pull/7378))
+
+
+- Fixed passing a custom `DDPPlugin` when choosing `accelerator="ddp_cpu"` for the accelerator ([#6208](https://github.com/PyTorchLightning/pytorch-lightning/pull/6208))
+
 
 ## [1.3.8] - 2021-07-01
 
@@ -357,16 +372,6 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Fixed SWA to also work with `IterableDataset` ([#8172](https://github.com/PyTorchLightning/pytorch-lightning/pull/8172))
 
 
-
-- Fixed a bug where `truncated_bptt_steps` would throw an AttributeError when the target RNN has multiple hidden states ([#8145](https://github.com/PyTorchLightning/pytorch-lightning/pull/8145))
-
-
-- Fixes access to `callback_metrics` in ddp_spawn ([#7916](https://github.com/PyTorchLightning/pytorch-lightning/pull/7916))
-
-
-- Fixed moving batch to device before sending it to the `on_*_batch_start`/`on_*_batch_end` callbacks and model hooks ([#7378](https://github.com/PyTorchLightning/pytorch-lightning/pull/7378))
-
-
 ## [1.3.7] - 2021-06-22
 
 ### Fixed
 
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 # https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes
-FROM nvcr.io/nvidia/pytorch:21.05-py3
+FROM nvcr.io/nvidia/pytorch:21.06-py3
 
 LABEL maintainer="PyTorchLightning <https://github.com/PyTorchLightning>"
 
@@ -39,7 +39,7 @@ RUN \
 
 # Installations
     python -c "fname = './pytorch-lightning/requirements/extra.txt' ; lines = [line for line in open(fname).readlines() if not line.startswith('horovod')] ; open(fname, 'w').writelines(lines)" && \
-    pip install "Pillow>=8.2" "cryptography>=3.4" "py>=1.10" --no-cache-dir --upgrade-strategy only-if-needed && \
+    pip install "Pillow>=8.2, !=8.3.0" "cryptography>=3.4" "py>=1.10" --no-cache-dir --upgrade-strategy only-if-needed && \
     pip install -r ./pytorch-lightning/requirements/extra.txt --no-cache-dir --upgrade-strategy only-if-needed && \
     pip install -r ./pytorch-lightning/requirements/examples.txt --no-cache-dir --upgrade-strategy only-if-needed && \
     pip install ./pytorch-lightning --no-cache-dir && \
 
@@ -102,7 +102,7 @@ class ModelCheckpoint(Callback):
             saved (``model.save_weights(filepath)``), else the full model
             is saved (``model.save(filepath)``).
         every_n_train_steps: Number of training steps between checkpoints.
-            If ``every_n_train_steps == None or every_n_train_steps == 0``, we skip saving during training
+            If ``every_n_train_steps == None or every_n_train_steps == 0``, we skip saving during training.
             To disable, set ``every_n_train_steps = 0``. This value must be ``None`` or non-negative.
             This must be mutually exclusive with ``train_time_interval`` and ``every_n_val_epochs``.
         train_time_interval: Checkpoints are monitored at the specified time interval.
@@ -111,7 +111,7 @@ class ModelCheckpoint(Callback):
             guaranteed to execute at the exact time specified, but should be close.
             This must be mutually exclusive with ``every_n_train_steps`` and ``every_n_val_epochs``.
         every_n_val_epochs: Number of validation epochs between checkpoints.
-            If ``every_n_val_epochs == None or every_n_val_epochs == 0``, we skip saving on validation end
+            If ``every_n_val_epochs == None or every_n_val_epochs == 0``, we skip saving on validation end.
             To disable, set ``every_n_val_epochs = 0``. This value must be ``None`` or non-negative.
             This must be mutually exclusive with ``every_n_train_steps`` and ``train_time_interval``.
             Setting both ``ModelCheckpoint(..., every_n_val_epochs=V)`` and
 
@@ -543,7 +543,7 @@ def write_prediction(
             ' and will be removed in v1.5.'
         )
 
-        self.trainer.evaluation_loop.predictions._add_prediction(name, value, filename)
+        self.trainer._evaluation_loop.predictions._add_prediction(name, value, filename)
 
     def write_prediction_dict(self, predictions_dict: Dict[str, Any], filename: str = 'predictions.pt'):
         """
 
@@ -46,6 +46,15 @@ class Loop(ABC):
     def __init__(self) -> None:
         self.iteration_count: int = 0
         self.trainer: Optional['pl.Trainer'] = None
+        self._restarting = False
+
+    @property
+    def restarting(self) -> bool:
+        return self._restarting
+
+    @restarting.setter
+    def restarting(self, restarting: bool) -> None:
+        self._restarting = restarting
 
     @property
     @abstractmethod
@@ -87,7 +96,12 @@ def run(self, *args: Any, **kwargs: Any) -> Optional[Any]:
         if self.skip:
             return self.on_skip()
 
-        self.reset()
+        if self.restarting:
+            self.restore()
+            self.restarting = False
+        else:
+            self.reset()
+
         self.on_run_start(*args, **kwargs)
 
         while not self.done:
@@ -100,9 +114,11 @@ def run(self, *args: Any, **kwargs: Any) -> Optional[Any]:
                 break
 
         output = self.on_run_end()
-        self.teardown()
         return output
 
+    def restore(self) -> None:
+        """Restore the internal state of the loop the beginning of run if restarting is ``True``."""
+
     @abstractmethod
     def reset(self) -> None:
         """Resets the internal state of the loop at the beginning of each call to :attr:`run`."""
@@ -132,7 +148,7 @@ def on_run_end(self) -> Any:
         """Hook to be called at the end of the run. Its return argument is returned from :attr:`run`."""
 
     def teardown(self) -> None:
-        """The very last method called inside :meth:`run`. Use to release memory etc."""
+        """Use to release memory etc."""
 
     def load_state_dict(self, state_dict: Dict) -> None:
         """Restore the loop state from the provided state_dict."""
 
@@ -29,6 +29,7 @@
 from pytorch_lightning.trainer.connectors.logger_connector.result import ResultCollection
 from pytorch_lightning.trainer.supporters import TensorRunningAccum
 from pytorch_lightning.utilities import AMPType, AttributeDict, DeviceType, grad_norm
+from pytorch_lightning.utilities.apply_func import apply_to_collection
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from pytorch_lightning.utilities.finite_checks import detect_nan_parameters
 from pytorch_lightning.utilities.imports import _TPU_AVAILABLE
@@ -47,7 +48,7 @@ def __init__(self) -> None:
         self.running_loss: TensorRunningAccum = TensorRunningAccum(window_length=20)
         self.batch_idx: int = 0
         self.split_idx: Optional[int] = None
-        self.warning_cache: WarningCache = WarningCache()
+        self._warning_cache: WarningCache = WarningCache()
 
         self._hiddens: Optional[Tensor] = None
         self._optimizer_freq_cumsum: Optional[int] = None
@@ -75,7 +76,7 @@ def run(self, batch: Any, batch_idx: int, dataloader_idx: int) -> AttributeDict:
             dataloader_idx: the index of the dataloader producing the current batch
         """
         if batch is None:
-            self.warning_cache.warn("train_dataloader yielded None. If this was on purpose, ignore this warning...")
+            self._warning_cache.warn("train_dataloader yielded None. If this was on purpose, ignore this warning...")
             return AttributeDict(signal=0, training_step_output=[[]])
 
         # hook
@@ -349,7 +350,8 @@ def _process_training_step_output(self, training_step_output: STEP_OUTPUT) -> Op
         if isinstance(training_step_output, dict):
             loss = training_step_output.pop("loss", None)
             hiddens = training_step_output.pop("hiddens", None)
-
+            # detach hiddens to avoid `RuntimeError: Trying to backward through the graph a second time`
+            hiddens = apply_to_collection(hiddens, Tensor, lambda t: t.detach())
             results.extra = training_step_output
 
         # handle scalar return
@@ -546,7 +548,7 @@ def training_step_and_backward(
                         self._check_finite(result.loss)
 
                 else:
-                    self.warning_cache.warn(
+                    self._warning_cache.warn(
                         "training_step returned None. If this was on purpose, ignore this warning..."
                     )
 
@@ -648,7 +650,7 @@ def _build_kwargs(self, batch: Any, batch_idx: int, opt_idx: int, hiddens: Optio
             has_opt_idx_in_train_step = is_param_in_hook_signature(training_step_fx, "optimizer_idx")
             if has_opt_idx_in_train_step:
                 if not lightning_module.automatic_optimization:
-                    self.warning_cache.deprecation(
+                    self._warning_cache.deprecation(
                         "`training_step` hook signature has changed in v1.3."
                         " `optimizer_idx` argument has been removed in case of manual optimization. Support for"
                         " the old signature will be removed in v1.5"
 
@@ -33,9 +33,11 @@ def __init__(self):
         super().__init__()
         self._max_batches: Optional[Union[int, Sequence[int]]] = None
         self.outputs = []
+
         self.epoch_loop = EvaluationEpochLoop()
-        self._has_run: bool = False
+
         self._results = ResultCollection(training=False)
+        self._has_run: bool = False
 
     @property
     def num_dataloaders(self) -> int:
@@ -57,11 +59,6 @@ def dataloaders(self) -> Sequence[DataLoader]:
             return self.trainer.test_dataloaders
         return self.trainer.val_dataloaders
 
-    @property
-    def results(self) -> ResultCollection:
-        """Returns the current results"""
-        return self._results
-
     @property
     def predictions(self):
         """Returns the predictions from all dataloaders"""
@@ -184,8 +181,8 @@ def on_evaluation_start(self, *args: Any, **kwargs: Any) -> None:
         """Runs ``on_{validation/test}_start`` hooks"""
         self.should_track_batch_outputs_for_epoch_end: bool = self._should_track_batch_outputs_for_epoch_end()
 
-        assert self.results is not None
-        self.results.to(device=self.trainer.lightning_module.device)
+        assert self._results is not None
+        self._results.to(device=self.trainer.lightning_module.device)
 
         if self.trainer.testing:
             self.trainer.call_hook("on_test_start", *args, **kwargs)
@@ -266,3 +263,7 @@ def on_evaluation_epoch_end(self) -> None:
         self.trainer.call_hook(hook_name)
         self.trainer.call_hook("on_epoch_end")
         self.trainer.logger_connector.on_epoch_end()
+
+    def teardown(self) -> None:
+        self._results.cpu()
+        self.epoch_loop.teardown()
@@ -16,9 +16,12 @@ class PredictionLoop(DataLoaderLoop):
 
     def __init__(self):
         super().__init__()
-        self.epoch_loop: PredictionEpochLoop = PredictionEpochLoop()
         self.predictions: Optional[List[List[Any]]] = None
         self.epoch_batch_indices: Optional[List[List[int]]] = None
+
+        self.epoch_loop: PredictionEpochLoop = PredictionEpochLoop()
+
+        self._results = None  # for `trainer._results` access
         self._return_predictions: bool = False
 
     @property
 
@@ -122,11 +122,10 @@ def advance(
 
     def on_run_end(self) -> List[STEP_OUTPUT]:
         """Returns the outputs of the whole run"""
-        return self.outputs
-
-    def teardown(self) -> None:
-        """Frees memory of tracked outputs"""
+        outputs = self.outputs
+        # free memory
         self.outputs = []
+        return outputs
 
     def evaluation_step(self, batch: Any, batch_idx: int, dataloader_idx: int) -> Optional[STEP_OUTPUT]:
         """The evaluation step (validation_step or test_step depending on the trainer's state).
Original file line number	Diff line number	Diff line change
`@@ -543,7 +543,7 @@ def write_prediction(`
`543`	`543`	`' and will be removed in v1.5.'`
`544`	`544`	`)`
`545`	`545`
`546`		`- self.trainer.evaluation_loop.predictions._add_prediction(name, value, filename)`
	`546`	`+ self.trainer._evaluation_loop.predictions._add_prediction(name, value, filename)`
`547`	`547`
`548`	`548`	`def write_prediction_dict(self, predictions_dict: Dict[str, Any], filename: str = 'predictions.pt'):`
`549`	`549`	`"""`