Apply #11556

carmocca · carmocca · commit 99794540d050 · 2022-02-07T20:48:43.000+01:00
diff --git a/pytorch_lightning/loops/epoch/training_epoch_loop.py b/pytorch_lightning/loops/epoch/training_epoch_loop.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import logging
+import math
 from collections import defaultdict
 from typing import Any, Dict, Generator, Iterator, List, Optional, overload, Tuple, Union
 
@@ -137,6 +138,17 @@ def reset(self) -> None:
             # seen per epoch, this is useful for tracking when validation is run multiple times per epoch
             self.val_loop.epoch_loop.batch_progress.total.reset()
 
+        ft_enabled = self.trainer.state._fault_tolerant_mode.is_enabled
+        if not ft_enabled and self.restarting and self.trainer.num_training_batches not in (0, float("inf")):
+            expected_steps = math.ceil(self.trainer.num_training_batches / self.trainer.accumulate_grad_batches)
+            if self.global_step % expected_steps != 0:
+                rank_zero_warn(
+                    "You're resuming from a checkpoint that ended mid-epoch."
+                    " This can cause unreliable results if further training is done,"
+                    " consider using an end of epoch checkpoint or use fault-tolerant training"
+                    " to restart as if training did not stop."
+                )
+
         self._outputs = []
 
     def on_run_start(self, data_fetcher: AbstractDataFetcher) -> None:  # type: ignore[override]
diff --git a/pytorch_lightning/loops/fit_loop.py b/pytorch_lightning/loops/fit_loop.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import logging
-import math
 from typing import Optional
 
 from pytorch_lightning.loops import Loop
@@ -22,10 +21,9 @@
 from pytorch_lightning.trainer.connectors.logger_connector.result import _ResultCollection
 from pytorch_lightning.trainer.progress import Progress
 from pytorch_lightning.trainer.supporters import TensorRunningAccum
-from pytorch_lightning.utilities.enums import _FaultTolerantMode
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from pytorch_lightning.utilities.model_helpers import is_overridden
-from pytorch_lightning.utilities.rank_zero import rank_zero_deprecation, rank_zero_warn
+from pytorch_lightning.utilities.rank_zero import rank_zero_deprecation
 
 log = logging.getLogger(__name__)
 
@@ -191,23 +189,6 @@ def on_run_start(self) -> None:  # type: ignore[override]
         # reset train dataloader and val dataloader
         self.trainer.reset_train_val_dataloaders(self.trainer.lightning_module)
 
-        ft_enabled = _FaultTolerantMode.detect_current_mode().is_enabled
-        if not ft_enabled and self.restarting and self.trainer.num_training_batches not in (0, float("inf")):
-            self.trainer.accumulate_grad_batches = self.trainer.accumulation_scheduler.get_accumulate_grad_batches(
-                self.trainer.current_epoch
-            )
-            expected_steps = math.ceil(self.trainer.num_training_batches / self.trainer.accumulate_grad_batches)
-
-            # global_step is incremented during checkpointing (#11555)
-            if (self.trainer.global_step - 1) % expected_steps != 0:
-                rank_zero_warn(
-                    "You're resuming from a checkpoint that ended mid-epoch."
-                    " Training will start from the beginning of the next epoch."
-                    " This can cause unreliable results if further training is done,"
-                    " consider using an end of epoch checkpoint or use fault-tolerant training"
-                    " to restart as if training did not stop."
-                )
-
         self._is_fresh_start_epoch = True
         self._results.to(device=self.trainer.lightning_module.device)
         self.trainer._call_callback_hooks("on_train_start")
diff --git a/tests/models/test_restore.py b/tests/models/test_restore.py
@@ -33,7 +33,7 @@
 from tests.helpers.datamodules import ClassifDataModule
 from tests.helpers.runif import RunIf
 from tests.helpers.simple_models import ClassificationModel
-from tests.helpers.utils import no_warning_call
+from tests.loops.test_loops import CustomException
 
 
 class ModelTrainerPropertyParity(Callback):
@@ -774,44 +774,59 @@ def test_model_pickle(tmpdir):
     cloudpickle.dumps(model)
 
 
-@pytest.mark.parametrize("stop_batch_idx", [4, 7])
-def test_restarting_mid_epoch_raises_warning(tmpdir, stop_batch_idx):
-    """Test that a warning is raised if training is restarted from mid-epoch."""
+class ExceptionModel(BoringModel):
+    def __init__(self, stop_batch_idx):
+        super().__init__()
+        self.stop_batch_idx = stop_batch_idx
 
-    class CustomModel(BoringModel):
-        def __init__(self, stop_batch_idx):
-            super().__init__()
-            self.stop_batch_idx = stop_batch_idx
+    def training_step(self, batch, batch_idx):
+        if batch_idx == self.stop_batch_idx:
+            raise CustomException()
+        return super().training_step(batch, batch_idx)
 
-        def training_step(self, batch, batch_idx):
-            if (batch_idx + 1) == self.stop_batch_idx:
-                self.trainer.should_stop = True
 
-            return super().training_step(batch, batch_idx)
+class ShouldStopModel(ExceptionModel):
+    def training_step(self, batch, batch_idx):
+        if batch_idx == self.stop_batch_idx:
+            # setting should_stop is treated differently to raising an exception.
+            # checking both tests that this warning is raised in the correct loop
+            self.trainer.should_stop = True
+        return super().training_step(batch, batch_idx)
 
-    limit_train_batches = 7
+
+@pytest.mark.parametrize("stop_in_the_middle", (True, False))
+@pytest.mark.parametrize("model_cls", (ExceptionModel, ShouldStopModel))
+def test_restarting_mid_epoch_raises_warning(tmpdir, stop_in_the_middle, model_cls):
+    """Test that a warning is raised if training is restarted from mid-epoch."""
+    limit_train_batches = 8
     trainer_kwargs = {
         "default_root_dir": tmpdir,
         "limit_train_batches": limit_train_batches,
+        "limit_val_batches": 0,
         "enable_progress_bar": False,
         "enable_model_summary": False,
     }
     trainer = Trainer(max_epochs=1, **trainer_kwargs)
-    model = CustomModel(stop_batch_idx)
-    trainer.fit(model)
+    model = model_cls(limit_train_batches // 2 if stop_in_the_middle else -1)
+
+    if stop_in_the_middle:
+        with pytest.raises(CustomException):
+            trainer.fit(model)
+    else:
+        trainer.fit(model)
 
     ckpt_path = str(tmpdir / "resume.ckpt")
     trainer.save_checkpoint(ckpt_path)
 
-    trainer = Trainer(max_epochs=2, limit_val_batches=0, **trainer_kwargs)
+    trainer = Trainer(max_epochs=2, **trainer_kwargs)
+    model.stop_batch_idx = -1
 
-    warning_raised = limit_train_batches != stop_batch_idx
-    context_manager = pytest.warns if warning_raised else no_warning_call
+    context_manager = pytest.warns if stop_in_the_middle else tutils.no_warning_call
     with context_manager(UserWarning, match="resuming from a checkpoint that ended mid-epoch"):
         trainer.fit(model, ckpt_path=ckpt_path)
 
-    if warning_raised:
+    if stop_in_the_middle:
         with mock.patch.dict(os.environ, {"PL_FAULT_TOLERANT_TRAINING": "1"}):
-            trainer = Trainer(max_epochs=2, limit_val_batches=0, **trainer_kwargs)
-            with no_warning_call(UserWarning, match="resuming from a checkpoint that ended mid-epoch"):
+            trainer = Trainer(max_epochs=2, **trainer_kwargs)
+            with tutils.no_warning_call(UserWarning, match="resuming from a checkpoint that ended mid-epoch"):
                 trainer.fit(model, ckpt_path=ckpt_path)