Apply #11556

carmocca · carmocca · commit 17ebb0c8ae56 · 2022-02-10T16:24:40.000+01:00
diff --git a/pytorch_lightning/loops/epoch/training_epoch_loop.py b/pytorch_lightning/loops/epoch/training_epoch_loop.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import logging
+import math
 from collections import defaultdict
 from typing import Any, Dict, Generator, Iterator, List, Optional, overload, Tuple, Union
 
@@ -129,6 +130,16 @@ def reset(self) -> None:
             self.batch_progress.reset_on_restart()
             self.scheduler_progress.reset_on_restart()
             self.batch_loop.optimizer_loop.optim_progress.reset_on_restart()
+
+            trainer = self.trainer
+            if not trainer.state._fault_tolerant_mode.is_enabled and trainer.num_training_batches != float("inf"):
+                expected_steps = math.ceil(trainer.num_training_batches / trainer.accumulate_grad_batches)
+                if self.global_step % expected_steps != 0:
+                    rank_zero_warn(
+                        "You're resuming from a checkpoint that ended before the epoch ended. This can cause unreliable"
+                        "results if further training is done. Consider using an end-of-epoch checkpoint or enabling"
+                        "fault-tolerant training."
+                    )
         else:
             self.batch_progress.reset_on_run()
             self.scheduler_progress.reset_on_run()
diff --git a/pytorch_lightning/loops/fit_loop.py b/pytorch_lightning/loops/fit_loop.py
@@ -12,10 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import logging
-import math
 import os
 from functools import partial
-from typing import Optional, Type
+from typing import Optional
+from typing import Type
 
 import pytorch_lightning as pl
 from pytorch_lightning.accelerators import GPUAccelerator
@@ -26,7 +26,6 @@
 from pytorch_lightning.trainer.connectors.logger_connector.result import _ResultCollection
 from pytorch_lightning.trainer.progress import Progress
 from pytorch_lightning.trainer.supporters import TensorRunningAccum
-from pytorch_lightning.utilities.enums import _FaultTolerantMode
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from pytorch_lightning.utilities.fetching import (
     AbstractDataFetcher,
@@ -35,7 +34,8 @@
     InterBatchParallelDataFetcher,
 )
 from pytorch_lightning.utilities.model_helpers import is_overridden
-from pytorch_lightning.utilities.rank_zero import rank_zero_deprecation, rank_zero_warn
+from pytorch_lightning.utilities.rank_zero import rank_zero_deprecation
+from pytorch_lightning.utilities.rank_zero import rank_zero_warn
 from pytorch_lightning.utilities.signature_utils import is_param_in_hook_signature
 
 log = logging.getLogger(__name__)
@@ -205,23 +205,6 @@ def on_run_start(self) -> None:  # type: ignore[override]
         data_fetcher_cls = _select_data_fetcher(self.trainer)
         self._data_fetcher = data_fetcher_cls()
 
-        ft_enabled = _FaultTolerantMode.detect_current_mode().is_enabled
-        if not ft_enabled and self.restarting and self.trainer.num_training_batches not in (0, float("inf")):
-            self.trainer.accumulate_grad_batches = self.trainer.accumulation_scheduler.get_accumulate_grad_batches(
-                self.trainer.current_epoch
-            )
-            expected_steps = math.ceil(self.trainer.num_training_batches / self.trainer.accumulate_grad_batches)
-
-            # global_step is incremented during checkpointing (#11555)
-            if (self.trainer.global_step - 1) % expected_steps != 0:
-                rank_zero_warn(
-                    "You're resuming from a checkpoint that ended mid-epoch."
-                    " Training will start from the beginning of the next epoch."
-                    " This can cause unreliable results if further training is done,"
-                    " consider using an end of epoch checkpoint or use fault-tolerant training"
-                    " to restart as if training did not stop."
-                )
-
         self._is_fresh_start_epoch = True
         self._results.to(device=self.trainer.lightning_module.device)
 
diff --git a/tests/models/test_restore.py b/tests/models/test_restore.py
@@ -33,7 +33,7 @@
 from tests.helpers.datamodules import ClassifDataModule
 from tests.helpers.runif import RunIf
 from tests.helpers.simple_models import ClassificationModel
-from tests.helpers.utils import no_warning_call
+from tests.loops.test_loops import CustomException
 
 
 class ModelTrainerPropertyParity(Callback):
@@ -774,44 +774,59 @@ def test_model_pickle(tmpdir):
     cloudpickle.dumps(model)
 
 
-@pytest.mark.parametrize("stop_batch_idx", [4, 7])
-def test_restarting_mid_epoch_raises_warning(tmpdir, stop_batch_idx):
-    """Test that a warning is raised if training is restarted from mid-epoch."""
+class ExceptionModel(BoringModel):
+    def __init__(self, stop_batch_idx):
+        super().__init__()
+        self.stop_batch_idx = stop_batch_idx
 
-    class CustomModel(BoringModel):
-        def __init__(self, stop_batch_idx):
-            super().__init__()
-            self.stop_batch_idx = stop_batch_idx
+    def training_step(self, batch, batch_idx):
+        if batch_idx == self.stop_batch_idx:
+            raise CustomException()
+        return super().training_step(batch, batch_idx)
 
-        def training_step(self, batch, batch_idx):
-            if (batch_idx + 1) == self.stop_batch_idx:
-                self.trainer.should_stop = True
 
-            return super().training_step(batch, batch_idx)
+class ShouldStopModel(ExceptionModel):
+    def training_step(self, batch, batch_idx):
+        if batch_idx == self.stop_batch_idx:
+            # setting should_stop is treated differently to raising an exception.
+            # checking both tests that this warning is raised in the correct loop
+            self.trainer.should_stop = True
+        return super().training_step(batch, batch_idx)
 
-    limit_train_batches = 7
+
+@pytest.mark.parametrize("stop_in_the_middle", (True, False))
+@pytest.mark.parametrize("model_cls", (ExceptionModel, ShouldStopModel))
+def test_restarting_mid_epoch_raises_warning(tmpdir, stop_in_the_middle, model_cls):
+    """Test that a warning is raised if training is restarted from mid-epoch."""
+    limit_train_batches = 8
     trainer_kwargs = {
         "default_root_dir": tmpdir,
         "limit_train_batches": limit_train_batches,
+        "limit_val_batches": 0,
         "enable_progress_bar": False,
         "enable_model_summary": False,
     }
     trainer = Trainer(max_epochs=1, **trainer_kwargs)
-    model = CustomModel(stop_batch_idx)
-    trainer.fit(model)
+    model = model_cls(limit_train_batches // 2 if stop_in_the_middle else -1)
+
+    if stop_in_the_middle:
+        with pytest.raises(CustomException):
+            trainer.fit(model)
+    else:
+        trainer.fit(model)
 
     ckpt_path = str(tmpdir / "resume.ckpt")
     trainer.save_checkpoint(ckpt_path)
 
-    trainer = Trainer(max_epochs=2, limit_val_batches=0, **trainer_kwargs)
+    trainer = Trainer(max_epochs=2, **trainer_kwargs)
+    model.stop_batch_idx = -1
 
-    warning_raised = limit_train_batches != stop_batch_idx
-    context_manager = pytest.warns if warning_raised else no_warning_call
-    with context_manager(UserWarning, match="resuming from a checkpoint that ended mid-epoch"):
+    context_manager = pytest.warns if stop_in_the_middle else tutils.no_warning_call
+    with context_manager(UserWarning, match="resuming from a checkpoint that ended"):
         trainer.fit(model, ckpt_path=ckpt_path)
 
-    if warning_raised:
+    if stop_in_the_middle:
         with mock.patch.dict(os.environ, {"PL_FAULT_TOLERANT_TRAINING": "1"}):
-            trainer = Trainer(max_epochs=2, limit_val_batches=0, **trainer_kwargs)
-            with no_warning_call(UserWarning, match="resuming from a checkpoint that ended mid-epoch"):
+            trainer = Trainer(max_epochs=2, **trainer_kwargs)
+            with tutils.no_warning_call(UserWarning, match="resuming from a checkpoint that ended"):
                 trainer.fit(model, ckpt_path=ckpt_path)