Fix mid-epoch warning call while resuming (#11556)

rohitgr7 · carmocca · Borda · web-flow · commit 0cb64fb8baf9 · 2022-02-03T05:42:31.000Z
Co-authored-by: Carlos Mocholí &lt;carlossmocholi@gmail.com&gt;
Co-authored-by: Jirka &lt;jirka.borovec@seznam.cz&gt;
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -497,6 +497,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Disbled sampler replacement when using `IterableDataset` ([#11507](https://github.com/PyTorchLightning/pytorch-lightning/pull/11507))
 
 
+- Fixed the mid-epoch warning call while resuming training ([#11556](https://github.com/PyTorchLightning/pytorch-lightning/pull/11556))
+
+
 - Fixed an issue in `RichProgressbar` to display the metrics logged only on main progress bar ([#11690](https://github.com/PyTorchLightning/pytorch-lightning/pull/11690))
 
 
diff --git a/pytorch_lightning/loops/fit_loop.py b/pytorch_lightning/loops/fit_loop.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import logging
+import math
 from typing import Optional
 
 from pytorch_lightning.loops import Loop
@@ -22,8 +23,10 @@
 from pytorch_lightning.trainer.progress import Progress
 from pytorch_lightning.trainer.supporters import TensorRunningAccum
 from pytorch_lightning.utilities import rank_zero_deprecation
+from pytorch_lightning.utilities.enums import _FaultTolerantMode
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from pytorch_lightning.utilities.model_helpers import is_overridden
+from pytorch_lightning.utilities.warnings import rank_zero_warn
 
 log = logging.getLogger(__name__)
 
@@ -181,6 +184,24 @@ def on_run_start(self) -> None:  # type: ignore[override]
         """Calls the ``on_train_start`` hook."""
         # reset train dataloader and val dataloader
         self.trainer.reset_train_val_dataloaders(self.trainer.lightning_module)
+
+        ft_enabled = _FaultTolerantMode.detect_current_mode().is_enabled
+        if not ft_enabled and self.restarting and self.trainer.num_training_batches not in (0, float("inf")):
+            self.trainer.accumulate_grad_batches = self.trainer.accumulation_scheduler.get_accumulate_grad_batches(
+                self.trainer.current_epoch
+            )
+            expected_steps = math.ceil(self.trainer.num_training_batches / self.trainer.accumulate_grad_batches)
+
+            # global_step is incremented during checkpointing (#11555)
+            if (self.trainer.global_step - 1) % expected_steps != 0:
+                rank_zero_warn(
+                    "You're resuming from a checkpoint that ended mid-epoch."
+                    " Training will start from the beginning of the next epoch."
+                    " This can cause unreliable results if further training is done,"
+                    " consider using an end of epoch checkpoint or use fault-tolerant training"
+                    " to restart as if training did not stop."
+                )
+
         self._is_fresh_start_epoch = True
         self._results.to(device=self.trainer.lightning_module.device)
         self.trainer._call_callback_hooks("on_train_start")
diff --git a/pytorch_lightning/trainer/connectors/checkpoint_connector.py b/pytorch_lightning/trainer/connectors/checkpoint_connector.py
@@ -24,7 +24,7 @@
 from pytorch_lightning.loops.utilities import _is_max_limit_reached
 from pytorch_lightning.plugins.environments import SLURMEnvironment
 from pytorch_lightning.trainer.states import TrainerFn
-from pytorch_lightning.utilities import _OMEGACONF_AVAILABLE, rank_zero_deprecation, rank_zero_info, rank_zero_warn
+from pytorch_lightning.utilities import _OMEGACONF_AVAILABLE, rank_zero_deprecation, rank_zero_info
 from pytorch_lightning.utilities.cloud_io import get_filesystem
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from pytorch_lightning.utilities.imports import _fault_tolerant_training
@@ -248,21 +248,6 @@ def restore_loops(self) -> None:
                 f" but you have set Trainer(max_epochs={self.trainer.max_epochs})."
             )
 
-        # Division deals with global step stepping once per accumulated batch
-        # Inequality deals with different global step for odd vs even num_training_batches
-        self.trainer.accumulate_grad_batches = self.trainer.accumulation_scheduler.get_accumulate_grad_batches(
-            self.trainer.current_epoch
-        )
-        n_accum = 1 if self.trainer.accumulate_grad_batches is None else self.trainer.accumulate_grad_batches
-        expected_steps = self.trainer.num_training_batches / n_accum
-        if self.trainer.num_training_batches != 0 and self.trainer.global_step % expected_steps > 1:
-            rank_zero_warn(
-                "You're resuming from a checkpoint that ended mid-epoch."
-                " Training will start from the beginning of the next epoch."
-                " This can cause unreliable results if further training is done,"
-                " consider using an end of epoch checkpoint."
-            )
-
     def restore_optimizers_and_schedulers(self) -> None:
         """Restores the optimizers and learning rate scheduler states from the pre-loaded checkpoint."""
         if not self._loaded_checkpoint:
diff --git a/pytorch_lightning/utilities/cloud_io.py b/pytorch_lightning/utilities/cloud_io.py
@@ -26,7 +26,7 @@
 
 
 def load(
-    path_or_url: Union[str, IO, Path],
+    path_or_url: Union[IO, _PATH],
     map_location: Optional[
         Union[str, Callable, torch.device, Dict[Union[str, torch.device], Union[str, torch.device]]]
     ] = None,
diff --git a/tests/models/test_restore.py b/tests/models/test_restore.py
@@ -17,6 +17,7 @@
 import pickle
 from copy import deepcopy
 from typing import Generic, Mapping, TypeVar
+from unittest import mock
 
 import cloudpickle
 import pytest
@@ -32,6 +33,7 @@
 from tests.helpers.datamodules import ClassifDataModule
 from tests.helpers.runif import RunIf
 from tests.helpers.simple_models import ClassificationModel
+from tests.helpers.utils import no_warning_call
 
 
 class ModelTrainerPropertyParity(Callback):
@@ -776,3 +778,46 @@ def test_model_pickle(tmpdir):
     model = BoringModel()
     pickle.dumps(model)
     cloudpickle.dumps(model)
+
+
+@pytest.mark.parametrize("stop_batch_idx", [4, 7])
+def test_restarting_mid_epoch_raises_warning(tmpdir, stop_batch_idx):
+    """Test that a warning is raised if training is restarted from mid-epoch."""
+
+    class CustomModel(BoringModel):
+        def __init__(self, stop_batch_idx):
+            super().__init__()
+            self.stop_batch_idx = stop_batch_idx
+
+        def training_step(self, batch, batch_idx):
+            if (batch_idx + 1) == self.stop_batch_idx:
+                self.trainer.should_stop = True
+
+            return super().training_step(batch, batch_idx)
+
+    limit_train_batches = 7
+    trainer_kwargs = {
+        "default_root_dir": tmpdir,
+        "limit_train_batches": limit_train_batches,
+        "enable_progress_bar": False,
+        "enable_model_summary": False,
+    }
+    trainer = Trainer(max_epochs=1, **trainer_kwargs)
+    model = CustomModel(stop_batch_idx)
+    trainer.fit(model)
+
+    ckpt_path = str(tmpdir / "resume.ckpt")
+    trainer.save_checkpoint(ckpt_path)
+
+    trainer = Trainer(max_epochs=2, limit_val_batches=0, **trainer_kwargs)
+
+    warning_raised = limit_train_batches != stop_batch_idx
+    context_manager = pytest.warns if warning_raised else no_warning_call
+    with context_manager(UserWarning, match="resuming from a checkpoint that ended mid-epoch"):
+        trainer.fit(model, ckpt_path=ckpt_path)
+
+    if warning_raised:
+        with mock.patch.dict(os.environ, {"PL_FAULT_TOLERANT_TRAINING": "1"}):
+            trainer = Trainer(max_epochs=2, limit_val_batches=0, **trainer_kwargs)
+            with no_warning_call(UserWarning, match="resuming from a checkpoint that ended mid-epoch"):
+                trainer.fit(model, ckpt_path=ckpt_path)