Comments requested by Thomas

carmocca · carmocca · commit 7afb814485af · 2022-02-10T16:41:40.000+01:00
diff --git a/pytorch_lightning/callbacks/stochastic_weight_avg.py b/pytorch_lightning/callbacks/stochastic_weight_avg.py
@@ -221,6 +221,7 @@ def on_train_epoch_end(self, trainer: "pl.Trainer", *args):
         trainer.fit_loop._skip_backward = False
 
     def on_train_end(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule"):
+        # the trainer increases the current epoch before this hook is called
         if self._model_contains_batch_norm and trainer.current_epoch - 1 == self.swa_end + 1:
             # BatchNorm epoch update. Reset state
             trainer.accumulate_grad_batches = self._accumulate_grad_batches
diff --git a/pytorch_lightning/loops/epoch/training_epoch_loop.py b/pytorch_lightning/loops/epoch/training_epoch_loop.py
@@ -127,7 +127,8 @@ def reset(self) -> None:
                     rank_zero_warn(
                         "You're resuming from a checkpoint that ended before the epoch ended. This can cause unreliable"
                         " results if further training is done. Consider using an end-of-epoch checkpoint or enabling"
-                        " fault-tolerant training."
+                        " fault-tolerant training:"
+                        " https://pytorch-lightning.readthedocs.io/en/stable/advanced/fault_tolerant_training.html"
                     )
         else:
             self.batch_progress.reset_on_run()
diff --git a/pytorch_lightning/loops/fit_loop.py b/pytorch_lightning/loops/fit_loop.py
@@ -169,6 +169,8 @@ def done(self) -> bool:
         """Evaluates when to leave the loop."""
         # TODO(@awaelchli): Move track steps inside training loop and move part of these condition inside training loop
         stop_steps = _is_max_limit_reached(self.global_step, self.max_steps)
+        # `processed` is increased before `on_train_epoch_end`, the hook where checkpoints are typically saved.
+        # we use it here because the checkpoint data won't have `completed` increased yet
         stop_epochs = _is_max_limit_reached(self.epoch_progress.current.processed, self.max_epochs)
 
         should_stop = False