Lightning-AI
diff --git a/‎CHANGELOG.md‎
Lines changed: 10 additions & 3 deletions b/‎CHANGELOG.md‎
Lines changed: 10 additions & 3 deletions
diff --git a/‎pytorch_lightning/callbacks/stochastic_weight_avg.py‎
Lines changed: 2 additions & 2 deletions b/‎pytorch_lightning/callbacks/stochastic_weight_avg.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎pytorch_lightning/loops/base.py‎
Lines changed: 1 addition & 4 deletions b/‎pytorch_lightning/loops/base.py‎
Lines changed: 1 addition & 4 deletions
diff --git a/‎pytorch_lightning/loops/epoch/training_epoch_loop.py‎
Lines changed: 11 additions & 0 deletions b/‎pytorch_lightning/loops/epoch/training_epoch_loop.py‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎pytorch_lightning/loops/fit_loop.py‎
Lines changed: 26 additions & 24 deletions b/‎pytorch_lightning/loops/fit_loop.py‎
Lines changed: 26 additions & 24 deletions
diff --git a/‎pytorch_lightning/loops/utilities.py‎
Lines changed: 7 additions & 2 deletions b/‎pytorch_lightning/loops/utilities.py‎
Lines changed: 7 additions & 2 deletions
diff --git a/‎pytorch_lightning/trainer/connectors/checkpoint_connector.py‎
Lines changed: 4 additions & 14 deletions b/‎pytorch_lightning/trainer/connectors/checkpoint_connector.py‎
Lines changed: 4 additions & 14 deletions
diff --git a/‎pytorch_lightning/trainer/trainer.py‎
Lines changed: 1 addition & 1 deletion b/‎pytorch_lightning/trainer/trainer.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎pytorch_lightning/tuner/batch_size_scaling.py‎
Lines changed: 0 additions & 2 deletions b/‎pytorch_lightning/tuner/batch_size_scaling.py‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎pytorch_lightning/tuner/lr_finder.py‎
Lines changed: 0 additions & 2 deletions b/‎pytorch_lightning/tuner/lr_finder.py‎
Lines changed: 0 additions & 2 deletions
@@ -224,9 +224,6 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - DeepSpeed does not require lightning module zero 3 partitioning ([#10655](https://github.com/PyTorchLightning/pytorch-lightning/pull/10655))
 
 
-- Deprecated `training_type_plugin` property in favor of `strategy` in `Trainer` and updated the references ([#11141](https://github.com/PyTorchLightning/pytorch-lightning/pull/11141))
-
-
 - Moved `Strategy` classes to the `strategies` directory ([#11226](https://github.com/PyTorchLightning/pytorch-lightning/pull/11226))
 
 
@@ -247,6 +244,10 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 - Changed `MisconfigurationException` to `ModuleNotFoundError` when `rich` isn't available ([#11360](https://github.com/PyTorchLightning/pytorch-lightning/pull/11360))
 
+
+- The `trainer.current_epoch` value is now increased by 1 during and after `on_train_end` ([#8578](https://github.com/PyTorchLightning/pytorch-lightning/pull/8578))
+
+
 - Inherit from `ABC` for `Accelerator`: Users need to implement `auto_device_count` ([#11521](https://github.com/PyTorchLightning/pytorch-lightning/pull/11521))
 
 
@@ -261,11 +262,17 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 - When using DP (data-parallel), Lightning will no longer automatically reduce all tensors returned in training_step; it will only reduce the loss unless `training_step_end` is overridden ([#11594](https://github.com/PyTorchLightning/pytorch-lightning/pull/11594))
 
+
 - When using DP (data-parallel), the `training_epoch_end` hook will no longer receive reduced outputs from `training_step` and instead get the full tensor of results from all GPUs ([#11594](https://github.com/PyTorchLightning/pytorch-lightning/pull/11594))
+
 ### Deprecated
 
+- Deprecated `training_type_plugin` property in favor of `strategy` in `Trainer` and updated the references ([#11141](https://github.com/PyTorchLightning/pytorch-lightning/pull/11141))
+
+
 - Deprecated `Trainer.{validated,tested,predicted}_ckpt_path` and replaced with read-only property `Trainer.ckpt_path` set when checkpoints loaded via `Trainer.{fit,validate,test,predict}` ([#11696](https://github.com/PyTorchLightning/pytorch-lightning/pull/11696))
 
+
 - Deprecated `ClusterEnvironment.master_{address,port}` in favor of `ClusterEnvironment.main_{address,port}` ([#10103](https://github.com/PyTorchLightning/pytorch-lightning/pull/10103))
 
 
 
@@ -221,13 +221,13 @@ def on_train_epoch_end(self, trainer: "pl.Trainer", *args):
         trainer.fit_loop._skip_backward = False
 
     def on_train_end(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule"):
-        if self._model_contains_batch_norm and trainer.current_epoch == self.swa_end + 1:
+        if self._model_contains_batch_norm and trainer.current_epoch - 1 == self.swa_end + 1:
             # BatchNorm epoch update. Reset state
             trainer.accumulate_grad_batches = self._accumulate_grad_batches
             trainer.num_training_batches -= 1
             trainer.fit_loop.max_epochs -= 1
             self.reset_momenta()
-        elif trainer.current_epoch == self.swa_end:
+        elif trainer.current_epoch - 1 == self.swa_end:
             # Last SWA epoch. Transfer weights from average model to pl_module
             self.transfer_weights(self._average_model, pl_module)
 
 
@@ -21,7 +21,6 @@
 import pytorch_lightning as pl
 from pytorch_lightning.trainer.connectors.logger_connector.result import _ResultCollection
 from pytorch_lightning.trainer.progress import BaseProgress
-from pytorch_lightning.utilities.enums import _FaultTolerantMode
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 
 T = TypeVar("T")  # the output type of `run`
@@ -288,11 +287,9 @@ def state_dict(self, destination: Optional[Dict] = None, prefix: str = "") -> Di
 
         destination[prefix + "state_dict"] = self.on_save_checkpoint()
 
-        # do not get the mode from `self.trainer` because it might not have been attached yet
-        ft_enabled = _FaultTolerantMode.detect_current_mode().is_enabled
         for k, v in self.__dict__.items():
             key = prefix + k
-            if ft_enabled and isinstance(v, BaseProgress):
+            if isinstance(v, BaseProgress):
                 destination[key] = v.state_dict()
             elif isinstance(v, Loop):
                 v.state_dict(destination, key + ".")
 
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import logging
 from collections import defaultdict
 from typing import Any, Dict, Generator, Iterator, List, Optional, overload, Tuple, Union
 
@@ -35,6 +36,9 @@
 _OUTPUTS_TYPE = List[_BATCH_OUTPUTS_TYPE]
 
 
+log = logging.getLogger(__name__)
+
+
 class TrainingEpochLoop(loops.Loop[_OUTPUTS_TYPE]):
     """Runs over all batches in a dataloader (one epoch).
 
@@ -99,6 +103,13 @@ def _is_validation_done(self) -> bool:
     @property
     def done(self) -> bool:
         """Evaluates when to leave the loop."""
+        if self.trainer.should_stop and self.min_steps:
+            self.trainer.should_stop = self.global_step >= self.min_steps
+            if not self.trainer.should_stop:
+                log.info(
+                    f"Trainer was signaled to stop but required minimum steps ({self.min_steps}) has not been met."
+                    " Training will continue..."
+                )
         return (self._is_training_done and self._is_validation_done) or self.trainer.should_stop
 
     def connect(  # type: ignore[override]
 
@@ -40,7 +40,7 @@ class FitLoop(Loop[None]):
 
     def __init__(
         self,
-        min_epochs: Optional[int] = 1,
+        min_epochs: int = 0,
         max_epochs: int = 1000,
     ) -> None:
         super().__init__()
@@ -121,6 +121,21 @@ def running_loss(self) -> TensorRunningAccum:
         """Returns the running loss."""
         return self.epoch_loop.batch_loop.running_loss
 
+    @Loop.restarting.setter
+    def restarting(self, restarting: bool) -> None:
+        # if the last epoch completely finished, we are not actually restarting, we can check this to see if all
+        # current values are equal
+        values = (
+            self.epoch_progress.current.ready,
+            self.epoch_progress.current.started,
+            self.epoch_progress.current.processed,
+        )
+        finished_before_on_train_end = any(v != self.epoch_progress.current.completed for v in values)
+        if finished_before_on_train_end:
+            self.epoch_progress.current.completed = self.epoch_progress.current.processed
+        restarting &= finished_before_on_train_end
+        Loop.restarting.fset(self, restarting)  # call the parent setter
+
     @property
     def _skip_backward(self) -> bool:
         """Determines whether the loop will skip backward during automatic optimization."""
@@ -144,31 +159,23 @@ def done(self) -> bool:
         """Evaluates when to leave the loop."""
         # TODO(@awaelchli): Move track steps inside training loop and move part of these condition inside training loop
         stop_steps = _is_max_limit_reached(self.global_step, self.max_steps)
-        stop_epochs = _is_max_limit_reached(self.epoch_progress.current.completed, self.max_epochs)
-
-        should_stop = False
-        if self.trainer.should_stop:
-            # early stopping
-            met_min_epochs = self.epoch_progress.current.completed >= self.min_epochs if self.min_epochs else True
-            met_min_steps = self.global_step >= self.min_steps if self.min_steps else True
-            if met_min_epochs and met_min_steps:
-                should_stop = True
-            else:
+        stop_epochs = _is_max_limit_reached(self.epoch_progress.current.processed, self.max_epochs)
+
+        if self.trainer.should_stop and self.min_epochs:
+            self.trainer.should_stop = self.epoch_progress.current.processed >= self.min_epochs
+            if not self.trainer.should_stop:
                 log.info(
-                    "Trainer was signaled to stop but required minimum epochs"
-                    f" ({self.min_epochs}) or minimum steps ({self.min_steps}) has"
-                    " not been met. Training will continue..."
+                    f"Trainer was signaled to stop but required minimum epochs ({self.min_epochs}) has not been met."
+                    " Training will continue..."
                 )
-        self.trainer.should_stop = should_stop
-
-        return stop_steps or should_stop or stop_epochs or self.trainer.num_training_batches == 0
+        return stop_steps or self.trainer.should_stop or stop_epochs or self.trainer.num_training_batches == 0
 
     @property
     def skip(self) -> bool:
         """Whether we should skip the training and immediately return from the call to :meth:`run`."""
         # since `trainer.num_training_batches` depends on the `train_dataloader` but that won't be called
         # until `on_run_start`, we use `limit_train_batches` instead
-        return self.done or self.trainer.limit_train_batches == 0
+        return self.trainer.limit_train_batches == 0
 
     def connect(self, epoch_loop: TrainingEpochLoop) -> None:  # type: ignore[override]
         """Connects a training epoch loop to this fit loop."""
@@ -225,7 +232,7 @@ def on_advance_start(self) -> None:  # type: ignore[override]
             getattr(self.trainer.train_dataloader.sampler, "set_epoch", None)
         ):
             # set seed for distributed sampler (enables shuffling for each epoch)
-            self.trainer.train_dataloader.sampler.set_epoch(self.epoch_progress.current.completed)
+            self.trainer.train_dataloader.sampler.set_epoch(self.epoch_progress.current.processed)
 
         # changing gradient according accumulation_scheduler
         self.trainer.accumulation_scheduler.on_train_epoch_start(self.trainer, self.trainer.lightning_module)
@@ -309,11 +316,6 @@ def on_advance_end(self) -> None:
     def on_run_end(self) -> None:
         """Calls the ``on_train_end`` hook."""
         log.detail(f"{self.__class__.__name__}: train run ended")
-        # NOTE: the current_epoch is already incremented
-        # Lightning today does not increment the current epoch at the last epoch run in Trainer.fit
-        # To simulate that current behavior, we decrement here.
-        # TODO: must be fixed by https://github.com/PyTorchLightning/pytorch-lightning/issues/5007
-        self.epoch_progress.current.completed = max(self.epoch_progress.current.completed - 1, 0)
 
         # hook
         self.trainer._call_callback_hooks("on_train_end")
 
@@ -71,7 +71,7 @@ def _parse_loop_limits(
     min_epochs: Optional[int],
     max_epochs: int,
     max_time: Optional[Union[str, timedelta, Dict[str, int]]],
-) -> Tuple[Optional[int], int, Optional[int], int, Optional[Union[str, timedelta, Dict[str, int]]]]:
+) -> Tuple[Optional[int], int, int, int, Optional[Union[str, timedelta, Dict[str, int]]]]:
     """This utility computes the default values for the minimum and maximum number of steps and epochs given the
     values the user has selected.
 
@@ -95,7 +95,12 @@ def _parse_loop_limits(
             max_epochs = 1000
         else:
             max_epochs = -1
-    min_epochs = 1 if (min_epochs is None and min_steps is None and max_time is None) else min_epochs
+    if min_epochs is None and min_steps is not None:
+        # setting this allows FitLoop.done to re-evaluate should_stop when it gets triggered `on_fit_start`
+        min_epochs = 1
+    if min_epochs is None:
+        # the default value is 0 so no training will be done when should_stop is triggered `on_fit_start`
+        min_epochs = 0
     return min_steps, max_steps, min_epochs, max_epochs, max_time
 
 
 
@@ -21,7 +21,6 @@
 from torchmetrics import Metric
 
 import pytorch_lightning as pl
-from pytorch_lightning.loops.utilities import _is_max_limit_reached
 from pytorch_lightning.plugins.environments import SLURMEnvironment
 from pytorch_lightning.trainer.states import TrainerFn
 from pytorch_lightning.utilities import _OMEGACONF_AVAILABLE
@@ -228,7 +227,7 @@ def restore_loops(self) -> None:
         assert self.trainer.state.fn is not None
         state_dict = self._loaded_checkpoint.get("loops")
         if state_dict is not None:
-            if self.trainer.state.fn == TrainerFn.FITTING:
+            if self.trainer.state.fn in (TrainerFn.FITTING, TrainerFn.TUNING):
                 self.trainer.fit_loop.load_state_dict(state_dict["fit_loop"])
             elif self.trainer.state.fn == TrainerFn.VALIDATING:
                 self.trainer.validate_loop.load_state_dict(state_dict["validate_loop"])
@@ -327,21 +326,12 @@ def dump_checkpoint(self, weights_only: bool = False) -> dict:
                 LightningDataModule.__class__.__name__: pl DataModule's state
             }
         """
-
-        # dump epoch/global_step/pytorch-lightning_version
-        current_epoch = self.trainer.current_epoch
-        global_step = self.trainer.global_step
-        has_reached_max_steps = _is_max_limit_reached(global_step, self.trainer.max_steps)
-
-        global_step += 1
-        if not has_reached_max_steps:
-            current_epoch += 1
-
         model = self.trainer.lightning_module
 
         checkpoint = {
-            "epoch": current_epoch,
-            "global_step": global_step,
+            # the epoch is saved for compatibility but it's not relevant for restoration
+            "epoch": self.trainer.current_epoch,
+            "global_step": self.trainer.global_step + 1,
             "pytorch-lightning_version": pl.__version__,
             "state_dict": self._get_lightning_module_state_dict(),
             "loops": self._get_loops_state_dict(),
 
@@ -2439,7 +2439,7 @@ def max_epochs(self) -> int:
         return self.fit_loop.max_epochs
 
     @property
-    def min_epochs(self) -> Optional[int]:
+    def min_epochs(self) -> int:
         return self.fit_loop.min_epochs
 
     @property
 
@@ -60,10 +60,8 @@ def scale_batch_size(
 
     # Save initial model, that is loaded after batch size is found
     ckpt_path = os.path.join(trainer.default_root_dir, f".scale_batch_size_{uuid.uuid4()}.ckpt")
-    trainer.fit_loop.epoch_progress.current.completed -= 1
     trainer.fit_loop.global_step -= 1
     trainer.save_checkpoint(ckpt_path)
-    trainer.fit_loop.epoch_progress.current.completed += 1
     trainer.fit_loop.global_step += 1
     params = __scale_batch_dump_params(trainer)
 
 
@@ -204,10 +204,8 @@ def lr_find(
 
     # Save initial model, that is loaded after learning rate is found
     ckpt_path = os.path.join(trainer.default_root_dir, f".lr_find_{uuid.uuid4()}.ckpt")
-    trainer.fit_loop.epoch_progress.current.completed -= 1
     trainer.fit_loop.global_step -= 1
     trainer.save_checkpoint(ckpt_path)
-    trainer.fit_loop.epoch_progress.current.completed += 1
     trainer.fit_loop.global_step += 1
     params = __lr_finder_dump_params(trainer)