Lightning-AI
diff --git a/‎pytorch_lightning/callbacks/model_checkpoint.py‎
Lines changed: 7 additions & 16 deletions b/‎pytorch_lightning/callbacks/model_checkpoint.py‎
Lines changed: 7 additions & 16 deletions
diff --git a/‎pytorch_lightning/loops/epoch/training_epoch_loop.py‎
Lines changed: 11 additions & 4 deletions b/‎pytorch_lightning/loops/epoch/training_epoch_loop.py‎
Lines changed: 11 additions & 4 deletions
diff --git a/‎pytorch_lightning/loops/fit_loop.py‎
Lines changed: 5 additions & 16 deletions b/‎pytorch_lightning/loops/fit_loop.py‎
Lines changed: 5 additions & 16 deletions
diff --git a/‎pytorch_lightning/trainer/connectors/checkpoint_connector.py‎
Lines changed: 9 additions & 5 deletions b/‎pytorch_lightning/trainer/connectors/checkpoint_connector.py‎
Lines changed: 9 additions & 5 deletions
diff --git a/‎pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py‎
Lines changed: 3 additions & 3 deletions b/‎pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎pytorch_lightning/trainer/trainer.py‎
Lines changed: 1 addition & 1 deletion b/‎pytorch_lightning/trainer/trainer.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎pytorch_lightning/tuner/batch_size_scaling.py‎
Lines changed: 0 additions & 2 deletions b/‎pytorch_lightning/tuner/batch_size_scaling.py‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎pytorch_lightning/tuner/lr_finder.py‎
Lines changed: 0 additions & 2 deletions b/‎pytorch_lightning/tuner/lr_finder.py‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎tests/callbacks/test_rich_progress_bar.py‎
Lines changed: 7 additions & 7 deletions b/‎tests/callbacks/test_rich_progress_bar.py‎
Lines changed: 7 additions & 7 deletions
diff --git a/‎tests/callbacks/test_tqdm_progress_bar.py‎
Lines changed: 7 additions & 7 deletions b/‎tests/callbacks/test_tqdm_progress_bar.py‎
Lines changed: 7 additions & 7 deletions
@@ -222,7 +222,7 @@ def __init__(
         self.save_weights_only = save_weights_only
         self.auto_insert_metric_name = auto_insert_metric_name
         self._save_on_train_epoch_end = save_on_train_epoch_end
-        self._last_global_step_saved = -1
+        self._last_global_step_saved = 0  # no need to save when no steps were taken
         self._last_time_checked: Optional[float] = None
         self.current_score = None
         self.best_k_models = {}
@@ -275,8 +275,7 @@ def on_train_batch_end(
         """Save checkpoint on train batch end if we meet the criteria for `every_n_train_steps`"""
         if self._should_skip_saving_checkpoint(trainer):
             return
-        step = trainer.global_step
-        skip_batch = self._every_n_train_steps < 1 or ((step + 1) % self._every_n_train_steps != 0)
+        skip_batch = self._every_n_train_steps < 1 or (trainer.global_step % self._every_n_train_steps != 0)
 
         train_time_interval = self._train_time_interval
         skip_time = True
@@ -297,16 +296,13 @@ def on_train_batch_end(
 
     def on_train_epoch_end(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None:
         """Save a checkpoint at the end of the training epoch."""
-        # as we advance one step at end of training, we use `global_step - 1` to avoid saving duplicates
-        trainer.fit_loop.global_step -= 1
         if (
             not self._should_skip_saving_checkpoint(trainer)
             and self._save_on_train_epoch_end
             and self._every_n_epochs > 0
             and (trainer.current_epoch + 1) % self._every_n_epochs == 0
         ):
             self.save_checkpoint(trainer)
-        trainer.fit_loop.global_step += 1
 
     def on_validation_end(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None:
         """Save a checkpoint at the end of the validation stage."""
@@ -329,11 +325,8 @@ def on_train_end(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -
             return
         if self.verbose:
             rank_zero_info("Saving latest checkpoint...")
-        # as we advance one step at end of training, we use `global_step - 1` to avoid saving duplicates
-        monitor_candidates = self._monitor_candidates(trainer, trainer.current_epoch, trainer.global_step - 1)
-        trainer.fit_loop.global_step -= 1
+        monitor_candidates = self._monitor_candidates(trainer, trainer.current_epoch, trainer.global_step)
         self._save_last_checkpoint(trainer, monitor_candidates)
-        trainer.fit_loop.global_step += 1
 
     def on_save_checkpoint(
         self, trainer: "pl.Trainer", pl_module: "pl.LightningModule", checkpoint: Dict[str, Any]
@@ -368,12 +361,8 @@ def save_checkpoint(self, trainer: "pl.Trainer") -> None:
         """
         self._validate_monitor_key(trainer)
 
-        # track epoch when ckpt was last checked
-        global_step = trainer.global_step
-        self._last_global_step_saved = global_step
-
         # what can be monitored
-        monitor_candidates = self._monitor_candidates(trainer, epoch=trainer.current_epoch, step=global_step)
+        monitor_candidates = self._monitor_candidates(trainer, epoch=trainer.current_epoch, step=trainer.global_step)
 
         # callback supports multiple simultaneous modes
         # here we call each mode sequentially
@@ -638,6 +627,7 @@ def _monitor_candidates(self, trainer: "pl.Trainer", epoch: int, step: int) -> D
     def _save_last_checkpoint(self, trainer: "pl.Trainer", monitor_candidates: Dict[str, _METRIC]) -> None:
         if not self.save_last:
             return
+        self._last_global_step_saved = monitor_candidates.get("step", trainer.global_step)
 
         filepath = self.format_checkpoint_name(monitor_candidates, self.CHECKPOINT_NAME_LAST)
         # set the last model path before saving because it will be part of the state.
@@ -649,9 +639,9 @@ def _save_last_checkpoint(self, trainer: "pl.Trainer", monitor_candidates: Dict[
     def _save_top_k_checkpoint(self, trainer: "pl.Trainer", monitor_candidates: Dict[str, _METRIC]) -> None:
         if self.monitor is None or self.save_top_k == 0:
             return
+        self._last_global_step_saved = monitor_candidates.get("step", trainer.global_step)
 
         current = monitor_candidates.get(self.monitor)
-
         if self.check_monitor_top_k(trainer, current):
             self._update_best_and_save(current, trainer, monitor_candidates)
         elif self.verbose:
@@ -662,6 +652,7 @@ def _save_top_k_checkpoint(self, trainer: "pl.Trainer", monitor_candidates: Dict
     def _save_none_monitor_checkpoint(self, trainer: "pl.Trainer", monitor_candidates: Dict[str, _METRIC]) -> None:
         if self.monitor is not None or self.save_top_k == 0:
             return
+        self._last_global_step_saved = monitor_candidates.get("step", trainer.global_step)
 
         filepath = self._get_metric_interpolated_filepath_name(monitor_candidates, trainer)
         # set the best model path before saving because it will be part of the state.
 
@@ -59,7 +59,6 @@ def __init__(self, min_steps: Optional[int] = None, max_steps: int = -1) -> None
         self.min_steps = min_steps
         self.max_steps = max_steps
 
-        self.global_step: int = 0
         self.batch_progress = BatchProgress()
         self.scheduler_progress = SchedulerProgress()
 
@@ -72,6 +71,7 @@ def __init__(self, min_steps: Optional[int] = None, max_steps: int = -1) -> None
         self._dataloader_iter: Optional[Iterator] = None
         # caches the loaded dataloader state until dataloader objects are available
         self._dataloader_state_dict: Dict[str, Any] = {}
+        self._legacy_global_step: int = 0
 
     @property
     def total_batch_idx(self) -> int:
@@ -87,6 +87,13 @@ def batch_idx(self) -> int:
         # but before the next `ready` increase
         return self.batch_progress.current.ready - 1
 
+    @property
+    def global_step(self) -> int:
+        lightning_module = self.trainer.lightning_module
+        if lightning_module is None or lightning_module.automatic_optimization:
+            return self.batch_loop.optimizer_loop.optim_progress.optimizer_steps
+        return self.batch_loop.manual_loop.optim_step_progress.total.completed
+
     @property
     def _is_training_done(self) -> bool:
         max_steps_reached = _is_max_limit_reached(self.global_step, self.max_steps)
@@ -253,9 +260,9 @@ def on_advance_end(self) -> None:
         # update plateau LR scheduler after metrics are logged
         self.update_lr_schedulers("step", update_plateau_schedulers=True)
 
-        if not self._should_accumulate():
-            # progress global step according to grads progress
-            self.global_step += 1
+        if self._should_accumulate():
+            # this is increased once per batch disregarding multiple optimizers or tbptt on purpose for loggers
+            self._legacy_global_step += 1
 
         # if training finished, defer exit to the parent. this assumes there will be enough time in between
         # which might not be the case depending on what's in the `*_epoch_end` hooks
 
@@ -68,16 +68,6 @@ def __init__(
         self._outputs: _EPOCH_OUTPUTS_TYPE = []
         self._data_fetcher: Optional[AbstractDataFetcher] = None
 
-    @property
-    def global_step(self) -> int:
-        """Returns the global step."""
-        return self.epoch_loop.global_step
-
-    @global_step.setter
-    def global_step(self, value: int) -> None:
-        """Sets the global step (forwards to epoch_loop)"""
-        self.epoch_loop.global_step = value
-
     @property
     def total_batch_idx(self) -> int:
         """Returns the current batch index (across epochs)"""
@@ -168,16 +158,16 @@ def _results(self) -> _ResultCollection:
     def done(self) -> bool:
         """Evaluates when to leave the loop."""
         # TODO(@awaelchli): Move track steps inside training loop and move part of these condition inside training loop
-        stop_steps = _is_max_limit_reached(self.global_step, self.max_steps)
         # `processed` is increased before `on_train_epoch_end`, the hook where checkpoints are typically saved.
         # we use it here because the checkpoint data won't have `completed` increased yet
+        stop_steps = _is_max_limit_reached(self.epoch_loop.global_step, self.max_steps)
         stop_epochs = _is_max_limit_reached(self.epoch_progress.current.processed, self.max_epochs)
 
         should_stop = False
         if self.trainer.should_stop:
             # early stopping
             met_min_epochs = self.epoch_progress.current.processed >= self.min_epochs if self.min_epochs else True
-            met_min_steps = self.global_step >= self.min_steps if self.min_steps else True
+            met_min_steps = self.epoch_loop.global_step >= self.min_steps if self.min_steps else True
             if met_min_epochs and met_min_steps:
                 should_stop = True
             else:
@@ -308,14 +298,13 @@ def on_advance_end(self) -> None:
 
         self.epoch_progress.increment_completed()
 
-        # the global step is manually decreased here due to backwards compatibility with existing loggers
+        # the legacy global step is manually decreased here due to backwards compatibility with existing loggers
         # as they expect that the same step is used when logging epoch end metrics even when the batch loop has
         # finished. this means the attribute does not exactly track the number of optimizer steps applied.
-        # TODO(@carmocca): deprecate and rename so users don't get confused
-        self.global_step -= 1
+        self.epoch_loop._legacy_global_step -= 1
         # log epoch metrics
         self.trainer.logger_connector.update_train_epoch_metrics()
-        self.global_step += 1
+        self.epoch_loop._legacy_global_step += 1
 
         # if fault tolerant is enabled and process has been notified, exit.
         self.trainer._exit_gracefully_on_signal()
 
@@ -221,16 +221,20 @@ def restore_loops(self) -> None:
         if not self._loaded_checkpoint:
             return
 
-        self.trainer.fit_loop.global_step = self._loaded_checkpoint["global_step"]
+        fit_loop = self.trainer.fit_loop
+        # set the `global_step` value for old checkpoints without the progress tracking state.
+        # it will be overwritten by the loop's state if it was also saved
+        optimizer_loop = fit_loop.epoch_loop.batch_loop.optimizer_loop
+        optimizer_loop.optim_progress.optimizer.step.total.completed = self._loaded_checkpoint["global_step"]
         # set the `current_epoch` value for old checkpoints without the progress tracking state.
         # it will be overwritten by the loop's state if it was also saved
-        self.trainer.fit_loop.epoch_progress.current.completed = self._loaded_checkpoint["epoch"]
+        fit_loop.epoch_progress.current.completed = self._loaded_checkpoint["epoch"]
 
         assert self.trainer.state.fn is not None
         state_dict = self._loaded_checkpoint.get("loops")
         if state_dict is not None:
             if self.trainer.state.fn in (TrainerFn.FITTING, TrainerFn.TUNING):
-                self.trainer.fit_loop.load_state_dict(state_dict["fit_loop"])
+                fit_loop.load_state_dict(state_dict["fit_loop"])
             elif self.trainer.state.fn == TrainerFn.VALIDATING:
                 self.trainer.validate_loop.load_state_dict(state_dict["validate_loop"])
             elif self.trainer.state.fn == TrainerFn.TESTING:
@@ -331,9 +335,9 @@ def dump_checkpoint(self, weights_only: bool = False) -> dict:
         model = self.trainer.lightning_module
 
         checkpoint = {
-            # the epoch is saved for compatibility but it's not relevant for restoration
+            # the epoch and global step are saved for compatibility but it's not relevant for restoration
             "epoch": self.trainer.current_epoch,
-            "global_step": self.trainer.global_step + 1,
+            "global_step": self.trainer.global_step,
             "pytorch-lightning_version": pl.__version__,
             "state_dict": self._get_lightning_module_state_dict(),
             "loops": self._get_loops_state_dict(),
 
@@ -67,12 +67,12 @@ def on_trainer_init(
 
     @property
     def should_flush_logs(self) -> bool:
-        should_flush = (self.trainer.global_step + 1) % self.trainer.flush_logs_every_n_steps == 0
+        should_flush = self.trainer.global_step % self.trainer.flush_logs_every_n_steps == 0
         return should_flush or self.trainer.should_stop
 
     @property
     def should_update_logs(self) -> bool:
-        should_log_every_n_steps = (self.trainer.global_step + 1) % self.trainer.log_every_n_steps == 0
+        should_log_every_n_steps = self.trainer.global_step % self.trainer.log_every_n_steps == 0
         return should_log_every_n_steps or self.trainer.should_stop
 
     def configure_logger(self, logger: Union[bool, LightningLoggerBase, Iterable[LightningLoggerBase]]) -> None:
@@ -111,7 +111,7 @@ def log_metrics(self, metrics: _OUT_DICT, step: Optional[int] = None) -> None:
         if step is None:
             # added metrics for convenience
             scalar_metrics.setdefault("epoch", self.trainer.current_epoch)
-            step = self.trainer.global_step
+            step = self.trainer.fit_loop.epoch_loop._legacy_global_step
 
         # log actual metrics
         self.trainer.logger.agg_and_log_metrics(scalar_metrics, step=step)
 
@@ -2427,7 +2427,7 @@ def sanity_checking(self, val: bool) -> None:
 
     @property
     def global_step(self) -> int:
-        return self.fit_loop.global_step
+        return self.fit_loop.epoch_loop.global_step
 
     @property
     def current_epoch(self) -> int:
 
@@ -60,9 +60,7 @@ def scale_batch_size(
 
     # Save initial model, that is loaded after batch size is found
     ckpt_path = os.path.join(trainer.default_root_dir, f".scale_batch_size_{uuid.uuid4()}.ckpt")
-    trainer.fit_loop.global_step -= 1
     trainer.save_checkpoint(ckpt_path)
-    trainer.fit_loop.global_step += 1
     params = __scale_batch_dump_params(trainer)
 
     # Set to values that are required by the algorithm
 
@@ -204,9 +204,7 @@ def lr_find(
 
     # Save initial model, that is loaded after learning rate is found
     ckpt_path = os.path.join(trainer.default_root_dir, f".lr_find_{uuid.uuid4()}.ckpt")
-    trainer.fit_loop.global_step -= 1
     trainer.save_checkpoint(ckpt_path)
-    trainer.fit_loop.global_step += 1
     params = __lr_finder_dump_params(trainer)
 
     # Set to values that are required by the algorithm
 
@@ -354,15 +354,15 @@ def test_step(self, batch, batch_idx):
     trainer.fit(model)
     assert pbar.calls["fit"] == [
         ("sanity_check", 0, 0, {"b": 0}),
-        ("train", 0, 0, {}),
         ("train", 0, 1, {}),
-        ("validate", 0, 1, {"b": 1}),  # validation end
+        ("train", 0, 2, {}),
+        ("validate", 0, 2, {"b": 2}),  # validation end
         # epoch end over, `on_epoch=True` metrics are computed
-        ("train", 0, 2, {"a": 1, "b": 1}),  # training epoch end
-        ("train", 1, 2, {"a": 1, "b": 1}),
-        ("train", 1, 3, {"a": 1, "b": 1}),
-        ("validate", 1, 3, {"a": 1, "b": 3}),  # validation end
-        ("train", 1, 4, {"a": 3, "b": 3}),  # training epoch end
+        ("train", 0, 2, {"a": 1, "b": 2}),  # training epoch end
+        ("train", 1, 3, {"a": 1, "b": 2}),
+        ("train", 1, 4, {"a": 1, "b": 2}),
+        ("validate", 1, 4, {"a": 1, "b": 4}),  # validation end
+        ("train", 1, 4, {"a": 3, "b": 4}),  # training epoch end
     ]
 
     trainer.validate(model, verbose=False)
 
@@ -608,15 +608,15 @@ def test_step(self, batch, batch_idx):
     trainer.fit(model)
     assert pbar.calls["fit"] == [
         ("sanity_check", 0, 0, {"b": 0}),
-        ("train", 0, 0, {}),
         ("train", 0, 1, {}),
-        ("validate", 0, 1, {"b": 1}),  # validation end
+        ("train", 0, 2, {}),
+        ("validate", 0, 2, {"b": 2}),  # validation end
         # epoch end over, `on_epoch=True` metrics are computed
-        ("train", 0, 2, {"a": 1, "b": 1}),  # training epoch end
-        ("train", 1, 2, {"a": 1, "b": 1}),
-        ("train", 1, 3, {"a": 1, "b": 1}),
-        ("validate", 1, 3, {"a": 1, "b": 3}),  # validation end
-        ("train", 1, 4, {"a": 3, "b": 3}),  # training epoch end
+        ("train", 0, 2, {"a": 1, "b": 2}),  # training epoch end
+        ("train", 1, 3, {"a": 1, "b": 2}),
+        ("train", 1, 4, {"a": 1, "b": 2}),
+        ("validate", 1, 4, {"a": 1, "b": 4}),  # validation end
+        ("train", 1, 4, {"a": 3, "b": 4}),  # training epoch end
     ]
 
     trainer.validate(model, verbose=False)