Resolve FIXME: implement option 3b

carmocca · carmocca · commit a304dba3e90f · 2022-01-09T03:48:06.000+01:00
diff --git a/pytorch_lightning/loops/base.py b/pytorch_lightning/loops/base.py
@@ -48,7 +48,7 @@ class Loop(ABC, Generic[T]):
     """
 
     def __init__(self) -> None:
-        self.restarting = False
+        self._restarting = False
         self._trainer: Optional["pl.Trainer"] = None
 
     @property
@@ -69,6 +69,17 @@ def trainer(self, trainer: "pl.Trainer") -> None:
             if isinstance(v, Loop):
                 v.trainer = trainer
 
+    @property
+    def restarting(self) -> bool:
+        return self._restarting
+
+    @restarting.setter
+    def restarting(self, restarting: bool) -> None:
+        self._restarting = restarting
+        for loop in vars(self).values():
+            if isinstance(loop, Loop):
+                loop.restarting = restarting
+
     @property
     @abstractmethod
     def done(self) -> bool:
@@ -189,7 +200,7 @@ def run(self, *args, **kwargs):
                 self.on_advance_start(*args, **kwargs)
                 self.advance(*args, **kwargs)
                 self.on_advance_end()
-                self.restarting = False
+                self._restarting = False
             except StopIteration:
                 break
 
@@ -298,6 +309,7 @@ def load_state_dict(
         for k, v in self.__dict__.items():
             if isinstance(v, Loop):
                 v.load_state_dict(state_dict.copy(), prefix + k + ".")
+        self.restarting = True
 
     def _load_from_state_dict(self, state_dict: Dict, prefix: str, metrics: Optional[Dict[str, Metric]] = None) -> None:
         for k, v in self.__dict__.items():
@@ -333,4 +345,3 @@ def _load_from_state_dict(self, state_dict: Dict, prefix: str, metrics: Optional
 
         if prefix + "state_dict" in state_dict:  # compatibility with old checkpoints
             self.on_load_checkpoint(state_dict[prefix + "state_dict"])
-        self.restarting = True
diff --git a/pytorch_lightning/loops/epoch/training_epoch_loop.py b/pytorch_lightning/loops/epoch/training_epoch_loop.py
@@ -88,7 +88,7 @@ def batch_idx(self) -> int:
         """Returns the current batch index (within this epoch)"""
         # use `ready` instead of `completed` in case this is accessed after `completed` has been increased
         # but before the next `ready` increase
-        return self.batch_progress.current.ready - 1
+        return max(self.batch_progress.current.ready - 1, 0)
 
     @property
     def _is_training_done(self) -> bool:
@@ -130,12 +130,6 @@ def reset(self) -> None:
             self.batch_progress.reset_on_restart()
             self.scheduler_progress.reset_on_restart()
             self.batch_loop.optimizer_loop.optim_progress.reset_on_restart()
-        # FIXME: fuck me this makes
-        # 1) test_restore::test_correct_step_and_epoch pass
-        # 2) test_model_checkpoint::test_checkpoint_repeated_strategy_extended fail
-        # 1) restarts after on_train_end (ce: 2, gs: 4) -> (ce: 4, gs: 8)
-        # 2) restarts after on_train_epoch_end (ce: 1, gs: 4) -> (ce: 2, gs: 4)
-        # if not self.restarting or self.done:
         else:
             self.batch_progress.reset_on_run()
             self.scheduler_progress.reset_on_run()
@@ -148,7 +142,7 @@ def reset(self) -> None:
 
     def on_run_start(self, data_fetcher: AbstractDataFetcher) -> None:  # type: ignore[override]
         self._reload_dataloader_state_dict(data_fetcher)
-        self._dataloader_iter = _update_dataloader_iter(data_fetcher, self.batch_idx + 1)
+        self._dataloader_iter = _update_dataloader_iter(data_fetcher, self.batch_idx)
 
     def advance(self, data_fetcher: AbstractDataFetcher) -> None:  # type: ignore[override]
         """Runs a single training batch.
@@ -159,6 +153,9 @@ def advance(self, data_fetcher: AbstractDataFetcher) -> None:  # type: ignore[ov
         if self.restarting and self._should_check_val_fx(self.batch_idx, self.batch_progress.is_last_batch):
             # skip training and run validation in `on_advance_end`
             return
+        else:
+            # we are going to train first so the val loop does not need to restart
+            self.val_loop.restarting = False
 
         assert self._dataloader_iter is not None
         batch_idx, (batch, self.batch_progress.is_last_batch) = next(self._dataloader_iter)
diff --git a/pytorch_lightning/loops/fit_loop.py b/pytorch_lightning/loops/fit_loop.py
@@ -56,16 +56,6 @@ def __init__(
         self._is_fresh_start_epoch: bool = True
         self._outputs: _EPOCH_OUTPUTS_TYPE = []
 
-    @property
-    def current_epoch(self) -> int:
-        """Return the current epoch."""
-        return self.epoch_progress.current.completed
-
-    @current_epoch.setter
-    def current_epoch(self, value: int) -> None:
-        """Setter for the current epoch."""
-        self.epoch_progress.current.completed = value
-
     @property
     def global_step(self) -> int:
         """Returns the global step."""
@@ -129,6 +119,18 @@ def running_loss(self) -> TensorRunningAccum:
         """Returns the running loss."""
         return self.epoch_loop.batch_loop.running_loss
 
+    @Loop.restarting.setter
+    def restarting(self, restarting: bool) -> None:
+        # if the last epoch completely finished, we are not actually restarting, we can check this to see if all
+        # current values are equal
+        values = (
+            self.epoch_progress.current.ready,
+            self.epoch_progress.current.started,
+            self.epoch_progress.current.processed,
+        )
+        restarting &= any(v != self.epoch_progress.current.completed for v in values)
+        Loop.restarting.fset(self, restarting)  # call the parent setter
+
     @property
     def _skip_backward(self) -> bool:
         """Determines whether the loop will skip backward during automatic optimization."""
@@ -152,11 +154,11 @@ def done(self) -> bool:
         """Evaluates when to leave the loop."""
         # TODO(@awaelchli): Move track steps inside training loop and move part of these condition inside training loop
         stop_steps = _is_max_limit_reached(self.global_step, self.max_steps)
-        stop_epochs = _is_max_limit_reached(self.current_epoch, self.max_epochs)
+        stop_epochs = _is_max_limit_reached(self.epoch_progress.current.processed, self.max_epochs)
 
         should_stop = self.trainer.should_stop
         if should_stop:
-            should_stop = self.current_epoch >= self.min_epochs if self.min_epochs else True
+            should_stop = self.epoch_progress.current.processed >= self.min_epochs if self.min_epochs else True
             if not should_stop:
                 log.info(
                     f"Trainer was signaled to stop but required minimum epochs ({self.min_epochs}) has not been met."
@@ -169,7 +171,7 @@ def skip(self) -> bool:
         """Whether we should skip the training and immediately return from the call to :meth:`run`."""
         # since `trainer.num_training_batches` depends on the `train_dataloader` but that won't be called
         # until `on_run_start`, we use `limit_train_batches` instead
-        return self.done or self.trainer.limit_train_batches == 0
+        return self.trainer.limit_train_batches == 0
 
     def connect(self, epoch_loop: TrainingEpochLoop) -> None:  # type: ignore[override]
         """Connects a training epoch loop to this fit loop."""
@@ -207,7 +209,7 @@ def on_advance_start(self) -> None:  # type: ignore[override]
             getattr(self.trainer.train_dataloader.sampler, "set_epoch", None)
         ):
             # set seed for distributed sampler (enables shuffling for each epoch)
-            self.trainer.train_dataloader.sampler.set_epoch(self.current_epoch)
+            self.trainer.train_dataloader.sampler.set_epoch(self.epoch_progress.current.processed)
 
         # changing gradient according accumulation_scheduler
         self.trainer.accumulation_scheduler.on_train_epoch_start(self.trainer, self.trainer.lightning_module)
diff --git a/pytorch_lightning/trainer/connectors/checkpoint_connector.py b/pytorch_lightning/trainer/connectors/checkpoint_connector.py
@@ -212,8 +212,9 @@ def restore_loops(self) -> None:
             return
 
         self.trainer.fit_loop.global_step = self._loaded_checkpoint["global_step"]
-        # FIXME: keep in mind old checkpoints without progress tracking
-        self.trainer.fit_loop.current_epoch = self._loaded_checkpoint["epoch"]
+        # set the `current_epoch` value for old checkpoints without the progress tracking state
+        # it will be overwritten by the loop's state if it was also saved
+        self.trainer.fit_loop.epoch_progress.current.completed = self._loaded_checkpoint["epoch"]
 
         assert self.trainer.state.fn is not None
         state_dict = self._loaded_checkpoint.get("loops")
diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
@@ -2342,7 +2342,8 @@ def global_step(self) -> int:
 
     @property
     def current_epoch(self) -> int:
-        return self.fit_loop.current_epoch
+        """The current epoch, updated after the epoch end hooks are run."""
+        return self.fit_loop.epoch_progress.current.completed
 
     @property
     def max_epochs(self) -> int:
diff --git a/pytorch_lightning/tuner/batch_size_scaling.py b/pytorch_lightning/tuner/batch_size_scaling.py
@@ -103,7 +103,6 @@ def __scale_batch_dump_params(trainer: "pl.Trainer") -> None:
     # Prevent going into infinite loop
     trainer.__dumped_params = {
         "auto_lr_find": trainer.auto_lr_find,
-        "current_epoch": trainer.current_epoch,
         "global_step": trainer.global_step,
         "max_steps": trainer.max_steps,
         "logger": trainer.logger,
@@ -118,7 +117,6 @@ def __scale_batch_dump_params(trainer: "pl.Trainer") -> None:
 def __scale_batch_reset_params(trainer: "pl.Trainer", model: "pl.LightningModule", steps_per_trial: int) -> None:
     trainer.auto_scale_batch_size = None  # prevent recursion
     trainer.auto_lr_find = False  # avoid lr find being called multiple times
-    trainer.fit_loop.current_epoch = 0
     trainer.fit_loop.max_steps = steps_per_trial  # take few steps
     trainer.logger = DummyLogger() if trainer.logger is not None else None
     trainer.callbacks = []  # not needed before full run
@@ -129,7 +127,6 @@ def __scale_batch_reset_params(trainer: "pl.Trainer", model: "pl.LightningModule
 
 def __scale_batch_restore_params(trainer: "pl.Trainer") -> None:
     trainer.auto_lr_find = trainer.__dumped_params["auto_lr_find"]
-    trainer.fit_loop.current_epoch = trainer.__dumped_params["current_epoch"]
     trainer.fit_loop.global_step = trainer.__dumped_params["global_step"]
     trainer.fit_loop.max_steps = trainer.__dumped_params["max_steps"]
     trainer.logger = trainer.__dumped_params["logger"]
diff --git a/tests/checkpointing/test_model_checkpoint.py b/tests/checkpointing/test_model_checkpoint.py
@@ -980,14 +980,17 @@ def assert_checkpoint_log_dir(idx):
         trainer.fit(model, ckpt_path=chk)
         assert trainer.global_step == epochs * limit_train_batches
         assert trainer.current_epoch == epochs
+        assert trainer.fit_loop.epoch_progress.current.processed == epochs
 
         trainer.validate(model)
         assert trainer.global_step == epochs * limit_train_batches
         assert trainer.current_epoch == epochs
+        assert trainer.fit_loop.epoch_progress.current.processed == epochs
 
         trainer.fit(model)
         assert trainer.global_step == epochs * limit_train_batches
         assert trainer.current_epoch == epochs
+        assert trainer.fit_loop.epoch_progress.current.processed == epochs
         assert_checkpoint_log_dir(idx)
 
 
diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py
@@ -331,7 +331,7 @@ def mock_save_function(filepath, *args):
 
     # emulate callback's calls during the training
     for i, loss in enumerate(losses):
-        trainer.fit_loop.current_epoch = i
+        trainer.fit_loop.epoch_progress.current.completed = i  # sets `trainer.current_epoch`
         trainer.fit_loop.global_step = i
         trainer.callback_metrics.update({"checkpoint_on": torch.tensor(loss)})
         checkpoint_callback.on_validation_end(trainer, trainer.lightning_module)