Refactor global step update

carmocca · carmocca · commit b156d49aca55 · 2021-05-05T00:39:05.000+02:00
diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py
@@ -512,28 +512,17 @@ def run_training_epoch(self):
             self.update_train_loop_lr_schedulers(monitor_metrics=monitor_metrics)
             self.trainer.checkpoint_connector.has_trained = True
 
-            # max steps reached, end training
-            if (
-                self.trainer.max_steps is not None and self.trainer.max_steps <= self.trainer.global_step + 1
-                and self._accumulated_batches_reached()
-            ):
-                break
-
-            # end epoch early
-            # stop when the flag is changed or we've gone past the amount
-            # requested in the batches
-            if self.trainer.should_stop:
-                break
-
             self.trainer.total_batch_idx += 1
 
-            # stop epoch if we limited the number of training batches
-            if self._num_training_batches_reached(is_last_batch):
-                break
-
             # progress global step according to grads progress
             self.increment_accumulated_grad_global_step()
 
+            max_steps_reached = (
+                self.trainer.max_steps is not None and self.trainer.max_steps <= self.trainer.global_step
+            )
+            if max_steps_reached or self.trainer.should_stop or self._num_training_batches_reached(is_last_batch):
+                break
+
         if batch_idx is None:
             # dataloader/iterator did not produce a batch
             return
@@ -552,18 +541,6 @@ def run_training_epoch(self):
         if (val_loop_called and not should_check_val) or should_train_only:
             self.trainer.optimizer_connector.update_learning_rates(interval='epoch')
 
-        if should_train_only:
-            self.check_checkpoint_callback(True)
-
-        if should_check_val:
-            self.trainer.validating = True
-            self.trainer.run_evaluation(on_epoch=True)
-            self.trainer.training = True
-
-        # increment the global step once
-        # progress global step according to grads progress
-        self.increment_accumulated_grad_global_step()
-
     def on_train_epoch_end(self, epoch_output: List[List[List[Result]]]) -> None:
         # inform logger the batch loop has finished
         self.trainer.logger_connector.on_train_epoch_end()
@@ -863,16 +840,12 @@ def _should_check_val_fx(self, batch_idx: int, is_last_batch: bool, on_epoch: bo
         elif self.trainer.val_check_batch != float('inf'):
             is_val_check_batch = (batch_idx + 1) % self.trainer.val_check_batch == 0
 
-        # Note: num_training_batches is also inf for iterable datasets with no length defined
-        epoch_end_val_check = (batch_idx + 1) % self.trainer.num_training_batches == 0
         is_last_batch_for_infinite_dataset = is_last_batch and self.trainer.val_check_batch == float("inf")
 
         if on_epoch:
-            return (
-                is_val_check_batch and epoch_end_val_check
-            ) or self.trainer.should_stop or is_last_batch_for_infinite_dataset
+            return is_val_check_batch or self.trainer.should_stop or is_last_batch_for_infinite_dataset
         else:
-            return is_val_check_batch and not epoch_end_val_check
+            return is_val_check_batch
 
     def build_train_args(self, batch, batch_idx, opt_idx, hiddens):
         # enable not needing to add opt_idx to training_step
diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py
@@ -800,7 +800,6 @@ def training_step(self, batch, batch_idx, optimizer_idx=None):
 
     with pytest.raises(ValueError, match=r".*The loss returned in `training_step` is.*"):
         trainer.fit(model)
-        assert trainer.global_step == model.test_step_inf_loss
 
     for param in model.parameters():
         assert torch.isfinite(param).all()