Increment the total batch idx before the accumulation early exit (#7692)

awaelchli · lexierule · commit 8bf5b71af4b3 · 2021-05-26T10:36:15.000-04:00
* Increment the total batch idx before the accumulation early exit

* Update CHANGELOG
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -18,6 +18,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Fixed dataloaders are not reset when tuning the model ([#7566](https://github.com/PyTorchLightning/pytorch-lightning/pull/7566))
 - Fixed print errors in `ProgressBar` when `trainer.fit` is not called ([#7674](https://github.com/PyTorchLightning/pytorch-lightning/pull/7674))
 - Fixed global step update when the epoch is skipped ([#7677](https://github.com/PyTorchLightning/pytorch-lightning/pull/7677))
+- Fixed training loop total batch counter when accumulate grad batches was enabled ([#7692](https://github.com/PyTorchLightning/pytorch-lightning/pull/7692))
 
 ## [1.3.2] - 2021-05-18
 
diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py
@@ -526,6 +526,8 @@ def run_training_epoch(self):
             self.update_train_loop_lr_schedulers(monitor_metrics=monitor_metrics)
             self.trainer.checkpoint_connector.has_trained = True
 
+            self.trainer.total_batch_idx += 1
+
             # max steps reached, end training
             if (
                 self.trainer.max_steps is not None and self.trainer.max_steps <= self.trainer.global_step + 1
@@ -539,8 +541,6 @@ def run_training_epoch(self):
             if self.trainer.should_stop:
                 break
 
-            self.trainer.total_batch_idx += 1
-
             # stop epoch if we limited the number of training batches
             if self._num_training_batches_reached(is_last_batch):
                 break
diff --git a/tests/tuner/test_lr_finder.py b/tests/tuner/test_lr_finder.py
@@ -197,31 +197,24 @@ def test_datamodule_parameter(tmpdir):
 
 
 def test_accumulation_and_early_stopping(tmpdir):
-    """ Test that early stopping of learning rate finder works, and that
-        accumulation also works for this feature """
+    """ Test that early stopping of learning rate finder works, and that accumulation also works for this feature """
 
-    hparams = EvalModelTemplate.get_default_hparams()
-    model = EvalModelTemplate(**hparams)
+    class TestModel(BoringModel):
 
-    before_lr = hparams.get('learning_rate')
-    # logger file to get meta
+        def __init__(self):
+            super().__init__()
+            self.lr = 1e-3
+
+    model = TestModel()
     trainer = Trainer(
         default_root_dir=tmpdir,
         accumulate_grad_batches=2,
     )
-
     lrfinder = trainer.tuner.lr_find(model, early_stop_threshold=None)
-    after_lr = lrfinder.suggestion()
 
-    expected_num_lrs = 100
-    expected_batch_idx = 200 - 1
-
-    assert before_lr != after_lr, \
-        'Learning rate was not altered after running learning rate finder'
-    assert len(lrfinder.results['lr']) == expected_num_lrs, \
-        'Early stopping for learning rate finder did not work'
-    assert lrfinder._total_batch_idx == expected_batch_idx, \
-        'Accumulation parameter did not work'
+    assert lrfinder.suggestion() != 1e-3
+    assert len(lrfinder.results['lr']) == 100
+    assert lrfinder._total_batch_idx == 200
 
 
 def test_suggestion_parameters_work(tmpdir):