Revert/Fix: epoch indexing from 1, to be from 0 (#2289)

Borda · web-flow · commit f278ac42c81f · 2020-06-19T23:39:53.000-04:00
* Revert "deprecated: epoch indexing from 1 (#2206)" This reverts commit f94b919 * chlog * grad index * Apply suggestions from code review * tests * fix * test
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -10,6 +10,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 ### Changed
 
+- Changed epoch indexing from 0 instead of 1 ([#2289](https://github.com/PyTorchLightning/pytorch-lightning/pull/2289))
+
 ### Deprecated
 
 ### Removed
diff --git a/pytorch_lightning/callbacks/gradient_accumulation_scheduler.py b/pytorch_lightning/callbacks/gradient_accumulation_scheduler.py
@@ -17,10 +17,6 @@ class GradientAccumulationScheduler(Callback):
     Args:
         scheduling: scheduling in format {epoch: accumulation_factor}
 
-            .. warning::
-                Epochs indexing starts from "1" until v0.6.x,
-                but will start from "0" in v0.8.0.
-
     Example::
 
         >>> from pytorch_lightning import Trainer
@@ -42,13 +38,13 @@ def __init__(self, scheduling: dict):
 
         for key in scheduling:
             if not isinstance(key, int) or not isinstance(scheduling[key], int):
-                raise TypeError("All epochs and accumulation factor must be integers")
+                raise TypeError("All epoches and accumulation factor must be integers")
 
         minimal_epoch = min(scheduling.keys())
-        if minimal_epoch < 1:
+        if minimal_epoch < 0:
             raise IndexError(f"Epochs indexing from 1, epoch {minimal_epoch} cannot be interpreted correct")
-        if minimal_epoch != 1:  # if user didnt define first epoch accumulation factor
-            scheduling.update({1: 1})
+        if minimal_epoch != 0:  # if user didnt define first epoch accumulation factor
+            scheduling.update({0: 1})
 
         self.scheduling = scheduling
         self.epochs = sorted(scheduling.keys())
diff --git a/pytorch_lightning/callbacks/progress.py b/pytorch_lightning/callbacks/progress.py
@@ -96,7 +96,7 @@ def total_val_batches(self) -> int:
         if trainer.fast_dev_run and trainer.val_dataloaders is not None:
             total_val_batches = len(trainer.val_dataloaders)
         elif not self.trainer.disable_validation:
-            is_val_epoch = trainer.current_epoch % trainer.check_val_every_n_epoch == 0
+            is_val_epoch = (trainer.current_epoch + 1) % trainer.check_val_every_n_epoch == 0
             total_val_batches = sum(trainer.num_val_batches) if is_val_epoch else 0
         return total_val_batches
 
@@ -318,7 +318,7 @@ def on_epoch_start(self, trainer, pl_module):
         total_batches = total_train_batches + total_val_batches
         if not self.main_progress_bar.disable:
             self.main_progress_bar.reset(convert_inf(total_batches))
-        self.main_progress_bar.set_description(f'Epoch {trainer.current_epoch}')
+        self.main_progress_bar.set_description(f'Epoch {trainer.current_epoch + 1}')
 
     def on_batch_end(self, trainer, pl_module):
         super().on_batch_end(trainer, pl_module)
diff --git a/pytorch_lightning/trainer/distrib_data_parallel.py b/pytorch_lightning/trainer/distrib_data_parallel.py
@@ -518,7 +518,7 @@ def ddp_train(self, process_idx, model, is_master=False, proc_offset=0):
 
         # AMP
         # run through amp wrapper before going to distributed DP
-        # TODO: remove in v0.8.0
+        # TODO: remove with dropping NVIDIA AMP support
         if self.use_amp and not self.use_native_amp:
             model, optimizers = model.configure_apex(amp, model, self.optimizers, self.amp_level)
             self.optimizers = optimizers
diff --git a/pytorch_lightning/trainer/distrib_parts.py b/pytorch_lightning/trainer/distrib_parts.py
@@ -174,7 +174,7 @@ def single_gpu_train(self, model):
         # allow for lr schedulers as well
         self.optimizers, self.lr_schedulers, self.optimizer_frequencies = self.init_optimizers(model)
 
-        # TODO: update for 0.8.0
+        # TODO: remove with dropping NVIDIA AMP support
         if self.use_amp and not self.use_native_amp:
             # An example
             model, optimizers = model.configure_apex(amp, model, self.optimizers, self.amp_level)
@@ -240,7 +240,7 @@ def dp_train(self, model):
             # wrap the user's forward in autocast and give it back at the end
             model.forward = torch.cuda.amp.autocast()(model.forward)
 
-        # TODO: remove in v0.8.0
+        # TODO: remove with dropping NVIDIA AMP support
         # check for this bug (amp + dp + !01 doesn't work)
         # https://github.com/NVIDIA/apex/issues/227
         if self.use_dp and self.use_amp and not self.use_native_amp:
diff --git a/pytorch_lightning/trainer/training_io.py b/pytorch_lightning/trainer/training_io.py
@@ -323,7 +323,7 @@ def dump_checkpoint(self, weights_only: bool = False) -> dict:
              structured dictionary
         """
         checkpoint = {
-            'epoch': self.current_epoch,
+            'epoch': self.current_epoch + 1,
             'global_step': self.global_step + 1,
             'pytorch-ligthning_version': pytorch_lightning.__version__,
         }
diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py
@@ -346,8 +346,8 @@ def train(self):
             model.on_train_start()
 
         try:
-            # run all epochs from actual + 1 till the maximal
-            for epoch in range(self.current_epoch + 1, self.max_epochs + 1):
+            # run all epochs
+            for epoch in range(self.current_epoch, self.max_epochs):
                 # reset train dataloader
                 if self.reload_dataloaders_every_epoch:
                     self.reset_train_dataloader(model)
@@ -382,7 +382,7 @@ def train(self):
                 self.update_learning_rates(interval='epoch')
 
                 # early stopping
-                met_min_epochs = epoch >= self.min_epochs
+                met_min_epochs = epoch >= self.min_epochs - 1
                 met_min_steps = self.global_step >= self.min_steps if self.min_steps else True
 
                 # TODO wrap this logic into the callback
@@ -476,7 +476,7 @@ def run_training_epoch(self):
             # RUN VAL STEP
             # ---------------
             is_val_check_batch = (batch_idx + 1) % self.val_check_batch == 0
-            can_check_epoch = self.current_epoch % self.check_val_every_n_epoch == 0
+            can_check_epoch = (self.current_epoch + 1) % self.check_val_every_n_epoch == 0
             can_check_val = not self.disable_validation and can_check_epoch
             should_check_val = is_val_check_batch or early_stop_epoch
             should_check_val = should_check_val or (is_last_batch and self.val_check_batch == float('inf'))
diff --git a/tests/callbacks/test_callbacks.py b/tests/callbacks/test_callbacks.py
@@ -286,7 +286,7 @@ def training_step(self, *args, **kwargs):
     result = trainer.fit(model)
 
     assert result == 1, 'training failed to complete'
-    assert trainer.current_epoch <= trainer.max_epochs
+    assert trainer.current_epoch < trainer.max_epochs
 
 
 def test_pickling(tmpdir):
diff --git a/tests/models/test_hooks.py b/tests/models/test_hooks.py
@@ -68,7 +68,7 @@ def training_epoch_end(self, outputs):
     # a metric shared in both methods gets overwritten by epoch_end
     assert metrics['shared_metric'] == 111
     # metrics are kept after each epoch
-    for i in range(1, num_epochs + 1):
+    for i in range(num_epochs):
         assert metrics[f'epoch_metric_{i}'] == i
 
 
diff --git a/tests/models/test_restore.py b/tests/models/test_restore.py
@@ -172,7 +172,7 @@ def test_dp_resume(tmpdir):
     result = trainer.fit(model)
 
     # track epoch before saving. Increment since we finished the current epoch, don't want to rerun
-    real_global_epoch = trainer.current_epoch
+    real_global_epoch = trainer.current_epoch + 1
 
     # correct result and ok accuracy
     assert result == 1, 'amp + dp model failed to complete'
diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py
@@ -107,13 +107,17 @@ def test_gradient_accumulation_scheduling(tmpdir):
 
     # test incorrect configs
     with pytest.raises(IndexError):
-        assert Trainer(accumulate_grad_batches={0: 3, 1: 4, 4: 6})
+        assert Trainer(accumulate_grad_batches={-1: 3, 1: 4, 4: 6})
+    with pytest.raises(IndexError):
         assert Trainer(accumulate_grad_batches={-2: 3})
 
     with pytest.raises(TypeError):
         assert Trainer(accumulate_grad_batches={})
+    with pytest.raises(TypeError):
         assert Trainer(accumulate_grad_batches=[[2, 3], [4, 6]])
+    with pytest.raises(TypeError):
         assert Trainer(accumulate_grad_batches={1: 2, 3.: 4})
+    with pytest.raises(TypeError):
         assert Trainer(accumulate_grad_batches={1: 2.5, 3: 5})
 
     # test optimizer call freq matches scheduler
@@ -451,7 +455,7 @@ def test_trainer_max_steps_and_epochs(tmpdir):
 
     # check training stopped at max_epochs
     assert trainer.global_step == num_train_samples * trainer.max_epochs
-    assert trainer.current_epoch == trainer.max_epochs, "Model did not stop at max_epochs"
+    assert trainer.current_epoch == trainer.max_epochs - 1, "Model did not stop at max_epochs"
 
 
 def test_trainer_min_steps_and_epochs(tmpdir):
@@ -619,7 +623,7 @@ def validation_epoch_end(self, *args, **kwargs):
 
     # check that limit_val_batches=0 turns off validation
     assert result == 1, 'training failed to complete'
-    assert trainer.current_epoch == 2
+    assert trainer.current_epoch == 1
     assert not model.validation_step_invoked, \
         '`validation_step` should not run when `limit_val_batches=0`'
     assert not model.validation_epoch_end_invoked, \
@@ -632,7 +636,7 @@ def validation_epoch_end(self, *args, **kwargs):
     result = trainer.fit(model)
 
     assert result == 1, 'training failed to complete'
-    assert trainer.current_epoch == 1
+    assert trainer.current_epoch == 0
     assert model.validation_step_invoked, \
         'did not run `validation_step` with `fast_dev_run=True`'
     assert model.validation_epoch_end_invoked, \

Original file line number	Diff line number	Diff line change
`@@ -323,7 +323,7 @@ def dump_checkpoint(self, weights_only: bool = False) -> dict:`
`323`	`323`	`structured dictionary`
`324`	`324`	`"""`
`325`	`325`	`checkpoint = {`
`326`		`- 'epoch': self.current_epoch,`
	`326`	`+ 'epoch': self.current_epoch + 1,`
`327`	`327`	`'global_step': self.global_step + 1,`
`328`	`328`	`'pytorch-ligthning_version': pytorch_lightning.__version__,`
`329`	`329`	`}`