standardize order of loop hooks

awaelchli · awaelchli · commit b090e4f232a7 · 2021-05-20T17:53:24.000+02:00
diff --git a/pytorch_lightning/loops/base.py b/pytorch_lightning/loops/base.py
@@ -1,11 +1,11 @@
 from _weakref import proxy
-from abc import ABCMeta, abstractmethod
-from typing import Any, Counter, List, Optional
+from abc import abstractmethod, ABC
+from typing import Any, Optional
 
 import pytorch_lightning as pl
 
 
-class Loop(metaclass=ABCMeta):
+class Loop(ABC):
 
     def __init__(self):
         self.iteration_count: int = 0
@@ -21,22 +21,6 @@ def connect(self, trainer, *args, **kwargs):
     def done(self):
         """Property indicating when loop is finished"""
 
-    @abstractmethod
-    def advance(self, *args: Any, **kwargs: Any):
-        """What to do within a single step"""
-
-    def on_run_start(self, *args: Any, **kwargs: Any) -> None:
-        pass
-
-    def on_run_end(self) -> Any:
-        pass
-
-    def on_advance_start(self, *args: Any, **kwargs: Any) -> None:
-        pass
-
-    def on_advance_end(self) -> None:
-        pass
-
     def run(self, *args: Any, **kwargs: Any):
         self.on_run_start(*args, **kwargs)
 
@@ -49,5 +33,21 @@ def run(self, *args: Any, **kwargs: Any):
 
         return self.on_run_end()
 
+    def on_run_start(self, *args: Any, **kwargs: Any) -> None:
+        pass
+
+    def on_advance_start(self, *args: Any, **kwargs: Any) -> None:
+        pass
+
+    @abstractmethod
+    def advance(self, *args: Any, **kwargs: Any):
+        """What to do within a single step"""
+
+    def on_advance_end(self) -> None:
+        pass
+
+    def on_run_end(self) -> Any:
+        pass
+
     def state_dict(self):
         return dict()
diff --git a/pytorch_lightning/loops/batch_loop.py b/pytorch_lightning/loops/batch_loop.py
@@ -44,6 +44,31 @@ def connect(self, trainer, *args, **kwargs):
     def done(self):
         return len(self._remaining_splits) == 0
 
+    def run(self, batch, batch_idx, dataloader_idx):
+        if batch is None:
+            return AttributeDict(signal=0, grad_norm_dic={})
+
+        # hook
+        response = self.trainer.call_hook("on_batch_start")
+        if response == -1:
+            return AttributeDict(signal=-1, grad_norm_dic={})
+
+        # hook
+        response = self.trainer.call_hook("on_train_batch_start", batch, batch_idx, dataloader_idx)
+        if response == -1:
+            return AttributeDict(signal=-1, grad_norm_dic={})
+
+        super().run(batch, batch_idx, dataloader_idx)
+
+        output = AttributeDict(
+            signal=0,
+            # todo: Properly aggregate grad_norm accros opt_idx and split_idx
+            # grad_norm_dict=grad_norm_dict,
+            grad_norm_dict={},
+            training_step_output_for_epoch_end=self.batch_outputs,
+        )
+        return output
+
     def on_run_start(self, batch, batch_idx, dataloader_idx):
         self._hiddens = None
         self._remaining_splits = list(enumerate(self.tbptt_split_batch(batch)))
@@ -70,31 +95,6 @@ def advance(self, batch, batch_idx, dataloader_idx):
             if result:
                 self.batch_outputs[0].append(result.training_step_output_for_epoch_end)
 
-    def run(self, batch, batch_idx, dataloader_idx):
-        if batch is None:
-            return AttributeDict(signal=0, grad_norm_dic={})
-
-        # hook
-        response = self.trainer.call_hook("on_batch_start")
-        if response == -1:
-            return AttributeDict(signal=-1, grad_norm_dic={})
-
-        # hook
-        response = self.trainer.call_hook("on_train_batch_start", batch, batch_idx, dataloader_idx)
-        if response == -1:
-            return AttributeDict(signal=-1, grad_norm_dic={})
-
-        super().run(batch, batch_idx, dataloader_idx)
-
-        output = AttributeDict(
-            signal=0,
-            # todo: Properly aggregate grad_norm accros opt_idx and split_idx
-            # grad_norm_dict=grad_norm_dict,
-            grad_norm_dict={},
-            training_step_output_for_epoch_end=self.batch_outputs,
-        )
-        return output
-
 # ------------------------------------------------------------------------------------------------------------
 # HELPER --- TO BE CLEANED UP
 # ------------------------------------------------------------------------------------------------------------
diff --git a/pytorch_lightning/loops/epoch_loop.py b/pytorch_lightning/loops/epoch_loop.py
@@ -73,13 +73,6 @@ def connect(self, trainer: 'pl.Trainer', *args, **kwargs):
         self.trainer = trainer
         self.training_loop.connect(trainer)
 
-    # TODO: is it used anywhere?
-    def should_accumulate(self):
-        return self.training_loop.batch_loop.should_accumulate()
-
-    def get_active_optimizers(self, batch_idx):
-        return self.training_loop.batch_loop.get_active_optimizers(batch_idx)
-
     @property
     def done(self) -> bool:
         # TODO: Move track steps inside training loop and move part of these condition inside training loop
@@ -109,36 +102,6 @@ def on_run_start(self):
         # hook
         self.trainer.call_hook("on_train_start")
 
-    def on_run_end(self):
-        if self._teardown_already_run:
-            return
-        self._teardown_already_run = True
-
-        # trigger checkpoint check. need to temporarily decrease the global step to avoid saving duplicates
-        # when a checkpoint was saved at the last step
-        self.training_loop.global_step -= 1
-        # TODO: see discussion/rework https://github.com/PyTorchLightning/pytorch-lightning/issues/7406
-        self.check_checkpoint_callback(should_update=True, is_last=True)
-        self.training_loop.global_step += 1
-
-        # hook
-        self.trainer.call_hook("on_train_end")
-
-        # todo: TPU 8 cores hangs in flush with TensorBoard. Might do for all loggers.
-        # It might be related to xla tensors blocked when moving the cpu
-        # kill loggers
-        if self.trainer.logger is not None:
-            self.trainer.logger.finalize("success")
-
-        # summarize profile results
-        self.trainer.profiler.describe()
-
-        # give accelerators a chance to finish
-        self.trainer.accelerator.on_train_end()
-
-        # reset bookkeeping
-        self.trainer._running_stage = None
-
     def on_advance_start(self):  # equal to on train epoch start
         # implemented here since this code has to be run always no matter the actual epoch implementation
         epoch = self.iteration_count + 1
@@ -167,7 +130,14 @@ def on_advance_start(self):  # equal to on train epoch start
         self.trainer.call_hook("on_epoch_start")
         self.trainer.call_hook("on_train_epoch_start")
 
-    # why is this not the same as the old on_train_epoch_end?
+    def advance(self):
+
+        with self.trainer.profiler.profile("run_training_epoch"):
+            # run train epoch
+            epoch_output = self.training_loop.run()
+            # log epoch metrics
+            self.trainer.logger_connector.log_train_epoch_end_metrics(epoch_output)
+
     def on_advance_end(self):
         # # handle epoch_output on epoch end
         # self.on_train_epoch_end(outputs)  # Handled in on_run_end of training_loop now
@@ -193,13 +163,42 @@ def on_advance_end(self):
         # TODO: move inside training_loop.on_run_end? equivalent? order?
         self.training_loop.increment_accumulated_grad_global_step()
 
-    def advance(self):
+    # why is this not the same as the old on_train_epoch_end?
+    def on_run_end(self):
+        if self._teardown_already_run:
+            return
+        self._teardown_already_run = True
 
-        with self.trainer.profiler.profile("run_training_epoch"):
-            # run train epoch
-            epoch_output = self.training_loop.run()
-            # log epoch metrics
-            self.trainer.logger_connector.log_train_epoch_end_metrics(epoch_output)
+        # trigger checkpoint check. need to temporarily decrease the global step to avoid saving duplicates
+        # when a checkpoint was saved at the last step
+        self.training_loop.global_step -= 1
+        # TODO: see discussion/rework https://github.com/PyTorchLightning/pytorch-lightning/issues/7406
+        self.check_checkpoint_callback(should_update=True, is_last=True)
+        self.training_loop.global_step += 1
+
+        # hook
+        self.trainer.call_hook("on_train_end")
+
+        # todo: TPU 8 cores hangs in flush with TensorBoard. Might do for all loggers.
+        # It might be related to xla tensors blocked when moving the cpu
+        # kill loggers
+        if self.trainer.logger is not None:
+            self.trainer.logger.finalize("success")
+
+        # summarize profile results
+        self.trainer.profiler.describe()
+
+        # give accelerators a chance to finish
+        self.trainer.accelerator.on_train_end()
+
+        # reset bookkeeping
+        self.trainer._running_stage = None
+
+    def should_accumulate(self):
+        return self.training_loop.batch_loop.should_accumulate()
+
+    def get_active_optimizers(self, batch_idx):
+        return self.training_loop.batch_loop.get_active_optimizers(batch_idx)
 
     def check_checkpoint_callback(self, should_update, is_last=False):
         # TODO bake this logic into the ModelCheckpoint callback
@@ -213,3 +212,4 @@ def check_checkpoint_callback(self, should_update, is_last=False):
 
             for cb in callbacks:
                 cb.on_validation_end(self.trainer, model)
+
diff --git a/pytorch_lightning/loops/training_loop.py b/pytorch_lightning/loops/training_loop.py
@@ -45,6 +45,43 @@ def connect(self, trainer: 'pl.Trainer', *args, **kwargs):
         self.batch_loop = BatchLoop()
         self.batch_loop.connect(trainer)
 
+    @property
+    def done(self):
+        # max steps reached, end training
+        if (
+            self.max_steps is not None and self.max_steps <= self.global_step + 1
+            and self.batch_loop._accumulated_batches_reached()
+        ):
+            return True
+
+        # end epoch early
+        # stop when the flag is changed or we've gone past the amount
+        # requested in the batches
+        if self.trainer.should_stop:
+            return True
+
+        # TODO: moved to on_advance_end, check if correct?
+        # self.total_batch_idx += 1
+
+        # stop epoch if we limited the number of training batches
+        if self._num_training_batches_reached(self.is_last_batch):
+            return True
+
+    def run(self, *args, **kwargs):
+        self.on_run_start()
+
+        while True:
+            try:
+                self.on_advance_start()
+                self.advance()
+                self.on_advance_end()
+            except StopIteration:
+                break
+
+            self.iteration_count += 1
+
+        return self.on_run_end()
+
     def on_run_start(self):
         # modify dataloader if needed (ddp, etc...)
         train_dataloader = self.trainer.accelerator.process_dataloader(self.trainer.train_dataloader)
@@ -121,28 +158,6 @@ def on_advance_end(self):
         # progress global step according to grads progress
         self.increment_accumulated_grad_global_step()
 
-    @property
-    def done(self):
-        # max steps reached, end training
-        if (
-            self.max_steps is not None and self.max_steps <= self.global_step + 1
-            and self.batch_loop._accumulated_batches_reached()
-        ):
-            return True
-
-        # end epoch early
-        # stop when the flag is changed or we've gone past the amount
-        # requested in the batches
-        if self.trainer.should_stop:
-            return True
-
-        # TODO: moved to on_advance_end, check if correct?
-        # self.total_batch_idx += 1
-
-        # stop epoch if we limited the number of training batches
-        if self._num_training_batches_reached(self.is_last_batch):
-            return True
-
     # this is the old on train_epoch_end?
     def on_run_end(self):
         # inform logger the batch loop has finished
@@ -176,21 +191,6 @@ def on_run_end(self):
         self.trainer.call_hook('on_epoch_end')
         return self.epoch_output
 
-    def run(self, *args, **kwargs):
-        self.on_run_start()
-
-        while True:
-            try:
-                self.on_advance_start()
-                self.advance()
-                self.on_advance_end()
-            except StopIteration:
-                break
-
-            self.iteration_count += 1
-
-        return self.on_run_end()
-
 # ------------------------------------------------------------------------------------------------------------
 # HELPER --- TO BE CLEANED UP
 # ------------------------------------------------------------------------------------------------------------