Lightning-AI · williamFalcon · Aug 6, 2020 · Aug 5, 2020 · Aug 5, 2020 · Aug 5, 2020
@@ -46,6 +46,14 @@ def on_sanity_check_end(self, trainer, pl_module):
         """Called when the validation sanity check ends."""
         pass
 
+    def on_train_batch_start(self, trainer, pl_module):
+        """Called when the validation batch begins."""
+        pass
+
+    def on_train_batch_end(self, trainer, pl_module):
+        """Called when the validation batch ends."""
+        pass
+
     def on_train_epoch_start(self, trainer, pl_module):
         """Called when the train epoch begins."""
         pass

@@ -64,7 +64,7 @@ def on_train_start(self, trainer, pl_module):
         # Initialize for storing values
         self.lrs = {name: [] for name in names}
 
-    def on_batch_start(self, trainer, pl_module):
+    def on_train_batch_start(self, trainer, pl_module):
         latest_stat = self._extract_lr(trainer, 'step')
         if trainer.logger and latest_stat:
             trainer.logger.log_metrics(latest_stat, step=trainer.global_step)

@@ -36,8 +36,8 @@ def __init__(self):
             def disable(self):
                 self.enable = False
 
-            def on_batch_end(self, trainer, pl_module):
-                super().on_batch_end(trainer, pl_module)  # don't forget this :)
+            def on_train_batch_end(self, trainer, pl_module):
+                super().on_train_batch_end(trainer, pl_module)  # don't forget this :)
                 percent = (self.train_batch_idx / self.total_train_batches) * 100
                 sys.stdout.flush()
                 sys.stdout.write(f'{percent:.01f} percent complete \r')
@@ -138,7 +138,7 @@ def on_train_start(self, trainer, pl_module):
     def on_epoch_start(self, trainer, pl_module):
         self._train_batch_idx = 0
 
-    def on_batch_end(self, trainer, pl_module):
+    def on_train_batch_end(self, trainer, pl_module):
         self._train_batch_idx += 1
 
     def on_validation_start(self, trainer, pl_module):
@@ -318,8 +318,8 @@ def on_epoch_start(self, trainer, pl_module):
             self.main_progress_bar.reset(convert_inf(total_batches))
         self.main_progress_bar.set_description(f'Epoch {trainer.current_epoch + 1}')
 
-    def on_batch_end(self, trainer, pl_module):
-        super().on_batch_end(trainer, pl_module)
+    def on_train_batch_end(self, trainer, pl_module):
+        super().on_train_batch_end(trainer, pl_module)
         if self.is_enabled and self.train_batch_idx % self.refresh_rate == 0:
             self.main_progress_bar.update(self.refresh_rate)
             self.main_progress_bar.set_postfix(trainer.progress_bar_dict)

@@ -77,6 +77,23 @@ def on_train_end(self) -> None:
         """
         # do something at the end of training
 
+    def on_train_batch_start(self, batch: Any) -> None:
+        """
+        Called in the training loop before anything happens for that batch.
+
+        If you return -1 here, you will skip training for the rest of the current epoch.
+
+        Args:
+            batch: The batched data as it is returned by the training DataLoader.
+        """
+        # do something when the batch starts
+
+    def on_train_batch_end(self) -> None:
+        """
+        Called in the training loop after the batch.
+        """
+        # do something when the batch end
+
     def on_batch_start(self, batch: Any) -> None:
         """
         Called in the training loop before anything happens for that batch.
@@ -85,12 +102,16 @@ def on_batch_start(self, batch: Any) -> None:
 
         Args:
             batch: The batched data as it is returned by the training DataLoader.
+
+        .. warning:: Deprecated in 0.9.0 will remove 1.0.0 (use `on_train_batch_start` instead)
         """
         # do something when the batch starts
 
     def on_batch_end(self) -> None:
         """
         Called in the training loop after the batch.
+
+        .. warning:: Deprecated in 0.9.0 will remove 1.0.0 (use `on_train_batch_end` instead)
         """
         # do something when the batch ends
 

@@ -1771,7 +1771,7 @@ def to_onnx(self, file_path: str, input_sample: Optional[Tensor] = None, **kwarg
         elif self.example_input_array is not None:
             input_data = self.example_input_array
         else:
-            raise ValueError(f'input_sample and example_input_array tensors are both missing.')
+            raise ValueError('input_sample and example_input_array tensors are both missing.')
 
         if 'example_outputs' not in kwargs:
             self.eval()

@@ -9,7 +9,7 @@ class TrainerCallbackHookMixin(ABC):
     # this is just a summary on variables used in this abstract class,
     # the proper values/initialisation should be done in child class
     callbacks: List[Callback] = []
-    get_model: Callable = ...
+    get_model: Callable
 
     def setup(self, stage: str):
         """Called in the beginning of fit and test"""
@@ -111,6 +111,16 @@ def on_batch_end(self):
         for callback in self.callbacks:
             callback.on_batch_end(self, self.get_model())
 
+    def on_train_batch_start(self):
+        """Called when the training batch begins."""
+        for callback in self.callbacks:
+            callback.on_train_batch_start(self, self.get_model())
+
+    def on_train_batch_end(self):
+        """Called when the training batch ends."""
+        for callback in self.callbacks:
+            callback.on_train_batch_end(self, self.get_model())
+
     def on_validation_batch_start(self):
         """Called when the validation batch begins."""
         for callback in self.callbacks:

@@ -382,7 +382,7 @@ def on_batch_start(self, trainer, pl_module):
 
         self.lrs.append(trainer.lr_schedulers[0]['scheduler'].lr[0])
 
-    def on_batch_end(self, trainer, pl_module):
+    def on_train_batch_end(self, trainer, pl_module):
         """ Called when the training batch ends, logs the calculated loss """
         if (trainer.batch_idx + 1) % trainer.accumulate_grad_batches != 0:
             return

@@ -263,6 +263,8 @@ class TrainerTrainLoopMixin(ABC):
     on_train_end: Callable
     on_batch_start: Callable
     on_batch_end: Callable
+    on_train_batch_start: Callable
+    on_train_batch_end: Callable
     on_epoch_start: Callable
     on_epoch_end: Callable
     on_validation_end: Callable
@@ -690,6 +692,7 @@ def run_training_batch(self, batch, batch_idx):
             return AttributeDict(signal=0, grad_norm_dic=grad_norm_dic)
 
         # Batch start events
+        # TODO: deprecate 1.0
         with self.profiler.profile('on_batch_start'):
             # callbacks
             self.on_batch_start()
@@ -699,6 +702,15 @@ def run_training_batch(self, batch, batch_idx):
                 if response == -1:
                     return AttributeDict(signal=-1, grad_norm_dic=grad_norm_dic)
 
+        with self.profiler.profile('on_train_batch_start'):
+            # callbacks
+            self.on_train_batch_start()
+            # hooks
+            if self.is_function_implemented('on_train_batch_start'):
+                response = self.get_model().on_train_batch_start(batch)
+                if response == -1:
+                    return AttributeDict(signal=-1, grad_norm_dic=grad_norm_dic)
+
         splits = [batch]
         if self.truncated_bptt_steps is not None:
             model_ref = self.get_model()
@@ -785,6 +797,13 @@ def run_training_batch(self, batch, batch_idx):
             if self.is_function_implemented('on_batch_end'):
                 self.get_model().on_batch_end()
 
+        with self.profiler.profile('on_train_batch_end'):
+            # callbacks
+            self.on_train_batch_end()
+            # model hooks
+            if self.is_function_implemented('on_train_batch_end'):
+                self.get_model().on_train_batch_end()
+
         # collapse all metrics into one dict
         batch_log_metrics = {k: v for d in batch_log_metrics for k, v in d.items()}
 

@@ -28,6 +28,8 @@ def __init__(self):
             self.on_epoch_end_called = False
             self.on_batch_start_called = False
             self.on_batch_end_called = False
+            self.on_train_batch_start_called = False
+            self.on_train_batch_end_called = False
             self.on_validation_batch_start_called = False
             self.on_validation_batch_end_called = False
             self.on_test_batch_start_called = False
@@ -87,6 +89,14 @@ def on_batch_end(self, trainer, pl_module):
             _check_args(trainer, pl_module)
             self.on_batch_end_called = True
 
+        def on_train_batch_start(self, trainer, pl_module):
+            _check_args(trainer, pl_module)
+            self.on_train_batch_start_called = True
+
+        def on_train_batch_end(self, trainer, pl_module):
+            _check_args(trainer, pl_module)
+            self.on_train_batch_end_called = True
+
         def on_validation_batch_start(self, trainer, pl_module):
             _check_args(trainer, pl_module)
             self.on_validation_batch_start_called = True
@@ -150,6 +160,8 @@ def on_test_end(self, trainer, pl_module):
     assert not test_callback.on_epoch_start_called
     assert not test_callback.on_batch_start_called
     assert not test_callback.on_batch_end_called
+    assert not test_callback.on_train_batch_start_called
+    assert not test_callback.on_train_batch_end_called
     assert not test_callback.on_validation_batch_start_called
     assert not test_callback.on_validation_batch_end_called
     assert not test_callback.on_test_batch_start_called
@@ -177,6 +189,8 @@ def on_test_end(self, trainer, pl_module):
     assert not test_callback.on_epoch_start_called
     assert not test_callback.on_batch_start_called
     assert not test_callback.on_batch_end_called
+    assert not test_callback.on_train_batch_start_called
+    assert not test_callback.on_train_batch_end_called
     assert not test_callback.on_validation_batch_start_called
     assert not test_callback.on_validation_batch_end_called
     assert not test_callback.on_test_batch_start_called
@@ -202,6 +216,8 @@ def on_test_end(self, trainer, pl_module):
     assert test_callback.on_epoch_start_called
     assert test_callback.on_batch_start_called
     assert test_callback.on_batch_end_called
+    assert test_callback.on_train_batch_start_called
+    assert test_callback.on_train_batch_end_called
     assert test_callback.on_validation_batch_start_called
     assert test_callback.on_validation_batch_end_called
     assert test_callback.on_train_start_called

@@ -153,12 +153,12 @@ class CurrentProgressBar(ProgressBar):
         val_batches_seen = 0
         test_batches_seen = 0
 
-        def on_batch_start(self, trainer, pl_module):
-            super().on_batch_start(trainer, pl_module)
+        def on_train_batch_start(self, trainer, pl_module):
+            super().on_train_batch_start(trainer, pl_module)
             assert self.train_batch_idx == trainer.batch_idx
 
-        def on_batch_end(self, trainer, pl_module):
-            super().on_batch_end(trainer, pl_module)
+        def on_train_batch_end(self, trainer, pl_module):
+            super().on_train_batch_end(trainer, pl_module)
             assert self.train_batch_idx == trainer.batch_idx + 1
             if not self.is_disabled and self.train_batch_idx % self.refresh_rate == 0:
                 assert self.main_progress_bar.n == self.train_batch_idx

@@ -50,17 +50,17 @@ def test_can_prepare_data(tmpdir):
 
     # is_overridden prepare data = True
     # has been called
-        # False
+    # False
     dm._has_prepared_data = True
     assert not trainer.can_prepare_data()
 
     # has not been called
-        # True
+    # True
     dm._has_prepared_data = False
     assert trainer.can_prepare_data()
 
     # is_overridden prepare data = False
-            # True
+    # True
     dm.prepare_data = None
     assert trainer.can_prepare_data()
 

@@ -214,7 +214,7 @@ class RankZeroLoggerCheck(Callback):
     # this class has to be defined outside the test function, otherwise we get pickle error
     # due to the way ddp process is launched
 
-    def on_batch_start(self, trainer, pl_module):
+    def on_train_batch_start(self, trainer, pl_module):
         is_dummy = isinstance(trainer.logger.experiment, DummyExperiment)
         if trainer.is_global_zero:
             assert not is_dummy

@@ -377,7 +377,7 @@ def increment_on_load_checkpoint(self, _):
         # Bind methods to keep track of epoch numbers, batch numbers it has seen
         # as well as number of times it has called on_load_checkpoint()
         model.on_epoch_end = types.MethodType(increment_epoch, model)
-        model.on_batch_start = types.MethodType(increment_batch, model)
+        model.on_train_batch_start = types.MethodType(increment_batch, model)
         model.on_load_checkpoint = types.MethodType(increment_on_load_checkpoint, model)
         return model
 
@@ -691,7 +691,7 @@ class InterruptCallback(Callback):
         def __init__(self):
             super().__init__()
 
-        def on_batch_start(self, trainer, pl_module):
+        def on_train_batch_start(self, trainer, pl_module):
             raise KeyboardInterrupt
 
     class HandleInterruptCallback(Callback):

@@ -27,7 +27,7 @@ def __init__(self, *args, **kwargs):
 
 class DeviceAssertCallback(Callback):
 
-    def on_batch_start(self, trainer, model):
+    def on_train_batch_start(self, trainer, model):
         rank = trainer.local_rank
         assert isinstance(model, TopModule)
         # index = None also means first device