From a774766e14408b56e7cbae615d9d8fa48cc35ed3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Wed, 5 May 2021 13:15:10 +0200 Subject: [PATCH 01/64] update train step --- pytorch_lightning/accelerators/accelerator.py | 2 +- pytorch_lightning/trainer/training_loop.py | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py index 558fbc30d5c7c..0af6f375dd722 100644 --- a/pytorch_lightning/accelerators/accelerator.py +++ b/pytorch_lightning/accelerators/accelerator.py @@ -198,7 +198,7 @@ def training_step( - hiddens(:class:`~torch.Tensor`): Passed in if :paramref:`~pytorch_lightning.core.lightning.LightningModule.truncated_bptt_steps` > 0. """ - args[0] = self.to_device(args[0]) + # args[0] = self.to_device(args[0]) with self.precision_plugin.train_step_context(), self.training_type_plugin.train_step_context(): return self.training_type_plugin.training_step(*args) diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index a23c8ba28cad7..3a5b8ba660ad3 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -471,6 +471,8 @@ def run_training_epoch(self): # ------------------------------------ # TRAINING_STEP + TRAINING_STEP_END # ------------------------------------ + batch = self.trainer.accelerator.to_device(batch) + with self.trainer.profiler.profile("run_training_batch"): batch_output = self.run_training_batch(batch, batch_idx, dataloader_idx) From 64af97dd2d6d52a42232c30056347fcce59a2141 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Wed, 5 May 2021 13:45:01 +0200 Subject: [PATCH 02/64] test --- tests/callbacks/test_callbacks.py | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/tests/callbacks/test_callbacks.py b/tests/callbacks/test_callbacks.py index a30b4fe0f609b..0b2ca996abf7f 100644 --- a/tests/callbacks/test_callbacks.py +++ b/tests/callbacks/test_callbacks.py @@ -14,7 +14,7 @@ from unittest import mock from unittest.mock import ANY, call, MagicMock, Mock -from pytorch_lightning import Trainer +from pytorch_lightning import Trainer, Callback from tests.helpers import BoringModel @@ -268,3 +268,24 @@ def configure_callbacks(self): trainer_fn(ckpt_path=None) callbacks_after = trainer.callbacks.copy() assert callbacks_after == callbacks_after_fit + + +class BatchObserverCallback(Callback): + + def on_train_batch_end(self, trainer, pl_module, **kwargs): + batch = kwargs.get("batch") + assert batch.device.type == "cuda" + assert batch.device == pl_module.device + + +def test_callback_batch_on_device(tmpdir): + + batch_callback = BatchObserverCallback() + + model = BoringModel() + trainer = Trainer( + default_root_dir=tmpdir, + gpus=1, + callbacks=[batch_callback], + ) + trainer.fit(model) From 36d6a910da4bfdabde1c90635a8649e0467f8a6c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Wed, 5 May 2021 13:48:06 +0200 Subject: [PATCH 03/64] x --- tests/callbacks/test_callbacks.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/callbacks/test_callbacks.py b/tests/callbacks/test_callbacks.py index 0b2ca996abf7f..6490289aed39e 100644 --- a/tests/callbacks/test_callbacks.py +++ b/tests/callbacks/test_callbacks.py @@ -272,8 +272,7 @@ def configure_callbacks(self): class BatchObserverCallback(Callback): - def on_train_batch_end(self, trainer, pl_module, **kwargs): - batch = kwargs.get("batch") + def on_train_batch_end(self, trainer, pl_module, outputs, batch, *args): assert batch.device.type == "cuda" assert batch.device == pl_module.device From cb6112ef24a8f33f51242778925ed1657cd7834f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Wed, 5 May 2021 13:49:44 +0200 Subject: [PATCH 04/64] limits --- tests/callbacks/test_callbacks.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/callbacks/test_callbacks.py b/tests/callbacks/test_callbacks.py index 6490289aed39e..380b8fad3795d 100644 --- a/tests/callbacks/test_callbacks.py +++ b/tests/callbacks/test_callbacks.py @@ -284,6 +284,10 @@ def test_callback_batch_on_device(tmpdir): model = BoringModel() trainer = Trainer( default_root_dir=tmpdir, + limit_train_batches=1, + limit_val_batches=1, + limit_test_batches=1, + limit_predict_batches=1, gpus=1, callbacks=[batch_callback], ) From 16728d1d10c539dd91fea1df869ae6515934aad7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Wed, 5 May 2021 13:51:50 +0200 Subject: [PATCH 05/64] val --- tests/callbacks/test_callbacks.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/callbacks/test_callbacks.py b/tests/callbacks/test_callbacks.py index 380b8fad3795d..b3dd2da1caa1c 100644 --- a/tests/callbacks/test_callbacks.py +++ b/tests/callbacks/test_callbacks.py @@ -276,6 +276,10 @@ def on_train_batch_end(self, trainer, pl_module, outputs, batch, *args): assert batch.device.type == "cuda" assert batch.device == pl_module.device + def on_validation_batch_end(trainer, pl_module, outputs, batch, *args): + assert batch.device.type == "cuda" + assert batch.device == pl_module.device + def test_callback_batch_on_device(tmpdir): From de08636827513b71982147d92bc079d16abf4b8b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Wed, 5 May 2021 13:52:07 +0200 Subject: [PATCH 06/64] typeo --- tests/callbacks/test_callbacks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/callbacks/test_callbacks.py b/tests/callbacks/test_callbacks.py index b3dd2da1caa1c..04cdff51580f0 100644 --- a/tests/callbacks/test_callbacks.py +++ b/tests/callbacks/test_callbacks.py @@ -276,7 +276,7 @@ def on_train_batch_end(self, trainer, pl_module, outputs, batch, *args): assert batch.device.type == "cuda" assert batch.device == pl_module.device - def on_validation_batch_end(trainer, pl_module, outputs, batch, *args): + def on_validation_batch_end(self, trainer, pl_module, outputs, batch, *args): assert batch.device.type == "cuda" assert batch.device == pl_module.device From 66934e4ff2706da8e3e92a6aa4142e5d8cbe3d9c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Wed, 5 May 2021 13:56:03 +0200 Subject: [PATCH 07/64] x --- tests/callbacks/test_callbacks.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tests/callbacks/test_callbacks.py b/tests/callbacks/test_callbacks.py index 04cdff51580f0..4a9408324aa39 100644 --- a/tests/callbacks/test_callbacks.py +++ b/tests/callbacks/test_callbacks.py @@ -273,11 +273,15 @@ def configure_callbacks(self): class BatchObserverCallback(Callback): def on_train_batch_end(self, trainer, pl_module, outputs, batch, *args): - assert batch.device.type == "cuda" assert batch.device == pl_module.device def on_validation_batch_end(self, trainer, pl_module, outputs, batch, *args): - assert batch.device.type == "cuda" + assert batch.device == pl_module.device + + def on_test_batch_end(self, trainer, pl_module, outputs, batch, *args): + assert batch.device == pl_module.device + + def on_predict_batch_end(self, trainer, pl_module, outputs, batch, *args): assert batch.device == pl_module.device From 89202c93c05dac83ba9895e5718555719bf0d700 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Wed, 5 May 2021 13:57:22 +0200 Subject: [PATCH 08/64] x --- tests/callbacks/test_callbacks.py | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/tests/callbacks/test_callbacks.py b/tests/callbacks/test_callbacks.py index 4a9408324aa39..894af7fdf8d5b 100644 --- a/tests/callbacks/test_callbacks.py +++ b/tests/callbacks/test_callbacks.py @@ -272,17 +272,20 @@ def configure_callbacks(self): class BatchObserverCallback(Callback): - def on_train_batch_end(self, trainer, pl_module, outputs, batch, *args): - assert batch.device == pl_module.device - - def on_validation_batch_end(self, trainer, pl_module, outputs, batch, *args): - assert batch.device == pl_module.device - - def on_test_batch_end(self, trainer, pl_module, outputs, batch, *args): - assert batch.device == pl_module.device - - def on_predict_batch_end(self, trainer, pl_module, outputs, batch, *args): + def on_train_batch_start(self,trainer, pl_module, batch, *args): assert batch.device == pl_module.device + # + # def on_train_batch_end(self, trainer, pl_module, outputs, batch, *args): + # assert batch.device == pl_module.device + # + # def on_validation_batch_end(self, trainer, pl_module, outputs, batch, *args): + # assert batch.device == pl_module.device + # + # def on_test_batch_end(self, trainer, pl_module, outputs, batch, *args): + # assert batch.device == pl_module.device + # + # def on_predict_batch_end(self, trainer, pl_module, outputs, batch, *args): + # assert batch.device == pl_module.device def test_callback_batch_on_device(tmpdir): From b5f8806a2a76446dc73e33442b7999a05decf552 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Wed, 5 May 2021 14:03:24 +0200 Subject: [PATCH 09/64] step --- pytorch_lightning/accelerators/accelerator.py | 12 +++---- pytorch_lightning/trainer/trainer.py | 3 ++ pytorch_lightning/trainer/training_loop.py | 2 ++ tests/callbacks/test_callbacks.py | 35 ++++++++++++------- 4 files changed, 33 insertions(+), 19 deletions(-) diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py index 0af6f375dd722..16a298ef80fa1 100644 --- a/pytorch_lightning/accelerators/accelerator.py +++ b/pytorch_lightning/accelerators/accelerator.py @@ -218,9 +218,9 @@ def validation_step(self, args: List[Union[Any, int]]) -> Optional[STEP_OUTPUT]: - dataloader_idx (int): The index of the dataloader that produced this batch (only if multiple val dataloaders used) """ - batch = self.to_device(args[0]) + # batch = self.to_device(args[0]) - args[0] = batch + # args[0] = batch with self.precision_plugin.val_step_context(), self.training_type_plugin.val_step_context(): return self.training_type_plugin.validation_step(*args) @@ -237,9 +237,9 @@ def test_step(self, args: List[Union[Any, int]]) -> Optional[STEP_OUTPUT]: - dataloader_idx (int): The index of the dataloader that produced this batch (only if multiple test dataloaders used). """ - batch = self.to_device(args[0]) + # batch = self.to_device(args[0]) - args[0] = batch + # args[0] = batch with self.precision_plugin.test_step_context(), self.training_type_plugin.test_step_context(): return self.training_type_plugin.test_step(*args) @@ -257,9 +257,9 @@ def predict_step(self, args: List[Union[Any, int]]) -> STEP_OUTPUT: (only if multiple predict dataloaders used). """ - batch = self.to_device(args[0]) + # batch = self.to_device(args[0]) - args[0] = batch + # args[0] = batch with self.precision_plugin.predict_step_context(), self.training_type_plugin.predict_step_context(): return self.training_type_plugin.predict_step(*args) diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 105f8e6810d36..0fbd45f02957b 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -954,6 +954,9 @@ def run_evaluation(self, on_epoch: bool = False) -> _EVALUATE_OUTPUT: if batch_idx >= dl_max_batches: break + # TODO: where is the profile step for to_device? + batch = self.accelerator.to_device(batch) + # hook self.evaluation_loop.on_evaluation_batch_start(batch, batch_idx, dataloader_idx) diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index 3a5b8ba660ad3..0c620d9e6e948 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -471,6 +471,8 @@ def run_training_epoch(self): # ------------------------------------ # TRAINING_STEP + TRAINING_STEP_END # ------------------------------------ + + # TODO: where is the profile step for to_device? batch = self.trainer.accelerator.to_device(batch) with self.trainer.profiler.profile("run_training_batch"): diff --git a/tests/callbacks/test_callbacks.py b/tests/callbacks/test_callbacks.py index 894af7fdf8d5b..e4c8f7e1b8823 100644 --- a/tests/callbacks/test_callbacks.py +++ b/tests/callbacks/test_callbacks.py @@ -272,20 +272,29 @@ def configure_callbacks(self): class BatchObserverCallback(Callback): - def on_train_batch_start(self,trainer, pl_module, batch, *args): + def on_train_batch_start(self, trainer, pl_module, batch, *args): + assert batch.device == pl_module.device + + def on_validation_batch_start(self, trainer, pl_module, batch, *args): + assert batch.device == pl_module.device + + def on_test_batch_start(self, trainer, pl_module, batch, *args): + assert batch.device == pl_module.device + + def on_predict_batch_start(self, trainer, pl_module, batch, *args): + assert batch.device == pl_module.device + + def on_train_batch_end(self, trainer, pl_module, outputs, batch, *args): + assert batch.device == pl_module.device + + def on_validation_batch_end(self, trainer, pl_module, outputs, batch, *args): + assert batch.device == pl_module.device + + def on_test_batch_end(self, trainer, pl_module, outputs, batch, *args): + assert batch.device == pl_module.device + + def on_predict_batch_end(self, trainer, pl_module, outputs, batch, *args): assert batch.device == pl_module.device - # - # def on_train_batch_end(self, trainer, pl_module, outputs, batch, *args): - # assert batch.device == pl_module.device - # - # def on_validation_batch_end(self, trainer, pl_module, outputs, batch, *args): - # assert batch.device == pl_module.device - # - # def on_test_batch_end(self, trainer, pl_module, outputs, batch, *args): - # assert batch.device == pl_module.device - # - # def on_predict_batch_end(self, trainer, pl_module, outputs, batch, *args): - # assert batch.device == pl_module.device def test_callback_batch_on_device(tmpdir): From 0ff4f9ef7dd2fdc9427771cb0a3b2167549f3709 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Thu, 6 May 2021 09:37:20 +0200 Subject: [PATCH 10/64] min gpus --- tests/callbacks/test_callbacks.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/callbacks/test_callbacks.py b/tests/callbacks/test_callbacks.py index e4c8f7e1b8823..c8815ad33947b 100644 --- a/tests/callbacks/test_callbacks.py +++ b/tests/callbacks/test_callbacks.py @@ -16,6 +16,7 @@ from pytorch_lightning import Trainer, Callback from tests.helpers import BoringModel +from tests.helpers.runif import RunIf @mock.patch("torch.save") # need to mock torch.save or we get pickle error @@ -297,6 +298,7 @@ def on_predict_batch_end(self, trainer, pl_module, outputs, batch, *args): assert batch.device == pl_module.device +@RunIf(min_gpus=1) def test_callback_batch_on_device(tmpdir): batch_callback = BatchObserverCallback() From 0d3f47ae2d3fd45ed962aed573608d125097f14c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Thu, 6 May 2021 09:42:41 +0200 Subject: [PATCH 11/64] run all loops --- tests/callbacks/test_callbacks.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/callbacks/test_callbacks.py b/tests/callbacks/test_callbacks.py index c8815ad33947b..ce4064cf3367c 100644 --- a/tests/callbacks/test_callbacks.py +++ b/tests/callbacks/test_callbacks.py @@ -314,3 +314,6 @@ def test_callback_batch_on_device(tmpdir): callbacks=[batch_callback], ) trainer.fit(model) + trainer.validate(model) + trainer.test(model) + trainer.predict(model) From fb9ffed19aa9f1404985bab1bb9338111dc8ca9f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Thu, 6 May 2021 09:44:13 +0200 Subject: [PATCH 12/64] x --- pytorch_lightning/trainer/trainer.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 0fbd45f02957b..1f431e102f2e2 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -1080,6 +1080,8 @@ def run_predict(self) -> Optional[_PREDICT_OUTPUT]: if batch_idx >= dl_max_batches: break + batch = self.accelerator.to_device(batch) + # lightning module methods with self.profiler.profile("predict_step"): self.predict_loop.predict_step(batch, batch_idx, dataloader_idx) From b7befc1ce918149c41215efce9e69c34d3c9d512 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Thu, 6 May 2021 09:45:20 +0200 Subject: [PATCH 13/64] limit test --- tests/callbacks/test_callbacks.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/callbacks/test_callbacks.py b/tests/callbacks/test_callbacks.py index ce4064cf3367c..511ecc521115e 100644 --- a/tests/callbacks/test_callbacks.py +++ b/tests/callbacks/test_callbacks.py @@ -306,6 +306,7 @@ def test_callback_batch_on_device(tmpdir): model = BoringModel() trainer = Trainer( default_root_dir=tmpdir, + max_steps=1, limit_train_batches=1, limit_val_batches=1, limit_test_batches=1, From fe065a592f9b8e752b8b1cf44926fb3dea92323f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Thu, 6 May 2021 09:49:47 +0200 Subject: [PATCH 14/64] profiler --- pytorch_lightning/trainer/trainer.py | 7 ++++--- pytorch_lightning/trainer/training_loop.py | 4 ++-- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 1f431e102f2e2..28b9ab9cd30e9 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -954,8 +954,8 @@ def run_evaluation(self, on_epoch: bool = False) -> _EVALUATE_OUTPUT: if batch_idx >= dl_max_batches: break - # TODO: where is the profile step for to_device? - batch = self.accelerator.to_device(batch) + with self.profiler.profile("evaluation_batch_to_device"): + batch = self.accelerator.to_device(batch) # hook self.evaluation_loop.on_evaluation_batch_start(batch, batch_idx, dataloader_idx) @@ -1080,7 +1080,8 @@ def run_predict(self) -> Optional[_PREDICT_OUTPUT]: if batch_idx >= dl_max_batches: break - batch = self.accelerator.to_device(batch) + with self.profiler.profile("predict_batch_to_device"): + batch = self.accelerator.to_device(batch) # lightning module methods with self.profiler.profile("predict_step"): diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index 0c620d9e6e948..dabd4db48e240 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -472,8 +472,8 @@ def run_training_epoch(self): # TRAINING_STEP + TRAINING_STEP_END # ------------------------------------ - # TODO: where is the profile step for to_device? - batch = self.trainer.accelerator.to_device(batch) + with self.trainer.profiler.profile("training_batch_to_device"): + batch = self.trainer.accelerator.to_device(batch) with self.trainer.profiler.profile("run_training_batch"): batch_output = self.run_training_batch(batch, batch_idx, dataloader_idx) From 60d458ec9f5fc266438aef7e98832946724d045b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Thu, 6 May 2021 09:53:12 +0200 Subject: [PATCH 15/64] clean up accelerator code --- pytorch_lightning/accelerators/accelerator.py | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py index 16a298ef80fa1..6a4e56b4403a9 100644 --- a/pytorch_lightning/accelerators/accelerator.py +++ b/pytorch_lightning/accelerators/accelerator.py @@ -198,8 +198,6 @@ def training_step( - hiddens(:class:`~torch.Tensor`): Passed in if :paramref:`~pytorch_lightning.core.lightning.LightningModule.truncated_bptt_steps` > 0. """ - # args[0] = self.to_device(args[0]) - with self.precision_plugin.train_step_context(), self.training_type_plugin.train_step_context(): return self.training_type_plugin.training_step(*args) @@ -218,10 +216,6 @@ def validation_step(self, args: List[Union[Any, int]]) -> Optional[STEP_OUTPUT]: - dataloader_idx (int): The index of the dataloader that produced this batch (only if multiple val dataloaders used) """ - # batch = self.to_device(args[0]) - - # args[0] = batch - with self.precision_plugin.val_step_context(), self.training_type_plugin.val_step_context(): return self.training_type_plugin.validation_step(*args) @@ -237,10 +231,6 @@ def test_step(self, args: List[Union[Any, int]]) -> Optional[STEP_OUTPUT]: - dataloader_idx (int): The index of the dataloader that produced this batch (only if multiple test dataloaders used). """ - # batch = self.to_device(args[0]) - - # args[0] = batch - with self.precision_plugin.test_step_context(), self.training_type_plugin.test_step_context(): return self.training_type_plugin.test_step(*args) @@ -257,10 +247,6 @@ def predict_step(self, args: List[Union[Any, int]]) -> STEP_OUTPUT: (only if multiple predict dataloaders used). """ - # batch = self.to_device(args[0]) - - # args[0] = batch - with self.precision_plugin.predict_step_context(), self.training_type_plugin.predict_step_context(): return self.training_type_plugin.predict_step(*args) From 642bcf6e6f4ab4f73cd96d5ca1d7b6ae2b95d4e2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Thu, 6 May 2021 10:00:52 +0200 Subject: [PATCH 16/64] move files --- tests/trainer/{data_flow => loops}/__init__.py | 0 tests/trainer/{data_flow => loops}/test_eval_loop_flow.py | 0 tests/trainer/{ => loops}/test_evaluation_loop.py | 0 tests/trainer/{data_flow => loops}/test_flow_warnings.py | 0 tests/trainer/{data_flow => loops}/test_train_loop_flow_dict.py | 0 tests/trainer/{data_flow => loops}/test_train_loop_flow_scalar.py | 0 tests/trainer/{ => loops}/test_training_loop.py | 0 7 files changed, 0 insertions(+), 0 deletions(-) rename tests/trainer/{data_flow => loops}/__init__.py (100%) rename tests/trainer/{data_flow => loops}/test_eval_loop_flow.py (100%) rename tests/trainer/{ => loops}/test_evaluation_loop.py (100%) rename tests/trainer/{data_flow => loops}/test_flow_warnings.py (100%) rename tests/trainer/{data_flow => loops}/test_train_loop_flow_dict.py (100%) rename tests/trainer/{data_flow => loops}/test_train_loop_flow_scalar.py (100%) rename tests/trainer/{ => loops}/test_training_loop.py (100%) diff --git a/tests/trainer/data_flow/__init__.py b/tests/trainer/loops/__init__.py similarity index 100% rename from tests/trainer/data_flow/__init__.py rename to tests/trainer/loops/__init__.py diff --git a/tests/trainer/data_flow/test_eval_loop_flow.py b/tests/trainer/loops/test_eval_loop_flow.py similarity index 100% rename from tests/trainer/data_flow/test_eval_loop_flow.py rename to tests/trainer/loops/test_eval_loop_flow.py diff --git a/tests/trainer/test_evaluation_loop.py b/tests/trainer/loops/test_evaluation_loop.py similarity index 100% rename from tests/trainer/test_evaluation_loop.py rename to tests/trainer/loops/test_evaluation_loop.py diff --git a/tests/trainer/data_flow/test_flow_warnings.py b/tests/trainer/loops/test_flow_warnings.py similarity index 100% rename from tests/trainer/data_flow/test_flow_warnings.py rename to tests/trainer/loops/test_flow_warnings.py diff --git a/tests/trainer/data_flow/test_train_loop_flow_dict.py b/tests/trainer/loops/test_train_loop_flow_dict.py similarity index 100% rename from tests/trainer/data_flow/test_train_loop_flow_dict.py rename to tests/trainer/loops/test_train_loop_flow_dict.py diff --git a/tests/trainer/data_flow/test_train_loop_flow_scalar.py b/tests/trainer/loops/test_train_loop_flow_scalar.py similarity index 100% rename from tests/trainer/data_flow/test_train_loop_flow_scalar.py rename to tests/trainer/loops/test_train_loop_flow_scalar.py diff --git a/tests/trainer/test_training_loop.py b/tests/trainer/loops/test_training_loop.py similarity index 100% rename from tests/trainer/test_training_loop.py rename to tests/trainer/loops/test_training_loop.py From 2d1b7ca2a4ca76c28ff05d7ee16189f83cfd2290 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Thu, 6 May 2021 10:04:41 +0200 Subject: [PATCH 17/64] rename --- .../{test_eval_loop_flow.py => test_evaluation_loop_flow.py} | 0 ...st_train_loop_flow_dict.py => test_training_loop_flow_dict.py} | 0 ...rain_loop_flow_scalar.py => test_training_loop_flow_scalar.py} | 0 3 files changed, 0 insertions(+), 0 deletions(-) rename tests/trainer/loops/{test_eval_loop_flow.py => test_evaluation_loop_flow.py} (100%) rename tests/trainer/loops/{test_train_loop_flow_dict.py => test_training_loop_flow_dict.py} (100%) rename tests/trainer/loops/{test_train_loop_flow_scalar.py => test_training_loop_flow_scalar.py} (100%) diff --git a/tests/trainer/loops/test_eval_loop_flow.py b/tests/trainer/loops/test_evaluation_loop_flow.py similarity index 100% rename from tests/trainer/loops/test_eval_loop_flow.py rename to tests/trainer/loops/test_evaluation_loop_flow.py diff --git a/tests/trainer/loops/test_train_loop_flow_dict.py b/tests/trainer/loops/test_training_loop_flow_dict.py similarity index 100% rename from tests/trainer/loops/test_train_loop_flow_dict.py rename to tests/trainer/loops/test_training_loop_flow_dict.py diff --git a/tests/trainer/loops/test_train_loop_flow_scalar.py b/tests/trainer/loops/test_training_loop_flow_scalar.py similarity index 100% rename from tests/trainer/loops/test_train_loop_flow_scalar.py rename to tests/trainer/loops/test_training_loop_flow_scalar.py From 50ac2a59ecc2381bd3e8eeac624489faaa453cb9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Thu, 6 May 2021 10:11:56 +0200 Subject: [PATCH 18/64] move tests --- tests/callbacks/test_callbacks.py | 52 +------------------------ tests/trainer/loops/test_all.py | 65 +++++++++++++++++++++++++++++++ 2 files changed, 66 insertions(+), 51 deletions(-) create mode 100644 tests/trainer/loops/test_all.py diff --git a/tests/callbacks/test_callbacks.py b/tests/callbacks/test_callbacks.py index 1d7d7ce3ed6b7..9b048e022c45b 100644 --- a/tests/callbacks/test_callbacks.py +++ b/tests/callbacks/test_callbacks.py @@ -14,9 +14,8 @@ from unittest import mock from unittest.mock import ANY, call, MagicMock, Mock -from pytorch_lightning import Trainer, Callback +from pytorch_lightning import Trainer from tests.helpers import BoringModel -from tests.helpers.runif import RunIf @mock.patch("torch.save") # need to mock torch.save or we get pickle error @@ -269,52 +268,3 @@ def configure_callbacks(self): trainer_fn(ckpt_path=None) callbacks_after = trainer.callbacks.copy() assert callbacks_after == callbacks_after_fit - - -class BatchObserverCallback(Callback): - - def on_train_batch_start(self, trainer, pl_module, batch, *args): - assert batch.device == pl_module.device - - def on_validation_batch_start(self, trainer, pl_module, batch, *args): - assert batch.device == pl_module.device - - def on_test_batch_start(self, trainer, pl_module, batch, *args): - assert batch.device == pl_module.device - - def on_predict_batch_start(self, trainer, pl_module, batch, *args): - assert batch.device == pl_module.device - - def on_train_batch_end(self, trainer, pl_module, outputs, batch, *args): - assert batch.device == pl_module.device - - def on_validation_batch_end(self, trainer, pl_module, outputs, batch, *args): - assert batch.device == pl_module.device - - def on_test_batch_end(self, trainer, pl_module, outputs, batch, *args): - assert batch.device == pl_module.device - - def on_predict_batch_end(self, trainer, pl_module, outputs, batch, *args): - assert batch.device == pl_module.device - - -@RunIf(min_gpus=1) -def test_callback_batch_on_device(tmpdir): - - batch_callback = BatchObserverCallback() - - model = BoringModel() - trainer = Trainer( - default_root_dir=tmpdir, - max_steps=1, - limit_train_batches=1, - limit_val_batches=1, - limit_test_batches=1, - limit_predict_batches=1, - gpus=1, - callbacks=[batch_callback], - ) - trainer.fit(model) - trainer.validate(model) - trainer.test(model) - trainer.predict(model) diff --git a/tests/trainer/loops/test_all.py b/tests/trainer/loops/test_all.py new file mode 100644 index 0000000000000..91201abf118a1 --- /dev/null +++ b/tests/trainer/loops/test_all.py @@ -0,0 +1,65 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from pytorch_lightning import Callback, Trainer +from tests.helpers import BoringModel +from tests.helpers.runif import RunIf + + +class BatchObserverCallback(Callback): + + def on_train_batch_start(self, trainer, pl_module, batch, *args): + assert batch.device == pl_module.device + + def on_validation_batch_start(self, trainer, pl_module, batch, *args): + assert batch.device == pl_module.device + + def on_test_batch_start(self, trainer, pl_module, batch, *args): + assert batch.device == pl_module.device + + def on_predict_batch_start(self, trainer, pl_module, batch, *args): + assert batch.device == pl_module.device + + def on_train_batch_end(self, trainer, pl_module, outputs, batch, *args): + assert batch.device == pl_module.device + + def on_validation_batch_end(self, trainer, pl_module, outputs, batch, *args): + assert batch.device == pl_module.device + + def on_test_batch_end(self, trainer, pl_module, outputs, batch, *args): + assert batch.device == pl_module.device + + def on_predict_batch_end(self, trainer, pl_module, outputs, batch, *args): + assert batch.device == pl_module.device + + +@RunIf(min_gpus=1) +def test_callback_batch_on_device(tmpdir): + + batch_callback = BatchObserverCallback() + + model = BoringModel() + trainer = Trainer( + default_root_dir=tmpdir, + max_steps=1, + limit_train_batches=1, + limit_val_batches=1, + limit_test_batches=1, + limit_predict_batches=1, + gpus=1, + callbacks=[batch_callback], + ) + trainer.fit(model) + trainer.validate(model) + trainer.test(model) + trainer.predict(model) From 1e3841314fe5d360b6d0f16d6c447a903984a1de Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Thu, 6 May 2021 10:23:51 +0200 Subject: [PATCH 19/64] changelog --- CHANGELOG.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index a93337f8c7d2a..d8a71d1ca1d8a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,10 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). + +- Fixed moving batch to device before sending it to the `on_*_batch_start`/`on_*_batch_end` callbacks and model hooks ([#7378](https://github.com/PyTorchLightning/pytorch-lightning/pull/7378)) + + ## [1.3.0] - 2021-MM-DD ### Added From b02164175d1905bf193671f0119e44ed63cc96fa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Thu, 6 May 2021 10:29:23 +0200 Subject: [PATCH 20/64] reorder callbacks and model hooks --- tests/trainer/loops/test_all.py | 45 ++++++++++++++++++++++++++------- 1 file changed, 36 insertions(+), 9 deletions(-) diff --git a/tests/trainer/loops/test_all.py b/tests/trainer/loops/test_all.py index 91201abf118a1..fec5c5e09f318 100644 --- a/tests/trainer/loops/test_all.py +++ b/tests/trainer/loops/test_all.py @@ -16,39 +16,66 @@ from tests.helpers.runif import RunIf -class BatchObserverCallback(Callback): +class BatchHookObserverCallback(Callback): def on_train_batch_start(self, trainer, pl_module, batch, *args): assert batch.device == pl_module.device - def on_validation_batch_start(self, trainer, pl_module, batch, *args): + def on_train_batch_end(self, trainer, pl_module, outputs, batch, *args): assert batch.device == pl_module.device - def on_test_batch_start(self, trainer, pl_module, batch, *args): + def on_validation_batch_start(self, trainer, pl_module, batch, *args): assert batch.device == pl_module.device - def on_predict_batch_start(self, trainer, pl_module, batch, *args): + def on_validation_batch_end(self, trainer, pl_module, outputs, batch, *args): assert batch.device == pl_module.device - def on_train_batch_end(self, trainer, pl_module, outputs, batch, *args): + def on_test_batch_start(self, trainer, pl_module, batch, *args): assert batch.device == pl_module.device - def on_validation_batch_end(self, trainer, pl_module, outputs, batch, *args): + def on_test_batch_end(self, trainer, pl_module, outputs, batch, *args): assert batch.device == pl_module.device - def on_test_batch_end(self, trainer, pl_module, outputs, batch, *args): + def on_predict_batch_start(self, trainer, pl_module, batch, *args): assert batch.device == pl_module.device def on_predict_batch_end(self, trainer, pl_module, outputs, batch, *args): assert batch.device == pl_module.device +class BatchHookObserverModel(BoringModel): + + def on_train_batch_start(self, batch, *args): + assert batch.device == self.device + + def on_train_batch_end(self, outputs, batch, *args): + assert batch.device == self.device + + def on_validation_batch_start(self, batch, *args): + assert batch.device == self.device + + def on_validation_batch_end(self, outputs, batch, *args): + assert batch.device == self.device + + def on_test_batch_start(self, batch, *args): + assert batch.device == self.device + + def on_test_batch_end(self, outputs, batch, *args): + assert batch.device == self.device + + def on_predict_batch_start(self, batch, *args): + assert batch.device == self.device + + def on_predict_batch_end(self, outputs, batch, *args): + assert batch.device == self.device + + @RunIf(min_gpus=1) def test_callback_batch_on_device(tmpdir): - batch_callback = BatchObserverCallback() + batch_callback = BatchHookObserverCallback() - model = BoringModel() + model = BatchHookObserverModel() trainer = Trainer( default_root_dir=tmpdir, max_steps=1, From cc79736e45254a0c9d28587ea6a484e5f727780b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Thu, 6 May 2021 10:37:13 +0200 Subject: [PATCH 21/64] add test description --- tests/trainer/loops/test_all.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/trainer/loops/test_all.py b/tests/trainer/loops/test_all.py index fec5c5e09f318..e0527c2905cb2 100644 --- a/tests/trainer/loops/test_all.py +++ b/tests/trainer/loops/test_all.py @@ -72,6 +72,7 @@ def on_predict_batch_end(self, outputs, batch, *args): @RunIf(min_gpus=1) def test_callback_batch_on_device(tmpdir): + """ Test that the batch object sent to the on_*_batch_start/end hooks is on the right device.""" batch_callback = BatchHookObserverCallback() From 4ffc332a53c7334de33cd6e853967a0f9c80920c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 17 May 2021 10:35:14 +0200 Subject: [PATCH 22/64] replace unneccessary method --- pytorch_lightning/accelerators/accelerator.py | 7 ------- pytorch_lightning/trainer/trainer.py | 4 ++-- pytorch_lightning/trainer/training_loop.py | 2 +- 3 files changed, 3 insertions(+), 10 deletions(-) diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py index 9b2dfc3f1bccf..c7dcf14055b27 100644 --- a/pytorch_lightning/accelerators/accelerator.py +++ b/pytorch_lightning/accelerators/accelerator.py @@ -377,13 +377,6 @@ def setup_precision_plugin(self) -> None: self.optimizers = optimizers self.schedulers = schedulers - def to_device(self, step_kwargs: Dict[str, Union[Any, int]]) -> Dict[str, Union[Any, int]]: - """Pushes the batch to the root device""" - step_kwargs['batch'] = self.batch_to_device( - step_kwargs['batch'], self.root_device, dataloader_idx=step_kwargs.get('dataloader_idx', None) - ) - return step_kwargs - @property def amp_backend(self) -> Optional[LightningEnum]: if isinstance(self.precision_plugin, ApexMixedPrecisionPlugin): diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 67b72849d003c..883afc2b04a88 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -958,7 +958,7 @@ def _run_evaluation(self, on_epoch: bool = False) -> _EVALUATE_OUTPUT: break with self.profiler.profile("evaluation_batch_to_device"): - batch = self.accelerator.to_device(batch) + batch = self.accelerator.batch_to_device(batch, dataloader_idx=dataloader_idx) # hook self.evaluation_loop.on_evaluation_batch_start(batch, batch_idx, dataloader_idx) @@ -1084,7 +1084,7 @@ def _run_predict(self) -> Optional[_PREDICT_OUTPUT]: break with self.profiler.profile("predict_batch_to_device"): - batch = self.accelerator.to_device(batch) + batch = self.accelerator.batch_to_device(batch, dataloader_idx=dataloader_idx) # lightning module methods with self.profiler.profile("predict_step"): diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index 9d58f78ec3dbf..3c4d05716dec9 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -488,7 +488,7 @@ def run_training_epoch(self): # ------------------------------------ with self.trainer.profiler.profile("training_batch_to_device"): - batch = self.trainer.accelerator.to_device(batch) + batch = self.trainer.accelerator.batch_to_device(batch, dataloader_idx=dataloader_idx) with self.trainer.profiler.profile("run_training_batch"): batch_output = self.run_training_batch(batch, batch_idx, dataloader_idx) From 41f83efbf7370a562e82c6a38441357962ebf2c8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 24 May 2021 20:55:14 +0200 Subject: [PATCH 23/64] fix chlog --- CHANGELOG.md | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 619d1b9bc9624..9e0c76beec45e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -136,6 +136,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed print errors in `ProgressBar` when `trainer.fit` is not called ([#7674](https://github.com/PyTorchLightning/pytorch-lightning/pull/7674)) +- Fixed moving batch to device before sending it to the `on_*_batch_start`/`on_*_batch_end` callbacks and model hooks ([#7378](https://github.com/PyTorchLightning/pytorch-lightning/pull/7378)) + + ## [1.3.2] - 2021-05-18 ### Changed @@ -159,10 +162,6 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed local rank displayed in console log ([#7395](https://github.com/PyTorchLightning/pytorch-lightning/pull/7395)) -- Fixed moving batch to device before sending it to the `on_*_batch_start`/`on_*_batch_end` callbacks and model hooks ([#7378](https://github.com/PyTorchLightning/pytorch-lightning/pull/7378)) - - - ## [1.3.0] - 2021-05-06 ### Added From 5c2b7b2629322b292aa37c6503a7ea51634b59f5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Wed, 26 May 2021 01:22:00 +0200 Subject: [PATCH 24/64] adjust batch_to_device for DP Plugin --- pytorch_lightning/accelerators/accelerator.py | 5 ++++- pytorch_lightning/accelerators/gpu.py | 8 -------- 2 files changed, 4 insertions(+), 9 deletions(-) diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py index 991b0a54d1733..e5e52c237c2a6 100644 --- a/pytorch_lightning/accelerators/accelerator.py +++ b/pytorch_lightning/accelerators/accelerator.py @@ -22,6 +22,7 @@ from torch.utils.data import DataLoader import pytorch_lightning as pl +from pytorch_lightning.plugins import DataParallelPlugin from pytorch_lightning.plugins.precision import ApexMixedPrecisionPlugin, NativeMixedPrecisionPlugin, PrecisionPlugin from pytorch_lightning.plugins.training_type import TrainingTypePlugin from pytorch_lightning.trainer.states import TrainerFn @@ -174,7 +175,9 @@ def batch_to_device( """ model = self.lightning_module - if model is not None: + # TODO: Add support to allow batch transfer to device in Lightning for DP mode. + if model is not None and not isinstance(self.training_type_plugin, DataParallelPlugin): + # no need to transfer batch to device in DP mode return model._apply_batch_transfer_handler(batch, device, dataloader_idx) return move_data_to_device(batch, device) diff --git a/pytorch_lightning/accelerators/gpu.py b/pytorch_lightning/accelerators/gpu.py index 7543a2b794b5d..7a0daefb4109a 100644 --- a/pytorch_lightning/accelerators/gpu.py +++ b/pytorch_lightning/accelerators/gpu.py @@ -54,11 +54,3 @@ def set_nvidia_flags(local_rank: int) -> None: all_gpu_ids = ",".join([str(x) for x in range(torch.cuda.device_count())]) devices = os.getenv("CUDA_VISIBLE_DEVICES", all_gpu_ids) _log.info(f"LOCAL_RANK: {local_rank} - CUDA_VISIBLE_DEVICES: [{devices}]") - - def to_device(self, step_kwargs: Dict[str, Union[Any, int]]) -> Dict[str, Union[Any, int]]: - # no need to transfer batch to device in DP mode - # TODO: Add support to allow batch transfer to device in Lightning for DP mode. - if not isinstance(self.training_type_plugin, DataParallelPlugin): - step_kwargs = super().to_device(step_kwargs) - - return step_kwargs From 88bc2fc5fd345fe447154ebbf7299f77888ba7a2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Wed, 26 May 2021 01:24:39 +0200 Subject: [PATCH 25/64] update tests for dataloader idx --- tests/core/test_datamodules.py | 6 +++--- tests/models/test_hooks.py | 6 +++--- tests/trainer/test_dataloaders.py | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/tests/core/test_datamodules.py b/tests/core/test_datamodules.py index e6500a15eeed1..a9d5beb33982a 100644 --- a/tests/core/test_datamodules.py +++ b/tests/core/test_datamodules.py @@ -383,14 +383,14 @@ class CurrentTestDM(LightningDataModule): on_after_batch_transfer_hook_rank = None def on_before_batch_transfer(self, batch, dataloader_idx): - assert dataloader_idx is None + assert dataloader_idx == 0 self.on_before_batch_transfer_hook_rank = self.rank self.rank += 1 batch.samples += 1 return batch def on_after_batch_transfer(self, batch, dataloader_idx): - assert dataloader_idx is None + assert dataloader_idx == 0 assert batch.samples.device == batch.targets.device == expected_device self.on_after_batch_transfer_hook_rank = self.rank self.rank += 1 @@ -398,7 +398,7 @@ def on_after_batch_transfer(self, batch, dataloader_idx): return batch def transfer_batch_to_device(self, batch, device, dataloader_idx): - assert dataloader_idx is None + assert dataloader_idx == 0 self.transfer_batch_to_device_hook_rank = self.rank self.rank += 1 batch.samples = batch.samples.to(device) diff --git a/tests/models/test_hooks.py b/tests/models/test_hooks.py index e8351072d2cc0..23339e98ed436 100644 --- a/tests/models/test_hooks.py +++ b/tests/models/test_hooks.py @@ -148,14 +148,14 @@ class CurrentTestModel(BoringModel): on_after_batch_transfer_hook_rank = None def on_before_batch_transfer(self, batch, dataloader_idx): - assert dataloader_idx is None + assert dataloader_idx == 0 self.on_before_batch_transfer_hook_rank = self.rank self.rank += 1 batch.samples += 1 return batch def on_after_batch_transfer(self, batch, dataloader_idx): - assert dataloader_idx is None + assert dataloader_idx == 0 assert batch.samples.device == batch.targets.device == expected_device self.on_after_batch_transfer_hook_rank = self.rank self.rank += 1 @@ -163,7 +163,7 @@ def on_after_batch_transfer(self, batch, dataloader_idx): return batch def transfer_batch_to_device(self, batch, device, dataloader_idx): - assert dataloader_idx is None + assert dataloader_idx == 0 self.transfer_batch_to_device_hook_rank = self.rank self.rank += 1 batch.samples = batch.samples.to(device) diff --git a/tests/trainer/test_dataloaders.py b/tests/trainer/test_dataloaders.py index c2e5e1c24ac78..b556ded813752 100644 --- a/tests/trainer/test_dataloaders.py +++ b/tests/trainer/test_dataloaders.py @@ -1514,7 +1514,7 @@ def __init__(self): def assert_dataloader_idx_hook(self, dataloader_idx): if self.trainer.training: - assert dataloader_idx is None + assert dataloader_idx == 0 elif self.trainer.validating: assert dataloader_idx == (0 if self.val_call_count <= 5 else 1) elif self.trainer.testing: From 9d3bebaa1fdb0eb6eb00297421691c6b5106b7e7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Wed, 26 May 2021 01:27:08 +0200 Subject: [PATCH 26/64] unused imports --- pytorch_lightning/accelerators/gpu.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pytorch_lightning/accelerators/gpu.py b/pytorch_lightning/accelerators/gpu.py index 7a0daefb4109a..43454482ca230 100644 --- a/pytorch_lightning/accelerators/gpu.py +++ b/pytorch_lightning/accelerators/gpu.py @@ -13,13 +13,11 @@ # limitations under the License. import logging import os -from typing import Any, Dict, Union import torch import pytorch_lightning as pl from pytorch_lightning.accelerators.accelerator import Accelerator -from pytorch_lightning.plugins import DataParallelPlugin from pytorch_lightning.utilities.exceptions import MisconfigurationException _log = logging.getLogger(__name__) From dea8cb4c339c92ef83cdec1a31de211a859f54b1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 1 Jun 2021 17:12:08 +0200 Subject: [PATCH 27/64] hook change --- tests/models/test_hooks.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/models/test_hooks.py b/tests/models/test_hooks.py index 172a6385e3404..f445b9cdfb4bb 100644 --- a/tests/models/test_hooks.py +++ b/tests/models/test_hooks.py @@ -235,10 +235,10 @@ def __init__(self): super().__init__() self.called = [] self.train_batch = [ - 'on_train_batch_start', 'on_before_batch_transfer', 'transfer_batch_to_device', 'on_after_batch_transfer', + 'on_train_batch_start', 'training_step', 'on_before_zero_grad', 'optimizer_zero_grad', @@ -248,10 +248,10 @@ def __init__(self): 'on_train_batch_end', ] self.val_batch = [ - 'on_validation_batch_start', 'on_before_batch_transfer', 'transfer_batch_to_device', 'on_after_batch_transfer', + 'on_validation_batch_start', 'on_validation_batch_end', ] @@ -560,10 +560,10 @@ def test_trainer_model_hook_system_validate(tmpdir): 'on_validation_start', 'on_epoch_start', 'on_validation_epoch_start', - 'on_validation_batch_start', 'on_before_batch_transfer', 'transfer_batch_to_device', 'on_after_batch_transfer', + 'on_validation_batch_start', 'on_validation_batch_end', 'validation_epoch_end', 'on_validation_epoch_end', @@ -594,10 +594,10 @@ def test_trainer_model_hook_system_test(tmpdir): 'on_test_start', 'on_epoch_start', 'on_test_epoch_start', - 'on_test_batch_start', 'on_before_batch_transfer', 'transfer_batch_to_device', 'on_after_batch_transfer', + 'on_test_batch_start', 'on_test_batch_end', 'on_test_epoch_end', 'on_epoch_end', From bf9981474ab65ea89c4e61e30b8e3955c31d39c9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Thu, 3 Jun 2021 11:48:57 +0200 Subject: [PATCH 28/64] switch None --- tests/core/test_datamodules.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/core/test_datamodules.py b/tests/core/test_datamodules.py index a9d5beb33982a..2f145e02a1cb0 100644 --- a/tests/core/test_datamodules.py +++ b/tests/core/test_datamodules.py @@ -390,7 +390,7 @@ def on_before_batch_transfer(self, batch, dataloader_idx): return batch def on_after_batch_transfer(self, batch, dataloader_idx): - assert dataloader_idx == 0 + assert dataloader_idx is None assert batch.samples.device == batch.targets.device == expected_device self.on_after_batch_transfer_hook_rank = self.rank self.rank += 1 @@ -398,7 +398,7 @@ def on_after_batch_transfer(self, batch, dataloader_idx): return batch def transfer_batch_to_device(self, batch, device, dataloader_idx): - assert dataloader_idx == 0 + assert dataloader_idx is None self.transfer_batch_to_device_hook_rank = self.rank self.rank += 1 batch.samples = batch.samples.to(device) From b4a1348c2b1f69a25a5a3ba07c8fe49880842b21 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Thu, 3 Jun 2021 11:54:52 +0200 Subject: [PATCH 29/64] clear memory --- pytorch_lightning/trainer/training_loop.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index 84db9a36de8ac..307a4d160fd41 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -508,6 +508,9 @@ def run_training_epoch(self): dataloader_idx, ) + # clear memory before running validation + del batch + # ----------------------------------------- # SAVE METRICS TO LOGGERS # ----------------------------------------- From 023e6190d09c36dbe3a87d80e1e5eca42cfedf9b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Thu, 3 Jun 2021 12:18:16 +0200 Subject: [PATCH 30/64] change to None --- tests/models/test_hooks.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/models/test_hooks.py b/tests/models/test_hooks.py index f445b9cdfb4bb..82d19114f4283 100644 --- a/tests/models/test_hooks.py +++ b/tests/models/test_hooks.py @@ -148,14 +148,14 @@ class CurrentTestModel(BoringModel): on_after_batch_transfer_hook_rank = None def on_before_batch_transfer(self, batch, dataloader_idx): - assert dataloader_idx == 0 + assert dataloader_idx is None self.on_before_batch_transfer_hook_rank = self.rank self.rank += 1 batch.samples += 1 return batch def on_after_batch_transfer(self, batch, dataloader_idx): - assert dataloader_idx == 0 + assert dataloader_idx is None assert batch.samples.device == batch.targets.device == expected_device self.on_after_batch_transfer_hook_rank = self.rank self.rank += 1 From b71547ee285b18ba7dc27526d1e4bee06d6b0fdf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Thu, 3 Jun 2021 13:00:58 +0200 Subject: [PATCH 31/64] None --- tests/models/test_hooks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/models/test_hooks.py b/tests/models/test_hooks.py index 82d19114f4283..34c28cce45e08 100644 --- a/tests/models/test_hooks.py +++ b/tests/models/test_hooks.py @@ -163,7 +163,7 @@ def on_after_batch_transfer(self, batch, dataloader_idx): return batch def transfer_batch_to_device(self, batch, device, dataloader_idx): - assert dataloader_idx == 0 + assert dataloader_idx is None self.transfer_batch_to_device_hook_rank = self.rank self.rank += 1 batch.samples = batch.samples.to(device) From 7eab3bb0b2eabdccb82ce0383b97b5aaa4de4bc5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Thu, 3 Jun 2021 13:37:32 +0200 Subject: [PATCH 32/64] None --- tests/core/test_datamodules.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/core/test_datamodules.py b/tests/core/test_datamodules.py index 2f145e02a1cb0..e6500a15eeed1 100644 --- a/tests/core/test_datamodules.py +++ b/tests/core/test_datamodules.py @@ -383,7 +383,7 @@ class CurrentTestDM(LightningDataModule): on_after_batch_transfer_hook_rank = None def on_before_batch_transfer(self, batch, dataloader_idx): - assert dataloader_idx == 0 + assert dataloader_idx is None self.on_before_batch_transfer_hook_rank = self.rank self.rank += 1 batch.samples += 1 From 91f1387bb48af0738c1c931f7f735a11fdc780fc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Thu, 3 Jun 2021 17:48:09 +0200 Subject: [PATCH 33/64] memory savings --- pytorch_lightning/trainer/trainer.py | 6 ++++++ pytorch_lightning/trainer/training_loop.py | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 2f268bd601349..7f94a19fcab08 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -988,6 +988,9 @@ def _run_evaluation(self) -> _EVALUATE_OUTPUT: # hook + store predictions self.evaluation_loop.on_evaluation_batch_end(output, batch, batch_idx, dataloader_idx) + # release memory before running any other hooks + del batch + # log batch metrics self.logger_connector.log_evaluation_step_metrics() @@ -1096,6 +1099,9 @@ def _run_predict(self) -> Optional[_PREDICT_OUTPUT]: with self.profiler.profile("predict_step"): self.predict_loop.predict_step(batch, batch_idx, dataloader_idx) + # release memory before running any other hooks + del batch + # call hook results = self.predict_loop.on_predict_epoch_end() diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index 307a4d160fd41..45cd320631ac9 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -508,7 +508,7 @@ def run_training_epoch(self): dataloader_idx, ) - # clear memory before running validation + # release memory before running any other hooks del batch # ----------------------------------------- From 01b72935e9dde741e3587c962c829dba5ce023aa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Sun, 6 Jun 2021 23:44:46 +0200 Subject: [PATCH 34/64] remove redundant todo --- pytorch_lightning/accelerators/accelerator.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py index e5e52c237c2a6..3f2f40364bf84 100644 --- a/pytorch_lightning/accelerators/accelerator.py +++ b/pytorch_lightning/accelerators/accelerator.py @@ -174,8 +174,6 @@ def batch_to_device( dataloader_idx: The index of the dataloader to which the batch belongs. """ model = self.lightning_module - - # TODO: Add support to allow batch transfer to device in Lightning for DP mode. if model is not None and not isinstance(self.training_type_plugin, DataParallelPlugin): # no need to transfer batch to device in DP mode return model._apply_batch_transfer_handler(batch, device, dataloader_idx) From 43a6d1edeb62a15ac69ef69ef2352581ba1947a5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Wed, 9 Jun 2021 16:23:23 +0200 Subject: [PATCH 35/64] hack --- pytorch_lightning/trainer/trainer.py | 4 ++++ pytorch_lightning/trainer/training_loop.py | 2 ++ 2 files changed, 6 insertions(+) diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 7f94a19fcab08..45255fd06d7fe 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -990,6 +990,8 @@ def _run_evaluation(self) -> _EVALUATE_OUTPUT: # release memory before running any other hooks del batch + import gc + gc.collect() # log batch metrics self.logger_connector.log_evaluation_step_metrics() @@ -1101,6 +1103,8 @@ def _run_predict(self) -> Optional[_PREDICT_OUTPUT]: # release memory before running any other hooks del batch + import gc + gc.collect() # call hook results = self.predict_loop.on_predict_epoch_end() diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index 45cd320631ac9..28ed1767eba32 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -510,6 +510,8 @@ def run_training_epoch(self): # release memory before running any other hooks del batch + import gc + gc.collect() # ----------------------------------------- # SAVE METRICS TO LOGGERS From a8433bd0b4bd35f218993335f7d4ff18977ae423 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Wed, 9 Jun 2021 16:39:31 +0200 Subject: [PATCH 36/64] cheat --- pytorch_lightning/trainer/trainer.py | 2 ++ pytorch_lightning/trainer/training_loop.py | 1 + 2 files changed, 3 insertions(+) diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 89e03243855d9..700f2b39fcbc4 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -992,6 +992,7 @@ def _run_evaluation(self) -> _EVALUATE_OUTPUT: del batch import gc gc.collect() + torch.cuda.empty_cache() # log batch metrics self.logger_connector.update_eval_step_metrics() @@ -1102,6 +1103,7 @@ def _run_predict(self) -> Optional[_PREDICT_OUTPUT]: del batch import gc gc.collect() + torch.cuda.empty_cache() # call hook results = self.predict_loop.on_predict_epoch_end() diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index 5c0c2a42c77f3..d4edfcef957b5 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -485,6 +485,7 @@ def run_training_epoch(self): del batch import gc gc.collect() + torch.cuda.empty_cache() # ----------------------------------------- # SAVE METRICS TO LOGGERS AND PROGRESS_BAR From d8cd2b104153eb1bfd711aee3ca61708e957fee5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Wed, 9 Jun 2021 16:39:31 +0200 Subject: [PATCH 37/64] Revert "cheat" This reverts commit a8433bd0b4bd35f218993335f7d4ff18977ae423. --- pytorch_lightning/trainer/trainer.py | 2 -- pytorch_lightning/trainer/training_loop.py | 1 - 2 files changed, 3 deletions(-) diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 700f2b39fcbc4..89e03243855d9 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -992,7 +992,6 @@ def _run_evaluation(self) -> _EVALUATE_OUTPUT: del batch import gc gc.collect() - torch.cuda.empty_cache() # log batch metrics self.logger_connector.update_eval_step_metrics() @@ -1103,7 +1102,6 @@ def _run_predict(self) -> Optional[_PREDICT_OUTPUT]: del batch import gc gc.collect() - torch.cuda.empty_cache() # call hook results = self.predict_loop.on_predict_epoch_end() diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index d4edfcef957b5..5c0c2a42c77f3 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -485,7 +485,6 @@ def run_training_epoch(self): del batch import gc gc.collect() - torch.cuda.empty_cache() # ----------------------------------------- # SAVE METRICS TO LOGGERS AND PROGRESS_BAR From 9467689e4c84171d304b44414bada9cc589f54f2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Wed, 9 Jun 2021 16:23:23 +0200 Subject: [PATCH 38/64] Revert "hack" This reverts commit 43a6d1edeb62a15ac69ef69ef2352581ba1947a5. --- pytorch_lightning/trainer/trainer.py | 4 ---- pytorch_lightning/trainer/training_loop.py | 2 -- 2 files changed, 6 deletions(-) diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 89e03243855d9..d1445d00e674d 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -990,8 +990,6 @@ def _run_evaluation(self) -> _EVALUATE_OUTPUT: # release memory before running any other hooks del batch - import gc - gc.collect() # log batch metrics self.logger_connector.update_eval_step_metrics() @@ -1100,8 +1098,6 @@ def _run_predict(self) -> Optional[_PREDICT_OUTPUT]: # release memory before running any other hooks del batch - import gc - gc.collect() # call hook results = self.predict_loop.on_predict_epoch_end() diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index 5c0c2a42c77f3..d2810f1cd3351 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -483,8 +483,6 @@ def run_training_epoch(self): # release memory before running any other hooks del batch - import gc - gc.collect() # ----------------------------------------- # SAVE METRICS TO LOGGERS AND PROGRESS_BAR From 5d0868029c042e97800bf9e598ad918e9acd0c7c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 15 Jun 2021 23:07:17 +0200 Subject: [PATCH 39/64] update new epoch loop --- pytorch_lightning/loops/training_epoch_loop.py | 3 +++ tests/models/test_hooks.py | 6 +++--- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/pytorch_lightning/loops/training_epoch_loop.py b/pytorch_lightning/loops/training_epoch_loop.py index d029c525d71ac..bf762333f3b46 100644 --- a/pytorch_lightning/loops/training_epoch_loop.py +++ b/pytorch_lightning/loops/training_epoch_loop.py @@ -107,6 +107,9 @@ def advance(self, dataloader_iter: Iterator, **kwargs: Any) -> None: # ------------------------------------ # TRAINING_STEP + TRAINING_STEP_END # ------------------------------------ + with self.trainer.profiler.profile("training_batch_to_device"): + batch = self.trainer.accelerator.batch_to_device(batch, dataloader_idx=self._dataloader_idx) + with self.trainer.profiler.profile("run_training_batch"): batch_output = self.batch_loop.run(batch, self.iteration_count, self._dataloader_idx) self.batches_seen += 1 diff --git a/tests/models/test_hooks.py b/tests/models/test_hooks.py index 6413ca8c930bd..4fecb2d951b7a 100644 --- a/tests/models/test_hooks.py +++ b/tests/models/test_hooks.py @@ -264,10 +264,10 @@ def test_epoch_end(self, *args, **kwargs): @staticmethod def _train_batch(): return [ - 'on_train_batch_start', 'on_before_batch_transfer', 'transfer_batch_to_device', 'on_after_batch_transfer', + 'on_train_batch_start', 'forward', 'training_step', 'training_step_end', @@ -282,10 +282,10 @@ def _train_batch(): @staticmethod def _val_batch(): return [ - 'on_validation_batch_start', 'on_before_batch_transfer', 'transfer_batch_to_device', 'on_after_batch_transfer', + 'on_validation_batch_start', 'forward', 'validation_step', 'validation_step_end', @@ -295,10 +295,10 @@ def _val_batch(): @staticmethod def _test_batch(): return [ - 'on_test_batch_start', 'on_before_batch_transfer', 'transfer_batch_to_device', 'on_after_batch_transfer', + 'on_test_batch_start', 'forward', 'test_step', 'test_step_end', From 032055caaf67f3ed096e08f8eb241bf0cd748733 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 15 Jun 2021 23:07:32 +0200 Subject: [PATCH 40/64] remove from old loop code --- pytorch_lightning/trainer/training_loop.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index 7170090961c59..f76568454b7ac 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -456,10 +456,6 @@ def run_training_epoch(self): # ------------------------------------ # TRAINING_STEP + TRAINING_STEP_END # ------------------------------------ - - with self.trainer.profiler.profile("training_batch_to_device"): - batch = self.trainer.accelerator.batch_to_device(batch, dataloader_idx=dataloader_idx) - with self.trainer.profiler.profile("run_training_batch"): batch_output = self.run_training_batch(batch, batch_idx, dataloader_idx) @@ -476,9 +472,6 @@ def run_training_epoch(self): dataloader_idx, ) - # release memory before running any other hooks - del batch - # ----------------------------------------- # SAVE METRICS TO LOGGERS AND PROGRESS_BAR # ----------------------------------------- From 1b4f2af3be2d9bff3d53c52a94a7bd608f4a481f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 15 Jun 2021 23:09:42 +0200 Subject: [PATCH 41/64] update chlog --- CHANGELOG.md | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index eb6abca8aca8f..bbbf43f62d57e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -251,6 +251,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed `BaseFinetuning` callback to properly handle parent modules w/ parameters ([#7931](https://github.com/PyTorchLightning/pytorch-lightning/pull/7931)) +- Fixed moving batch to device before sending it to the `on_*_batch_start`/`on_*_batch_end` callbacks and model hooks ([#7378](https://github.com/PyTorchLightning/pytorch-lightning/pull/7378)) + + ## [1.3.5] - 2021-06-08 ### Added @@ -293,10 +296,6 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed global step update when the epoch is skipped ([#7677](https://github.com/PyTorchLightning/pytorch-lightning/pull/7677)) - Fixed training loop total batch counter when accumulate grad batches was enabled ([#7692](https://github.com/PyTorchLightning/pytorch-lightning/pull/7692)) - -- Fixed moving batch to device before sending it to the `on_*_batch_start`/`on_*_batch_end` callbacks and model hooks ([#7378](https://github.com/PyTorchLightning/pytorch-lightning/pull/7378)) - - ## [1.3.2] - 2021-05-18 ### Changed From faaec6ab94fc756a1033732e3bf18434b994707d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Fri, 18 Jun 2021 14:42:13 +0200 Subject: [PATCH 42/64] update hook test --- tests/models/test_hooks.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/models/test_hooks.py b/tests/models/test_hooks.py index 1fd905f210da0..487e3ed559dee 100644 --- a/tests/models/test_hooks.py +++ b/tests/models/test_hooks.py @@ -704,9 +704,9 @@ def call(hook, fn, *args, **kwargs): dm = HookedDataModule(called) trainer.fit(model, datamodule=dm) batch_transfer = [ - dict(name='on_before_batch_transfer', args=(ANY, None)), - dict(name='transfer_batch_to_device', args=(ANY, torch.device('cpu'), None)), - dict(name='on_after_batch_transfer', args=(ANY, None)), + dict(name='on_before_batch_transfer', args=(ANY, 0)), + dict(name='transfer_batch_to_device', args=(ANY, torch.device('cpu'), 0)), + dict(name='on_after_batch_transfer', args=(ANY, 0)), ] expected = [ dict(name='prepare_data'), From 053f37773e0594e533cc24f3b6c6027132303f49 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Fri, 18 Jun 2021 14:46:05 +0200 Subject: [PATCH 43/64] changelog --- CHANGELOG.md | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a6a2dbfa01718..31fcefb8fe5c0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -280,6 +280,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Pass the `stage` argument of `Callback.{setup,teardown}` as a keyword ([#7973](https://github.com/PyTorchLightning/pytorch-lightning/pull/7973)) +- Fixed a bug where skipping an optimizer while using amp causes amp to trigger an assertion error ([#7975](https://github.com/PyTorchLightning/pytorch-lightning/pull/7975)) + + - Fixed moving batch to device before sending it to the `on_*_batch_start`/`on_*_batch_end` callbacks and model hooks ([#7378](https://github.com/PyTorchLightning/pytorch-lightning/pull/7378)) @@ -336,10 +339,6 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed training loop total batch counter when accumulate grad batches was enabled ([#7692](https://github.com/PyTorchLightning/pytorch-lightning/pull/7692)) - -- Fixed a bug where skipping an optimizer while using amp causes amp to trigger an assertion error ([#7975](https://github.com/PyTorchLightning/pytorch-lightning/pull/7975)) - - ## [1.3.2] - 2021-05-18 ### Changed From 14ea8a8c8f2a4ed4014b47f53df5b29593831b3f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Fri, 18 Jun 2021 14:47:56 +0200 Subject: [PATCH 44/64] teardown --- pytorch_lightning/loops/training_epoch_loop.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pytorch_lightning/loops/training_epoch_loop.py b/pytorch_lightning/loops/training_epoch_loop.py index 2792bc7e01b4c..20e9b949dd8d3 100644 --- a/pytorch_lightning/loops/training_epoch_loop.py +++ b/pytorch_lightning/loops/training_epoch_loop.py @@ -210,6 +210,10 @@ def on_run_end(self) -> List[List[STEP_OUTPUT]]: self.trainer.logger_connector.on_epoch_end() return self.epoch_output + def teardown(self) -> None: + """Frees memory of tracked epoch outputs.""" + self.epoch_output = None + def _on_train_epoch_end_hook(self, processed_epoch_output: List[List[STEP_OUTPUT]]) -> None: """Runs ``on_train_epoch_end hook``.""" # We cannot rely on Trainer.call_hook because the signatures might be different across From 0733da2a5ed2f3c8870a1d9390cbff47cd9f6fb0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 21 Jun 2021 13:29:54 +0200 Subject: [PATCH 45/64] integrate changes in new eval loop --- pytorch_lightning/loops/evaluation_epoch_loop.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pytorch_lightning/loops/evaluation_epoch_loop.py b/pytorch_lightning/loops/evaluation_epoch_loop.py index d42a8941630a1..c4f2d80344dc4 100644 --- a/pytorch_lightning/loops/evaluation_epoch_loop.py +++ b/pytorch_lightning/loops/evaluation_epoch_loop.py @@ -100,6 +100,9 @@ def advance( if batch is None: raise StopIteration + with self.trainer.profiler.profile("evaluation_batch_to_device"): + batch = self.trainer.accelerator.batch_to_device(batch, dataloader_idx=dataloader_idx) + # hook self.on_evaluation_batch_start(batch, batch_idx, dataloader_idx) From 708cf0e5ec332417bde579387ad5c07c8434c595 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 21 Jun 2021 14:01:46 +0200 Subject: [PATCH 46/64] fix hook calls --- tests/models/test_hooks.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/models/test_hooks.py b/tests/models/test_hooks.py index d5b8700384be3..b953993165eda 100644 --- a/tests/models/test_hooks.py +++ b/tests/models/test_hooks.py @@ -335,12 +335,12 @@ def _eval_batch(fn, trainer, model, batches, key): outputs = {key: ANY} for i in range(batches): out.extend([ + dict(name='on_before_batch_transfer', args=(ANY, 0)), + dict(name='transfer_batch_to_device', args=(ANY, torch.device('cpu'), 0)), + dict(name='on_after_batch_transfer', args=(ANY, 0)), # TODO: `{,Callback}.on_batch_{start,end}` dict(name=f'Callback.on_{fn}_batch_start', args=(trainer, model, ANY, i, 0)), dict(name=f'on_{fn}_batch_start', args=(ANY, i, 0)), - dict(name='on_before_batch_transfer', args=(ANY, None)), - dict(name='transfer_batch_to_device', args=(ANY, torch.device('cpu'), None)), - dict(name='on_after_batch_transfer', args=(ANY, None)), dict(name='forward', args=(ANY, )), dict(name=f'{fn}_step', args=(ANY, i)), dict(name=f'{fn}_step_end', args=(outputs, )), @@ -355,11 +355,11 @@ def _predict_batch(trainer, model, batches): for i in range(batches): out.extend([ # TODO: `{,Callback}.on_batch_{start,end}` + dict(name='on_before_batch_transfer', args=(ANY, 0)), + dict(name='transfer_batch_to_device', args=(ANY, torch.device('cpu'), 0)), + dict(name='on_after_batch_transfer', args=(ANY, 0)), dict(name='Callback.on_predict_batch_start', args=(trainer, model, ANY, i, 0)), dict(name='on_predict_batch_start', args=(ANY, i, 0)), - dict(name='on_before_batch_transfer', args=(ANY, None)), - dict(name='transfer_batch_to_device', args=(ANY, torch.device('cpu'), None)), - dict(name='on_after_batch_transfer', args=(ANY, None)), dict(name='forward', args=(ANY, )), dict(name='predict_step', args=(ANY, i)), # TODO: `predict_step_end` From e5de5825c9fbb6b36f6f9a709ecd1e090cedd4b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 28 Jun 2021 00:20:23 +0200 Subject: [PATCH 47/64] add prediction step --- pytorch_lightning/loops/epoch/prediction_epoch_loop.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pytorch_lightning/loops/epoch/prediction_epoch_loop.py b/pytorch_lightning/loops/epoch/prediction_epoch_loop.py index 258a81648a3e0..58ada1f48316e 100644 --- a/pytorch_lightning/loops/epoch/prediction_epoch_loop.py +++ b/pytorch_lightning/loops/epoch/prediction_epoch_loop.py @@ -83,6 +83,9 @@ def advance( if batch is None: raise StopIteration + with self.trainer.profiler.profile("predict_batch_to_device"): + batch = self.trainer.accelerator.batch_to_device(batch, dataloader_idx=dataloader_idx) + with self.trainer.profiler.profile("predict_step"): self._predict_step(batch, batch_idx, dataloader_idx) From 488080863cf012dcf04446be3b7d973b7340687e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 28 Jun 2021 00:23:04 +0200 Subject: [PATCH 48/64] bad merge --- tests/models/test_hooks.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/tests/models/test_hooks.py b/tests/models/test_hooks.py index 18e304eeb664c..0e0f2e1fa3a5b 100644 --- a/tests/models/test_hooks.py +++ b/tests/models/test_hooks.py @@ -291,9 +291,9 @@ def _train_batch(trainer, model, batches, current_epoch=0): dict(name='Callback.on_batch_start', args=(trainer, model)), dict(name='Callback.on_train_batch_start', args=(trainer, model, ANY, i, 0)), dict(name='on_train_batch_start', args=(ANY, i, 0)), - dict(name='on_before_batch_transfer', args=(ANY, None)), - dict(name='transfer_batch_to_device', args=(ANY, torch.device('cpu'), None)), - dict(name='on_after_batch_transfer', args=(ANY, None)), + dict(name='on_before_batch_transfer', args=(ANY, 0)), + dict(name='transfer_batch_to_device', args=(ANY, torch.device('cpu'), 0)), + dict(name='on_after_batch_transfer', args=(ANY, 0)), dict(name='forward', args=(ANY, )), dict(name='training_step', args=(ANY, i)), dict(name='training_step_end', args=(dict(loss=ANY), )), @@ -338,12 +338,12 @@ def _eval_batch(fn, trainer, model, batches, key): outputs = {key: ANY} for i in range(batches): out.extend([ - dict(name='on_before_batch_transfer', args=(ANY, 0)), - dict(name='transfer_batch_to_device', args=(ANY, torch.device('cpu'), 0)), - dict(name='on_after_batch_transfer', args=(ANY, 0)), # TODO: `{,Callback}.on_batch_{start,end}` dict(name=f'Callback.on_{fn}_batch_start', args=(trainer, model, ANY, i, 0)), dict(name=f'on_{fn}_batch_start', args=(ANY, i, 0)), + dict(name='on_before_batch_transfer', args=(ANY, 0)), + dict(name='transfer_batch_to_device', args=(ANY, torch.device('cpu'), 0)), + dict(name='on_after_batch_transfer', args=(ANY, 0)), dict(name='forward', args=(ANY, )), dict(name=f'{fn}_step', args=(ANY, i)), dict(name=f'{fn}_step_end', args=(outputs, )), @@ -358,11 +358,11 @@ def _predict_batch(trainer, model, batches): for i in range(batches): out.extend([ # TODO: `{,Callback}.on_batch_{start,end}` + dict(name='Callback.on_predict_batch_start', args=(trainer, model, ANY, i, 0)), + dict(name='on_predict_batch_start', args=(ANY, i, 0)), dict(name='on_before_batch_transfer', args=(ANY, 0)), dict(name='transfer_batch_to_device', args=(ANY, torch.device('cpu'), 0)), dict(name='on_after_batch_transfer', args=(ANY, 0)), - dict(name='Callback.on_predict_batch_start', args=(trainer, model, ANY, i, 0)), - dict(name='on_predict_batch_start', args=(ANY, i, 0)), dict(name='forward', args=(ANY, )), dict(name='predict_step', args=(ANY, i)), # TODO: `predict_step_end` From 3c3e87abf294773e7cf4d7f242331307e6eb8eee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 28 Jun 2021 00:23:04 +0200 Subject: [PATCH 49/64] Revert "bad merge" This reverts commit 488080863cf012dcf04446be3b7d973b7340687e. --- tests/models/test_hooks.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/tests/models/test_hooks.py b/tests/models/test_hooks.py index 0e0f2e1fa3a5b..18e304eeb664c 100644 --- a/tests/models/test_hooks.py +++ b/tests/models/test_hooks.py @@ -291,9 +291,9 @@ def _train_batch(trainer, model, batches, current_epoch=0): dict(name='Callback.on_batch_start', args=(trainer, model)), dict(name='Callback.on_train_batch_start', args=(trainer, model, ANY, i, 0)), dict(name='on_train_batch_start', args=(ANY, i, 0)), - dict(name='on_before_batch_transfer', args=(ANY, 0)), - dict(name='transfer_batch_to_device', args=(ANY, torch.device('cpu'), 0)), - dict(name='on_after_batch_transfer', args=(ANY, 0)), + dict(name='on_before_batch_transfer', args=(ANY, None)), + dict(name='transfer_batch_to_device', args=(ANY, torch.device('cpu'), None)), + dict(name='on_after_batch_transfer', args=(ANY, None)), dict(name='forward', args=(ANY, )), dict(name='training_step', args=(ANY, i)), dict(name='training_step_end', args=(dict(loss=ANY), )), @@ -338,12 +338,12 @@ def _eval_batch(fn, trainer, model, batches, key): outputs = {key: ANY} for i in range(batches): out.extend([ - # TODO: `{,Callback}.on_batch_{start,end}` - dict(name=f'Callback.on_{fn}_batch_start', args=(trainer, model, ANY, i, 0)), - dict(name=f'on_{fn}_batch_start', args=(ANY, i, 0)), dict(name='on_before_batch_transfer', args=(ANY, 0)), dict(name='transfer_batch_to_device', args=(ANY, torch.device('cpu'), 0)), dict(name='on_after_batch_transfer', args=(ANY, 0)), + # TODO: `{,Callback}.on_batch_{start,end}` + dict(name=f'Callback.on_{fn}_batch_start', args=(trainer, model, ANY, i, 0)), + dict(name=f'on_{fn}_batch_start', args=(ANY, i, 0)), dict(name='forward', args=(ANY, )), dict(name=f'{fn}_step', args=(ANY, i)), dict(name=f'{fn}_step_end', args=(outputs, )), @@ -358,11 +358,11 @@ def _predict_batch(trainer, model, batches): for i in range(batches): out.extend([ # TODO: `{,Callback}.on_batch_{start,end}` - dict(name='Callback.on_predict_batch_start', args=(trainer, model, ANY, i, 0)), - dict(name='on_predict_batch_start', args=(ANY, i, 0)), dict(name='on_before_batch_transfer', args=(ANY, 0)), dict(name='transfer_batch_to_device', args=(ANY, torch.device('cpu'), 0)), dict(name='on_after_batch_transfer', args=(ANY, 0)), + dict(name='Callback.on_predict_batch_start', args=(trainer, model, ANY, i, 0)), + dict(name='on_predict_batch_start', args=(ANY, i, 0)), dict(name='forward', args=(ANY, )), dict(name='predict_step', args=(ANY, i)), # TODO: `predict_step_end` From da082690661d5b619b74e56767a04f1c68b1f1c9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 28 Jun 2021 00:26:31 +0200 Subject: [PATCH 50/64] fix train batch hook test --- tests/models/test_hooks.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/models/test_hooks.py b/tests/models/test_hooks.py index 18e304eeb664c..13c9b9f13ec23 100644 --- a/tests/models/test_hooks.py +++ b/tests/models/test_hooks.py @@ -287,13 +287,13 @@ def _train_batch(trainer, model, batches, current_epoch=0): out = [] for i in range(batches): out.extend([ + dict(name='on_before_batch_transfer', args=(ANY, 0)), + dict(name='transfer_batch_to_device', args=(ANY, torch.device('cpu'), 0)), + dict(name='on_after_batch_transfer', args=(ANY, 0)), # TODO: `on_batch_{start,end}` dict(name='Callback.on_batch_start', args=(trainer, model)), dict(name='Callback.on_train_batch_start', args=(trainer, model, ANY, i, 0)), dict(name='on_train_batch_start', args=(ANY, i, 0)), - dict(name='on_before_batch_transfer', args=(ANY, None)), - dict(name='transfer_batch_to_device', args=(ANY, torch.device('cpu'), None)), - dict(name='on_after_batch_transfer', args=(ANY, None)), dict(name='forward', args=(ANY, )), dict(name='training_step', args=(ANY, i)), dict(name='training_step_end', args=(dict(loss=ANY), )), From ebe3ce3ee3df681cba6586fcf870be6735993d88 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 28 Jun 2021 00:27:04 +0200 Subject: [PATCH 51/64] rm -rf _notebooks --- _notebooks | 1 - 1 file changed, 1 deletion(-) delete mode 160000 _notebooks diff --git a/_notebooks b/_notebooks deleted file mode 160000 index 3321b468e7816..0000000000000 --- a/_notebooks +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 3321b468e78167aaf056894e92ed6d649c76e89e From 2a0aedb2f674ed617192ed4eee4c312b522328bf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 28 Jun 2021 00:39:16 +0200 Subject: [PATCH 52/64] update chlog --- CHANGELOG.md | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b93be00d977d0..e32a6b92ae97f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -315,6 +315,12 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed a DDP info message that was never shown ([#8111](https://github.com/PyTorchLightning/pytorch-lightning/pull/8111)) +- Fixes access to `callback_metrics` in ddp_spawn ([#7916](https://github.com/PyTorchLightning/pytorch-lightning/pull/7916)) + + +- Fixed moving batch to device before sending it to the `on_*_batch_start`/`on_*_batch_end` callbacks and model hooks ([#7378](https://github.com/PyTorchLightning/pytorch-lightning/pull/7378)) + + ## [1.3.7] - 2021-06-22 - Fixed a bug where skipping an optimizer while using amp causes amp to trigger an assertion error ([#7975](https://github.com/PyTorchLightning/pytorch-lightning/pull/7975)) @@ -324,12 +330,6 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed moving the best score to device in `EarlyStopping` callback for TPU devices ([#7959](https://github.com/PyTorchLightning/pytorch-lightning/pull/7959)) -- Fixed a bug where skipping an optimizer while using amp causes amp to trigger an assertion error ([#7975](https://github.com/PyTorchLightning/pytorch-lightning/pull/7975)) - - -- Fixed moving batch to device before sending it to the `on_*_batch_start`/`on_*_batch_end` callbacks and model hooks ([#7378](https://github.com/PyTorchLightning/pytorch-lightning/pull/7378)) - - ## [1.3.6] - 2021-06-15 ### Fixed @@ -340,9 +340,6 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed `BaseFinetuning` callback to properly handle parent modules w/ parameters ([#7931](https://github.com/PyTorchLightning/pytorch-lightning/pull/7931)) -- Fixes access to `callback_metrics` in ddp_spawn ([#7916](https://github.com/PyTorchLightning/pytorch-lightning/pull/7916)) - - ## [1.3.5] - 2021-06-08 ### Added From f4a2f8cf6ba55564b90a62ce995ce0b15837d556 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 28 Jun 2021 00:57:00 +0200 Subject: [PATCH 53/64] release memory --- pytorch_lightning/loops/batch/training_batch_loop.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pytorch_lightning/loops/batch/training_batch_loop.py b/pytorch_lightning/loops/batch/training_batch_loop.py index 76051fc3f1e94..2f19273c0aa83 100644 --- a/pytorch_lightning/loops/batch/training_batch_loop.py +++ b/pytorch_lightning/loops/batch/training_batch_loop.py @@ -138,6 +138,10 @@ def advance(self, batch, batch_idx, dataloader_idx): if result: self.batch_outputs[0].append(result.training_step_output) + def on_run_end(self) -> Any: + # release memory + self._remaining_splits = None + def num_active_optimizers(self, batch_idx: Optional[int] = None) -> int: """Gets the number of active optimizers based on their frequency""" return len(self.get_active_optimizers(batch_idx)) From 8838b433b13eb2c107f52423e02c10eb1ec4fa47 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 28 Jun 2021 00:57:55 +0200 Subject: [PATCH 54/64] fix type --- pytorch_lightning/loops/batch/training_batch_loop.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/loops/batch/training_batch_loop.py b/pytorch_lightning/loops/batch/training_batch_loop.py index 2f19273c0aa83..593a34e967416 100644 --- a/pytorch_lightning/loops/batch/training_batch_loop.py +++ b/pytorch_lightning/loops/batch/training_batch_loop.py @@ -138,7 +138,7 @@ def advance(self, batch, batch_idx, dataloader_idx): if result: self.batch_outputs[0].append(result.training_step_output) - def on_run_end(self) -> Any: + def on_run_end(self) -> None: # release memory self._remaining_splits = None From 6d5c61cabcbde55259deb34c9bd58ff35b3fd570 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 28 Jun 2021 01:05:16 +0200 Subject: [PATCH 55/64] notebooks mess --- _notebooks | 1 + notebooks | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) create mode 160000 _notebooks delete mode 160000 notebooks diff --git a/_notebooks b/_notebooks new file mode 160000 index 0000000000000..29aea106edefc --- /dev/null +++ b/_notebooks @@ -0,0 +1 @@ +Subproject commit 29aea106edefc9d1904c0c17223a8ac2b15c48e7 diff --git a/notebooks b/notebooks deleted file mode 160000 index aeae8085b4833..0000000000000 --- a/notebooks +++ /dev/null @@ -1 +0,0 @@ -Subproject commit aeae8085b48339e9bd9ab61d81cc0dc8b0d48f9c From eec4ee2f77b5eb39965211a250598ed5d2320e88 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 28 Jun 2021 01:05:51 +0200 Subject: [PATCH 56/64] debug --- pytorch_lightning/loops/epoch/evaluation_epoch_loop.py | 2 ++ pytorch_lightning/loops/epoch/training_epoch_loop.py | 3 +++ 2 files changed, 5 insertions(+) diff --git a/pytorch_lightning/loops/epoch/evaluation_epoch_loop.py b/pytorch_lightning/loops/epoch/evaluation_epoch_loop.py index c4f2d80344dc4..e2cf586fcc313 100644 --- a/pytorch_lightning/loops/epoch/evaluation_epoch_loop.py +++ b/pytorch_lightning/loops/epoch/evaluation_epoch_loop.py @@ -114,6 +114,8 @@ def advance( # hook + store predictions self.on_evaluation_batch_end(output, batch, batch_idx, dataloader_idx) + del batch + # log batch metrics self.trainer.logger_connector.update_eval_step_metrics() diff --git a/pytorch_lightning/loops/epoch/training_epoch_loop.py b/pytorch_lightning/loops/epoch/training_epoch_loop.py index 7751e57f1a2a4..7b9148de4f9b1 100644 --- a/pytorch_lightning/loops/epoch/training_epoch_loop.py +++ b/pytorch_lightning/loops/epoch/training_epoch_loop.py @@ -133,6 +133,9 @@ def advance(self, dataloader_iter: Iterator, **kwargs: Any) -> None: self.trainer.call_hook( 'on_train_batch_end', processed_batch_end_outputs, batch, self.iteration_count, self._dataloader_idx ) + + del batch + self.trainer.call_hook('on_batch_end') self.trainer.logger_connector.on_batch_end() From 968c967b451b6a694dd9cb9a503d23dc92552f54 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 28 Jun 2021 01:05:51 +0200 Subject: [PATCH 57/64] Revert "debug" This reverts commit eec4ee2f77b5eb39965211a250598ed5d2320e88. --- pytorch_lightning/loops/epoch/evaluation_epoch_loop.py | 2 -- pytorch_lightning/loops/epoch/training_epoch_loop.py | 3 --- 2 files changed, 5 deletions(-) diff --git a/pytorch_lightning/loops/epoch/evaluation_epoch_loop.py b/pytorch_lightning/loops/epoch/evaluation_epoch_loop.py index e2cf586fcc313..c4f2d80344dc4 100644 --- a/pytorch_lightning/loops/epoch/evaluation_epoch_loop.py +++ b/pytorch_lightning/loops/epoch/evaluation_epoch_loop.py @@ -114,8 +114,6 @@ def advance( # hook + store predictions self.on_evaluation_batch_end(output, batch, batch_idx, dataloader_idx) - del batch - # log batch metrics self.trainer.logger_connector.update_eval_step_metrics() diff --git a/pytorch_lightning/loops/epoch/training_epoch_loop.py b/pytorch_lightning/loops/epoch/training_epoch_loop.py index 7b9148de4f9b1..7751e57f1a2a4 100644 --- a/pytorch_lightning/loops/epoch/training_epoch_loop.py +++ b/pytorch_lightning/loops/epoch/training_epoch_loop.py @@ -133,9 +133,6 @@ def advance(self, dataloader_iter: Iterator, **kwargs: Any) -> None: self.trainer.call_hook( 'on_train_batch_end', processed_batch_end_outputs, batch, self.iteration_count, self._dataloader_idx ) - - del batch - self.trainer.call_hook('on_batch_end') self.trainer.logger_connector.on_batch_end() From fc2d6128a6520b90b39c91ebd71047fd107a9c4b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 28 Jun 2021 02:52:23 +0200 Subject: [PATCH 58/64] teardown --- pytorch_lightning/loops/base.py | 1 + pytorch_lightning/loops/batch/training_batch_loop.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/pytorch_lightning/loops/base.py b/pytorch_lightning/loops/base.py index 1d976aa3cd079..cd125ba2a1729 100644 --- a/pytorch_lightning/loops/base.py +++ b/pytorch_lightning/loops/base.py @@ -92,6 +92,7 @@ def run(self, *args: Any, **kwargs: Any) -> Optional[Any]: self.on_advance_end() self.iteration_count += 1 except StopIteration: + self.teardown() break output = self.on_run_end() diff --git a/pytorch_lightning/loops/batch/training_batch_loop.py b/pytorch_lightning/loops/batch/training_batch_loop.py index 593a34e967416..78f4dd175ebc3 100644 --- a/pytorch_lightning/loops/batch/training_batch_loop.py +++ b/pytorch_lightning/loops/batch/training_batch_loop.py @@ -138,7 +138,7 @@ def advance(self, batch, batch_idx, dataloader_idx): if result: self.batch_outputs[0].append(result.training_step_output) - def on_run_end(self) -> None: + def teardown(self) -> None: # release memory self._remaining_splits = None From 856cd6623b3bd717caf694b3dfadd18d2692d58d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Thu, 1 Jul 2021 20:24:26 +0200 Subject: [PATCH 59/64] fix teardown bug --- pytorch_lightning/loops/epoch/training_epoch_loop.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/loops/epoch/training_epoch_loop.py b/pytorch_lightning/loops/epoch/training_epoch_loop.py index 7751e57f1a2a4..b772dc675d7c6 100644 --- a/pytorch_lightning/loops/epoch/training_epoch_loop.py +++ b/pytorch_lightning/loops/epoch/training_epoch_loop.py @@ -227,7 +227,7 @@ def on_run_end(self) -> List[List[STEP_OUTPUT]]: def teardown(self) -> None: """Frees memory of tracked epoch outputs.""" - self.epoch_output = None + self._epoch_output = None def _on_train_epoch_end_hook(self, processed_epoch_output: List[List[STEP_OUTPUT]]) -> None: """Runs ``on_train_epoch_end hook``.""" From a6e61019462b80d09d31b65bed289fa6e4dd15f6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Thu, 1 Jul 2021 20:34:39 +0200 Subject: [PATCH 60/64] debug --- pytorch_lightning/loops/epoch/training_epoch_loop.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pytorch_lightning/loops/epoch/training_epoch_loop.py b/pytorch_lightning/loops/epoch/training_epoch_loop.py index b772dc675d7c6..94b98a3d4615f 100644 --- a/pytorch_lightning/loops/epoch/training_epoch_loop.py +++ b/pytorch_lightning/loops/epoch/training_epoch_loop.py @@ -110,11 +110,13 @@ def advance(self, dataloader_iter: Iterator, **kwargs: Any) -> None: # TRAINING_STEP + TRAINING_STEP_END # ------------------------------------ with self.trainer.profiler.profile("training_batch_to_device"): + print("before run", self.iteration_count, torch.cuda.memory_allocated()) batch = self.trainer.accelerator.batch_to_device(batch, dataloader_idx=self._dataloader_idx) with self.trainer.profiler.profile("run_training_batch"): batch_output = self.batch_loop.run(batch, self.iteration_count, self._dataloader_idx) self.batches_seen += 1 + print("after run", self.iteration_count, torch.cuda.memory_allocated()) # when returning -1 from train_step, we end epoch early if batch_output.signal == -1: @@ -150,6 +152,7 @@ def on_advance_end(self): Raises: StopIteration: if :attr:`done` evaluates to ``True`` to finish this epoch """ + print("advance end", self.iteration_count, torch.cuda.memory_allocated()) # ----------------------------------------- # VALIDATE IF NEEDED + CHECKPOINT CALLBACK # ----------------------------------------- From cde1622864373c675ce9f9a55887e7595071dde8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Thu, 1 Jul 2021 20:40:39 +0200 Subject: [PATCH 61/64] x --- pytorch_lightning/loops/base.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pytorch_lightning/loops/base.py b/pytorch_lightning/loops/base.py index cd125ba2a1729..1d976aa3cd079 100644 --- a/pytorch_lightning/loops/base.py +++ b/pytorch_lightning/loops/base.py @@ -92,7 +92,6 @@ def run(self, *args: Any, **kwargs: Any) -> Optional[Any]: self.on_advance_end() self.iteration_count += 1 except StopIteration: - self.teardown() break output = self.on_run_end() From 5ddeaec06911e96730aade1be6ee71d097b46b9a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Thu, 1 Jul 2021 21:01:27 +0200 Subject: [PATCH 62/64] debug --- pytorch_lightning/loops/epoch/evaluation_epoch_loop.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pytorch_lightning/loops/epoch/evaluation_epoch_loop.py b/pytorch_lightning/loops/epoch/evaluation_epoch_loop.py index c4f2d80344dc4..f35bd998f1bfa 100644 --- a/pytorch_lightning/loops/epoch/evaluation_epoch_loop.py +++ b/pytorch_lightning/loops/epoch/evaluation_epoch_loop.py @@ -100,6 +100,9 @@ def advance( if batch is None: raise StopIteration + assert self.num_dataloaders is not None + self.trainer.logger_connector.on_evaluation_batch_start(batch, batch_idx, dataloader_idx, self.num_dataloaders) + with self.trainer.profiler.profile("evaluation_batch_to_device"): batch = self.trainer.accelerator.batch_to_device(batch, dataloader_idx=dataloader_idx) @@ -172,9 +175,6 @@ def on_evaluation_batch_start(self, batch: Any, batch_idx: int, dataloader_idx: """ self.trainer.logger_connector.on_batch_start() - assert self.num_dataloaders is not None - self.trainer.logger_connector.on_evaluation_batch_start(batch, batch_idx, dataloader_idx, self.num_dataloaders) - if self.trainer.testing: self.trainer.call_hook("on_test_batch_start", batch, batch_idx, dataloader_idx) else: From c712b625eb2a88f32c7a39cfc0d2c05751718733 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Thu, 1 Jul 2021 21:31:34 +0200 Subject: [PATCH 63/64] Revert "debug" This reverts commit a6e61019462b80d09d31b65bed289fa6e4dd15f6. Revert "debug" This reverts commit 5ddeaec06911e96730aade1be6ee71d097b46b9a. debug debug Revert "debug" This reverts commit 605be746f7daedf265b2c05a1c153ce543394435. Revert "Revert "debug"" This reverts commit a7612d5410409ed886cfb609457349ecf44cbfa8. debug x x x s tol x tol --- benchmarks/test_basic_parity.py | 4 ++-- pytorch_lightning/loops/epoch/evaluation_epoch_loop.py | 6 +++--- pytorch_lightning/loops/epoch/training_epoch_loop.py | 3 --- 3 files changed, 5 insertions(+), 8 deletions(-) diff --git a/benchmarks/test_basic_parity.py b/benchmarks/test_basic_parity.py index bf2ddae2c0084..e01d45e4423ca 100644 --- a/benchmarks/test_basic_parity.py +++ b/benchmarks/test_basic_parity.py @@ -45,8 +45,8 @@ def assert_parity_absolute(pl_values, pt_values, norm_by: float = 1, max_diff: f @pytest.mark.parametrize( 'cls_model,max_diff_speed,max_diff_memory', [ - (ParityModuleRNN, 0.05, 0.0), - (ParityModuleMNIST, 0.25, 0.0), # todo: lower this thr + (ParityModuleRNN, 0.05, 0.001), + (ParityModuleMNIST, 0.25, 0.001), # todo: lower this thr ] ) @pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires GPU machine") diff --git a/pytorch_lightning/loops/epoch/evaluation_epoch_loop.py b/pytorch_lightning/loops/epoch/evaluation_epoch_loop.py index f35bd998f1bfa..c4f2d80344dc4 100644 --- a/pytorch_lightning/loops/epoch/evaluation_epoch_loop.py +++ b/pytorch_lightning/loops/epoch/evaluation_epoch_loop.py @@ -100,9 +100,6 @@ def advance( if batch is None: raise StopIteration - assert self.num_dataloaders is not None - self.trainer.logger_connector.on_evaluation_batch_start(batch, batch_idx, dataloader_idx, self.num_dataloaders) - with self.trainer.profiler.profile("evaluation_batch_to_device"): batch = self.trainer.accelerator.batch_to_device(batch, dataloader_idx=dataloader_idx) @@ -175,6 +172,9 @@ def on_evaluation_batch_start(self, batch: Any, batch_idx: int, dataloader_idx: """ self.trainer.logger_connector.on_batch_start() + assert self.num_dataloaders is not None + self.trainer.logger_connector.on_evaluation_batch_start(batch, batch_idx, dataloader_idx, self.num_dataloaders) + if self.trainer.testing: self.trainer.call_hook("on_test_batch_start", batch, batch_idx, dataloader_idx) else: diff --git a/pytorch_lightning/loops/epoch/training_epoch_loop.py b/pytorch_lightning/loops/epoch/training_epoch_loop.py index 5d00246bbfa37..e0234d461545f 100644 --- a/pytorch_lightning/loops/epoch/training_epoch_loop.py +++ b/pytorch_lightning/loops/epoch/training_epoch_loop.py @@ -115,13 +115,11 @@ def advance(self, dataloader_iter: Iterator, **kwargs: Any) -> None: # TRAINING_STEP + TRAINING_STEP_END # ------------------------------------ with self.trainer.profiler.profile("training_batch_to_device"): - print("before run", self.iteration_count, torch.cuda.memory_allocated()) batch = self.trainer.accelerator.batch_to_device(batch, dataloader_idx=self._dataloader_idx) with self.trainer.profiler.profile("run_training_batch"): batch_output = self.batch_loop.run(batch, self.iteration_count, self._dataloader_idx) self.batches_seen += 1 - print("after run", self.iteration_count, torch.cuda.memory_allocated()) # when returning -1 from train_step, we end epoch early if batch_output.signal == -1: @@ -157,7 +155,6 @@ def on_advance_end(self): Raises: StopIteration: if :attr:`done` evaluates to ``True`` to finish this epoch """ - print("advance end", self.iteration_count, torch.cuda.memory_allocated()) # ----------------------------------------- # VALIDATE IF NEEDED + CHECKPOINT CALLBACK # ----------------------------------------- From 542efbd9e18191dc628d6702a9f7b77f071bfcbc Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Fri, 2 Jul 2021 18:12:40 +0200 Subject: [PATCH 64/64] Fix changelog --- CHANGELOG.md | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b6c39e21d1876..26c863be63d83 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -348,9 +348,6 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed a bug where `truncated_bptt_steps` would throw an AttributeError when the target RNN has multiple hidden states ([#8145](https://github.com/PyTorchLightning/pytorch-lightning/pull/8145)) -- Fixes access to `callback_metrics` in ddp_spawn ([#7916](https://github.com/PyTorchLightning/pytorch-lightning/pull/7916)) - - - Fixed moving batch to device before sending it to the `on_*_batch_start`/`on_*_batch_end` callbacks and model hooks ([#7378](https://github.com/PyTorchLightning/pytorch-lightning/pull/7378)) @@ -381,6 +378,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed setting a `DistributedSampler` when using a distributed plugin in a custom accelerator ([#7814](https://github.com/PyTorchLightning/pytorch-lightning/pull/7814)) - Improved `PyTorchProfiler` chrome traces names ([#8009](https://github.com/PyTorchLightning/pytorch-lightning/pull/8009)) - Fixed moving the best score to device in `EarlyStopping` callback for TPU devices ([#7959](https://github.com/PyTorchLightning/pytorch-lightning/pull/7959)) +- Fixes access to `callback_metrics` in ddp_spawn ([#7916](https://github.com/PyTorchLightning/pytorch-lightning/pull/7916)) ## [1.3.6] - 2021-06-15