From 68af5783f993082cf581c5e7367d4ed19895d2c8 Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Tue, 21 Jun 2022 15:35:05 +0100 Subject: [PATCH 1/4] Fixes deepspeed and `estimated_stepping_batches` --- src/pytorch_lightning/strategies/deepspeed.py | 1 + .../strategies/test_deepspeed_strategy.py | 25 +++++++++++++++++++ 2 files changed, 26 insertions(+) diff --git a/src/pytorch_lightning/strategies/deepspeed.py b/src/pytorch_lightning/strategies/deepspeed.py index 56933e1a23e75..8f7f2a6c58860 100644 --- a/src/pytorch_lightning/strategies/deepspeed.py +++ b/src/pytorch_lightning/strategies/deepspeed.py @@ -357,6 +357,7 @@ def setup_distributed(self): def setup(self, trainer: "pl.Trainer") -> None: self.accelerator.setup(trainer) + self.model_to_device() self.setup_optimizers(trainer) self.setup_precision_plugin() optimizers_to_device(self.optimizers, self.root_device) diff --git a/tests/tests_pytorch/strategies/test_deepspeed_strategy.py b/tests/tests_pytorch/strategies/test_deepspeed_strategy.py index 2306aeb045c20..876815823ea23 100644 --- a/tests/tests_pytorch/strategies/test_deepspeed_strategy.py +++ b/tests/tests_pytorch/strategies/test_deepspeed_strategy.py @@ -1337,3 +1337,28 @@ def test_error_with_invalid_accelerator(tmpdir): model = BoringModel() with pytest.raises(MisconfigurationException, match="DeepSpeed strategy is only supported on GPU"): trainer.fit(model) + + +@RunIf(min_cuda_gpus=2, deepspeed=True, standalone=True) +def test_deepspeed_configure_optimizer_device_set(tmpdir): + """ + Test to ensure that the LM has access to the device within the ``configure_optimizer`` function, + and estimated_stepping_batches works correctly as a result. + """ + + class TestModel(BoringModel): + def configure_optimizers(self): + assert self.trainer.estimated_stepping_batches == 1 + assert self.device.type == 'cuda' + raise SystemExit + + model = TestModel() + trainer = Trainer( + default_root_dir=tmpdir, + fast_dev_run=True, + accelerator="gpu", + devices=2, + strategy=DeepSpeedStrategy(), + ) + with pytest.raises(SystemExit): + trainer.fit(model) From 7262fa99ed919212ce7d7f83d48a9198f821f96e Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Tue, 21 Jun 2022 15:42:20 +0100 Subject: [PATCH 2/4] Swap to setting the device manually to exclude moving weights to device --- src/pytorch_lightning/strategies/deepspeed.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/pytorch_lightning/strategies/deepspeed.py b/src/pytorch_lightning/strategies/deepspeed.py index 8f7f2a6c58860..bae617561f85a 100644 --- a/src/pytorch_lightning/strategies/deepspeed.py +++ b/src/pytorch_lightning/strategies/deepspeed.py @@ -357,7 +357,8 @@ def setup_distributed(self): def setup(self, trainer: "pl.Trainer") -> None: self.accelerator.setup(trainer) - self.model_to_device() + # we set the device so that optimizers can be created with distributed comms. + self.lightning_module._device = self.root_device self.setup_optimizers(trainer) self.setup_precision_plugin() optimizers_to_device(self.optimizers, self.root_device) From fc2bfeaba53cbaa5cf4a471f5a7880ff8aadccd2 Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Tue, 21 Jun 2022 15:48:27 +0100 Subject: [PATCH 3/4] Add CHANGELOG.md --- CHANGELOG.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index d2590a70e021e..b8f07b63a0ce6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -232,6 +232,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed issue where the CLI fails with certain torch objects ([#13153](https://github.com/PyTorchLightning/pytorch-lightning/pull/13153)) +- Fixed `estimated_stepping_batches` requiring distributed comms in `configure_optimizers` for the `DeepSpeedStrategy` ([#13350](https://github.com/PyTorchLightning/pytorch-lightning/pull/13350)) + + - From 519ef0131db7e600862f9c09892e7afa97810714 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 21 Jun 2022 14:49:00 +0000 Subject: [PATCH 4/4] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/tests_pytorch/strategies/test_deepspeed_strategy.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/tests/tests_pytorch/strategies/test_deepspeed_strategy.py b/tests/tests_pytorch/strategies/test_deepspeed_strategy.py index 876815823ea23..601e559ab3c18 100644 --- a/tests/tests_pytorch/strategies/test_deepspeed_strategy.py +++ b/tests/tests_pytorch/strategies/test_deepspeed_strategy.py @@ -1341,15 +1341,13 @@ def test_error_with_invalid_accelerator(tmpdir): @RunIf(min_cuda_gpus=2, deepspeed=True, standalone=True) def test_deepspeed_configure_optimizer_device_set(tmpdir): - """ - Test to ensure that the LM has access to the device within the ``configure_optimizer`` function, - and estimated_stepping_batches works correctly as a result. - """ + """Test to ensure that the LM has access to the device within the ``configure_optimizer`` function, and + estimated_stepping_batches works correctly as a result.""" class TestModel(BoringModel): def configure_optimizers(self): assert self.trainer.estimated_stepping_batches == 1 - assert self.device.type == 'cuda' + assert self.device.type == "cuda" raise SystemExit model = TestModel()