From 14ace4e6f95cc3f8d4e0f06ad80930398e12468d Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Fri, 19 Feb 2021 18:40:50 +0000 Subject: [PATCH 1/6] Give priority to plugins to set distributed mode, and then accelerator --- .../connectors/accelerator_connector.py | 7 +++--- .../test_accelerator_connector.py | 24 ++++++++++++++++++- 2 files changed, 27 insertions(+), 4 deletions(-) diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py index d32970d61fa9b..282813151031c 100644 --- a/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -116,10 +116,11 @@ def __init__( self.parallel_device_ids = device_parser.parse_gpu_ids(self.gpus) - self.set_distributed_mode() - self.configure_slurm_ddp() - self.handle_given_plugins(plugins) + if not self._distrib_type: + self.set_distributed_mode() + + self.configure_slurm_ddp() self.accelerator = self.select_accelerator() diff --git a/tests/accelerators/test_accelerator_connector.py b/tests/accelerators/test_accelerator_connector.py index 76d4a597d8ecb..82b631807c8e9 100644 --- a/tests/accelerators/test_accelerator_connector.py +++ b/tests/accelerators/test_accelerator_connector.py @@ -23,7 +23,14 @@ from pytorch_lightning.accelerators.cpu import CPUAccelerator from pytorch_lightning.accelerators.gpu import GPUAccelerator from pytorch_lightning.callbacks import Callback -from pytorch_lightning.plugins import DDP2Plugin, DDPPlugin, DDPSpawnPlugin, PrecisionPlugin, SingleDevicePlugin +from pytorch_lightning.plugins import ( + DDP2Plugin, + DDPPlugin, + DDPShardedPlugin, + DDPSpawnPlugin, + PrecisionPlugin, + SingleDevicePlugin, +) from pytorch_lightning.plugins.environments import ClusterEnvironment, SLURMEnvironment, TorchElasticEnvironment from tests.helpers.boring_model import BoringModel @@ -378,3 +385,18 @@ def on_fit_start(self, trainer, pl_module): with pytest.raises(SystemExit): trainer.fit(model) + + +@pytest.mark.parametrize( + ["accelerator", "plugin"], + [('ddp_spawn', 'ddp_sharded'), (None, 'ddp_sharded')], +) +def test_plugin_accelerator_choice(accelerator, plugin): + """ + Ensure that when a plugin and accelerator is passed in, that the plugin takes precedent. + """ + trainer = Trainer(accelerator=accelerator, plugins=plugin, num_processes=2) + assert isinstance(trainer.accelerator.training_type_plugin, DDPShardedPlugin) + + trainer = Trainer(plugins=plugin, num_processes=2) + assert isinstance(trainer.accelerator.training_type_plugin, DDPShardedPlugin) From b64b7433fa0b13bd1ffb8b0153de47775f7dfd43 Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Fri, 19 Feb 2021 18:43:25 +0000 Subject: [PATCH 2/6] Add CHANGELOG.md --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7dad863d41293..15d6a919db76b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -24,6 +24,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed incorrect yield logic for the amp autocast context manager ([#6080](https://github.com/PyTorchLightning/pytorch-lightning/pull/6080)) +- Fixed priority of plugin/accelerator when setting distributed mode ([#6089](https://github.com/PyTorchLightning/pytorch-lightning/pull/6089)) + ## [1.2.0] - 2021-02-18 ### Added From 14a9b2d713500f1a5f430fc3e14738f81ba8b3b7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Fri, 19 Feb 2021 19:49:42 +0100 Subject: [PATCH 3/6] Update CHANGELOG.md --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 15d6a919db76b..07e48308175c4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -26,6 +26,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed priority of plugin/accelerator when setting distributed mode ([#6089](https://github.com/PyTorchLightning/pytorch-lightning/pull/6089)) + ## [1.2.0] - 2021-02-18 ### Added From 9abea6d1ba2fb7d7082c0646986411a4ecc54170 Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Fri, 19 Feb 2021 19:52:49 +0000 Subject: [PATCH 4/6] Remove very scary line --- pytorch_lightning/trainer/connectors/accelerator_connector.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py index 282813151031c..5829cabcf2c84 100644 --- a/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -117,7 +117,7 @@ def __init__( self.parallel_device_ids = device_parser.parse_gpu_ids(self.gpus) self.handle_given_plugins(plugins) - if not self._distrib_type: + if self._distrib_type is None: self.set_distributed_mode() self.configure_slurm_ddp() @@ -197,7 +197,6 @@ def handle_given_plugins( ) self._training_type_plugin = training_type - self._training_type_plugin = self.training_type_plugin self._precision_plugin = precision self._cluster_environment = cluster_environment or self.select_cluster_environment() From 8cb04746a4b3772495c477bc75d8b2b65ee580c5 Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Sat, 20 Feb 2021 00:08:12 +0000 Subject: [PATCH 5/6] Ensure we set cluster environment after slurm configured if necessary --- .../trainer/connectors/accelerator_connector.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py index 5829cabcf2c84..759834d4a180d 100644 --- a/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -122,6 +122,9 @@ def __init__( self.configure_slurm_ddp() + # set cluster env after slurm is configured if using slurm + self._cluster_environment = self.select_cluster_environment() + self.accelerator = self.select_accelerator() # override dist backend when using tpus @@ -198,7 +201,7 @@ def handle_given_plugins( self._training_type_plugin = training_type self._precision_plugin = precision - self._cluster_environment = cluster_environment or self.select_cluster_environment() + self._cluster_environment = cluster_environment @property def precision_plugin(self) -> PrecisionPlugin: From b6b1611b2d426c602776b4ca294130add1842abc Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Sat, 20 Feb 2021 11:01:18 +0000 Subject: [PATCH 6/6] Simplify the fix with a reset --- .../trainer/connectors/accelerator_connector.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py index 759834d4a180d..7021081d6cc90 100644 --- a/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -116,14 +116,10 @@ def __init__( self.parallel_device_ids = device_parser.parse_gpu_ids(self.gpus) - self.handle_given_plugins(plugins) - if self._distrib_type is None: - self.set_distributed_mode() - + self.set_distributed_mode() self.configure_slurm_ddp() - # set cluster env after slurm is configured if using slurm - self._cluster_environment = self.select_cluster_environment() + self.handle_given_plugins(plugins) self.accelerator = self.select_accelerator() @@ -167,6 +163,9 @@ def handle_given_plugins( for plug in plugins: if isinstance(plug, str): + # Reset the distributed type as the user has overridden training type + # via the plugins argument + self._distrib_type = None self.set_distributed_mode(plug) elif isinstance(plug, TrainingTypePlugin): @@ -201,7 +200,7 @@ def handle_given_plugins( self._training_type_plugin = training_type self._precision_plugin = precision - self._cluster_environment = cluster_environment + self._cluster_environment = cluster_environment or self.select_cluster_environment() @property def precision_plugin(self) -> PrecisionPlugin: