From 379697289a7517ed0222d7017d62fe80b750025f Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Tue, 29 Dec 2020 22:22:26 +0100 Subject: [PATCH 01/25] naive replace --- .../accelerators/accelerator_connector.py | 36 +++++++++---------- pytorch_lightning/trainer/deprecated_api.py | 28 +++++++-------- tests/deprecated_api/test_remove_1-4.py | 25 ++++++++----- 3 files changed, 48 insertions(+), 41 deletions(-) diff --git a/pytorch_lightning/accelerators/accelerator_connector.py b/pytorch_lightning/accelerators/accelerator_connector.py index ce2a418cf2fa5..d0f3426402fef 100644 --- a/pytorch_lightning/accelerators/accelerator_connector.py +++ b/pytorch_lightning/accelerators/accelerator_connector.py @@ -15,7 +15,7 @@ import torch -from pytorch_lightning.utilities import _HOROVOD_AVAILABLE +from pytorch_lightning.utilities import _HOROVOD_AVAILABLE, DeviceType, DistributedType from pytorch_lightning import _logger as log from pytorch_lightning import accelerators from pytorch_lightning.accelerators.accelerator import Accelerator @@ -82,7 +82,8 @@ def on_trainer_init( self.trainer.sync_batchnorm = sync_batchnorm self.trainer.tpu_cores = device_parser.parse_tpu_cores(tpu_cores) - self.trainer.on_tpu = self.trainer.tpu_cores is not None + if self.trainer.tpu_cores is not None: + self.trainer._device_type = DeviceType.TPU self.trainer.tpu_id = self.trainer.tpu_cores[0] if isinstance(self.trainer.tpu_cores, list) else None @@ -102,7 +103,8 @@ def on_trainer_init( self.trainer.root_gpu = device_parser.determine_root_gpu_device(self.trainer.data_parallel_device_ids) self.trainer.root_device = torch.device("cpu") - self.trainer.on_gpu = True if (self.trainer.data_parallel_device_ids and torch.cuda.is_available()) else False + if (self.trainer.data_parallel_device_ids and torch.cuda.is_available()): + self.trainer._device_type = DeviceType.GPU # tpu state flags self.trainer.use_tpu = False @@ -115,7 +117,6 @@ def on_trainer_init( # override dist backend when using tpus if self.trainer.on_tpu: self.trainer.distributed_backend = "tpu" - self.trainer.use_tpu = True # init flags for SLURM+DDP to work self.trainer.world_size = 1 @@ -293,9 +294,9 @@ def set_distributed_mode(self): self._set_horovod_backend() elif self.trainer.num_gpus == 0: if self.trainer.num_nodes > 1 or self.trainer.num_processes > 1: - self.trainer.use_ddp = True # ddp_cpu + self.trainer._distrib_type = DistributedType.DDP elif self.trainer.num_gpus == 1: - self.trainer.use_single_gpu = True + self.trainer._device_type = DeviceType.GPU elif self.trainer.num_gpus > 1: rank_zero_warn( 'You requested multiple GPUs but did not specify a backend, e.g.' @@ -307,35 +308,34 @@ def set_distributed_mode(self): if self.trainer.distributed_backend == "dp": # do nothing if num_gpus == 0 if self.trainer.num_gpus == 1: - self.trainer.use_single_gpu = True - self.trainer.use_dp = True + self.trainer._device_type = DeviceType.GPU + self.trainer._distrib_type = DistributedType.DP elif self.trainer.num_gpus > 1: - self.trainer.use_dp = True + self.trainer._distrib_type = DistributedType.DP elif self.trainer.distributed_backend in ("ddp", "ddp_spawn"): if self.trainer.num_gpus == 0: if self.trainer.num_nodes > 1 or self.trainer.num_processes > 1: - self.trainer.use_ddp = True # ddp_cpu + self.trainer._distrib_type = DistributedType.DDP elif self.trainer.num_gpus == 1: - self.trainer.use_single_gpu = True - self.trainer.use_ddp = True + self.trainer._device_type = DeviceType.GPU + self.trainer._distrib_type = DistributedType.DDP elif self.trainer.num_gpus > 1: - self.trainer.use_ddp = True + self.trainer._distrib_type = DistributedType.DDP self.trainer.num_processes = self.trainer.num_gpus elif self.trainer.distributed_backend == "ddp2": # do nothing if num_gpus == 0 if self.trainer.num_gpus >= 1: - self.trainer.use_ddp2 = True + self.trainer._distrib_type = DistributedType.DDP2 elif self.trainer.distributed_backend == "ddp_cpu": if self.trainer.num_gpus > 0: rank_zero_warn( 'You requested one or more GPUs, but set the backend to `ddp_cpu`. Training will not use GPUs.' ) - self.trainer.use_ddp = True + self.trainer._distrib_type = DistributedType.DDP self.trainer.data_parallel_device_ids = None - self.trainer.on_gpu = False - self.trainer.on_cpu = True + self.trainer.self._device_type = DeviceType.CPU elif self.trainer.distributed_backend == "horovod": self._set_horovod_backend() @@ -355,7 +355,7 @@ def set_distributed_mode(self): def _set_horovod_backend(self): self.check_horovod() - self.trainer.use_horovod = True + self.trainer._distrib_type = DistributedType.HOROVOD # Initialize Horovod to get rank / size info hvd.init() diff --git a/pytorch_lightning/trainer/deprecated_api.py b/pytorch_lightning/trainer/deprecated_api.py index 7b4de47a1be2c..2db6dabd8e0d3 100644 --- a/pytorch_lightning/trainer/deprecated_api.py +++ b/pytorch_lightning/trainer/deprecated_api.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from pytorch_lightning.utilities import DistributedType, DeviceType +from pytorch_lightning.utilities import DistributedType, DeviceType, rank_zero_warn class DeprecatedDistDeviceAttributes: @@ -28,7 +28,7 @@ def on_cpu(self) -> bool: @on_cpu.setter def on_cpu(self, val: bool) -> None: - # rank_zero_warn("Internal: `on_cpu` is deprecated in v1.1 and will be removed in v1.2.", DeprecationWarning) + rank_zero_warn("Internal: `on_cpu` is deprecated in v1.2 and will be removed in v1.4.", DeprecationWarning) if val: self._device_type = DeviceType.CPU @@ -39,7 +39,7 @@ def on_tpu(self) -> bool: @on_tpu.setter def on_tpu(self, val: bool) -> None: - # rank_zero_warn("Internal: `on_tpu` is deprecated in v1.1 and will be removed in v1.2.", DeprecationWarning) + rank_zero_warn("Internal: `on_tpu` is deprecated in v1.2 and will be removed in v1.4.", DeprecationWarning) # todo add logic that it cannot be set if TPU is missing if val: self._device_type = DeviceType.TPU @@ -51,7 +51,7 @@ def use_tpu(self) -> bool: @use_tpu.setter def use_tpu(self, val: bool) -> None: - # rank_zero_warn("Internal: `use_tpu` is deprecated in v1.1 and will be removed in v1.2.", DeprecationWarning) + rank_zero_warn("Internal: `use_tpu` is deprecated in v1.2 and will be removed in v1.4.", DeprecationWarning) self.on_tpu = val @property @@ -61,7 +61,7 @@ def on_gpu(self) -> bool: @on_gpu.setter def on_gpu(self, val: bool) -> None: - # rank_zero_warn("Internal: `on_gpu` is deprecated in v1.1 and will be removed in v1.2.", DeprecationWarning) + rank_zero_warn("Internal: `on_gpu` is deprecated in v1.2 and will be removed in v1.4.", DeprecationWarning) # todo add logic that it cannot be set if GPU is missing if val: self._device_type = DeviceType.GPU @@ -73,7 +73,7 @@ def use_dp(self) -> bool: @use_dp.setter def use_dp(self, val: bool) -> None: - # rank_zero_warn("Internal: `use_dp` is deprecated in v1.1 and will be removed in v1.2.", DeprecationWarning) + rank_zero_warn("Internal: `use_dp` is deprecated in v1.2 and will be removed in v1.4.", DeprecationWarning) if val: self._distrib_type = DistributedType.DP @@ -84,7 +84,7 @@ def use_ddp(self) -> bool: @use_ddp.setter def use_ddp(self, val: bool) -> None: - # rank_zero_warn("Internal: `use_ddp` is deprecated in v1.1 and will be removed in v1.2.", DeprecationWarning) + rank_zero_warn("Internal: `use_ddp` is deprecated in v1.2 and will be removed in v1.4.", DeprecationWarning) if val: self._distrib_type = DistributedType.DDP @@ -95,7 +95,7 @@ def use_ddp2(self) -> bool: @use_ddp2.setter def use_ddp2(self, val: bool) -> None: - # rank_zero_warn("Internal: `use_ddp2` is deprecated in v1.1 and will be removed in v1.2.", DeprecationWarning) + rank_zero_warn("Internal: `use_ddp2` is deprecated in v1.2 and will be removed in v1.4.", DeprecationWarning) if val: self._distrib_type = DistributedType.DDP2 @@ -108,9 +108,9 @@ def use_horovod(self) -> bool: @use_horovod.setter def use_horovod(self, val: bool) -> None: - # rank_zero_warn( - # "Internal: `use_horovod` is deprecated in v1.1 and will be removed in v1.2.", DeprecationWarning - # ) + rank_zero_warn( + "Internal: `use_horovod` is deprecated in v1.2 and will be removed in v1.4.", DeprecationWarning + ) if val: self._distrib_type = DistributedType.HOROVOD @@ -126,8 +126,8 @@ def use_single_gpu(self) -> bool: @use_single_gpu.setter def use_single_gpu(self, val: bool) -> None: - # rank_zero_warn( - # "Internal: `use_single_gpu` is deprecated in v1.1 and will be removed in v1.2.", DeprecationWarning, - # ) + rank_zero_warn( + "Internal: `use_single_gpu` is deprecated in v1.2 and will be removed in v1.4.", DeprecationWarning, + ) if val: self._device_type = DeviceType.GPU diff --git a/tests/deprecated_api/test_remove_1-4.py b/tests/deprecated_api/test_remove_1-4.py index 9a7a970aecaf7..2d2d59be0f797 100644 --- a/tests/deprecated_api/test_remove_1-4.py +++ b/tests/deprecated_api/test_remove_1-4.py @@ -37,35 +37,42 @@ def test_v1_4_0_deprecated_imports(): from pytorch_lightning.utilities.xla_device_utils import XLADeviceUtils # noqa: F811 F401 -# todo: later add also checking deprecated warnings def test_v1_4_0_deprecated_trainer_attributes(): """Test that Trainer attributes works fine.""" trainer = Trainer() trainer._distrib_type = None trainer._device_type = None - trainer.on_cpu = True + with pytest.deprecated_call(match='deprecated in v1.2 and will be removed in v1.4'): + trainer.on_cpu = True assert trainer.on_cpu - trainer.on_gpu = True + with pytest.deprecated_call(match='deprecated in v1.2 and will be removed in v1.4'): + trainer.on_gpu = True assert trainer.on_gpu - trainer.on_tpu = True + with pytest.deprecated_call(match='deprecated in v1.2 and will be removed in v1.4'): + trainer.on_tpu = True assert trainer.on_tpu trainer._device_type = None - trainer.use_tpu = True + with pytest.deprecated_call(match='deprecated in v1.2 and will be removed in v1.4'): + trainer.use_tpu = True assert trainer.use_tpu - trainer.use_dp = True + with pytest.deprecated_call(match='deprecated in v1.2 and will be removed in v1.4'): + trainer.use_dp = True assert trainer.use_dp - trainer.use_ddp = True + with pytest.deprecated_call(match='deprecated in v1.2 and will be removed in v1.4'): + trainer.use_ddp = True assert trainer.use_ddp - trainer.use_ddp2 = True + with pytest.deprecated_call(match='deprecated in v1.2 and will be removed in v1.4'): + trainer.use_ddp2 = True assert trainer.use_ddp2 - trainer.use_horovod = True + with pytest.deprecated_call(match='deprecated in v1.2 and will be removed in v1.4'): + trainer.use_horovod = True assert trainer.use_horovod From 0cbc4cc9fc207ea3b07b1169c107ae3ff2255fec Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Tue, 29 Dec 2020 23:03:17 +0100 Subject: [PATCH 02/25] simplify --- .../accelerators/accelerator_connector.py | 60 ++++++++----------- 1 file changed, 25 insertions(+), 35 deletions(-) diff --git a/pytorch_lightning/accelerators/accelerator_connector.py b/pytorch_lightning/accelerators/accelerator_connector.py index d0f3426402fef..9b5e6bfc9d0dc 100644 --- a/pytorch_lightning/accelerators/accelerator_connector.py +++ b/pytorch_lightning/accelerators/accelerator_connector.py @@ -283,11 +283,6 @@ def select_accelerator(self): return accelerator_backend def set_distributed_mode(self): - self.trainer.use_dp = False - self.trainer.use_ddp = False - self.trainer.use_ddp2 = False - self.trainer.use_horovod = False - self.trainer.use_single_gpu = False if self.trainer.distributed_backend is None: if self.has_horovodrun(): @@ -305,42 +300,37 @@ def set_distributed_mode(self): ) self.trainer.distributed_backend = "ddp_spawn" - if self.trainer.distributed_backend == "dp": - # do nothing if num_gpus == 0 - if self.trainer.num_gpus == 1: - self.trainer._device_type = DeviceType.GPU - self.trainer._distrib_type = DistributedType.DP - elif self.trainer.num_gpus > 1: - self.trainer._distrib_type = DistributedType.DP - - elif self.trainer.distributed_backend in ("ddp", "ddp_spawn"): - if self.trainer.num_gpus == 0: - if self.trainer.num_nodes > 1 or self.trainer.num_processes > 1: - self.trainer._distrib_type = DistributedType.DDP - elif self.trainer.num_gpus == 1: - self.trainer._device_type = DeviceType.GPU - self.trainer._distrib_type = DistributedType.DDP - elif self.trainer.num_gpus > 1: - self.trainer._distrib_type = DistributedType.DDP - self.trainer.num_processes = self.trainer.num_gpus - - elif self.trainer.distributed_backend == "ddp2": - # do nothing if num_gpus == 0 - if self.trainer.num_gpus >= 1: - self.trainer._distrib_type = DistributedType.DDP2 - elif self.trainer.distributed_backend == "ddp_cpu": + if self.trainer.distributed_backend == "ddp_cpu": + self.trainer._distrib_type = DistributedType.DDP + self.trainer.data_parallel_device_ids = None if self.trainer.num_gpus > 0: rank_zero_warn( 'You requested one or more GPUs, but set the backend to `ddp_cpu`. Training will not use GPUs.' ) - self.trainer._distrib_type = DistributedType.DDP - self.trainer.data_parallel_device_ids = None - self.trainer.self._device_type = DeviceType.CPU - elif self.trainer.distributed_backend == "horovod": + else: + self.trainer._distrib_type = DistributedType(self.trainer.distributed_backend) + + if self.trainer.num_gpus > 0 and 'cpu' not in self.trainer.distributed_backend: + self.trainer._device_type = DeviceType.GPU + + if self.trainer.num_gpus == 0 and self.trainer._distrib_type in (DistributedType.DP, DistributedType.DDP2): + rank_zero_warn( + 'You requested distributed training on GPUs, but none is available, so we set backend to `ddp_cpu`.' + ) + if self.trainer.num_nodes > 1 or self.trainer.num_processes > 1: + self.trainer._distrib_type = DistributedType.DDP + else: + rank_zero_warn('You are running on single node with no parallelization, so distributed has no effect.') + self.trainer._distrib_type = None + + if self.trainer._distrib_type in (DistributedType.DDP, DistributedType.DDP_SPAWN): + self.trainer.num_processes = self.trainer.num_gpus + + if self.trainer.distributed_backend == "horovod": self._set_horovod_backend() # throw error to force user ddp or ddp2 choice - if self.trainer.num_nodes > 1 and not (self.trainer.use_ddp2 or self.trainer.use_ddp): + if self.trainer.num_nodes > 1 and self.trainer._distrib_type not in (DistributedType.DDP2, DistributedType.DDP): raise MisconfigurationException( 'DataParallel does not support num_nodes > 1. Switching to DistributedDataParallel for you. ' 'To silence this warning set `accelerator="ddp"` or `accelerator="ddp2"`' @@ -350,7 +340,7 @@ def set_distributed_mode(self): num_cores = self.trainer.tpu_cores if self.trainer.tpu_cores is not None else 0 rank_zero_info(f'TPU available: {_TPU_AVAILABLE}, using: {num_cores} TPU cores') - if torch.cuda.is_available() and not self.trainer.on_gpu: + if torch.cuda.is_available() and self.trainer._device_type != DeviceType.GPU: rank_zero_warn('GPU available but not used. Set the --gpus flag when calling the script.') def _set_horovod_backend(self): From 9b3c68d595c55f9dd0e3034b7942f6d17296faa4 Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Tue, 29 Dec 2020 23:12:06 +0100 Subject: [PATCH 03/25] clean --- .../accelerators/accelerator_connector.py | 24 ++++++++++--------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/pytorch_lightning/accelerators/accelerator_connector.py b/pytorch_lightning/accelerators/accelerator_connector.py index 9b5e6bfc9d0dc..167a7df01cfe1 100644 --- a/pytorch_lightning/accelerators/accelerator_connector.py +++ b/pytorch_lightning/accelerators/accelerator_connector.py @@ -83,7 +83,12 @@ def on_trainer_init( self.trainer.tpu_cores = device_parser.parse_tpu_cores(tpu_cores) if self.trainer.tpu_cores is not None: - self.trainer._device_type = DeviceType.TPU + if _TPU_AVAILABLE: + self.trainer._device_type = DeviceType.TPU + else: + raise MisconfigurationException( + f"You have requested {self.trainer.tpu_cores} TPU cores but none is available." + ) self.trainer.tpu_id = self.trainer.tpu_cores[0] if isinstance(self.trainer.tpu_cores, list) else None @@ -101,20 +106,15 @@ def on_trainer_init( self.trainer.data_parallel_device_ids = device_parser.parse_gpu_ids(self.trainer.gpus) self.trainer.root_gpu = device_parser.determine_root_gpu_device(self.trainer.data_parallel_device_ids) - self.trainer.root_device = torch.device("cpu") - - if (self.trainer.data_parallel_device_ids and torch.cuda.is_available()): - self.trainer._device_type = DeviceType.GPU # tpu state flags - self.trainer.use_tpu = False self.trainer.tpu_local_core_rank = None self.trainer.tpu_global_core_rank = None # distributed backend choice self.set_distributed_mode() - # override dist backend when using tpus + # override dist backend when using TPUs if self.trainer.on_tpu: self.trainer.distributed_backend = "tpu" @@ -138,8 +138,10 @@ def on_trainer_init( def _map_deprecated_dist_backend(self, accelerator, distributed_backend): if distributed_backend is not None: - rank_zero_warn(DeprecationWarning('distributed_backend has been renamed to accelerator. ' - 'Deprecated in 1.0.0, will be removed in 1.2.0')) + rank_zero_warn( + '`distributed_backend` has been renamed to accelerator. Deprecated in 1.0.0, will be removed in 1.2.0', + DeprecationWarning + ) # temporary mapping until we remove all the distributed_backend references if accelerator is not None: @@ -344,7 +346,7 @@ def set_distributed_mode(self): rank_zero_warn('GPU available but not used. Set the --gpus flag when calling the script.') def _set_horovod_backend(self): - self.check_horovod() + self._check_horovod() self.trainer._distrib_type = DistributedType.HOROVOD # Initialize Horovod to get rank / size info @@ -353,7 +355,7 @@ def _set_horovod_backend(self): # Horovod assigns one local GPU per process self.trainer.root_gpu = hvd.local_rank() - def check_horovod(self): + def _check_horovod(self): """Raises a `MisconfigurationException` if the Trainer is not configured correctly for Horovod.""" if not _HOROVOD_AVAILABLE: raise MisconfigurationException( From 71a81b1bd44cc54f5b55fbc1455e10acfbab6737 Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Tue, 29 Dec 2020 23:23:41 +0100 Subject: [PATCH 04/25] . --- pytorch_lightning/accelerators/accelerator_connector.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/accelerators/accelerator_connector.py b/pytorch_lightning/accelerators/accelerator_connector.py index 167a7df01cfe1..fc97bdbe7b7f2 100644 --- a/pytorch_lightning/accelerators/accelerator_connector.py +++ b/pytorch_lightning/accelerators/accelerator_connector.py @@ -309,7 +309,7 @@ def set_distributed_mode(self): rank_zero_warn( 'You requested one or more GPUs, but set the backend to `ddp_cpu`. Training will not use GPUs.' ) - else: + elif self.trainer.distributed_backend: self.trainer._distrib_type = DistributedType(self.trainer.distributed_backend) if self.trainer.num_gpus > 0 and 'cpu' not in self.trainer.distributed_backend: From 1c43e5ee5edfa740f7041913b4b64f1599b302dc Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Wed, 30 Dec 2020 00:40:57 +0100 Subject: [PATCH 05/25] fix --- .../accelerators/accelerator_connector.py | 27 +++++++++++-------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/pytorch_lightning/accelerators/accelerator_connector.py b/pytorch_lightning/accelerators/accelerator_connector.py index fc97bdbe7b7f2..d58c42bc26c4d 100644 --- a/pytorch_lightning/accelerators/accelerator_connector.py +++ b/pytorch_lightning/accelerators/accelerator_connector.py @@ -289,19 +289,16 @@ def set_distributed_mode(self): if self.trainer.distributed_backend is None: if self.has_horovodrun(): self._set_horovod_backend() - elif self.trainer.num_gpus == 0: - if self.trainer.num_nodes > 1 or self.trainer.num_processes > 1: - self.trainer._distrib_type = DistributedType.DDP - elif self.trainer.num_gpus == 1: - self.trainer._device_type = DeviceType.GPU + elif self.trainer.num_gpus == 0 and (self.trainer.num_nodes > 1 or self.trainer.num_processes > 1): + self.trainer._distrib_type = DistributedType.DDP elif self.trainer.num_gpus > 1: rank_zero_warn( 'You requested multiple GPUs but did not specify a backend, e.g.' - ' `Trainer(accelerator="dp"|"ddp"|"ddp2")`.' - ' Setting `accelerator="ddp_spawn"` for you.' + ' `Trainer(accelerator="dp"|"ddp"|"ddp2")`. Setting `accelerator="ddp_spawn"` for you.' ) self.trainer.distributed_backend = "ddp_spawn" + # special case with DDP on CPUs if self.trainer.distributed_backend == "ddp_cpu": self.trainer._distrib_type = DistributedType.DDP self.trainer.data_parallel_device_ids = None @@ -309,13 +306,18 @@ def set_distributed_mode(self): rank_zero_warn( 'You requested one or more GPUs, but set the backend to `ddp_cpu`. Training will not use GPUs.' ) - elif self.trainer.distributed_backend: + # set all other requested distrib. types adn if it was not set in the + elif self.trainer.distributed_backend and self.trainer._distrib_type is None: self.trainer._distrib_type = DistributedType(self.trainer.distributed_backend) - if self.trainer.num_gpus > 0 and 'cpu' not in self.trainer.distributed_backend: + # unless you request explicitly for CPU and some GPU are available use them + if (self.trainer.num_gpus > 0 + and not (self.trainer.distributed_backend and 'cpu' in self.trainer.distributed_backend)): self.trainer._device_type = DeviceType.GPU - if self.trainer.num_gpus == 0 and self.trainer._distrib_type in (DistributedType.DP, DistributedType.DDP2): + # DP and DDP2 cannot run without GPU + if (self.trainer.num_gpus == 0 + and self.trainer._distrib_type in (DistributedType.DP, DistributedType.DDP, DistributedType.DDP2)): rank_zero_warn( 'You requested distributed training on GPUs, but none is available, so we set backend to `ddp_cpu`.' ) @@ -325,9 +327,12 @@ def set_distributed_mode(self): rank_zero_warn('You are running on single node with no parallelization, so distributed has no effect.') self.trainer._distrib_type = None - if self.trainer._distrib_type in (DistributedType.DDP, DistributedType.DDP_SPAWN): + # for DDP overwrite nb processes by requested GPUs + if (self.trainer._device_type == DeviceType.GPU + and self.trainer._distrib_type in (DistributedType.DDP, DistributedType.DDP_SPAWN)): self.trainer.num_processes = self.trainer.num_gpus + # Horovod si an extra case... if self.trainer.distributed_backend == "horovod": self._set_horovod_backend() From 64c73d57e7d3f257f839a797cf4241ed8ad0116c Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Wed, 30 Dec 2020 00:55:15 +0100 Subject: [PATCH 06/25] . --- pytorch_lightning/trainer/deprecated_api.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pytorch_lightning/trainer/deprecated_api.py b/pytorch_lightning/trainer/deprecated_api.py index 2db6dabd8e0d3..2c8377d2936c9 100644 --- a/pytorch_lightning/trainer/deprecated_api.py +++ b/pytorch_lightning/trainer/deprecated_api.py @@ -40,7 +40,6 @@ def on_tpu(self) -> bool: @on_tpu.setter def on_tpu(self, val: bool) -> None: rank_zero_warn("Internal: `on_tpu` is deprecated in v1.2 and will be removed in v1.4.", DeprecationWarning) - # todo add logic that it cannot be set if TPU is missing if val: self._device_type = DeviceType.TPU @@ -62,7 +61,6 @@ def on_gpu(self) -> bool: @on_gpu.setter def on_gpu(self, val: bool) -> None: rank_zero_warn("Internal: `on_gpu` is deprecated in v1.2 and will be removed in v1.4.", DeprecationWarning) - # todo add logic that it cannot be set if GPU is missing if val: self._device_type = DeviceType.GPU From 7af6832f69a75bf5a94b7770baa5e8ebe7723fea Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Wed, 30 Dec 2020 19:25:37 +0100 Subject: [PATCH 07/25] fix --- pytorch_lightning/accelerators/accelerator_connector.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pytorch_lightning/accelerators/accelerator_connector.py b/pytorch_lightning/accelerators/accelerator_connector.py index d58c42bc26c4d..2db6f1c2bdbde 100644 --- a/pytorch_lightning/accelerators/accelerator_connector.py +++ b/pytorch_lightning/accelerators/accelerator_connector.py @@ -321,7 +321,9 @@ def set_distributed_mode(self): rank_zero_warn( 'You requested distributed training on GPUs, but none is available, so we set backend to `ddp_cpu`.' ) - if self.trainer.num_nodes > 1 or self.trainer.num_processes > 1: + # in some cases it yield in comarison None and int + if ((self.trainer.num_nodes and self.trainer.num_nodes > 1) + or (self.trainer.num_processes and self.trainer.num_processes > 1)): self.trainer._distrib_type = DistributedType.DDP else: rank_zero_warn('You are running on single node with no parallelization, so distributed has no effect.') From ff748549f60f37711324ec3e2998635d84d415c1 Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Wed, 30 Dec 2020 19:47:39 +0100 Subject: [PATCH 08/25] fix --- pytorch_lightning/accelerators/accelerator_connector.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/accelerators/accelerator_connector.py b/pytorch_lightning/accelerators/accelerator_connector.py index 2db6f1c2bdbde..8700a6d5f6b31 100644 --- a/pytorch_lightning/accelerators/accelerator_connector.py +++ b/pytorch_lightning/accelerators/accelerator_connector.py @@ -317,11 +317,11 @@ def set_distributed_mode(self): # DP and DDP2 cannot run without GPU if (self.trainer.num_gpus == 0 - and self.trainer._distrib_type in (DistributedType.DP, DistributedType.DDP, DistributedType.DDP2)): + and self.trainer._distrib_type in (DistributedType.DP, DistributedType.DDP2)): rank_zero_warn( 'You requested distributed training on GPUs, but none is available, so we set backend to `ddp_cpu`.' ) - # in some cases it yield in comarison None and int + # todo: in some cases it yield in comarison None and int if ((self.trainer.num_nodes and self.trainer.num_nodes > 1) or (self.trainer.num_processes and self.trainer.num_processes > 1)): self.trainer._distrib_type = DistributedType.DDP From b3c8f96d8e1a6381f6e332133e724b2b863fa8bb Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Thu, 31 Dec 2020 11:46:47 +0100 Subject: [PATCH 09/25] fix --- pytorch_lightning/accelerators/accelerator_connector.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pytorch_lightning/accelerators/accelerator_connector.py b/pytorch_lightning/accelerators/accelerator_connector.py index 8700a6d5f6b31..8cd579a22e926 100644 --- a/pytorch_lightning/accelerators/accelerator_connector.py +++ b/pytorch_lightning/accelerators/accelerator_connector.py @@ -317,7 +317,9 @@ def set_distributed_mode(self): # DP and DDP2 cannot run without GPU if (self.trainer.num_gpus == 0 - and self.trainer._distrib_type in (DistributedType.DP, DistributedType.DDP2)): + and self.trainer._distrib_type in ( + DistributedType.DP, DistributedType.DDP, DistributedType.DDP_SPAWN, DistributedType.DDP2 + )): rank_zero_warn( 'You requested distributed training on GPUs, but none is available, so we set backend to `ddp_cpu`.' ) From f55383bbe572916f9d415a58fb35f2b3b94e0873 Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Thu, 31 Dec 2020 11:51:21 +0100 Subject: [PATCH 10/25] flake8 --- pytorch_lightning/accelerators/accelerator_connector.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/pytorch_lightning/accelerators/accelerator_connector.py b/pytorch_lightning/accelerators/accelerator_connector.py index 8cd579a22e926..fea3df06b7df4 100644 --- a/pytorch_lightning/accelerators/accelerator_connector.py +++ b/pytorch_lightning/accelerators/accelerator_connector.py @@ -315,11 +315,9 @@ def set_distributed_mode(self): and not (self.trainer.distributed_backend and 'cpu' in self.trainer.distributed_backend)): self.trainer._device_type = DeviceType.GPU + _distrib_types = (DistributedType.DP, DistributedType.DDP, DistributedType.DDP_SPAWN, DistributedType.DDP2) # DP and DDP2 cannot run without GPU - if (self.trainer.num_gpus == 0 - and self.trainer._distrib_type in ( - DistributedType.DP, DistributedType.DDP, DistributedType.DDP_SPAWN, DistributedType.DDP2 - )): + if (self.trainer.num_gpus == 0 and self.trainer._distrib_type in _distrib_types): rank_zero_warn( 'You requested distributed training on GPUs, but none is available, so we set backend to `ddp_cpu`.' ) From 60240a624734ff7af7654fbc413041c9f9f2da3d Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Fri, 1 Jan 2021 10:20:15 +0100 Subject: [PATCH 11/25] text --- pytorch_lightning/accelerators/accelerator_connector.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pytorch_lightning/accelerators/accelerator_connector.py b/pytorch_lightning/accelerators/accelerator_connector.py index fea3df06b7df4..4e736a20856e9 100644 --- a/pytorch_lightning/accelerators/accelerator_connector.py +++ b/pytorch_lightning/accelerators/accelerator_connector.py @@ -279,7 +279,9 @@ def select_accelerator(self): accelerator_backend = accelerators.CPUAccelerator(self.trainer, cluster_env) else: raise MisconfigurationException( - f'Trainer(accelerator={self.trainer.distributed_backend} is not a supported backend' + f'Trainer(accelerator={self.trainer.distributed_backend} is not a supported backend for' + f' num_nodes={self.trainer.num_nodes}, num_gpus={self.trainer.num_gpus}' + f' and num_processes={self.trainer.num_processes}.' ) return accelerator_backend From a70616e1d65dd421069123468fc233b8177cc12a Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Fri, 1 Jan 2021 14:35:41 +0100 Subject: [PATCH 12/25] 2 --- tests/backends/test_accelerator_connector.py | 10 +++++----- tests/checkpointing/test_torch_saving.py | 3 +-- tests/plugins/test_amp_plugin.py | 12 ++++++++---- tests/plugins/test_apex_plugin.py | 12 ++++++++---- tests/plugins/test_ddp_plugin.py | 12 ++++++------ tests/plugins/test_plugin.py | 4 ++-- tests/plugins/test_rpc_plugin.py | 2 +- tests/plugins/test_sharded_plugin.py | 4 ++-- 8 files changed, 33 insertions(+), 26 deletions(-) diff --git a/tests/backends/test_accelerator_connector.py b/tests/backends/test_accelerator_connector.py index b9b4263d0cf50..56f69c7970f15 100644 --- a/tests/backends/test_accelerator_connector.py +++ b/tests/backends/test_accelerator_connector.py @@ -242,7 +242,7 @@ def on_fit_start(self, trainer, pl_module): trainer = Trainer( fast_dev_run=True, accelerator='ddp_cpu', - num_processes=1, + num_processes=2, callbacks=[CB()], ) @@ -270,7 +270,7 @@ def on_fit_start(self, trainer, pl_module): trainer = Trainer( fast_dev_run=True, accelerator='ddp_cpu', - num_processes=1, + num_processes=2, callbacks=[CB()], ) @@ -307,7 +307,7 @@ def on_fit_start(self, trainer, pl_module): plugins=[CustomCluster()], fast_dev_run=True, accelerator='ddp_cpu', - num_processes=1, + num_processes=2, callbacks=[CB()], ) @@ -341,7 +341,7 @@ def on_fit_start(self, trainer, pl_module): trainer = Trainer( fast_dev_run=True, accelerator=Accel(), - num_processes=1, + num_processes=2, callbacks=[CB()] ) @@ -367,7 +367,7 @@ def on_fit_start(self, trainer, pl_module): trainer = Trainer( fast_dev_run=True, accelerator='ddp_cpu', - num_processes=1, + num_processes=2, callbacks=[CB()] ) diff --git a/tests/checkpointing/test_torch_saving.py b/tests/checkpointing/test_torch_saving.py index 493aa0dabe126..a15d425f5a0e7 100644 --- a/tests/checkpointing/test_torch_saving.py +++ b/tests/checkpointing/test_torch_saving.py @@ -43,8 +43,7 @@ def test_model_torch_save(tmpdir, enable_pl_optimizer): assert is_lightning_optimizer if enable_pl_optimizer else not is_lightning_optimizer -@pytest.mark.skipif(platform.system() == "Windows", - reason="Distributed training is not supported on Windows") +@pytest.mark.skipif(platform.system() == "Windows", reason="Distributed training is not supported on Windows") def test_model_torch_save_ddp_cpu(tmpdir): """Test to ensure torch save does not fail for model and trainer using cpu ddp.""" model = BoringModel() diff --git a/tests/plugins/test_amp_plugin.py b/tests/plugins/test_amp_plugin.py index 6c5a7b052d0d1..1e98740f99d62 100644 --- a/tests/plugins/test_amp_plugin.py +++ b/tests/plugins/test_amp_plugin.py @@ -21,8 +21,10 @@ "SLURM_LOCALID": "0" }) @mock.patch('torch.cuda.device_count', return_value=2) -@pytest.mark.parametrize(['ddp_backend', 'gpus', 'num_processes'], - [('ddp_cpu', None, None), ('ddp', 2, 0), ('ddp2', 2, 0), ('ddp_spawn', 2, 0)]) +@pytest.mark.parametrize( + ['ddp_backend', 'gpus', 'num_processes'], + [('ddp_cpu', None, 2), ('ddp', 2, 0), ('ddp2', 2, 0), ('ddp_spawn', 2, 0)], +) def test_amp_choice_default_ddp_cpu(tmpdir, ddp_backend, gpus, num_processes): class CB(Callback): @@ -55,8 +57,10 @@ def on_fit_start(self, trainer, pl_module): "SLURM_LOCALID": "0" }) @mock.patch('torch.cuda.device_count', return_value=2) -@pytest.mark.parametrize(['ddp_backend', 'gpus', 'num_processes'], - [('ddp_cpu', None, None), ('ddp', 2, 0), ('ddp2', 2, 0), ('ddp_spawn', 2, 0)]) +@pytest.mark.parametrize( + ['ddp_backend', 'gpus', 'num_processes'], + [('ddp_cpu', None, 2), ('ddp', 2, 0), ('ddp2', 2, 0), ('ddp_spawn', 2, 0)], +) def test_amp_choice_custom_ddp_cpu(tmpdir, ddp_backend, gpus, num_processes): class MyNativeAMP(NativeAMPPlugin): pass diff --git a/tests/plugins/test_apex_plugin.py b/tests/plugins/test_apex_plugin.py index bfed1aefec0a1..c4198b97446c3 100644 --- a/tests/plugins/test_apex_plugin.py +++ b/tests/plugins/test_apex_plugin.py @@ -18,8 +18,10 @@ "SLURM_LOCALID": "0" }) @mock.patch('torch.cuda.device_count', return_value=2) -@pytest.mark.parametrize(['ddp_backend', 'gpus', 'num_processes'], - [('ddp_cpu', None, None), ('ddp', 2, 0), ('ddp2', 2, 0), ('ddp_spawn', 2, 0)]) +@pytest.mark.parametrize( + ['ddp_backend', 'gpus', 'num_processes'], + [('ddp_cpu', None, 2), ('ddp', 2, 0), ('ddp2', 2, 0), ('ddp_spawn', 2, 0)], +) def test_amp_choice_default_ddp_cpu(tmpdir, ddp_backend, gpus, num_processes): class CB(Callback): @@ -52,8 +54,10 @@ def on_fit_start(self, trainer, pl_module): "SLURM_LOCALID": "0" }) @mock.patch('torch.cuda.device_count', return_value=2) -@pytest.mark.parametrize(['ddp_backend', 'gpus', 'num_processes'], - [('ddp_cpu', None, None), ('ddp', 2, 0), ('ddp2', 2, 0), ('ddp_spawn', 2, 0)]) +@pytest.mark.parametrize( + ['ddp_backend', 'gpus', 'num_processes'], + [('ddp_cpu', None, 2), ('ddp', 2, 0), ('ddp2', 2, 0), ('ddp_spawn', 2, 0)], +) def test_amp_choice_custom_ddp_cpu(tmpdir, ddp_backend, gpus, num_processes): class MyApexPlugin(ApexPlugin): pass diff --git a/tests/plugins/test_ddp_plugin.py b/tests/plugins/test_ddp_plugin.py index 4e51fc7c5ac21..fe8fc555ba06c 100644 --- a/tests/plugins/test_ddp_plugin.py +++ b/tests/plugins/test_ddp_plugin.py @@ -27,7 +27,7 @@ @mock.patch("torch.cuda.device_count", return_value=2) @pytest.mark.parametrize( ["ddp_backend", "gpus", "num_processes"], - [("ddp_cpu", None, None), ("ddp", 2, 0), ("ddp2", 2, 0), ("ddp_spawn", 2, 0)], + [("ddp_cpu", None, 2), ("ddp", 2, 0), ("ddp2", 2, 0), ("ddp_spawn", 2, 0)], ) def test_ddp_choice_default_ddp_cpu(tmpdir, ddp_backend, gpus, num_processes): class CB(Callback): @@ -62,7 +62,7 @@ def on_fit_start(self, trainer, pl_module): @mock.patch("torch.cuda.device_count", return_value=2) @pytest.mark.parametrize( ["ddp_backend", "gpus", "num_processes"], - [("ddp_cpu", None, None), ("ddp", 2, 0), ("ddp2", 2, 0), ("ddp_spawn", 2, 0)], + [("ddp_cpu", None, 2), ("ddp", 2, 0), ("ddp2", 2, 0), ("ddp_spawn", 2, 0)], ) def test_ddp_choice_custom_ddp_cpu(tmpdir, ddp_backend, gpus, num_processes): class MyDDP(DDPPlugin): @@ -101,7 +101,7 @@ def on_fit_start(self, trainer, pl_module): @mock.patch("torch.cuda.device_count", return_value=2) @pytest.mark.parametrize( ["ddp_backend", "gpus", "num_processes"], - [("ddp_cpu", None, None), ("ddp", 2, 0), ("ddp2", 2, 0), ("ddp_spawn", 2, 0)], + [("ddp_cpu", None, 2), ("ddp", 2, 0), ("ddp2", 2, 0), ("ddp_spawn", 2, 0)], ) @pytest.mark.skipif(platform.system() == "Windows", reason="Distributed sharded plugin is not supported on Windows") @pytest.mark.skipif(not _FAIRSCALE_AVAILABLE, reason="Fairscale is not available") @@ -139,7 +139,7 @@ def on_fit_start(self, trainer, pl_module): @mock.patch("torch.cuda.device_count", return_value=2) @pytest.mark.parametrize( ["ddp_backend", "gpus", "num_processes"], - [("ddp_cpu", None, None), ("ddp", 2, 0), ("ddp2", 2, 0), ("ddp_spawn", 2, 0)], + [("ddp_cpu", None, 2), ("ddp", 2, 0), ("ddp2", 2, 0), ("ddp_spawn", 2, 0)], ) def test_ddp_invalid_choice_string_ddp_cpu(tmpdir, ddp_backend, gpus, num_processes): with pytest.raises(MisconfigurationException, match='not a supported lightning custom plugin'): @@ -166,7 +166,7 @@ def test_ddp_invalid_choice_string_ddp_cpu(tmpdir, ddp_backend, gpus, num_proces @mock.patch("torch.cuda.device_count", return_value=2) @pytest.mark.parametrize( ["ddp_backend", "gpus", "num_processes"], - [("ddp_cpu", None, None), ("ddp", 2, 0), ("ddp2", 2, 0), ("ddp_spawn", 2, 0)], + [("ddp_cpu", None, 2), ("ddp", 2, 0), ("ddp2", 2, 0), ("ddp_spawn", 2, 0)], ) @pytest.mark.skipif(platform.system() == "Windows", reason="Distributed sharded plugin is not supported on Windows") @pytest.mark.skipif(not _FAIRSCALE_AVAILABLE, reason="Fairscale is not available") @@ -202,7 +202,7 @@ class MyDDP(DDPPlugin): @mock.patch("torch.cuda.device_count", return_value=2) @pytest.mark.parametrize( ["ddp_backend", "gpus", "num_processes"], - [("ddp_cpu", None, None), ("ddp", 2, 0), ("ddp2", 2, 0), ("ddp_spawn", 2, 0)], + [("ddp_cpu", None, 2), ("ddp", 2, 0), ("ddp2", 2, 0), ("ddp_spawn", 2, 0)], ) def test_ddp_choice_custom_ddp_cpu_custom_args( tmpdir, ddp_backend, gpus, num_processes diff --git a/tests/plugins/test_plugin.py b/tests/plugins/test_plugin.py index be9d95f09f03f..05789596879b4 100644 --- a/tests/plugins/test_plugin.py +++ b/tests/plugins/test_plugin.py @@ -38,7 +38,7 @@ @mock.patch("torch.cuda.device_count", return_value=2) @pytest.mark.parametrize( ["ddp_backend", "gpus", "num_processes"], - [("ddp_cpu", None, None), ("ddp", 2, 0), ("ddp2", 2, 0), ("ddp_spawn", 2, 0)], + [("ddp_cpu", None, 2), ("ddp", 2, 0), ("ddp2", 2, 0), ("ddp_spawn", 2, 0)], ) def test_custom_required_plugins(tmpdir, ddp_backend, gpus, num_processes): """ @@ -92,7 +92,7 @@ def on_fit_start(self, trainer, pl_module): @mock.patch("torch.cuda.device_count", return_value=2) @pytest.mark.parametrize( ["ddp_backend", "gpus", "num_processes"], - [("ddp_cpu", None, None), ("ddp", 2, 0), ("ddp2", 2, 0), ("ddp_spawn", 2, 0)], + [("ddp_cpu", None, 2), ("ddp", 2, 0), ("ddp2", 2, 0), ("ddp_spawn", 2, 0)], ) def test_invalid_custom_required_plugins(tmpdir, ddp_backend, gpus, num_processes): """ diff --git a/tests/plugins/test_rpc_plugin.py b/tests/plugins/test_rpc_plugin.py index 87d64a7b8c686..a28cd4b50e4f4 100644 --- a/tests/plugins/test_rpc_plugin.py +++ b/tests/plugins/test_rpc_plugin.py @@ -26,7 +26,7 @@ @mock.patch("torch.cuda.device_count", return_value=2) @pytest.mark.parametrize( ["ddp_backend", "gpus", "num_processes"], - [("ddp_cpu", None, None), ("ddp", 2, 0), ("ddp2", 2, 0), ("ddp_spawn", 2, 0)], + [("ddp_cpu", None, 2), ("ddp", 2, 0), ("ddp2", 2, 0), ("ddp_spawn", 2, 0)], ) @pytest.mark.skipif(not _RPC_AVAILABLE, reason="RPC is not available") def test_rpc_choice(tmpdir, ddp_backend, gpus, num_processes): diff --git a/tests/plugins/test_sharded_plugin.py b/tests/plugins/test_sharded_plugin.py index c0761b7e03fcb..b4a09760bc31c 100644 --- a/tests/plugins/test_sharded_plugin.py +++ b/tests/plugins/test_sharded_plugin.py @@ -28,7 +28,7 @@ @mock.patch("torch.cuda.device_count", return_value=2) @pytest.mark.parametrize( ["ddp_backend", "gpus", "num_processes"], - [("ddp_cpu", None, None), ("ddp", 2, 0), ("ddp2", 2, 0), ("ddp_spawn", 2, 0)], + [("ddp_cpu", None, 2), ("ddp", 2, 0), ("ddp2", 2, 0), ("ddp_spawn", 2, 0)], ) @pytest.mark.skipif(not _FAIRSCALE_AVAILABLE, reason="Fairscale is not available") def test_ddp_choice_sharded(tmpdir, ddp_backend, gpus, num_processes): @@ -89,7 +89,7 @@ def test_invalid_apex_sharded(tmpdir): @mock.patch("torch.cuda.device_count", return_value=2) @pytest.mark.parametrize( ["ddp_backend", "gpus", "num_processes"], - [("ddp_cpu", None, None), ("ddp", 2, 0), ("ddp2", 2, 0), ("ddp_spawn", 2, 0)], + [("ddp_cpu", None, 2), ("ddp", 2, 0), ("ddp2", 2, 0), ("ddp_spawn", 2, 0)], ) @pytest.mark.skipif(not _FAIRSCALE_AVAILABLE, reason="Fairscale is not available") @pytest.mark.skipif(not _NATIVE_AMP_AVAILABLE, reason="Requires native AMP") From 7780f3f7eb29f1280dca68d64de948a2db95f12b Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Fri, 1 Jan 2021 14:47:05 +0100 Subject: [PATCH 13/25] max --- .../accelerators/accelerator_connector.py | 43 ++++++++++--------- 1 file changed, 23 insertions(+), 20 deletions(-) diff --git a/pytorch_lightning/accelerators/accelerator_connector.py b/pytorch_lightning/accelerators/accelerator_connector.py index 4e736a20856e9..804810140bf79 100644 --- a/pytorch_lightning/accelerators/accelerator_connector.py +++ b/pytorch_lightning/accelerators/accelerator_connector.py @@ -81,16 +81,7 @@ def on_trainer_init( # sync-bn backend self.trainer.sync_batchnorm = sync_batchnorm - self.trainer.tpu_cores = device_parser.parse_tpu_cores(tpu_cores) - if self.trainer.tpu_cores is not None: - if _TPU_AVAILABLE: - self.trainer._device_type = DeviceType.TPU - else: - raise MisconfigurationException( - f"You have requested {self.trainer.tpu_cores} TPU cores but none is available." - ) - - self.trainer.tpu_id = self.trainer.tpu_cores[0] if isinstance(self.trainer.tpu_cores, list) else None + self._parse_tpu_device_details(tpu_cores) if num_processes != 1 and distributed_backend != "ddp_cpu": rank_zero_warn("num_processes is only used for `accelerator='ddp_cpu'`. Ignoring it.") @@ -107,17 +98,9 @@ def on_trainer_init( self.trainer.data_parallel_device_ids = device_parser.parse_gpu_ids(self.trainer.gpus) self.trainer.root_gpu = device_parser.determine_root_gpu_device(self.trainer.data_parallel_device_ids) - # tpu state flags - self.trainer.tpu_local_core_rank = None - self.trainer.tpu_global_core_rank = None - # distributed backend choice self.set_distributed_mode() - # override dist backend when using TPUs - if self.trainer.on_tpu: - self.trainer.distributed_backend = "tpu" - # init flags for SLURM+DDP to work self.trainer.world_size = 1 self.trainer.interactive_ddp_procs = [] @@ -136,6 +119,23 @@ def on_trainer_init( self.trainer.replace_sampler_ddp = replace_sampler_ddp + def _parse_tpu_device_details(self, tpu_cores): + self.trainer.tpu_cores = device_parser.parse_tpu_cores(tpu_cores) + if self.trainer.tpu_cores is not None: + if _TPU_AVAILABLE: + self.trainer._device_type = DeviceType.TPU + self.trainer.distributed_backend = "tpu" + else: + raise MisconfigurationException( + f"You have requested {self.trainer.tpu_cores} TPU cores but none is available." + ) + + self.trainer.tpu_id = self.trainer.tpu_cores[0] if isinstance(self.trainer.tpu_cores, list) else None + + # tpu state flags + self.trainer.tpu_local_core_rank = None + self.trainer.tpu_global_core_rank = None + def _map_deprecated_dist_backend(self, accelerator, distributed_backend): if distributed_backend is not None: rank_zero_warn( @@ -313,9 +313,12 @@ def set_distributed_mode(self): self.trainer._distrib_type = DistributedType(self.trainer.distributed_backend) # unless you request explicitly for CPU and some GPU are available use them - if (self.trainer.num_gpus > 0 - and not (self.trainer.distributed_backend and 'cpu' in self.trainer.distributed_backend)): + _on_cpu = self.trainer.distributed_backend and 'cpu' in self.trainer.distributed_backend + if (self.trainer.num_gpus > 0 and not _on_cpu): self.trainer._device_type = DeviceType.GPU + elif self.trainer._device_type == DeviceType.CPU and self.trainer.num_processes is None: + # define the max CPU available + self.trainer.num_processes = os.cpu_count() _distrib_types = (DistributedType.DP, DistributedType.DDP, DistributedType.DDP_SPAWN, DistributedType.DDP2) # DP and DDP2 cannot run without GPU From 4a11e1ad427052b19a8c9b63bf5da2195af3c08c Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Fri, 1 Jan 2021 14:55:57 +0100 Subject: [PATCH 14/25] max --- pytorch_lightning/accelerators/accelerator_connector.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pytorch_lightning/accelerators/accelerator_connector.py b/pytorch_lightning/accelerators/accelerator_connector.py index 804810140bf79..e0b155e62b904 100644 --- a/pytorch_lightning/accelerators/accelerator_connector.py +++ b/pytorch_lightning/accelerators/accelerator_connector.py @@ -308,6 +308,10 @@ def set_distributed_mode(self): rank_zero_warn( 'You requested one or more GPUs, but set the backend to `ddp_cpu`. Training will not use GPUs.' ) + if self.trainer.num_processes is None: + # define the max CPU available + self.trainer.num_processes = os.cpu_count() + # set all other requested distrib. types adn if it was not set in the elif self.trainer.distributed_backend and self.trainer._distrib_type is None: self.trainer._distrib_type = DistributedType(self.trainer.distributed_backend) @@ -316,9 +320,6 @@ def set_distributed_mode(self): _on_cpu = self.trainer.distributed_backend and 'cpu' in self.trainer.distributed_backend if (self.trainer.num_gpus > 0 and not _on_cpu): self.trainer._device_type = DeviceType.GPU - elif self.trainer._device_type == DeviceType.CPU and self.trainer.num_processes is None: - # define the max CPU available - self.trainer.num_processes = os.cpu_count() _distrib_types = (DistributedType.DP, DistributedType.DDP, DistributedType.DDP_SPAWN, DistributedType.DDP2) # DP and DDP2 cannot run without GPU From 5d3a550a58c1d767ca9f972cbff4d2d726d1d6c0 Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Fri, 1 Jan 2021 15:09:15 +0100 Subject: [PATCH 15/25] 2 --- tests/backends/test_accelerator_connector.py | 1 + tests/plugins/test_sharded_plugin.py | 5 +++++ tests/trainer/properties/test_get_model.py | 2 +- 3 files changed, 7 insertions(+), 1 deletion(-) diff --git a/tests/backends/test_accelerator_connector.py b/tests/backends/test_accelerator_connector.py index 56f69c7970f15..5ca29d27d292e 100644 --- a/tests/backends/test_accelerator_connector.py +++ b/tests/backends/test_accelerator_connector.py @@ -50,6 +50,7 @@ def on_fit_start(self, trainer, pl_module): trainer = Trainer( fast_dev_run=True, accelerator='ddp_cpu', + num_processes=2, callbacks=[CB()], ) diff --git a/tests/plugins/test_sharded_plugin.py b/tests/plugins/test_sharded_plugin.py index b4a09760bc31c..d8334e24e0e83 100644 --- a/tests/plugins/test_sharded_plugin.py +++ b/tests/plugins/test_sharded_plugin.py @@ -129,6 +129,7 @@ def test_ddp_sharded_plugin_checkpoint_cpu(tmpdir): model = BoringModel() trainer = Trainer( accelerator='ddp_cpu', + num_processes=2, plugins=[DDPShardedPlugin()], fast_dev_run=True, ) @@ -208,6 +209,7 @@ def test_ddp_sharded_plugin_resume_from_checkpoint(tmpdir): model = BoringModel() trainer = Trainer( accelerator='ddp_cpu', + num_processes=2, plugins=[DDPShardedPlugin()], fast_dev_run=True, ) @@ -221,6 +223,7 @@ def test_ddp_sharded_plugin_resume_from_checkpoint(tmpdir): trainer = Trainer( accelerator='ddp_cpu', + num_processes=2, plugins=[DDPShardedPlugin()], fast_dev_run=True, resume_from_checkpoint=checkpoint_path @@ -291,6 +294,7 @@ def test_ddp_sharded_plugin_resume_from_checkpoint_gpu_to_cpu(tmpdir): trainer = Trainer( plugins=[DDPShardedPlugin()], accelerator='ddp_cpu', + num_processes=2, fast_dev_run=True, resume_from_checkpoint=checkpoint_path ) @@ -308,6 +312,7 @@ def test_ddp_sharded_plugin_test(tmpdir): model = BoringModel() trainer = Trainer( accelerator='ddp_cpu', + num_processes=2, plugins=[DDPShardedPlugin()], fast_dev_run=True, ) diff --git a/tests/trainer/properties/test_get_model.py b/tests/trainer/properties/test_get_model.py index ca1301fb0dec6..16434f390b90a 100644 --- a/tests/trainer/properties/test_get_model.py +++ b/tests/trainer/properties/test_get_model.py @@ -61,7 +61,7 @@ def test_get_model_ddp_cpu(tmpdir): limit_val_batches=2, max_epochs=1, accelerator='ddp_cpu', - num_processes=2 + num_processes=2, ) trainer.fit(model) From 0beae5754363d9ca1bdfb382c725913bf24e13cd Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Fri, 1 Jan 2021 16:14:38 +0100 Subject: [PATCH 16/25] 2 --- pytorch_lightning/plugins/plugin_connector.py | 4 ++-- tests/backends/test_accelerator_connector.py | 10 +++++----- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/pytorch_lightning/plugins/plugin_connector.py b/pytorch_lightning/plugins/plugin_connector.py index d66c25173cc77..596a630b1c959 100644 --- a/pytorch_lightning/plugins/plugin_connector.py +++ b/pytorch_lightning/plugins/plugin_connector.py @@ -31,8 +31,8 @@ def __init__(self, trainer): self.plugins = [] self.ddp_plugin = DDPPlugin() self.cloud_environment = None - self.amp_plugin = NativeAMPPlugin(trainer) - self.apex_plugin = ApexPlugin(trainer) + # self.amp_plugin = NativeAMPPlugin(trainer) + # self.apex_plugin = ApexPlugin(trainer) def on_trainer_init(self, plugins: Optional[Union[str, list]]): self.plugins = plugins diff --git a/tests/backends/test_accelerator_connector.py b/tests/backends/test_accelerator_connector.py index 5ca29d27d292e..dc8bf338d3eb3 100644 --- a/tests/backends/test_accelerator_connector.py +++ b/tests/backends/test_accelerator_connector.py @@ -252,7 +252,7 @@ def on_fit_start(self, trainer, pl_module): @mock.patch.dict(os.environ, { - "SLURM_NTASKS": "1", + "SLURM_NTASKS": "2", "SLURM_JOB_NAME": "SOME_NAME", "SLURM_NODEID": "0", "LOCAL_RANK": "0", @@ -280,7 +280,7 @@ def on_fit_start(self, trainer, pl_module): @mock.patch.dict(os.environ, { - "SLURM_NTASKS": "1", + "SLURM_NTASKS": "2", "SLURM_JOB_NAME": "SOME_NAME", "SLURM_NODEID": "0", "LOCAL_RANK": "0", @@ -317,7 +317,7 @@ def on_fit_start(self, trainer, pl_module): @mock.patch.dict(os.environ, { - "SLURM_NTASKS": "1", + "SLURM_NTASKS": "2", "SLURM_JOB_NAME": "SOME_NAME", "SLURM_NODEID": "0", "LOCAL_RANK": "0", @@ -351,7 +351,7 @@ def on_fit_start(self, trainer, pl_module): @mock.patch.dict(os.environ, { - "SLURM_NTASKS": "1", + "SLURM_NTASKS": "2", "SLURM_JOB_NAME": "SOME_NAME", "SLURM_NODEID": "0", "LOCAL_RANK": "0", @@ -369,7 +369,7 @@ def on_fit_start(self, trainer, pl_module): fast_dev_run=True, accelerator='ddp_cpu', num_processes=2, - callbacks=[CB()] + callbacks=[CB()], ) with pytest.raises(SystemExit): From 0e99cb99236c49a6b8ce45ddd619ff0dc6cb57a4 Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Fri, 1 Jan 2021 16:53:51 +0100 Subject: [PATCH 17/25] tpu --- pytorch_lightning/accelerators/accelerator_connector.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pytorch_lightning/accelerators/accelerator_connector.py b/pytorch_lightning/accelerators/accelerator_connector.py index e0b155e62b904..bda4cd05383f2 100644 --- a/pytorch_lightning/accelerators/accelerator_connector.py +++ b/pytorch_lightning/accelerators/accelerator_connector.py @@ -311,7 +311,9 @@ def set_distributed_mode(self): if self.trainer.num_processes is None: # define the max CPU available self.trainer.num_processes = os.cpu_count() - + # special case with TPUs + elif self.trainer.distributed_backend == 'tpu': + self.trainer._device_type = DeviceType.TPU # set all other requested distrib. types adn if it was not set in the elif self.trainer.distributed_backend and self.trainer._distrib_type is None: self.trainer._distrib_type = DistributedType(self.trainer.distributed_backend) From ed634353a48a5e738e739678349e554fd5b336ae Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Fri, 1 Jan 2021 17:01:53 +0100 Subject: [PATCH 18/25] 2 --- benchmarks/test_sharded_parity.py | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/benchmarks/test_sharded_parity.py b/benchmarks/test_sharded_parity.py index bd7f335a03720..2616f5a11d7f2 100644 --- a/benchmarks/test_sharded_parity.py +++ b/benchmarks/test_sharded_parity.py @@ -1,12 +1,12 @@ import os import platform import time -from typing import Union +from typing import Union, Type import pytest import torch -from pytorch_lightning import Trainer, seed_everything +from pytorch_lightning import Trainer, seed_everything, LightningModule from pytorch_lightning.plugins.ddp_plugin import DDPPlugin from pytorch_lightning.plugins.sharded_plugin import DDPShardedPlugin from pytorch_lightning.utilities import _FAIRSCALE_AVAILABLE, _NATIVE_AMP_AVAILABLE @@ -20,9 +20,10 @@ def test_ddp_sharded_plugin_correctness_one_device(): plugin_parity_test( accelerator='ddp_cpu', + num_processes=2, max_percent_speed_diff=0.15, # slower speed due to one CPU doing additional sequential memory saving calls plugin=DDPShardedPlugin(), - model_cls=SeedTrainLoaderModel + model_cls=SeedTrainLoaderModel, ) @@ -35,7 +36,7 @@ def test_ddp_sharded_plugin_correctness_one_gpu(): gpus=1, accelerator='ddp_spawn', plugin=DDPShardedPlugin(), - model_cls=SeedTrainLoaderModel + model_cls=SeedTrainLoaderModel, ) @@ -50,7 +51,7 @@ def test_ddp_sharded_plugin_correctness_amp_one_gpu(): precision=16, accelerator='ddp_spawn', plugin=DDPShardedPlugin(), - model_cls=SeedTrainLoaderModel + model_cls=SeedTrainLoaderModel, ) @@ -65,7 +66,7 @@ def test_ddp_sharded_plugin_correctness_multi_gpu(): accelerator='ddp_spawn', plugin=DDPShardedPlugin(), model_cls=SeedTrainLoaderModel, - max_percent_speed_diff=0.25 + max_percent_speed_diff=0.25, ) @@ -81,7 +82,7 @@ def test_ddp_sharded_plugin_correctness_amp_multi_gpu(): accelerator='ddp_spawn', plugin=DDPShardedPlugin(), model_cls=SeedTrainLoaderModel, - max_percent_speed_diff=0.25 + max_percent_speed_diff=0.25, ) @@ -97,7 +98,7 @@ def test_ddp_string_sharded_plugin_correctness_amp_multi_gpu(): accelerator='ddp_spawn', plugin='ddp_sharded', model_cls=SeedTrainLoaderModel, - max_percent_speed_diff=0.25 + max_percent_speed_diff=0.25, ) @@ -145,7 +146,7 @@ def test_ddp_sharded_plugin_correctness_multi_gpu_multi_optim(): gpus=2, accelerator='ddp_spawn', model_cls=SeedTrainLoaderMultipleOptimizersModel, - max_percent_speed_diff=0.25 # Increase speed diff since only 2 GPUs sharding 2 optimizers + max_percent_speed_diff=0.25, # Increase speed diff since only 2 GPUs sharding 2 optimizers ) @@ -163,7 +164,7 @@ def test_ddp_sharded_plugin_correctness_multi_gpu_multi_optim_manual(tmpdir): gpus=2, accelerator='ddp_spawn', model_cls=SeedTrainLoaderManualModel, - max_percent_speed_diff=0.25 # Increase speed diff since only 2 GPUs sharding 2 optimizers + max_percent_speed_diff=0.25, # Increase speed diff since only 2 GPUs sharding 2 optimizers ) @@ -259,13 +260,15 @@ def record_ddp_fit_model_stats(trainer, model, use_cuda): def plugin_parity_test( - model_cls: SeedTrainLoaderModel, + model_cls: Type[SeedTrainLoaderModel], plugin: Union[str, DDPPlugin], seed: int = 42, accelerator: str = 'ddp_spawn', gpus: int = 0, precision: int = 32, - max_percent_speed_diff: float = 0.1): + max_percent_speed_diff: float = 0.1, + **kwargs, +): """ Ensures that the trained model is identical to the standard DDP implementation. Also checks for speed/memory regressions, we should expect always less memory but performance to fluctuate. @@ -293,6 +296,7 @@ def plugin_parity_test( gpus=gpus, precision=precision, accelerator=accelerator, + **kwargs, ) max_memory_ddp, ddp_time = record_ddp_fit_model_stats( From 8fc771148d3ccfaa493ffe8a7ac69c55c49629b6 Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Fri, 1 Jan 2021 17:05:22 +0100 Subject: [PATCH 19/25] flake8 --- benchmarks/test_sharded_parity.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmarks/test_sharded_parity.py b/benchmarks/test_sharded_parity.py index 2616f5a11d7f2..5b0445c5b9b80 100644 --- a/benchmarks/test_sharded_parity.py +++ b/benchmarks/test_sharded_parity.py @@ -1,12 +1,12 @@ import os import platform import time -from typing import Union, Type +from typing import Type, Union import pytest import torch -from pytorch_lightning import Trainer, seed_everything, LightningModule +from pytorch_lightning import Trainer, seed_everything from pytorch_lightning.plugins.ddp_plugin import DDPPlugin from pytorch_lightning.plugins.sharded_plugin import DDPShardedPlugin from pytorch_lightning.utilities import _FAIRSCALE_AVAILABLE, _NATIVE_AMP_AVAILABLE From fc26e76004ead251acb0cb83e8da82f5851ec99a Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Fri, 1 Jan 2021 18:11:55 +0100 Subject: [PATCH 20/25] . --- benchmarks/test_sharded_parity.py | 1 + pytorch_lightning/accelerators/accelerator_connector.py | 6 +++--- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/benchmarks/test_sharded_parity.py b/benchmarks/test_sharded_parity.py index 5b0445c5b9b80..cff3a0c0ecdb9 100644 --- a/benchmarks/test_sharded_parity.py +++ b/benchmarks/test_sharded_parity.py @@ -316,6 +316,7 @@ def plugin_parity_test( precision=precision, accelerator=accelerator, plugins=[plugin], + **kwargs, ) max_memory_custom, custom_model_time = record_ddp_fit_model_stats( diff --git a/pytorch_lightning/accelerators/accelerator_connector.py b/pytorch_lightning/accelerators/accelerator_connector.py index bda4cd05383f2..7417f889dd808 100644 --- a/pytorch_lightning/accelerators/accelerator_connector.py +++ b/pytorch_lightning/accelerators/accelerator_connector.py @@ -279,9 +279,9 @@ def select_accelerator(self): accelerator_backend = accelerators.CPUAccelerator(self.trainer, cluster_env) else: raise MisconfigurationException( - f'Trainer(accelerator={self.trainer.distributed_backend} is not a supported backend for' - f' num_nodes={self.trainer.num_nodes}, num_gpus={self.trainer.num_gpus}' - f' and num_processes={self.trainer.num_processes}.' + f'`Trainer(accelerator={self.trainer.distributed_backend}, num_nodes={self.trainer.num_nodes},' + f' num_processes={self.trainer.num_processes}, ...)` is not a supported backend for' + f' num_gpus={self.trainer.num_gpus}' ) return accelerator_backend From cbd05810c0d5fa39e1596d01017e0994ea9eaa4f Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Fri, 1 Jan 2021 19:18:33 +0100 Subject: [PATCH 21/25] . --- benchmarks/test_sharded_parity.py | 24 ++++++++---------------- 1 file changed, 8 insertions(+), 16 deletions(-) diff --git a/benchmarks/test_sharded_parity.py b/benchmarks/test_sharded_parity.py index cff3a0c0ecdb9..9688a603451f1 100644 --- a/benchmarks/test_sharded_parity.py +++ b/benchmarks/test_sharded_parity.py @@ -14,8 +14,7 @@ from tests.base.boring_model import BoringModel, RandomDataset -@pytest.mark.skipif(platform.system() == "Windows", - reason="Distributed training is not supported on Windows") +@pytest.mark.skipif(platform.system() == "Windows", reason="Distributed training is not supported on Windows") @pytest.mark.skipif(not _FAIRSCALE_AVAILABLE, reason="Fairscale is not available") def test_ddp_sharded_plugin_correctness_one_device(): plugin_parity_test( @@ -28,8 +27,7 @@ def test_ddp_sharded_plugin_correctness_one_device(): @pytest.mark.skipif(not torch.cuda.is_available(), reason="requires GPU machine") -@pytest.mark.skipif(platform.system() == "Windows", - reason="Distributed training is not supported on Windows") +@pytest.mark.skipif(platform.system() == "Windows", reason="Distributed training is not supported on Windows") @pytest.mark.skipif(not _FAIRSCALE_AVAILABLE, reason="Fairscale is not available") def test_ddp_sharded_plugin_correctness_one_gpu(): plugin_parity_test( @@ -42,8 +40,7 @@ def test_ddp_sharded_plugin_correctness_one_gpu(): @pytest.mark.skipif(not _NATIVE_AMP_AVAILABLE, reason="Requires native AMP") @pytest.mark.skipif(not torch.cuda.is_available(), reason="requires GPU machine") -@pytest.mark.skipif(platform.system() == "Windows", - reason="Distributed training is not supported on Windows") +@pytest.mark.skipif(platform.system() == "Windows", reason="Distributed training is not supported on Windows") @pytest.mark.skipif(not _FAIRSCALE_AVAILABLE, reason="Fairscale is not available") def test_ddp_sharded_plugin_correctness_amp_one_gpu(): plugin_parity_test( @@ -57,8 +54,7 @@ def test_ddp_sharded_plugin_correctness_amp_one_gpu(): @pytest.mark.skip(reason="Not a critical test, skip till drone CI performance improves.") @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") -@pytest.mark.skipif(platform.system() == "Windows", - reason="Distributed training is not supported on Windows") +@pytest.mark.skipif(platform.system() == "Windows", reason="Distributed training is not supported on Windows") @pytest.mark.skipif(not _FAIRSCALE_AVAILABLE, reason="Fairscale is not available") def test_ddp_sharded_plugin_correctness_multi_gpu(): plugin_parity_test( @@ -71,8 +67,7 @@ def test_ddp_sharded_plugin_correctness_multi_gpu(): @pytest.mark.skipif(not _NATIVE_AMP_AVAILABLE, reason="Requires native AMP") -@pytest.mark.skipif(platform.system() == "Windows", - reason="Distributed training is not supported on Windows") +@pytest.mark.skipif(platform.system() == "Windows", reason="Distributed training is not supported on Windows") @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") @pytest.mark.skipif(not _FAIRSCALE_AVAILABLE, reason="Fairscale is not available") def test_ddp_sharded_plugin_correctness_amp_multi_gpu(): @@ -87,8 +82,7 @@ def test_ddp_sharded_plugin_correctness_amp_multi_gpu(): @pytest.mark.skipif(not _NATIVE_AMP_AVAILABLE, reason="Requires native AMP") -@pytest.mark.skipif(platform.system() == "Windows", - reason="Distributed training is not supported on Windows") +@pytest.mark.skipif(platform.system() == "Windows", reason="Distributed training is not supported on Windows") @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") @pytest.mark.skipif(not _FAIRSCALE_AVAILABLE, reason="Fairscale is not available") def test_ddp_string_sharded_plugin_correctness_amp_multi_gpu(): @@ -134,8 +128,7 @@ def test_ddp_sharded_plugin_correctness_amp_multi_gpu_ddp(tmpdir, args=None): @pytest.mark.skip(reason="Current issue with multiple optimizers and FairScale.") @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") -@pytest.mark.skipif(platform.system() == "Windows", - reason="Distributed training is not supported on Windows") +@pytest.mark.skipif(platform.system() == "Windows", reason="Distributed training is not supported on Windows") @pytest.mark.skipif(not _FAIRSCALE_AVAILABLE, reason="Fairscale is not available") def test_ddp_sharded_plugin_correctness_multi_gpu_multi_optim(): """ @@ -152,8 +145,7 @@ def test_ddp_sharded_plugin_correctness_multi_gpu_multi_optim(): @pytest.mark.skip(reason="Current issue with multiple optimizers and FairScale.") @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") -@pytest.mark.skipif(platform.system() == "Windows", - reason="Distributed training is not supported on Windows") +@pytest.mark.skipif(platform.system() == "Windows", reason="Distributed training is not supported on Windows") @pytest.mark.skipif(not _FAIRSCALE_AVAILABLE, reason="Fairscale is not available") def test_ddp_sharded_plugin_correctness_multi_gpu_multi_optim_manual(tmpdir): """ From 2f6f608156f3bf705c6c44354cc19cf6d348629f Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Sat, 2 Jan 2021 16:39:09 +0100 Subject: [PATCH 22/25] . @SeanNaren --- benchmarks/test_sharded_parity.py | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/benchmarks/test_sharded_parity.py b/benchmarks/test_sharded_parity.py index 9688a603451f1..1a9aed58b97c4 100644 --- a/benchmarks/test_sharded_parity.py +++ b/benchmarks/test_sharded_parity.py @@ -14,18 +14,6 @@ from tests.base.boring_model import BoringModel, RandomDataset -@pytest.mark.skipif(platform.system() == "Windows", reason="Distributed training is not supported on Windows") -@pytest.mark.skipif(not _FAIRSCALE_AVAILABLE, reason="Fairscale is not available") -def test_ddp_sharded_plugin_correctness_one_device(): - plugin_parity_test( - accelerator='ddp_cpu', - num_processes=2, - max_percent_speed_diff=0.15, # slower speed due to one CPU doing additional sequential memory saving calls - plugin=DDPShardedPlugin(), - model_cls=SeedTrainLoaderModel, - ) - - @pytest.mark.skipif(not torch.cuda.is_available(), reason="requires GPU machine") @pytest.mark.skipif(platform.system() == "Windows", reason="Distributed training is not supported on Windows") @pytest.mark.skipif(not _FAIRSCALE_AVAILABLE, reason="Fairscale is not available") From 233515cc5880ff838d082336a2189b9267d659b2 Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Sun, 3 Jan 2021 01:42:48 +0100 Subject: [PATCH 23/25] chlog --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 57105e252dfb0..d3baec790195d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -33,6 +33,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Fixed +- Fixed distributed setting and `ddp_cpu` only with `num_processes>1` ([#5297](https://github.com/PyTorchLightning/pytorch-lightning/pull/5297)) + ## [1.1.0] - 2020-12-09 From 713bc040f70b80dae677be57fca2c282c951499e Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Mon, 4 Jan 2021 14:48:57 +0100 Subject: [PATCH 24/25] Apply suggestions from code review --- benchmarks/test_sharded_parity.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/benchmarks/test_sharded_parity.py b/benchmarks/test_sharded_parity.py index 1a9aed58b97c4..0f58cb882bcf9 100644 --- a/benchmarks/test_sharded_parity.py +++ b/benchmarks/test_sharded_parity.py @@ -247,7 +247,6 @@ def plugin_parity_test( gpus: int = 0, precision: int = 32, max_percent_speed_diff: float = 0.1, - **kwargs, ): """ Ensures that the trained model is identical to the standard DDP implementation. @@ -276,7 +275,6 @@ def plugin_parity_test( gpus=gpus, precision=precision, accelerator=accelerator, - **kwargs, ) max_memory_ddp, ddp_time = record_ddp_fit_model_stats( @@ -296,7 +294,6 @@ def plugin_parity_test( precision=precision, accelerator=accelerator, plugins=[plugin], - **kwargs, ) max_memory_custom, custom_model_time = record_ddp_fit_model_stats( From ad7fdee3f78523f31eb114b1b8cecbcee5a6bcdb Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Mon, 4 Jan 2021 17:06:29 +0100 Subject: [PATCH 25/25] . --- pytorch_lightning/plugins/plugin_connector.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pytorch_lightning/plugins/plugin_connector.py b/pytorch_lightning/plugins/plugin_connector.py index 596a630b1c959..ccd128d87a26a 100644 --- a/pytorch_lightning/plugins/plugin_connector.py +++ b/pytorch_lightning/plugins/plugin_connector.py @@ -31,8 +31,6 @@ def __init__(self, trainer): self.plugins = [] self.ddp_plugin = DDPPlugin() self.cloud_environment = None - # self.amp_plugin = NativeAMPPlugin(trainer) - # self.apex_plugin = ApexPlugin(trainer) def on_trainer_init(self, plugins: Optional[Union[str, list]]): self.plugins = plugins